diff options
author | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
---|---|---|
committer | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
commit | 7c478bd95313f5f23a4c958a745db2134aa03244 (patch) | |
tree | c871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/uts/common/vm | |
download | illumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz |
OpenSolaris Launch
Diffstat (limited to 'usr/src/uts/common/vm')
39 files changed, 47175 insertions, 0 deletions
diff --git a/usr/src/uts/common/vm/Makefile b/usr/src/uts/common/vm/Makefile new file mode 100644 index 0000000000..fcd6582985 --- /dev/null +++ b/usr/src/uts/common/vm/Makefile @@ -0,0 +1,55 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2003 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# + +# include global definitions +include ../../../Makefile.master + +HDRS= anon.h as.h faultcode.h hat.h kpm.h page.h pvn.h rm.h seg.h vpage.h \ + seg_dev.h seg_enum.h seg_kmem.h seg_kp.h seg_kpm.h seg_map.h \ + seg_vn.h seg_spt.h + +ROOTDIRS= $(ROOT)/usr/include/vm + +ROOTHDRS= $(HDRS:%=$(ROOTDIRS)/%) + +CHECKHDRS= $(HDRS:%.h=%.check) + +# install rule +$(ROOTDIRS)/%: % + $(INS.file) + +.KEEP_STATE: + +.PARALLEL: $(CHECKHDRS) + +install_h: $(ROOTDIRS) $(ROOTHDRS) + +$(ROOTDIRS): + $(INS.dir) + +check: $(CHECKHDRS) diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h new file mode 100644 index 0000000000..466b939a75 --- /dev/null +++ b/usr/src/uts/common/vm/anon.h @@ -0,0 +1,461 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_ANON_H +#define _VM_ANON_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/cred.h> +#include <vm/seg.h> +#include <vm/vpage.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * VM - Anonymous pages. + */ + +typedef unsigned long anoff_t; /* anon offsets */ + +/* + * Each anonymous page, either in memory or in swap, has an anon structure. + * The structure (slot) provides a level of indirection between anonymous pages + * and their backing store. + * + * (an_vp, an_off) names the vnode of the anonymous page for this slot. + * + * (an_pvp, an_poff) names the location of the physical backing store + * for the page this slot represents. If the name is null there is no + * associated physical store. The physical backing store location can + * change while the slot is in use. + * + * an_hash is a hash list of anon slots. The list is hashed by + * (an_vp, an_off) of the associated anonymous page and provides a + * method of going from the name of an anonymous page to its + * associated anon slot. + * + * an_refcnt holds a reference count which is the number of separate + * copies that will need to be created in case of copy-on-write. + * A refcnt > 0 protects the existence of the slot. The refcnt is + * initialized to 1 when the anon slot is created in anon_alloc(). + * If a client obtains an anon slot and allows multiple threads to + * share it, then it is the client's responsibility to insure that + * it does not allow one thread to try to reference the slot at the + * same time as another is trying to decrement the last count and + * destroy the anon slot. E.g., the seg_vn segment type protects + * against this with higher level locks. + */ + +struct anon { + struct vnode *an_vp; /* vnode of anon page */ + struct vnode *an_pvp; /* vnode of physical backing store */ + anoff_t an_off; /* offset of anon page */ + anoff_t an_poff; /* offset in vnode */ + struct anon *an_hash; /* hash table of anon slots */ + int an_refcnt; /* # of people sharing slot */ +}; + +#ifdef _KERNEL +/* + * The swapinfo_lock protects: + * swapinfo list + * individual swapinfo structures + * + * The anoninfo_lock protects: + * anoninfo counters + * + * The anonhash_lock protects: + * anon hash lists + * anon slot fields + * + * Fields in the anon slot which are read-only for the life of the slot + * (an_vp, an_off) do not require the anonhash_lock be held to access them. + * If you access a field without the anonhash_lock held you must be holding + * the slot with an_refcnt to make sure it isn't destroyed. + * To write (an_pvp, an_poff) in a given slot you must also hold the + * p_iolock of the anonymous page for slot. + */ +extern kmutex_t anoninfo_lock; +extern kmutex_t swapinfo_lock; +extern kmutex_t anonhash_lock[]; +extern pad_mutex_t anon_array_lock[]; +extern kcondvar_t anon_array_cv[]; + +/* + * Global hash table to provide a function from (vp, off) -> ap + */ +extern size_t anon_hash_size; +extern struct anon **anon_hash; +#define ANON_HASH_SIZE anon_hash_size +#define ANON_HASHAVELEN 4 +#define ANON_HASH(VP, OFF) \ +((((uintptr_t)(VP) >> 7) ^ ((OFF) >> PAGESHIFT)) & (ANON_HASH_SIZE - 1)) + +#define AH_LOCK_SIZE 64 +#define AH_LOCK(vp, off) (ANON_HASH((vp), (off)) & (AH_LOCK_SIZE -1)) + +#endif /* _KERNEL */ + +/* + * Declaration for the Global counters to accurately + * track the kernel foot print in memory. + */ +extern pgcnt_t segvn_pages_locked; +extern pgcnt_t pages_locked; +extern pgcnt_t pages_claimed; +extern pgcnt_t pages_useclaim; +extern pgcnt_t obp_pages; + +/* + * Anonymous backing store accounting structure for swapctl. + * + * ani_max = maximum amount of swap space + * (including potentially available physical memory) + * ani_free = amount of unallocated anonymous memory + * (some of which might be reserved and including + * potentially available physical memory) + * ani_resv = amount of claimed (reserved) anonymous memory + * + * The swap data can be aquired more efficiently through the + * kstats interface. + * Total slots currently available for reservation = + * MAX(ani_max - ani_resv, 0) + (availrmem - swapfs_minfree) + */ +struct anoninfo { + pgcnt_t ani_max; + pgcnt_t ani_free; + pgcnt_t ani_resv; +}; + +#ifdef _SYSCALL32 +struct anoninfo32 { + size32_t ani_max; + size32_t ani_free; + size32_t ani_resv; +}; +#endif /* _SYSCALL32 */ + +/* + * Define the NCPU pool of the ani_free counters. Update the counter + * of the cpu on which the thread is running and in every clock intr + * sync anoninfo.ani_free with the current total off all the NCPU entries. + */ + +typedef struct ani_free { + kmutex_t ani_lock; + pgcnt_t ani_count; + uchar_t pad[64 - sizeof (kmutex_t) - sizeof (pgcnt_t)]; + /* XXX 64 = cacheline size */ +} ani_free_t; + +#define ANI_MAX_POOL 128 +extern ani_free_t ani_free_pool[]; + +#define ANI_ADD(inc) { \ + ani_free_t *anifp; \ + int index; \ + index = (CPU->cpu_id & (ANI_MAX_POOL - 1)); \ + anifp = &ani_free_pool[index]; \ + mutex_enter(&anifp->ani_lock); \ + anifp->ani_count += inc; \ + mutex_exit(&anifp->ani_lock); \ +} + +/* + * Anon array pointers are allocated in chunks. Each chunk + * has PAGESIZE/sizeof(u_long *) of anon pointers. + * There are two levels of arrays for anon array pointers larger + * than a chunk. The first level points to anon array chunks. + * The second level consists of chunks of anon pointers. + * + * If anon array is smaller than a chunk then the whole anon array + * is created (memory is allocated for whole anon array). + * If anon array is larger than a chunk only first level array is + * allocated. Then other arrays (chunks) are allocated only when + * they are initialized with anon pointers. + */ +struct anon_hdr { + kmutex_t serial_lock; /* serialize array chunk allocation */ + pgcnt_t size; /* number of pointers to (anon) pages */ + void **array_chunk; /* pointers to anon pointers or chunks of */ + /* anon pointers */ + int flags; /* ANON_ALLOC_FORCE force preallocation of */ + /* whole anon array */ +}; + +#ifdef _LP64 +#define ANON_PTRSHIFT 3 +#define ANON_PTRMASK ~7 +#else +#define ANON_PTRSHIFT 2 +#define ANON_PTRMASK ~3 +#endif + +#define ANON_CHUNK_SIZE (PAGESIZE >> ANON_PTRSHIFT) +#define ANON_CHUNK_SHIFT (PAGESHIFT - ANON_PTRSHIFT) +#define ANON_CHUNK_OFF (ANON_CHUNK_SIZE - 1) + +/* + * Anon flags. + */ +#define ANON_SLEEP 0x0 /* ok to block */ +#define ANON_NOSLEEP 0x1 /* non-blocking call */ +#define ANON_ALLOC_FORCE 0x2 /* force single level anon array */ +#define ANON_GROWDOWN 0x4 /* anon array should grow downward */ + +/* + * The anon_map structure is used by various clients of the anon layer to + * manage anonymous memory. When anonymous memory is shared, + * then the different clients sharing it will point to the + * same anon_map structure. Also, if a segment is unmapped + * in the middle where an anon_map structure exists, the + * newly created segment will also share the anon_map structure, + * although the two segments will use different ranges of the + * anon array. When mappings are private (or shared with + * a reference count of 1), an unmap operation will free up + * a range of anon slots in the array given by the anon_map + * structure. Because of fragmentation due to this unmapping, + * we have to store the size of the anon array in the anon_map + * structure so that we can free everything when the referernce + * count goes to zero. + * + * A new rangelock scheme is introduced to make the anon layer scale. + * A reader/writer lock per anon_amp and an array of system-wide hash + * locks, anon_array_lock[] are introduced to replace serial_lock and + * anonmap lock. The writer lock is held when we want to singlethreaD + * the reference to the anon array pointers or when references to + * anon_map's members, whereas reader lock and anon_array_lock are + * held to allows multiple threads to reference different part of + * anon array. A global set of condition variables, anon_array_cv, + * are used with anon_array_lock[] to make the hold time of the locks + * short. + * + * szc is used to calculate the index of hash locks and cv's. We + * could've just used seg->s_szc if not for the possible sharing of + * anon_amp between SYSV shared memory and ISM, so now we introduce + * szc in the anon_map structure. For MAP_SHARED, the amp->szc is either + * 0 (base page size) or page_num_pagesizes() - 1, while MAP_PRIVATE + * the amp->szc could be anything in [0, page_num_pagesizes() - 1]. + */ +struct anon_map { + krwlock_t a_rwlock; /* protect anon_map and anon array */ + size_t size; /* size in bytes mapped by the anon array */ + struct anon_hdr *ahp; /* anon array header pointer, containing */ + /* anon pointer array(s) */ + size_t swresv; /* swap space reserved for this anon_map */ + uint_t refcnt; /* reference count on this structure */ + ushort_t a_szc; /* max szc among shared processes */ + void *locality; /* lgroup locality info */ +}; + +#ifdef _KERNEL + +#define ANON_BUSY 0x1 +#define ANON_ISBUSY(slot) (*(slot) & ANON_BUSY) +#define ANON_SETBUSY(slot) (*(slot) |= ANON_BUSY) +#define ANON_CLRBUSY(slot) (*(slot) &= ~ANON_BUSY) + +#define ANON_MAP_SHIFT 6 /* log2(sizeof (struct anon_map)) */ +#define ANON_ARRAY_SHIFT 7 /* log2(ANON_LOCKSIZE) */ +#define ANON_LOCKSIZE 128 + +#define ANON_LOCK_ENTER(lock, type) rw_enter((lock), (type)) +#define ANON_LOCK_EXIT(lock) rw_exit((lock)) + +#define ANON_ARRAY_HASH(amp, idx)\ + ((((idx) + ((idx) >> ANON_ARRAY_SHIFT) +\ + ((idx) >> (ANON_ARRAY_SHIFT << 1)) +\ + ((idx) >> (ANON_ARRAY_SHIFT + (ANON_ARRAY_SHIFT << 1)))) ^\ + ((uintptr_t)(amp) >> ANON_MAP_SHIFT)) & (ANON_LOCKSIZE - 1)) + +typedef struct anon_sync_obj { + kmutex_t *sync_mutex; + kcondvar_t *sync_cv; + ulong_t *sync_data; +} anon_sync_obj_t; + +/* + * Anonymous backing store accounting structure for kernel. + * ani_max = total reservable slots on physical (disk-backed) swap + * ani_phys_resv = total phys slots reserved for use by clients + * ani_mem_resv = total mem slots reserved for use by clients + * ani_free = # unallocated physical slots + # of reserved unallocated + * memory slots + */ + +/* + * Initial total swap slots available for reservation + */ +#define TOTAL_AVAILABLE_SWAP \ + (k_anoninfo.ani_max + MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) + +/* + * Swap slots currently available for reservation + */ +#define CURRENT_TOTAL_AVAILABLE_SWAP \ + ((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) + \ + MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) + +struct k_anoninfo { + pgcnt_t ani_max; /* total reservable slots on phys */ + /* (disk) swap */ + pgcnt_t ani_free; /* # of unallocated phys and mem slots */ + pgcnt_t ani_phys_resv; /* # of reserved phys (disk) slots */ + pgcnt_t ani_mem_resv; /* # of reserved mem slots */ + pgcnt_t ani_locked_swap; /* # of swap slots locked in reserved */ + /* mem swap */ +}; + +extern struct k_anoninfo k_anoninfo; + +extern void anon_init(void); +extern struct anon *anon_alloc(struct vnode *, anoff_t); +extern void anon_dup(struct anon_hdr *, ulong_t, + struct anon_hdr *, ulong_t, size_t); +extern void anon_dup_fill_holes(struct anon_hdr *, ulong_t, + struct anon_hdr *, ulong_t, size_t, uint_t, int); +extern int anon_fill_cow_holes(struct seg *, caddr_t, struct anon_hdr *, + ulong_t, struct vnode *, u_offset_t, size_t, uint_t, + uint_t, struct vpage [], struct cred *); +extern void anon_free(struct anon_hdr *, ulong_t, size_t); +extern void anon_free_pages(struct anon_hdr *, ulong_t, size_t, uint_t); +extern void anon_disclaim(struct anon_map *, ulong_t, size_t, int); +extern int anon_getpage(struct anon **, uint_t *, struct page **, + size_t, struct seg *, caddr_t, enum seg_rw, struct cred *); +extern int swap_getconpage(struct vnode *, u_offset_t, size_t, + uint_t *, page_t *[], size_t, page_t *, + spgcnt_t *, struct seg *, caddr_t, + enum seg_rw, struct cred *); +extern int anon_map_getpages(struct anon_map *, ulong_t, + uint_t, struct seg *, caddr_t, uint_t, + uint_t *, page_t *[], uint_t *, + struct vpage [], enum seg_rw, int, int, struct cred *); +extern int anon_map_privatepages(struct anon_map *, ulong_t, + uint_t, struct seg *, caddr_t, uint_t, + page_t *[], struct vpage [], int, struct cred *); +extern struct page *anon_private(struct anon **, struct seg *, + caddr_t, uint_t, struct page *, + int, struct cred *); +extern struct page *anon_zero(struct seg *, caddr_t, + struct anon **, struct cred *); +extern int anon_map_createpages(struct anon_map *, ulong_t, + size_t, struct page **, + struct seg *, caddr_t, + enum seg_rw, struct cred *); +extern int anon_map_demotepages(struct anon_map *, ulong_t, + struct seg *, caddr_t, uint_t, + struct vpage [], struct cred *); +extern int anon_resvmem(size_t, uint_t); +extern void anon_unresv(size_t); +extern struct anon_map *anonmap_alloc(size_t, size_t); +extern void anonmap_free(struct anon_map *); +extern void anon_decref(struct anon *); +extern int non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *); +extern pgcnt_t anon_pages(struct anon_hdr *, ulong_t, pgcnt_t); +extern int anon_swap_adjust(pgcnt_t); +extern void anon_swap_restore(pgcnt_t); +extern struct anon_hdr *anon_create(pgcnt_t, int); +extern void anon_release(struct anon_hdr *, pgcnt_t); +extern struct anon *anon_get_ptr(struct anon_hdr *, ulong_t); +extern ulong_t *anon_get_slot(struct anon_hdr *, ulong_t); +extern struct anon *anon_get_next_ptr(struct anon_hdr *, ulong_t *); +extern int anon_set_ptr(struct anon_hdr *, ulong_t, struct anon *, int); +extern int anon_copy_ptr(struct anon_hdr *, ulong_t, + struct anon_hdr *, ulong_t, pgcnt_t, int); +extern pgcnt_t anon_grow(struct anon_hdr *, ulong_t *, pgcnt_t, pgcnt_t, int); +extern void anon_array_enter(struct anon_map *, ulong_t, + anon_sync_obj_t *); +extern void anon_array_exit(anon_sync_obj_t *); + +/* + * anon_resv checks to see if there is enough swap space to fulfill a + * request and if so, reserves the appropriate anonymous memory resources. + * anon_checkspace just checks to see if there is space to fulfill the request, + * without taking any resources. Both return 1 if successful and 0 if not. + */ +#define anon_resv(size) anon_resvmem((size), 1) +#define anon_checkspace(size) anon_resvmem((size), 0) + +/* + * Flags to anon_private + */ +#define STEAL_PAGE 0x1 /* page can be stolen */ +#define LOCK_PAGE 0x2 /* page must be ``logically'' locked */ + +/* + * Flags to anon_disclaim + */ +#define ANON_PGLOOKUP_BLK 0x1 /* block on locked pages */ + +/* + * SEGKP ANON pages that are locked are assumed to be LWP stack pages + * and thus count towards the user pages locked count. + * This value is protected by the same lock as availrmem. + */ +extern pgcnt_t anon_segkp_pages_locked; + +extern int anon_debug; + +#ifdef ANON_DEBUG + +#define A_ANON 0x01 +#define A_RESV 0x02 +#define A_MRESV 0x04 + +/* vararg-like debugging macro. */ +#define ANON_PRINT(f, printf_args) \ + if (anon_debug & f) \ + printf printf_args + +#else /* ANON_DEBUG */ + +#define ANON_PRINT(f, printf_args) + +#endif /* ANON_DEBUG */ + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_ANON_H */ diff --git a/usr/src/uts/common/vm/as.h b/usr/src/uts/common/vm/as.h new file mode 100644 index 0000000000..c7afefc23c --- /dev/null +++ b/usr/src/uts/common/vm/as.h @@ -0,0 +1,290 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_AS_H +#define _VM_AS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/watchpoint.h> +#include <vm/seg.h> +#include <vm/faultcode.h> +#include <vm/hat.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * VM - Address spaces. + */ + +/* + * Each address space consists of a sorted list of segments + * and machine dependent address translation information. + * + * All the hard work is in the segment drivers and the + * hardware address translation code. + * + * The segment list is represented as an AVL tree. + * + * The address space lock (a_lock) is a long term lock which serializes + * access to certain operations (as_map, as_unmap) and protects the + * underlying generic segment data (seg.h) along with some fields in the + * address space structure as shown below: + * + * address space structure segment structure + * + * a_segtree s_base + * a_size s_size + * a_lastgap s_link + * a_seglast s_ops + * s_as + * s_data + * + * The address space contents lock (a_contents) is a short term + * lock that protects most of the data in the address space structure. + * This lock is always acquired after the "a_lock" in all situations + * except while dealing with AS_CLAIMGAP to avoid deadlocks. + * + * The following fields are protected by this lock: + * + * a_flags (AS_PAGLCK, AS_CLAIMGAP, etc.) + * a_unmapwait + * a_seglast + * + * The address space lock (a_lock) is always held prior to any segment + * operation. Some segment drivers use the address space lock to protect + * some or all of their segment private data, provided the version of + * "a_lock" (read vs. write) is consistent with the use of the data. + * + * The following fields are protected by the hat layer lock: + * + * a_vbits + * a_hat + * a_hrm + */ + +struct as { + kmutex_t a_contents; /* protect certain fields in the structure */ + uchar_t a_flags; /* as attributes */ + uchar_t a_vbits; /* used for collecting statistics */ + kcondvar_t a_cv; /* used by as_rangelock */ + struct hat *a_hat; /* hat structure */ + struct hrmstat *a_hrm; /* ref and mod bits */ + caddr_t a_userlimit; /* highest allowable address in this as */ + struct seg *a_seglast; /* last segment hit on the addr space */ + krwlock_t a_lock; /* protects segment related fields */ + size_t a_size; /* size of address space */ + struct seg *a_lastgap; /* last seg found by as_gap() w/ AS_HI (mmap) */ + struct seg *a_lastgaphl; /* last seg saved in as_gap() either for */ + /* AS_HI or AS_LO used in as_addseg() */ + avl_tree_t a_segtree; /* segments in this address space. (AVL tree) */ + avl_tree_t a_wpage; /* watched pages (procfs) */ + uchar_t a_updatedir; /* mappings changed, rebuild a_objectdir */ + timespec_t a_updatetime; /* time when mappings last changed */ + vnode_t **a_objectdir; /* object directory (procfs) */ + size_t a_sizedir; /* size of object directory */ + struct as_callback *a_callbacks; /* callback list */ + void *a_xhat; /* list of xhat providers */ +}; + +#define AS_PAGLCK 0x80 +#define AS_CLAIMGAP 0x40 +#define AS_UNMAPWAIT 0x20 +#define AS_NEEDSPURGE 0x10 /* mostly for seg_nf, see as_purge() */ +#define AS_BUSY 0x01 /* needed by XHAT framework */ + +#define AS_ISPGLCK(as) ((as)->a_flags & AS_PAGLCK) +#define AS_ISCLAIMGAP(as) ((as)->a_flags & AS_CLAIMGAP) +#define AS_ISUNMAPWAIT(as) ((as)->a_flags & AS_UNMAPWAIT) +#define AS_ISBUSY(as) ((as)->a_flags & AS_BUSY) + + +#define AS_SETPGLCK(as) ((as)->a_flags |= AS_PAGLCK) +#define AS_SETCLAIMGAP(as) ((as)->a_flags |= AS_CLAIMGAP) +#define AS_SETUNMAPWAIT(as) ((as)->a_flags |= AS_UNMAPWAIT) +#define AS_SETBUSY(as) ((as)->a_flags |= AS_BUSY) + +#define AS_CLRPGLCK(as) ((as)->a_flags &= ~AS_PAGLCK) +#define AS_CLRCLAIMGAP(as) ((as)->a_flags &= ~AS_CLAIMGAP) +#define AS_CLRUNMAPWAIT(as) ((as)->a_flags &= ~AS_UNMAPWAIT) +#define AS_CLRBUSY(as) ((as)->a_flags &= ~AS_BUSY) + +#define AS_TYPE_64BIT(as) \ + (((as)->a_userlimit > (caddr_t)UINT32_MAX) ? 1 : 0) + +/* + * The as_callback is the basic structure which supports the ability to + * inform clients of specific events pertaining to address space management. + * A user calls as_add_callback to register an address space callback + * for a range of pages, specifying the events that need to occur. + * When as_do_callbacks is called and finds a 'matching' entry, the + * callback is called once, and the callback function MUST call + * as_delete_callback when all callback activities are complete. + * The thread calling as_do_callbacks blocks until the as_delete_callback + * is called. This allows for asynchorous events to subside before the + * as_do_callbacks thread continues. + * + * An example of the need for this is a driver which has done long-term + * locking of memory. Address space management operations (events) such + * as as_free, as_umap, and as_setprot will block indefinitely until the + * pertinent memory is unlocked. The callback mechanism provides the + * way to inform the driver of the event so that the driver may do the + * necessary unlocking. + * + * The contents of this structure is protected by a_contents lock + */ +typedef void (*callback_func_t)(struct as *, void *, uint_t); +struct as_callback { + struct as_callback *ascb_next; /* list link */ + uint_t ascb_events; /* event types */ + callback_func_t ascb_func; /* callback function */ + void *ascb_arg; /* callback argument */ + caddr_t ascb_saddr; /* start address */ + size_t ascb_len; /* address range */ +}; +/* + * Callback events + */ +#define AS_FREE_EVENT 0x1 +#define AS_SETPROT_EVENT 0x2 +#define AS_UNMAP_EVENT 0x4 +#define AS_CALLBACK_CALLED ((uint_t)(1U << (8 * sizeof (uint_t) - 1U))) +#define AS_UNMAPWAIT_EVENT \ + (AS_FREE_EVENT | AS_SETPROT_EVENT | AS_UNMAP_EVENT) +#define AS_ALL_EVENT \ + (AS_FREE_EVENT | AS_SETPROT_EVENT | AS_UNMAP_EVENT) + + +/* Return code values for as_callback_delete */ +enum as_cbdelete_rc { + AS_CALLBACK_DELETED, + AS_CALLBACK_NOTFOUND, + AS_CALLBACK_DELETE_DEFERRED +}; + +#ifdef _KERNEL + +/* + * Flags for as_gap. + */ +#define AH_DIR 0x1 /* direction flag mask */ +#define AH_LO 0x0 /* find lowest hole */ +#define AH_HI 0x1 /* find highest hole */ +#define AH_CONTAIN 0x2 /* hole must contain `addr' */ + +extern struct as kas; /* kernel's address space */ + +/* + * Macros for address space locking. + */ +#define AS_LOCK_ENTER(as, lock, type) rw_enter((lock), (type)) +#define AS_LOCK_EXIT(as, lock) rw_exit((lock)) +#define AS_LOCK_DESTROY(as, lock) rw_destroy((lock)) +#define AS_LOCK_TRYENTER(as, lock, type) rw_tryenter((lock), (type)) + +/* + * Macros to test lock states. + */ +#define AS_LOCK_HELD(as, lock) RW_LOCK_HELD((lock)) +#define AS_READ_HELD(as, lock) RW_READ_HELD((lock)) +#define AS_WRITE_HELD(as, lock) RW_WRITE_HELD((lock)) + +/* + * macros to walk thru segment lists + */ +#define AS_SEGFIRST(as) avl_first(&(as)->a_segtree) +#define AS_SEGNEXT(as, seg) AVL_NEXT(&(as)->a_segtree, (seg)) +#define AS_SEGPREV(as, seg) AVL_PREV(&(as)->a_segtree, (seg)) + +void as_init(void); +void as_avlinit(struct as *); +struct seg *as_segat(struct as *as, caddr_t addr); +void as_rangelock(struct as *as); +void as_rangeunlock(struct as *as); +struct as *as_alloc(void); +void as_free(struct as *as); +int as_dup(struct as *as, struct as **outas); +struct seg *as_findseg(struct as *as, caddr_t addr, int tail); +int as_addseg(struct as *as, struct seg *newseg); +struct seg *as_removeseg(struct as *as, struct seg *seg); +faultcode_t as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, + enum fault_type type, enum seg_rw rw); +faultcode_t as_faulta(struct as *as, caddr_t addr, size_t size); +int as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot); +int as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot); +int as_unmap(struct as *as, caddr_t addr, size_t size); +int as_map(struct as *as, caddr_t addr, size_t size, int ((*crfp)()), + void *argsp); +void as_purge(struct as *as); +int as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, + uint_t flags, caddr_t addr); +int as_memory(struct as *as, caddr_t *basep, size_t *lenp); +size_t as_swapout(struct as *as); +int as_incore(struct as *as, caddr_t addr, size_t size, char *vec, + size_t *sizep); +int as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, + uintptr_t arg, ulong_t *lock_map, size_t pos); +int as_exec(struct as *oas, caddr_t ostka, size_t stksz, + struct as *nas, caddr_t nstka, uint_t hatflag); +int as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, + size_t size, enum seg_rw rw); +void as_pageunlock(struct as *as, struct page **pp, caddr_t addr, + size_t size, enum seg_rw rw); +void as_pagereclaim(struct as *as, struct page **pp, caddr_t addr, + size_t size, enum seg_rw rw); +int as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, + boolean_t wait); +void as_setwatch(struct as *as); +void as_clearwatch(struct as *as); +int as_getmemid(struct as *, caddr_t, memid_t *); + +int as_add_callback(struct as *, void (*)(), void *, uint_t, + caddr_t, size_t, int); +uint_t as_delete_callback(struct as *, void *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_AS_H */ diff --git a/usr/src/uts/common/vm/faultcode.h b/usr/src/uts/common/vm/faultcode.h new file mode 100644 index 0000000000..82f886e00f --- /dev/null +++ b/usr/src/uts/common/vm/faultcode.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1992 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_FAULTCODE_H +#define _VM_FAULTCODE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This file describes the data type returned by vm routines + * which handle faults. + * + * If FC_CODE(fc) == FC_OBJERR, then FC_ERRNO(fc) contains the errno value + * returned by the underlying object mapped at the fault address. + */ +#define FC_HWERR 0x1 /* misc hardware error (e.g. bus timeout) */ +#define FC_ALIGN 0x2 /* hardware alignment error */ +#define FC_OBJERR 0x3 /* underlying object returned errno value */ +#define FC_PROT 0x4 /* access exceeded current protections */ +#define FC_NOMAP 0x5 /* no mapping at the fault address */ +#define FC_NOSUPPORT 0x6 /* operation not supported by driver */ + +#define FC_MAKE_ERR(e) (((e) << 8) | FC_OBJERR) + +#define FC_CODE(fc) ((fc) & 0xff) +#define FC_ERRNO(fc) ((unsigned)(fc) >> 8) + +#ifndef _ASM +typedef int faultcode_t; /* type returned by vm fault routines */ +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_FAULTCODE_H */ diff --git a/usr/src/uts/common/vm/hat.c b/usr/src/uts/common/vm/hat.c new file mode 100644 index 0000000000..24d6e50b1a --- /dev/null +++ b/usr/src/uts/common/vm/hat.c @@ -0,0 +1,149 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/systm.h> +#include <sys/modctl.h> +#include <sys/kobj.h> +#include <vm/hat.h> + +/* + * PSARC 2004/405 made hat_getkpfnum(9F) obsolete. As part of the + * obsolecense, the original documented behavior will begin to be + * enforced in the future; namely, hat_getkpfnum(9F) may _only_ + * be called with device-mapped memory virtual addresses. Since + * changing hat_getkpfnum(9F) to return PFN_INVALID on kernel memory + * would break a lot of modules without any warning, we've implemented + * the following mechanism as a stop-gap. In a future release, this + * can all be ripped out and hat_getkpfnum(9F) changed to return + * PFN_INVALID if it isn't called with a device-mapped memory address. + * + * We keep track of each module that has used hat_getkpfnum(9F) + * incorrectly. This allows us to avoid flooding the console/logs + * with too many warnings about a bad module that has already been + * flagged. + * + * On amd64 hat_getkpfnum() is never supported. + */ + +#if !defined(__amd64) + +#define HAT_STACK_MAXDEPTH 15 + +struct badcall_node { + char *bc_modname; + int bc_stackdepth; + pc_t bc_callstack[HAT_STACK_MAXDEPTH]; + struct badcall_node *bc_linkage; +}; + +static struct badcall_node *bad_getkpfnum_callers; + +/* + * Common VM HAT routines. + */ + +static void +printwarn(struct badcall_node *bc) +{ + int sf; + char *ksym; + ulong_t off; + + cmn_err(CE_WARN, "Module %s is using the obsolete hat_getkpfnum(9F)", + bc->bc_modname); + cmn_err(CE_CONT, "interface in a way that will not be supported in\n"); + cmn_err(CE_CONT, "a future release of Solaris. Please contact the\n"); + cmn_err(CE_CONT, "vendor that supplied the module for assistance,\n"); + cmn_err(CE_CONT, "or consult the Writing Device Drivers guide,\n"); + cmn_err(CE_CONT, "available from http://www.sun.com for migration\n"); + cmn_err(CE_CONT, "advice.\n"); + cmn_err(CE_CONT, "---\n"); + cmn_err(CE_CONT, "Callstack of bad caller:\n"); + + for (sf = 0; sf < bc->bc_stackdepth; sf++) { + ksym = kobj_getsymname(bc->bc_callstack[sf], &off); + cmn_err(CE_CONT, "\t%s+%lx\n", ksym? ksym : "?", off); + } +} + + +void +hat_getkpfnum_badcall(void *caller) +{ + struct badcall_node bcs; + char *modname = mod_containing_pc((caddr_t)caller); + struct badcall_node *bc; + +#ifdef __sparc + /* + * This is a hack until the ifb and jfb framebuffer drivers + * are fixed. Right now they use hat_getkpfnum() in a way that + * is really safe but will be incorrectly flagged as being + * buggy. + */ + if (strcmp(modname, "ifb") == 0 || strcmp(modname, "jfb") == 0) + return; +#elif defined(__i386) + /* + * This is a hack until these ethernet drivers can be fixed + * or EOL'd. hat_getkpfnum() will continue to work correctly + * until this list can be removed. + */ + if (strcmp(modname, "dnet") == 0 || strcmp(modname, "pcn") == 0 || + strcmp(modname, "adp") == 0 || strcmp(modname, "chs") == 0) + return; +#endif /* __sparc / __i386 */ + + for (bc = bad_getkpfnum_callers; bc != NULL; bc = bc->bc_linkage) + if (strcmp(bc->bc_modname, modname) == 0) + return; + + /* + * We haven't seen this caller before, so create a log of + * the callstack and module name, and emit a warning to the + * user. + */ + bc = kmem_zalloc(sizeof (struct badcall_node), KM_NOSLEEP); + if (bc != NULL) { + bc->bc_linkage = bad_getkpfnum_callers; + bc->bc_modname = modname; + bad_getkpfnum_callers = bc; + } else { + bc = &bcs; + bc->bc_modname = modname; + } + + bc->bc_stackdepth = getpcstack(bc->bc_callstack, HAT_STACK_MAXDEPTH); + + printwarn(bc); +} +#endif /* __amd64 */ diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h new file mode 100644 index 0000000000..b873f4e06e --- /dev/null +++ b/usr/src/uts/common/vm/hat.h @@ -0,0 +1,598 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_HAT_H +#define _VM_HAT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <vm/faultcode.h> +#include <sys/kstat.h> +#include <sys/siginfo.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * VM - Hardware Address Translation management. + * + * This file describes the machine independent interfaces to + * the hardware address translation management routines. Other + * machine specific interfaces and structures are defined + * in <vm/hat_xxx.h>. The hat layer manages the address + * translation hardware as a cache driven by calls from the + * higher levels of the VM system. + */ + +struct hat; +struct kpme; +struct memseg; + +#include <vm/page.h> + +/* + * a callback used with hat_unload_callback() + * start and end mark are set to a range of unloaded addresses + * and the function is invoked with a pointer to this data structure + */ +typedef struct hat_callback { + caddr_t hcb_start_addr; + caddr_t hcb_end_addr; + void (*hcb_function)(struct hat_callback *); + void *hcb_data; +} hat_callback_t; + +#ifdef _KERNEL + +/* + * One time hat initialization + */ +void hat_init(void); + +/* + * Notify hat of a system dump + */ +void hat_dump(void); + +/* + * Operations on an address space: + * + * struct hat *hat_alloc(as) + * allocated a hat structure for as. + * + * void hat_free_start(hat) + * informs hat layer process has finished executing but as has not + * been cleaned up yet. + * + * void hat_free_end(hat) + * informs hat layer as is being destroyed. hat layer cannot use as + * pointer after this call. + * + * void hat_swapin(hat) + * allocate any hat resources required for process being swapped in. + * + * void hat_swapout(hat) + * deallocate hat resources for process being swapped out. + * + * size_t hat_get_mapped_size(hat) + * returns number of bytes that have valid mappings in hat. + * + * void hat_stats_enable(hat) + * void hat_stats_disable(hat) + * enables/disables collection of stats for hat. + * + * int hat_dup(parenthat, childhat, addr, len, flags) + * Duplicate address translations of the parent to the child. Supports + * the entire address range or a range depending on flag, + * zero returned on success, non-zero on error + * + * void hat_thread_exit(thread) + * Notifies the HAT that a thread is exiting, called after it has been + * reassigned to the kernel AS. + */ + +struct hat *hat_alloc(struct as *); +void hat_free_start(struct hat *); +void hat_free_end(struct hat *); +int hat_dup(struct hat *, struct hat *, caddr_t, size_t, uint_t); +void hat_swapin(struct hat *); +void hat_swapout(struct hat *); +size_t hat_get_mapped_size(struct hat *); +int hat_stats_enable(struct hat *); +void hat_stats_disable(struct hat *); +void hat_thread_exit(kthread_t *); + +/* + * Operations on a named address within a segment: + * + * void hat_memload(hat, addr, pp, attr, flags) + * load/lock the given page struct + * + * void hat_memload_array(hat, addr, len, ppa, attr, flags) + * load/lock the given array of page structs + * + * void hat_devload(hat, addr, len, pf, attr, flags) + * load/lock the given page frame number + * + * void hat_unlock(hat, addr, len) + * unlock a given range of addresses + * + * void hat_unload(hat, addr, len, flags) + * void hat_unload_callback(hat, addr, len, flags, callback) + * unload a given range of addresses (has optional callback) + * + * void hat_sync(hat, addr, len, flags) + * synchronize mapping with software data structures + * + * void hat_map(hat, addr, len, flags) + * + * void hat_setattr(hat, addr, len, attr) + * void hat_clrattr(hat, addr, len, attr) + * void hat_chgattr(hat, addr, len, attr) + * modify attributes for a range of addresses. skips any invalid mappings + * + * uint_t hat_getattr(hat, addr, *attr) + * returns attr for <hat,addr> in *attr. returns 0 if there was a + * mapping and *attr is valid, nonzero if there was no mapping and + * *attr is not valid. + * + * size_t hat_getpagesize(hat, addr) + * returns pagesize in bytes for <hat, addr>. returns -1 if there is + * no mapping. This is an advisory call. + * + * pfn_t hat_getpfnum(hat, addr) + * returns pfn for <hat, addr> or PFN_INVALID if mapping is invalid. + * + * pfn_t hat_getkpfnum(addr) + * returns pfn for non-memory mapped addr in kernel address space + * or PFN_INVALID if mapping is invalid or is kernel memory. + * + * int hat_probe(hat, addr) + * return 0 if no valid mapping is present. Faster version + * of hat_getattr in certain architectures. + * + * int hat_share(dhat, daddr, shat, saddr, len, szc) + * + * void hat_unshare(hat, addr, len, szc) + * + * void hat_chgprot(hat, addr, len, vprot) + * This is a deprecated call. New segment drivers should store + * all attributes and use hat_*attr calls. + * Change the protections in the virtual address range + * given to the specified virtual protection. If vprot is ~PROT_WRITE, + * then remove write permission, leaving the other permissions + * unchanged. If vprot is ~PROT_USER, remove user permissions. + */ + +void hat_memload(struct hat *, caddr_t, struct page *, uint_t, uint_t); +void hat_memload_array(struct hat *, caddr_t, size_t, struct page **, + uint_t, uint_t); + +void hat_devload(struct hat *, caddr_t, size_t, pfn_t, uint_t, int); +void hat_unlock(struct hat *, caddr_t, size_t); +void hat_unload(struct hat *, caddr_t, size_t, uint_t); +void hat_unload_callback(struct hat *, caddr_t, size_t, uint_t, + hat_callback_t *); +void hat_sync(struct hat *, caddr_t, size_t, uint_t); +void hat_map(struct hat *, caddr_t, size_t, uint_t); +void hat_setattr(struct hat *, caddr_t, size_t, uint_t); +void hat_clrattr(struct hat *, caddr_t, size_t, uint_t); +void hat_chgattr(struct hat *, caddr_t, size_t, uint_t); +uint_t hat_getattr(struct hat *, caddr_t, uint_t *); +ssize_t hat_getpagesize(struct hat *, caddr_t); +pfn_t hat_getpfnum(struct hat *, caddr_t); +int hat_probe(struct hat *, caddr_t); +int hat_share(struct hat *, caddr_t, struct hat *, caddr_t, size_t, uint_t); +void hat_unshare(struct hat *, caddr_t, size_t, uint_t); +void hat_chgprot(struct hat *, caddr_t, size_t, uint_t); +void hat_reserve(struct as *, caddr_t, size_t); +pfn_t va_to_pfn(void *); +uint64_t va_to_pa(void *); + +/* + * hat_getkpfnum() is never supported on amd64 and will be + * removed from other platforms in future release + */ +#if !defined(__amd64) +pfn_t hat_getkpfnum(caddr_t); +#endif + + +/* + * Kernel Physical Mapping (segkpm) hat interface routines. + */ +caddr_t hat_kpm_mapin(struct page *, struct kpme *); +void hat_kpm_mapout(struct page *, struct kpme *, caddr_t); +caddr_t hat_kpm_page2va(struct page *, int); +struct page *hat_kpm_vaddr2page(caddr_t); +int hat_kpm_fault(struct hat *, caddr_t); +void hat_kpm_mseghash_clear(int); +void hat_kpm_mseghash_update(pgcnt_t, struct memseg *); +void hat_kpm_addmem_mseg_update(struct memseg *, pgcnt_t, offset_t); +void hat_kpm_addmem_mseg_insert(struct memseg *); +void hat_kpm_addmem_memsegs_update(struct memseg *); +caddr_t hat_kpm_mseg_reuse(struct memseg *); +void hat_kpm_delmem_mseg_update(struct memseg *, struct memseg **); +void hat_kpm_split_mseg_update(struct memseg *, struct memseg **, + struct memseg *, struct memseg *, struct memseg *); +void hat_kpm_walk(void (*)(void *, void *, size_t), void *); + +/* + * Operations on all translations for a given page(s) + * + * void hat_page_setattr(pp, flag) + * void hat_page_clrattr(pp, flag) + * used to set/clr red/mod bits. + * + * uint hat_page_getattr(pp, flag) + * If flag is specified, returns 0 if attribute is disabled + * and non zero if enabled. If flag specifes multiple attributs + * then returns 0 if ALL atriibutes are disabled. This is an advisory + * call. + * + * int hat_pageunload(pp, forceflag) + * unload all translations attached to pp. + * + * uint_t hat_pagesync(pp, flags) + * get hw stats from hardware into page struct and reset hw stats + * returns attributes of page + * + * ulong_t hat_page_getshare(pp) + * returns approx number of mappings to this pp. A return of 0 implies + * there are no mappings to the page. + * + * faultcode_t hat_softlock(hat, addr, lenp, ppp, flags); + * called to softlock pages for zero copy tcp + * + * void hat_page_demote(pp); + * unload all large mappings to pp and decrease p_szc of all + * constituent pages according to the remaining mappings. + */ + +void hat_page_setattr(struct page *, uint_t); +void hat_page_clrattr(struct page *, uint_t); +uint_t hat_page_getattr(struct page *, uint_t); +int hat_pageunload(struct page *, uint_t); +uint_t hat_pagesync(struct page *, uint_t); +ulong_t hat_page_getshare(struct page *); +faultcode_t hat_softlock(struct hat *, caddr_t, size_t *, + struct page **, uint_t); +void hat_page_demote(struct page *); + +/* + * Rountine to expose supported HAT features to PIM. + */ +enum hat_features { + HAT_SHARED_PT, /* Shared page tables */ + HAT_DYNAMIC_ISM_UNMAP, /* hat_pageunload() handles ISM pages */ + HAT_VMODSORT /* support for VMODSORT flag of vnode */ +}; + +int hat_supported(enum hat_features, void *); + +/* + * Services provided to the hat: + * + * void as_signal_proc(as, siginfo) + * deliver signal to all processes that have this as. + * + * int hat_setstat(as, addr, len, rmbits) + * informs hatstat layer that ref/mod bits need to be updated for + * address range. Returns 0 on success, 1 for failure. + */ +void as_signal_proc(struct as *, k_siginfo_t *siginfo); +void hat_setstat(struct as *, caddr_t, size_t, uint_t); + +/* + * Flags to pass to hat routines. + * + * Certain flags only apply to some interfaces: + * + * HAT_LOAD Default flags to load a translation to the page. + * HAT_LOAD_LOCK Lock down mapping resources; hat_map(), hat_memload(), + * and hat_devload(). + * HAT_LOAD_ADV Advisory load - Load translation if and only if + * sufficient MMU resources exist (i.e., do not steal). + * HAT_LOAD_SHARE A flag to hat_memload() to indicate h/w page tables + * that map some user pages (not kas) is shared by more + * than one process (eg. ISM). + * HAT_LOAD_CONTIG Pages are contigous + * HAT_LOAD_NOCONSIST Do not add mapping to mapping list. + * HAT_LOAD_REMAP Reload a valid pte with a different page frame. + * HAT_RELOAD_SHARE Reload a shared page table entry. Some platforms + * may require different actions than on the first + * load of a shared mapping. + * HAT_NO_KALLOC Do not kmem_alloc while creating the mapping; at this + * point, it's setting up mapping to allocate internal + * hat layer data structures. This flag forces hat layer + * to tap its reserves in order to prevent infinite + * recursion. + * HAT_LOAD_AUTOLPG Get MMU specific disable_auto_large_pages + */ + +/* + * Flags for hat_memload/hat_devload + */ +#define HAT_FLAGS_RESV 0xFF000000 /* resv for hat impl */ +#define HAT_LOAD 0x00 +#define HAT_LOAD_LOCK 0x01 +#define HAT_LOAD_ADV 0x04 +#define HAT_LOAD_CONTIG 0x10 +#define HAT_LOAD_NOCONSIST 0x20 +#define HAT_LOAD_SHARE 0x40 +#define HAT_LOAD_REMAP 0x80 +#define HAT_RELOAD_SHARE 0x100 +#define HAT_NO_KALLOC 0x200 +#define HAT_LOAD_TEXT 0x400 +#define HAT_LOAD_AUTOLPG 0x800 + +/* + * Attributes for hat_memload/hat_devload/hat_*attr + * are a superset of prot flags defined in mman.h. + */ +#define HAT_PLAT_ATTR_MASK 0xF00000 +#define HAT_PROT_MASK 0x0F + +#define HAT_NOFAULT 0x10 +#define HAT_NOSYNC 0x20 + +/* + * Advisory ordering attributes. Apply only to device mappings. + * + * HAT_STRICTORDER: the CPU must issue the references in order, as the + * programmer specified. This is the default. + * HAT_UNORDERED_OK: the CPU may reorder the references (this is all kinds + * of reordering; store or load with store or load). + * HAT_MERGING_OK: merging and batching: the CPU may merge individual stores + * to consecutive locations (for example, turn two consecutive byte + * stores into one halfword store), and it may batch individual loads + * (for example, turn two consecutive byte loads into one halfword load). + * This also implies re-ordering. + * HAT_LOADCACHING_OK: the CPU may cache the data it fetches and reuse it + * until another store occurs. The default is to fetch new data + * on every load. This also implies merging. + * HAT_STORECACHING_OK: the CPU may keep the data in the cache and push it to + * the device (perhaps with other data) at a later time. The default is + * to push the data right away. This also implies load caching. + */ +#define HAT_STRICTORDER 0x0000 +#define HAT_UNORDERED_OK 0x0100 +#define HAT_MERGING_OK 0x0200 +#define HAT_LOADCACHING_OK 0x0300 +#define HAT_STORECACHING_OK 0x0400 +#define HAT_ORDER_MASK 0x0700 + +/* endian attributes */ +#define HAT_NEVERSWAP 0x0000 +#define HAT_STRUCTURE_BE 0x1000 +#define HAT_STRUCTURE_LE 0x2000 +#define HAT_ENDIAN_MASK 0x3000 + +/* flags for hat_softlock */ +#define HAT_COW 0x0001 + +/* + * Flags for hat_unload + */ +#define HAT_UNLOAD 0x00 +#define HAT_UNLOAD_NOSYNC 0x02 +#define HAT_UNLOAD_UNLOCK 0x04 +#define HAT_UNLOAD_OTHER 0x08 +#define HAT_UNLOAD_UNMAP 0x10 + +/* + * Flags for hat_pagesync, hat_getstat, hat_sync + */ +#define HAT_SYNC_DONTZERO 0x00 +#define HAT_SYNC_ZERORM 0x01 +/* Additional flags for hat_pagesync */ +#define HAT_SYNC_STOPON_REF 0x02 +#define HAT_SYNC_STOPON_MOD 0x04 +#define HAT_SYNC_STOPON_RM (HAT_SYNC_STOPON_REF | HAT_SYNC_STOPON_MOD) +#define HAT_SYNC_STOPON_SHARED 0x08 + +/* + * Flags for hat_dup + * + * HAT_DUP_ALL dup entire address space + * HAT_DUP_COW dup plus hat_clrattr(..PROT_WRITE) on newas + */ +#define HAT_DUP_ALL 1 +#define HAT_DUP_COW 2 + + +/* + * Flags for hat_map + */ +#define HAT_MAP 0x00 + +/* + * Flag for hat_pageunload + */ +#define HAT_ADV_PGUNLOAD 0x00 +#define HAT_FORCE_PGUNLOAD 0x01 + +/* + * Attributes for hat_page_*attr, hat_setstats and + * returned by hat_pagesync. + */ +#define P_MOD 0x1 /* the modified bit */ +#define P_REF 0x2 /* the referenced bit */ +#define P_RO 0x4 /* Read only page */ + +#define hat_ismod(pp) (hat_page_getattr(pp, P_MOD)) +#define hat_isref(pp) (hat_page_getattr(pp, P_REF)) +#define hat_isro(pp) (hat_page_getattr(pp, P_RO)) + +#define hat_setmod(pp) (hat_page_setattr(pp, P_MOD)) +#define hat_setref(pp) (hat_page_setattr(pp, P_REF)) +#define hat_setrefmod(pp) (hat_page_setattr(pp, P_REF|P_MOD)) + +#define hat_clrmod(pp) (hat_page_clrattr(pp, P_MOD)) +#define hat_clrref(pp) (hat_page_clrattr(pp, P_REF)) +#define hat_clrrefmod(pp) (hat_page_clrattr(pp, P_REF|P_MOD)) + +#define hat_page_is_mapped(pp) (hat_page_getshare(pp)) + +/* + * hat_setup is being used in sparc/os/sundep.c + */ +void hat_setup(struct hat *, int); + +/* + * Flags for hat_setup + */ +#define HAT_DONTALLOC 0 +#define HAT_ALLOC 1 +#define HAT_INIT 2 + +/* + * Other routines, for statistics + */ +int hat_startstat(struct as *); +void hat_getstat(struct as *, caddr_t, size_t, uint_t, char *, int); +void hat_freestat(struct as *, int); +void hat_resvstat(size_t, struct as *, caddr_t); + +/* + * Transitionary routine while we still allow hat_getkpfnum(caddr_t) + * to return a pfn for kernel memory, but want to warn the user that + * it isn't supported. + */ +void hat_getkpfnum_badcall(void *caller); + +/* + * Relocation callback routines. Currently only sfmmu HAT supports + * these. + */ +extern int hat_add_callback(id_t, caddr_t, uint_t, uint_t, void *, + pfn_t *); +extern id_t hat_register_callback( + int (*prehandler)(caddr_t, uint_t, uint_t, void *), + int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t), + int (*errhandler)(caddr_t, uint_t, uint_t, void *), int); +extern void hat_delete_callback(caddr_t, uint_t, void *, uint_t); + +/* + * hat_add_callback()/hat_delete_callback() flags. + */ +#define HAC_NOSLEEP 0x0 +#define HAC_SLEEP 0x1 +#define HAC_PAGELOCK 0x2 + +/* + * Suspend/unsuspend handler callback arguments. + */ +#define HAT_SUSPEND 0x0010 +#define HAT_UNSUSPEND 0x0010 +#define HAT_PRESUSPEND 0x0020 +#define HAT_POSTUNSUSPEND 0x0020 + +/* + * Error handler callback arguments. See the block comments + * before the implementation of hat_add_callback() for an + * explanation of what these mean. + */ +#define HAT_CB_ERR_LEAKED 0x1 + +#endif /* _KERNEL */ + +/* + * The size of the bit array for ref and mod bit storage must be a power of 2. + * 2 bits are collected for each page. Below the power used is 4, + * which is 16 8-bit characters = 128 bits, ref and mod bit information + * for 64 pages. + */ +#define HRM_SHIFT 4 +#define HRM_BYTES (1 << HRM_SHIFT) +#define HRM_PAGES ((HRM_BYTES * NBBY) / 2) +#define HRM_PGPERBYTE (NBBY/2) +#define HRM_PGBYTEMASK (HRM_PGPERBYTE-1) + +#define HRM_PGOFFMASK ((HRM_PGPERBYTE-1) << MMU_PAGESHIFT) +#define HRM_BASEOFFSET (((MMU_PAGESIZE * HRM_PAGES) - 1)) +#define HRM_BASEMASK (~(HRM_BASEOFFSET)) + +#define HRM_BASESHIFT (MMU_PAGESHIFT + (HRM_SHIFT + 2)) +#define HRM_PAGEMASK (MMU_PAGEMASK ^ HRM_BASEMASK) + +#define HRM_HASHSIZE 0x200 +#define HRM_HASHMASK (HRM_HASHSIZE - 1) + +#define HRM_BLIST_INCR 0x200 + +/* + * The structure for maintaining referenced and modified information + */ +struct hrmstat { + struct as *hrm_as; /* stat block belongs to this as */ + uintptr_t hrm_base; /* base of block */ + ushort_t hrm_id; /* opaque identifier, one of a_vbits */ + struct hrmstat *hrm_anext; /* as statistics block list */ + struct hrmstat *hrm_hnext; /* list for hashed blocks */ + uchar_t hrm_bits[HRM_BYTES]; /* the ref and mod bits */ +}; + +/* + * For global monitoring of the reference and modified bits + * of all address spaces we reserve one id bit. + */ +#define HRM_SWSMONID 1 + + +#ifdef _KERNEL + +/* + * Hat locking functions + * XXX - these two functions are currently being used by hatstats + * they can be removed by using a per-as mutex for hatstats. + */ +void hat_enter(struct hat *); +void hat_exit(struct hat *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_HAT_H */ diff --git a/usr/src/uts/common/vm/hat_refmod.c b/usr/src/uts/common/vm/hat_refmod.c new file mode 100644 index 0000000000..1a812bd94f --- /dev/null +++ b/usr/src/uts/common/vm/hat_refmod.c @@ -0,0 +1,544 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * The following routines implement the hat layer's + * recording of the referenced and modified bits. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/debug.h> +#include <sys/kmem.h> + +/* + * Note, usage of cmn_err requires you not hold any hat layer locks. + */ +#include <sys/cmn_err.h> + +#include <vm/as.h> +#include <vm/hat.h> + +kmutex_t hat_statlock; /* protects all hat statistics data */ +struct hrmstat *hrm_memlist; /* tracks memory alloced for hrm_blist blocks */ +struct hrmstat **hrm_hashtab; /* hash table for finding blocks quickly */ +struct hrmstat *hrm_blist; +int hrm_blist_incr = HRM_BLIST_INCR; +int hrm_blist_lowater = HRM_BLIST_INCR/2; +int hrm_blist_num = 0; +int hrm_blist_total = 0; +int hrm_mlockinited = 0; +int hrm_allocfailmsg = 0; /* print a message when allocations fail */ +int hrm_allocfail = 0; + +static struct hrmstat *hrm_balloc(void); +static int hrm_init(void); +static void hrm_link(struct hrmstat *); +static void hrm_setbits(struct hrmstat *, caddr_t, uint_t); +static void hrm_hashout(struct hrmstat *); +static void hrm_getblk(int); + +#define hrm_hash(as, addr) \ + (HRM_HASHMASK & \ + (((uintptr_t)(addr) >> HRM_BASESHIFT) ^ ((uintptr_t)(as) >> 2))) + +#define hrm_match(hrm, as, addr) \ + (((hrm)->hrm_as == (as) && \ + ((hrm)->hrm_base == ((uintptr_t)(addr) & HRM_BASEMASK))) ? 1 : 0) + +/* + * reserve enough statistic blocks for + * chunk of bytes (pages) in a given as. + */ +/* ARGSUSED */ +void +hat_resvstat(size_t chunk, struct as *as, caddr_t addr) +{ + int nhrm = btop(chunk)/HRM_PAGES; + + if (nhrm < HRM_BLIST_INCR) + nhrm = 0; /* preallocate at least HRM_BLIST_INCR */ + hrm_getblk(nhrm); +} + +/* + * Start the statistics gathering for an address space. + * Return -1 if we can't do it, otherwise return an opaque + * identifier to be used when querying for the gathered statistics. + * The identifier is an unused bit in a_vbits. + * Bit 0 is reserved for swsmon. + */ +int +hat_startstat(struct as *as) +{ + uint_t nbits; /* number of bits */ + uint_t bn; /* bit number */ + uint_t id; /* new vbit, identifier */ + uint_t vbits; /* used vbits of address space */ + size_t chunk; /* mapped size for stats */ + /* + * Initialize global data, if needed. + */ + if (hrm_init() == -1) + return (-1); + + /* + * If the refmod saving memory allocator runs out, print + * a warning message about how to fix it, see comment at + * the beginning of hat_setstat. + */ + if (hrm_allocfailmsg) { + cmn_err(CE_WARN, + "hrm_balloc failures occured, increase hrm_blist_incr"); + hrm_allocfailmsg = 0; + } + + /* + * Verify that a buffer of statistics blocks exists + * and allocate more, if needed. + */ + + chunk = hat_get_mapped_size(as->a_hat); + chunk = (btop(chunk)/HRM_PAGES); + if (chunk < HRM_BLIST_INCR) + chunk = 0; + + hrm_getblk((int)chunk); + + /* + * Find a unused id in the given address space. + */ + hat_enter(as->a_hat); + vbits = as->a_vbits; + nbits = sizeof (as->a_vbits) * NBBY; + for (bn = 1, id = 2; bn < (nbits - 1); bn++, id <<= 1) + if ((id & vbits) == 0) + break; + if (bn >= (nbits - 1)) { + hat_exit(as->a_hat); + return (-1); + } + as->a_vbits |= id; + hat_exit(as->a_hat); + (void) hat_stats_enable(as->a_hat); + return (id); +} + +/* + * Record referenced and modified information for an address space. + * Rmbits is a word containing the referenced bit in bit position 1 + * and the modified bit in bit position 0. + * + * For current informational uses, one can rerun any program using + * this facility after modifying the hrm_blist_incr to be a larger + * amount so that a larger buffer of blocks will be maintained. + */ +void +hat_setstat(struct as *as, caddr_t addr, size_t len, uint_t rmbits) +{ + struct hrmstat *hrm; + uint_t vbits, newbits, nb; + int h; + + ASSERT(len == PAGESIZE); + ASSERT((rmbits & ~(P_MOD|P_REF)) == 0); + + if (rmbits == 0) + return; + + /* + * Initialize global data, if needed. + */ + if (hrm_init() == -1) + return; + + mutex_enter(&hat_statlock); + + /* + * Search the hash list for the as and addr we are looking for + * and set the ref and mod bits in every block that matches. + */ + vbits = 0; + h = hrm_hash(as, addr); + for (hrm = hrm_hashtab[h]; hrm; hrm = hrm->hrm_hnext) { + if (hrm_match(hrm, as, addr)) { + hrm_setbits(hrm, addr, rmbits); + vbits |= hrm->hrm_id; + } + } + + /* + * If we didn't find a block for all of the enabled + * vpages bits, then allocate and initialize a block + * for each bit that was not found. + */ + if (vbits != as->a_vbits) { + newbits = vbits ^ as->a_vbits; + while (newbits) { + if (ffs(newbits)) + nb = 1 << (ffs(newbits)-1); + hrm = (struct hrmstat *)hrm_balloc(); + if (hrm == NULL) { + hrm_allocfailmsg = 1; + hrm_allocfail++; + mutex_exit(&hat_statlock); + return; + } + hrm->hrm_as = as; + hrm->hrm_base = (uintptr_t)addr & HRM_BASEMASK; + hrm->hrm_id = nb; + hrm_link(hrm); + hrm_setbits(hrm, addr, rmbits); + newbits &= ~nb; + } + } + mutex_exit(&hat_statlock); +} + +/* + * Free the resources used to maintain the referenced and modified + * statistics for the virtual page view of an address space + * identified by id. + */ +void +hat_freestat(struct as *as, int id) +{ + struct hrmstat *hrm, *prev_ahrm; + + hat_stats_disable(as->a_hat); /* tell the hat layer to stop */ + hat_enter(as->a_hat); + if (id == 0) + as->a_vbits = 0; + else + as->a_vbits &= ~id; + + if ((hrm = as->a_hrm) == NULL) { + hat_exit(as->a_hat); + return; + } + hat_exit(as->a_hat); + + mutex_enter(&hat_statlock); + if (hrm_hashtab == NULL) { + /* can't happen? */ + mutex_exit(&hat_statlock); + return; + } + for (prev_ahrm = NULL; hrm; hrm = hrm->hrm_anext) { + if ((id == hrm->hrm_id) || (id == NULL)) { + + hrm_hashout(hrm); + hrm->hrm_hnext = hrm_blist; + hrm_blist = hrm; + hrm_blist_num++; + + if (prev_ahrm == NULL) + as->a_hrm = hrm->hrm_anext; + else + prev_ahrm->hrm_anext = hrm->hrm_anext; + + } else + prev_ahrm = hrm; + } + + /* + * If all statistics blocks are free, + * return the memory to the system. + */ + if (hrm_blist_num == hrm_blist_total) { + /* zero the block list since we are giving back its memory */ + hrm_blist = NULL; + hrm_blist_num = 0; + hrm_blist_total = 0; + while (hrm_memlist) { + hrm = hrm_memlist; + hrm_memlist = hrm->hrm_hnext; + kmem_free(hrm, hrm->hrm_base); + } + ASSERT(hrm_memlist == NULL); + kmem_free(hrm_hashtab, HRM_HASHSIZE * sizeof (char *)); + hrm_hashtab = NULL; + } + mutex_exit(&hat_statlock); +} + +/* + * Initialize any global state for the statistics handling. + * Hrm_lock protects the globally allocted memory: + * hrm_memlist and hrm_hashtab. + */ +static int +hrm_init(void) +{ + /* + * Alloacte the hashtable if it doesn't exist yet. + */ + mutex_enter(&hat_statlock); + if (hrm_hashtab == NULL) + hrm_hashtab = + kmem_zalloc(HRM_HASHSIZE * sizeof (char *), KM_SLEEP); + mutex_exit(&hat_statlock); + return (0); +} + +/* + * Grab memory for statistics gathering of the hat layer. + */ +static void +hrm_getblk(int chunk) +{ + struct hrmstat *hrm, *l; + int i; + int hrm_incr; + + mutex_enter(&hat_statlock); + if ((hrm_blist == NULL) || + (hrm_blist_num <= hrm_blist_lowater) || + chunk) { + + mutex_exit(&hat_statlock); + + hrm_incr = chunk? chunk : hrm_blist_incr; + hrm = kmem_zalloc(sizeof (struct hrmstat) * hrm_incr, KM_SLEEP); + hrm->hrm_base = sizeof (struct hrmstat) * hrm_incr; + + /* + * thread the allocated blocks onto a freelist + * using the first block to hold information for + * freeing them all later + */ + mutex_enter(&hat_statlock); + hrm->hrm_hnext = hrm_memlist; + hrm_memlist = hrm; + + hrm_blist_total += (hrm_incr - 1); + for (i = 1; i < hrm_incr; i++) { + l = &hrm[i]; + l->hrm_hnext = hrm_blist; + hrm_blist = l; + hrm_blist_num++; + } + } + mutex_exit(&hat_statlock); +} + +static void +hrm_hashin(struct hrmstat *hrm) +{ + int h; + + ASSERT(MUTEX_HELD(&hat_statlock)); + h = hrm_hash(hrm->hrm_as, hrm->hrm_base); + + hrm->hrm_hnext = hrm_hashtab[h]; + hrm_hashtab[h] = hrm; +} + +static void +hrm_hashout(struct hrmstat *hrm) +{ + struct hrmstat *list, **prev_hrm; + int h; + + ASSERT(MUTEX_HELD(&hat_statlock)); + h = hrm_hash(hrm->hrm_as, hrm->hrm_base); + list = hrm_hashtab[h]; + prev_hrm = &hrm_hashtab[h]; + + while (list) { + if (list == hrm) { + *prev_hrm = list->hrm_hnext; + return; + } + prev_hrm = &list->hrm_hnext; + list = list->hrm_hnext; + } +} + + +/* + * Link a statistic block into an address space and also put it + * on the hash list for future references. + */ +static void +hrm_link(struct hrmstat *hrm) +{ + struct as *as = hrm->hrm_as; + + ASSERT(MUTEX_HELD(&hat_statlock)); + hrm->hrm_anext = as->a_hrm; + as->a_hrm = hrm; + hrm_hashin(hrm); +} + +/* + * Allocate a block for statistics keeping. + * Returns NULL if blocks are unavailable. + */ +static struct hrmstat * +hrm_balloc(void) +{ + struct hrmstat *hrm; + + ASSERT(MUTEX_HELD(&hat_statlock)); + + hrm = hrm_blist; + if (hrm != NULL) { + hrm_blist = hrm->hrm_hnext; + hrm_blist_num--; + hrm->hrm_hnext = NULL; + } + return (hrm); +} + +/* + * Set the ref and mod bits for addr within statistics block hrm. + */ +static void +hrm_setbits(struct hrmstat *hrm, caddr_t addr, uint_t bits) +{ + uint_t po, bo, spb; + uint_t nbits; + + po = ((uintptr_t)addr & HRM_BASEOFFSET) >> MMU_PAGESHIFT; /* pg off */ + bo = po / (NBBY / 2); /* which byte in bit array */ + spb = (3 - (po & 3)) * 2; /* shift position within byte */ + nbits = bits << spb; /* bit mask */ + hrm->hrm_bits[bo] |= nbits; +} + +/* + * Return collected statistics about an address space. + * If clearflag is set, atomically read and zero the bits. + * + * Fill in the data array supplied with the referenced and + * modified bits collected for address range [addr ... addr + len] + * in address space, as, uniquely identified by id. + * The destination is a byte array. We fill in three bits per byte: + * referenced, modified, and hwmapped bits. + * Kernel only interface, can't fault on destination data array. + * + */ +void +hat_getstat(struct as *as, caddr_t addr, size_t len, uint_t id, + caddr_t datap, int clearflag) +{ + size_t np; /* number of pages */ + caddr_t a; + char *dp; + + np = btop(len); + bzero(datap, np); + + hat_sync(as->a_hat, addr, len, clearflag); + + /* allocate more statistics blocks if needed */ + hrm_getblk(0); + + mutex_enter(&hat_statlock); + if (hrm_hashtab == NULL) { + /* can happen when victim process exits */ + mutex_exit(&hat_statlock); + return; + } + dp = datap; + a = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + while (a < addr + len) { + struct hrmstat *hrm; + size_t n; /* number of pages, temp */ + int h; /* hash index */ + uint_t po; + + h = hrm_hash(as, a); + n = (HRM_PAGES - + (((uintptr_t)a & HRM_PAGEMASK) >> MMU_PAGESHIFT)); + if (n > np) + n = np; + po = ((uintptr_t)a & HRM_BASEOFFSET) >> MMU_PAGESHIFT; + + for (hrm = hrm_hashtab[h]; hrm; hrm = hrm->hrm_hnext) { + if (hrm->hrm_as == as && + hrm->hrm_base == ((uintptr_t)a & HRM_BASEMASK) && + id == hrm->hrm_id) { + int i, nr; + uint_t bo, spb; + + /* + * Extract leading unaligned bits. + */ + i = 0; + while (i < n && (po & 3)) { + bo = po / (NBBY / 2); + spb = (3 - (po & 3)) * 2; + *dp++ |= (hrm->hrm_bits[bo] >> spb) & 3; + if (clearflag) + hrm->hrm_bits[bo] &= ~(3<<spb); + po++; + i++; + } + /* + * Extract aligned bits. + */ + nr = n/4*4; + bo = po / (NBBY / 2); + while (i < nr) { + int bits = hrm->hrm_bits[bo]; + *dp++ |= (bits >> 6) & 3; + *dp++ |= (bits >> 4) & 3; + *dp++ |= (bits >> 2) & 3; + *dp++ |= (bits >> 0) & 3; + if (clearflag) + hrm->hrm_bits[bo] = 0; + bo++; + po += 4; + i += 4; + } + /* + * Extract trailing unaligned bits. + */ + while (i < n) { + bo = po / (NBBY / 2); + spb = (3 - (po & 3)) * 2; + *dp++ |= (hrm->hrm_bits[bo] >> spb) & 3; + if (clearflag) + hrm->hrm_bits[bo] &= ~(3<<spb); + po++; + i++; + } + + break; + } + } + if (hrm == NULL) + dp += n; + np -= n; + a += n * MMU_PAGESIZE; + } + mutex_exit(&hat_statlock); +} diff --git a/usr/src/uts/common/vm/kpm.h b/usr/src/uts/common/vm/kpm.h new file mode 100644 index 0000000000..edc213b8f8 --- /dev/null +++ b/usr/src/uts/common/vm/kpm.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VM_KPM_H +#define _VM_KPM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _LP64 +#define SEGKPM_SUPPORT +#endif + +#ifndef _ASM + +/* + * Machine independent per instance kpm mapping structure + */ +struct kpme { + struct kpme *kpe_next; + struct kpme *kpe_prev; + struct page *kpe_page; /* back pointer to (start) page */ +}; + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_KPM_H */ diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h new file mode 100644 index 0000000000..9cd32e0ae3 --- /dev/null +++ b/usr/src/uts/common/vm/page.h @@ -0,0 +1,1006 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_PAGE_H +#define _VM_PAGE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <vm/seg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_KERNEL) || defined(_KMEMUSER) + +/* + * Shared/Exclusive lock. + */ + +/* + * Types of page locking supported by page_lock & friends. + */ +typedef enum { + SE_SHARED, + SE_EXCL /* exclusive lock (value == -1) */ +} se_t; + +/* + * For requesting that page_lock reclaim the page from the free list. + */ +typedef enum { + P_RECLAIM, /* reclaim page from free list */ + P_NO_RECLAIM /* DON`T reclaim the page */ +} reclaim_t; + +/* + * Callers of page_try_reclaim_lock and page_lock_es can use this flag + * to get SE_EXCL access before reader/writers are given access. + */ +#define SE_EXCL_WANTED 0x02 + +#endif /* _KERNEL | _KMEMUSER */ + +typedef int selock_t; + +/* + * Define VM_STATS to turn on all sorts of statistic gathering about + * the VM layer. By default, it is only turned on when DEBUG is + * also defined. + */ +#ifdef DEBUG +#define VM_STATS +#endif /* DEBUG */ + +#ifdef VM_STATS +#define VM_STAT_ADD(stat) (stat)++ +#define VM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++)) +#else +#define VM_STAT_ADD(stat) +#define VM_STAT_COND_ADD(cond, stat) +#endif /* VM_STATS */ + +#ifdef _KERNEL + +/* + * Macros to acquire and release the page logical lock. + */ +#define page_struct_lock(pp) mutex_enter(&page_llock) +#define page_struct_unlock(pp) mutex_exit(&page_llock) + +#endif /* _KERNEL */ + +#include <sys/t_lock.h> + +struct as; + +/* + * Each physical page has a page structure, which is used to maintain + * these pages as a cache. A page can be found via a hashed lookup + * based on the [vp, offset]. If a page has an [vp, offset] identity, + * then it is entered on a doubly linked circular list off the + * vnode using the vpnext/vpprev pointers. If the p_free bit + * is on, then the page is also on a doubly linked circular free + * list using next/prev pointers. If the "p_selock" and "p_iolock" + * are held, then the page is currently being read in (exclusive p_selock) + * or written back (shared p_selock). In this case, the next/prev pointers + * are used to link the pages together for a consecutive i/o request. If + * the page is being brought in from its backing store, then other processes + * will wait for the i/o to complete before attaching to the page since it + * will have an "exclusive" lock. + * + * Each page structure has the locks described below along with + * the fields they protect: + * + * p_selock This is a per-page shared/exclusive lock that is + * used to implement the logical shared/exclusive + * lock for each page. The "shared" lock is normally + * used in most cases while the "exclusive" lock is + * required to destroy or retain exclusive access to + * a page (e.g., while reading in pages). The appropriate + * lock is always held whenever there is any reference + * to a page structure (e.g., during i/o). + * (Note that with the addition of the "writer-lock-wanted" + * semantics (via SE_EWANTED), threads must not acquire + * multiple reader locks or else a deadly embrace will + * occur in the following situation: thread 1 obtains a + * reader lock; next thread 2 fails to get a writer lock + * but specified SE_EWANTED so it will wait by either + * blocking (when using page_lock_es) or spinning while + * retrying (when using page_try_reclaim_lock) until the + * reader lock is released; then thread 1 attempts to + * get another reader lock but is denied due to + * SE_EWANTED being set, and now both threads are in a + * deadly embrace.) + * + * p_hash + * p_vnode + * p_offset + * + * p_free + * p_age + * + * p_iolock This is a binary semaphore lock that provides + * exclusive access to the i/o list links in each + * page structure. It is always held while the page + * is on an i/o list (i.e., involved in i/o). That is, + * even though a page may be only `shared' locked + * while it is doing a write, the following fields may + * change anyway. Normally, the page must be + * `exclusively' locked to change anything in it. + * + * p_next + * p_prev + * + * The following fields are protected by the global page_llock: + * + * p_lckcnt + * p_cowcnt + * + * The following lists are protected by the global page_freelock: + * + * page_cachelist + * page_freelist + * + * The following, for our purposes, are protected by + * the global freemem_lock: + * + * freemem + * freemem_wait + * freemem_cv + * + * The following fields are protected by hat layer lock(s). When a page + * structure is not mapped and is not associated with a vnode (after a call + * to page_hashout() for example) the p_nrm field may be modified with out + * holding the hat layer lock: + * + * p_nrm + * p_mapping + * p_share + * + * The following field is file system dependent. How it is used and + * the locking strategies applied are up to the individual file system + * implementation. + * + * p_fsdata + * + * The page structure is used to represent and control the system's + * physical pages. There is one instance of the structure for each + * page that is not permenately allocated. For example, the pages that + * hold the page structures are permanently held by the kernel + * and hence do not need page structures to track them. The array + * of page structures is allocated early on in the kernel's life and + * is based on the amount of available physical memory. + * + * Each page structure may simultaneously appear on several linked lists. + * The lists are: hash list, free or in i/o list, and a vnode's page list. + * Each type of list is protected by a different group of mutexes as described + * below: + * + * The hash list is used to quickly find a page when the page's vnode and + * offset within the vnode are known. Each page that is hashed is + * connected via the `p_hash' field. The anchor for each hash is in the + * array `page_hash'. An array of mutexes, `ph_mutex', protects the + * lists anchored by page_hash[]. To either search or modify a given hash + * list, the appropriate mutex in the ph_mutex array must be held. + * + * The free list contains pages that are `free to be given away'. For + * efficiency reasons, pages on this list are placed in two catagories: + * pages that are still associated with a vnode, and pages that are not + * associated with a vnode. Free pages always have their `p_free' bit set, + * free pages that are still associated with a vnode also have their + * `p_age' bit set. Pages on the free list are connected via their + * `p_next' and `p_prev' fields. When a page is involved in some sort + * of i/o, it is not free and these fields may be used to link associated + * pages together. At the moment, the free list is protected by a + * single mutex `page_freelock'. The list of free pages still associated + * with a vnode is anchored by `page_cachelist' while other free pages + * are anchored in architecture dependent ways (to handle page coloring etc.). + * + * Pages associated with a given vnode appear on a list anchored in the + * vnode by the `v_pages' field. They are linked together with + * `p_vpnext' and `p_vpprev'. The field `p_offset' contains a page's + * offset within the vnode. The pages on this list are not kept in + * offset order. These lists, in a manner similar to the hash lists, + * are protected by an array of mutexes called `vph_hash'. Before + * searching or modifying this chain the appropriate mutex in the + * vph_hash[] array must be held. + * + * Again, each of the lists that a page can appear on is protected by a + * mutex. Before reading or writing any of the fields comprising the + * list, the appropriate lock must be held. These list locks should only + * be held for very short intervals. + * + * In addition to the list locks, each page structure contains a + * shared/exclusive lock that protects various fields within it. + * To modify one of these fields, the `p_selock' must be exclusively held. + * To read a field with a degree of certainty, the lock must be at least + * held shared. + * + * Removing a page structure from one of the lists requires holding + * the appropriate list lock and the page's p_selock. A page may be + * prevented from changing identity, being freed, or otherwise modified + * by acquiring p_selock shared. + * + * To avoid deadlocks, a strict locking protocol must be followed. Basically + * there are two cases: In the first case, the page structure in question + * is known ahead of time (e.g., when the page is to be added or removed + * from a list). In the second case, the page structure is not known and + * must be found by searching one of the lists. + * + * When adding or removing a known page to one of the lists, first the + * page must be exclusively locked (since at least one of its fields + * will be modified), second the lock protecting the list must be acquired, + * third the page inserted or deleted, and finally the list lock dropped. + * + * The more interesting case occures when the particular page structure + * is not known ahead of time. For example, when a call is made to + * page_lookup(), it is not known if a page with the desired (vnode and + * offset pair) identity exists. So the appropriate mutex in ph_mutex is + * acquired, the hash list searched, and if the desired page is found + * an attempt is made to lock it. The attempt to acquire p_selock must + * not block while the hash list lock is held. A deadlock could occure + * if some other process was trying to remove the page from the list. + * The removing process (following the above protocol) would have exclusively + * locked the page, and be spinning waiting to acquire the lock protecting + * the hash list. Since the searching process holds the hash list lock + * and is waiting to acquire the page lock, a deadlock occurs. + * + * The proper scheme to follow is: first, lock the appropriate list, + * search the list, and if the desired page is found either use + * page_trylock() (which will not block) or pass the address of the + * list lock to page_lock(). If page_lock() can not acquire the page's + * lock, it will drop the list lock before going to sleep. page_lock() + * returns a value to indicate if the list lock was dropped allowing the + * calling program to react appropriately (i.e., retry the operation). + * + * If the list lock was dropped before the attempt at locking the page + * was made, checks would have to be made to ensure that the page had + * not changed identity before its lock was obtained. This is because + * the interval between dropping the list lock and acquiring the page + * lock is indeterminate. + * + * In addition, when both a hash list lock (ph_mutex[]) and a vnode list + * lock (vph_mutex[]) are needed, the hash list lock must be acquired first. + * The routine page_hashin() is a good example of this sequence. + * This sequence is ASSERTed by checking that the vph_mutex[] is not held + * just before each acquisition of one of the mutexs in ph_mutex[]. + * + * So, as a quick summary: + * + * pse_mutex[]'s protect the p_selock and p_cv fields. + * + * p_selock protects the p_free, p_age, p_vnode, p_offset and p_hash, + * + * ph_mutex[]'s protect the page_hash[] array and its chains. + * + * vph_mutex[]'s protect the v_pages field and the vp page chains. + * + * First lock the page, then the hash chain, then the vnode chain. When + * this is not possible `trylocks' must be used. Sleeping while holding + * any of these mutexes (p_selock is not a mutex) is not allowed. + * + * + * field reading writing ordering + * ====================================================================== + * p_vnode p_selock(E,S) p_selock(E) + * p_offset + * p_free + * p_age + * ===================================================================== + * p_hash p_selock(E,S) p_selock(E) && p_selock, ph_mutex + * ph_mutex[] + * ===================================================================== + * p_vpnext p_selock(E,S) p_selock(E) && p_selock, vph_mutex + * p_vpprev vph_mutex[] + * ===================================================================== + * When the p_free bit is set: + * + * p_next p_selock(E,S) p_selock(E) && p_selock, + * p_prev page_freelock page_freelock + * + * When the p_free bit is not set: + * + * p_next p_selock(E,S) p_selock(E) && p_selock, p_iolock + * p_prev p_iolock + * ===================================================================== + * p_selock pse_mutex[] pse_mutex[] can`t acquire any + * p_cv other mutexes or + * sleep while holding + * this lock. + * ===================================================================== + * p_lckcnt p_selock(E,S) p_selock(E) && + * p_cowcnt page_llock + * ===================================================================== + * p_nrm hat layer lock hat layer lock + * p_mapping + * p_pagenum + * ===================================================================== + * + * where: + * E----> exclusive version of p_selock. + * S----> shared version of p_selock. + * + * + * Global data structures and variable: + * + * field reading writing ordering + * ===================================================================== + * page_hash[] ph_mutex[] ph_mutex[] can hold this lock + * before acquiring + * a vph_mutex or + * pse_mutex. + * ===================================================================== + * vp->v_pages vph_mutex[] vph_mutex[] can only acquire + * a pse_mutex while + * holding this lock. + * ===================================================================== + * page_cachelist page_freelock page_freelock can't acquire any + * page_freelist page_freelock page_freelock + * ===================================================================== + * freemem freemem_lock freemem_lock can't acquire any + * freemem_wait other mutexes while + * freemem_cv holding this mutex. + * ===================================================================== + * + * Page relocation, PG_NORELOC and P_NORELOC. + * + * Pages may be relocated using the page_relocate() interface. Relocation + * involves moving the contents and identity of a page to another, free page. + * To relocate a page, the SE_EXCL lock must be obtained. The way to prevent + * a page from being relocated is to hold the SE_SHARED lock (the SE_EXCL + * lock must not be held indefinitely). If the page is going to be held + * SE_SHARED indefinitely, then the PG_NORELOC hint should be passed + * to page_create_va so that pages that are prevented from being relocated + * can be managed differently by the platform specific layer. + * + * Pages locked in memory using page_pp_lock (p_lckcnt/p_cowcnt != 0) + * are guaranteed to be held in memory, but can still be relocated + * providing the SE_EXCL lock can be obtained. + * + * The P_NORELOC bit in the page_t.p_state field is provided for use by + * the platform specific code in managing pages when the PG_NORELOC + * hint is used. + * + * Memory delete and page locking. + * + * The set of all usable pages is managed using the global page list as + * implemented by the memseg structure defined below. When memory is added + * or deleted this list changes. Additions to this list guarantee that the + * list is never corrupt. In order to avoid the necessity of an additional + * lock to protect against failed accesses to the memseg being deleted and, + * more importantly, the page_ts, the memseg structure is never freed and the + * page_t virtual address space is remapped to a page (or pages) of + * zeros. If a page_t is manipulated while it is p_selock'd, or if it is + * locked indirectly via a hash or freelist lock, it is not possible for + * memory delete to collect the page and so that part of the page list is + * prevented from being deleted. If the page is referenced outside of one + * of these locks, it is possible for the page_t being referenced to be + * deleted. Examples of this are page_t pointers returned by + * page_numtopp_nolock, page_first and page_next. Providing the page_t + * is re-checked after taking the p_selock (for p_vnode != NULL), the + * remapping to the zero pages will be detected. + * + * + * Page size (p_szc field) and page locking. + * + * p_szc field of free pages is changed by free list manager under freelist + * locks and is of no concern to the rest of VM subsystem. + * + * p_szc changes of allocated anonymous (swapfs) can only be done only after + * exclusively locking all constituent pages and calling hat_pageunload() on + * each of them. To prevent p_szc changes of non free anonymous (swapfs) large + * pages it's enough to either lock SHARED any of constituent pages or prevent + * hat_pageunload() by holding hat level lock that protects mapping lists (this + * method is for hat code only) + * + * To increase (promote) p_szc of allocated non anonymous file system pages + * one has to first lock exclusively all involved constituent pages and call + * hat_pageunload() on each of them. To prevent p_szc promote it's enough to + * either lock SHARED any of constituent pages that will be needed to make a + * large page or prevent hat_pageunload() by holding hat level lock that + * protects mapping lists (this method is for hat code only). + * + * To decrease (demote) p_szc of an allocated non anonymous file system large + * page one can either use the same method as used for changeing p_szc of + * anonymous large pages or if it's not possible to lock all constituent pages + * exclusively a different method can be used. In the second method one only + * has to exclusively lock one of constituent pages but then one has to + * acquire further locks by calling page_szc_lock() and + * hat_page_demote(). hat_page_demote() acquires hat level locks and then + * demotes the page. This mechanism relies on the fact that any code that + * needs to prevent p_szc of a file system large page from changeing either + * locks all constituent large pages at least SHARED or locks some pages at + * least SHARED and calls page_szc_lock() or uses hat level page locks. + * Demotion using this method is implemented by page_demote_vp_pages(). + * Please see comments in front of page_demote_vp_pages(), hat_page_demote() + * and page_szc_lock() for more details. + * + * Lock order: p_selock, page_szc_lock, ph_mutex/vph_mutex/freelist, + * hat level locks. + */ + +typedef struct page { + u_offset_t p_offset; /* offset into vnode for this page */ + struct vnode *p_vnode; /* vnode that this page is named by */ + selock_t p_selock; /* shared/exclusive lock on the page */ +#if defined(_LP64) + int p_selockpad; /* pad for growing selock */ +#endif + struct page *p_hash; /* hash by [vnode, offset] */ + struct page *p_vpnext; /* next page in vnode list */ + struct page *p_vpprev; /* prev page in vnode list */ + struct page *p_next; /* next page in free/intrans lists */ + struct page *p_prev; /* prev page in free/intrans lists */ + ushort_t p_lckcnt; /* number of locks on page data */ + ushort_t p_cowcnt; /* number of copy on write lock */ + kcondvar_t p_cv; /* page struct's condition var */ + kcondvar_t p_io_cv; /* for iolock */ + uchar_t p_iolock_state; /* replaces p_iolock */ + volatile uchar_t p_szc; /* page size code */ + uchar_t p_fsdata; /* file system dependent byte */ + uchar_t p_state; /* p_free, p_noreloc */ + uchar_t p_nrm; /* non-cache, ref, mod readonly bits */ +#if defined(__sparc) + uchar_t p_vcolor; /* virtual color */ +#else + uchar_t p_embed; /* x86 - changes p_mapping & p_index */ +#endif + uchar_t p_index; /* MPSS mapping info. Not used on x86 */ + uchar_t p_toxic; /* page has an unrecoverable error */ + void *p_mapping; /* hat specific translation info */ + pfn_t p_pagenum; /* physical page number */ + + uint_t p_share; /* number of translations */ +#if defined(_LP64) + uint_t p_sharepad; /* pad for growing p_share */ +#endif + uint_t p_msresv_1; /* reserved for future use */ +#if defined(__sparc) + uint_t p_kpmref; /* number of kpm mapping sharers */ + struct kpme *p_kpmelist; /* kpm specific mapping info */ +#else + /* index of entry in p_map when p_embed is set */ + uint_t p_mlentry; +#endif + uint64_t p_msresv_2; /* page allocation debugging */ +} page_t; + + +typedef page_t devpage_t; +#define devpage page + + +/* + * Page hash table is a power-of-two in size, externally chained + * through the hash field. PAGE_HASHAVELEN is the average length + * desired for this chain, from which the size of the page_hash + * table is derived at boot time and stored in the kernel variable + * page_hashsz. In the hash function it is given by PAGE_HASHSZ. + * + * PAGE_HASH_FUNC returns an index into the page_hash[] array. This + * index is also used to derive the mutex that protects the chain. + * + * In constructing the hash function, first we dispose of unimportant bits + * (page offset from "off" and the low 3 bits of "vp" which are zero for + * struct alignment). Then shift and sum the remaining bits a couple times + * in order to get as many source bits from the two source values into the + * resulting hashed value. Note that this will perform quickly, since the + * shifting/summing are fast register to register operations with no additional + * memory references). + */ +#if NCPU < 4 +#define PH_TABLE_SIZE 16 +#define VP_SHIFT 7 +#else +#define PH_TABLE_SIZE 128 +#define VP_SHIFT 9 +#endif + +/* + * The amount to use for the successive shifts in the hash function below. + * The actual value is LOG2(PH_TABLE_SIZE), so that as many bits as + * possible will filter thru PAGE_HASH_FUNC() and PAGE_HASH_MUTEX(). + */ +#define PH_SHIFT_SIZE (7) + +#define PAGE_HASHSZ page_hashsz +#define PAGE_HASHAVELEN 4 +#define PAGE_HASH_FUNC(vp, off) \ + ((((uintptr_t)(off) >> PAGESHIFT) + \ + ((uintptr_t)(off) >> (PAGESHIFT + PH_SHIFT_SIZE)) + \ + ((uintptr_t)(vp) >> 3) + \ + ((uintptr_t)(vp) >> (3 + PH_SHIFT_SIZE)) + \ + ((uintptr_t)(vp) >> (3 + 2 * PH_SHIFT_SIZE))) & \ + (PAGE_HASHSZ - 1)) +#ifdef _KERNEL + +/* + * The page hash value is re-hashed to an index for the ph_mutex array. + * + * For 64 bit kernels, the mutex array is padded out to prevent false + * sharing of cache sub-blocks (64 bytes) of adjacent mutexes. + * + * For 32 bit kernels, we don't want to waste kernel address space with + * padding, so instead we rely on the hash function to introduce skew of + * adjacent vnode/offset indexes (the left shift part of the hash function). + * Since sizeof (kmutex_t) is 8, we shift an additional 3 to skew to a different + * 64 byte sub-block. + */ +typedef struct pad_mutex { + kmutex_t pad_mutex; +#ifdef _LP64 + char pad_pad[64 - sizeof (kmutex_t)]; +#endif +} pad_mutex_t; +extern pad_mutex_t ph_mutex[]; + +#define PAGE_HASH_MUTEX(x) \ + &(ph_mutex[((x) + ((x) >> VP_SHIFT) + ((x) << 3)) & \ + (PH_TABLE_SIZE - 1)].pad_mutex) + +/* + * Flags used while creating pages. + */ +#define PG_EXCL 0x0001 +#define PG_WAIT 0x0002 +#define PG_PHYSCONTIG 0x0004 /* NOT SUPPORTED */ +#define PG_MATCH_COLOR 0x0008 /* SUPPORTED by free list routines */ +#define PG_NORELOC 0x0010 /* Non-relocatable alloc hint. */ + /* Page must be PP_ISNORELOC */ +#define PG_PANIC 0x0020 /* system will panic if alloc fails */ +#define PG_PUSHPAGE 0x0040 /* alloc may use reserve */ + +/* + * When p_selock has the SE_EWANTED bit set, threads waiting for SE_EXCL + * access are given priority over all other waiting threads. + */ +#define SE_EWANTED 0x40000000 +#define PAGE_LOCKED(pp) (((pp)->p_selock & ~SE_EWANTED) != 0) +#define PAGE_SHARED(pp) (((pp)->p_selock & ~SE_EWANTED) > 0) +#define PAGE_EXCL(pp) ((pp)->p_selock < 0) +#define PAGE_LOCKED_SE(pp, se) \ + ((se) == SE_EXCL ? PAGE_EXCL(pp) : PAGE_SHARED(pp)) + +extern long page_hashsz; +extern page_t **page_hash; + +extern kmutex_t page_llock; /* page logical lock mutex */ +extern kmutex_t freemem_lock; /* freemem lock */ + +extern pgcnt_t total_pages; /* total pages in the system */ + +/* + * Variables controlling locking of physical memory. + */ +extern pgcnt_t pages_pp_maximum; /* tuning: lock + claim <= max */ +extern void init_pages_pp_maximum(void); + +struct lgrp; + +/* page_list_{add,sub} flags */ + +/* which list */ +#define PG_FREE_LIST 0x0001 +#define PG_CACHE_LIST 0x0002 + +/* where on list */ +#define PG_LIST_TAIL 0x0010 +#define PG_LIST_HEAD 0x0020 + +/* called from */ +#define PG_LIST_ISINIT 0x1000 +#define PG_LIST_ISCAGE 0x2000 + +/* + * Flags for setting the p_toxic flag when a page has errors + * These flags may be OR'ed into the p_toxic page flag to + * indicate that error(s) have occurred on a page, + * (see page_settoxic()). If both PAGE_IS_TOXIC and + * PAGE_IS_FAILING are set, PAGE_IS_FAILING takes precedence. + * + * When an error happens on a page, the trap handler sets + * PAGE_IS_FAULTY on the page to indicate that an error has been + * seen on the page. The error could be really a memory error or + * something else (like a datapath error). When it is determined + * that it is a memory error, the page is marked as PAGE_IS_TOXIC + * or PAGE_IS_FAILING depending on the type of error and then + * retired. + * + * We use the page's 'toxic' flag to determine whether the page + * has just got a single error - PAGE_IS_TOXIC - or is being + * retired due to multiple soft errors - PAGE_IS_FAILING. In + * page_free(), a page that has been marked PAGE_IS_FAILING will + * not be cleaned, it will always be retired. A page marked + * PAGE_IS_TOXIC is cleaned and is retired only if this attempt at + * cleaning fails. + * + * When a page has been successfully retired, we set PAGE_IS_RETIRED. + */ +#define PAGE_IS_OK 0x0 +#define PAGE_IS_TOXIC 0x1 +#define PAGE_IS_FAILING 0x2 +#define PAGE_IS_RETIRED 0x4 +#define PAGE_IS_FAULTY 0x8 + +/* + * Page frame operations. + */ +page_t *page_lookup(struct vnode *, u_offset_t, se_t); +page_t *page_lookup_create(struct vnode *, u_offset_t, se_t, page_t *, + spgcnt_t *, int); +page_t *page_lookup_nowait(struct vnode *, u_offset_t, se_t); +page_t *page_find(struct vnode *, u_offset_t); +page_t *page_exists(struct vnode *, u_offset_t); +int page_exists_physcontig(vnode_t *, u_offset_t, uint_t, page_t *[]); +int page_exists_forreal(struct vnode *, u_offset_t, uint_t *); +void page_needfree(spgcnt_t); +page_t *page_create(struct vnode *, u_offset_t, size_t, uint_t); +int page_alloc_pages(struct seg *, caddr_t, page_t **, page_t **, + uint_t, int); +page_t *page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, + uint_t flags, struct seg *seg, caddr_t vaddr, void *arg); +page_t *page_create_va(struct vnode *, u_offset_t, size_t, uint_t, + struct seg *, caddr_t); +int page_create_wait(size_t npages, uint_t flags); +void page_create_putback(ssize_t npages); +void page_free(page_t *, int); +void page_free_at_startup(page_t *); +void page_free_pages(page_t *); +void free_vp_pages(struct vnode *, u_offset_t, size_t); +int page_reclaim(page_t *, kmutex_t *); +void page_destroy(page_t *, int); +void page_destroy_pages(page_t *); +void page_destroy_free(page_t *); +void page_rename(page_t *, struct vnode *, u_offset_t); +int page_hashin(page_t *, struct vnode *, u_offset_t, kmutex_t *); +void page_hashout(page_t *, kmutex_t *); +int page_num_hashin(pfn_t, struct vnode *, u_offset_t); +void page_add(page_t **, page_t *); +void page_add_common(page_t **, page_t *); +void page_sub(page_t **, page_t *); +void page_sub_common(page_t **, page_t *); +page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, + caddr_t, size_t, uint_t, struct lgrp *); + +page_t *page_get_cachelist(struct vnode *, u_offset_t, struct seg *, + caddr_t, uint_t, struct lgrp *); +void page_list_add(page_t *, int); +void page_boot_demote(page_t *); +void page_promote_size(page_t *, uint_t); +void page_list_add_pages(page_t *, int); +void page_list_sub(page_t *, int); +void page_list_break(page_t **, page_t **, size_t); +void page_list_concat(page_t **, page_t **); +void page_vpadd(page_t **, page_t *); +void page_vpsub(page_t **, page_t *); +int page_lock(page_t *, se_t, kmutex_t *, reclaim_t); +int page_lock_es(page_t *, se_t, kmutex_t *, reclaim_t, int); +void page_lock_clr_exclwanted(page_t *); +int page_trylock(page_t *, se_t); +int page_try_reclaim_lock(page_t *, se_t, int); +int page_tryupgrade(page_t *); +void page_downgrade(page_t *); +void page_unlock(page_t *); +void page_lock_delete(page_t *); +int page_pp_lock(page_t *, int, int); +void page_pp_unlock(page_t *, int, int); +int page_resv(pgcnt_t, uint_t); +void page_unresv(pgcnt_t); +void page_pp_useclaim(page_t *, page_t *, uint_t); +int page_addclaim(page_t *); +int page_subclaim(page_t *); +int page_addclaim_pages(page_t **); +int page_subclaim_pages(page_t **); +pfn_t page_pptonum(page_t *); +page_t *page_numtopp(pfn_t, se_t); +page_t *page_numtopp_noreclaim(pfn_t, se_t); +page_t *page_numtopp_nolock(pfn_t); +page_t *page_numtopp_nowait(pfn_t, se_t); +page_t *page_first(); +page_t *page_next(page_t *); +page_t *page_nextn_raw(page_t *, ulong_t); /* pp += n */ +#define page_next_raw(PP) page_nextn_raw((PP), 1) +page_t *page_list_next(page_t *); +page_t *page_nextn(page_t *, ulong_t); +page_t *page_next_scan_init(void **); +page_t *page_next_scan_large(page_t *, ulong_t *, void **); +void prefetch_page_r(void *); +void ppcopy(page_t *, page_t *); +void page_relocate_hash(page_t *, page_t *); +void pagezero(page_t *, uint_t, uint_t); +void pagescrub(page_t *, uint_t, uint_t); +void page_io_lock(page_t *); +void page_io_unlock(page_t *); +int page_io_trylock(page_t *); +int page_iolock_assert(page_t *); +void page_iolock_init(page_t *); +pgcnt_t page_busy(int); +void page_lock_init(void); +ulong_t page_share_cnt(page_t *); +int page_isshared(page_t *); +int page_isfree(page_t *); +int page_isref(page_t *); +int page_ismod(page_t *); +int page_release(page_t *, int); +int page_retire(page_t *, uchar_t); +int page_istoxic(page_t *); +int page_isfailing(page_t *); +int page_isretired(page_t *); +int page_deteriorating(page_t *); +void page_settoxic(page_t *, uchar_t); +void page_clrtoxic(page_t *); +void page_clrtoxic_flag(page_t *, uchar_t); +int page_isfaulty(page_t *); +int page_mem_avail(pgcnt_t); + +void page_set_props(page_t *, uint_t); +void page_clr_all_props(page_t *); + +kmutex_t *page_vnode_mutex(struct vnode *); +kmutex_t *page_se_mutex(struct page *); +kmutex_t *page_szc_lock(struct page *); +int page_szc_lock_assert(struct page *pp); + +/* + * Page relocation interfaces. page_relocate() is generic. + * page_get_replacement_page() is provided by the PSM. + * page_free_replacement_page() is generic. + */ +int group_page_trylock(page_t *, se_t); +void group_page_unlock(page_t *); +int page_relocate(page_t **, page_t **, int, int, spgcnt_t *, struct lgrp *); +int do_page_relocate(page_t **, page_t **, int, spgcnt_t *, struct lgrp *); +page_t *page_get_replacement_page(page_t *, struct lgrp *, uint_t); +void page_free_replacement_page(page_t *); +int page_relocate_cage(page_t **, page_t **); + +int page_try_demote_pages(page_t *); +void page_demote_free_pages(page_t *); + +struct anon_map; + +void page_mark_migrate(struct seg *, caddr_t, size_t, struct anon_map *, + ulong_t, vnode_t *, u_offset_t, int); +void page_migrate(struct seg *, caddr_t, page_t **, pgcnt_t); + +/* + * Tell the PIM we are adding physical memory + */ +void add_physmem(page_t *, size_t, pfn_t); +void add_physmem_cb(page_t *, pfn_t); /* callback for page_t part */ + +/* + * hw_page_array[] is configured with hardware supported page sizes by + * platform specific code. + */ +typedef struct { + size_t hp_size; + uint_t hp_shift; + pgcnt_t hp_pgcnt; /* base pagesize cnt */ +} hw_pagesize_t; + +extern hw_pagesize_t hw_page_array[]; +extern uint_t page_colors, page_colors_mask; +extern uint_t page_coloring_shift; +extern int cpu_page_colors; + +uint_t page_num_pagesizes(void); +uint_t page_num_user_pagesizes(void); +size_t page_get_pagesize(uint_t); +size_t page_get_user_pagesize(uint_t n); +pgcnt_t page_get_pagecnt(uint_t); +uint_t page_get_shift(uint_t); +int page_szc(size_t); +int page_user_szc(size_t); + + +/* page_get_replacement page flags */ +#define PGR_SAMESZC 0x1 /* only look for page size same as orig */ +#define PGR_NORELOC 0x2 /* allocate a P_NORELOC page */ + +#endif /* _KERNEL */ + +/* + * Constants used for the p_iolock_state + */ +#define PAGE_IO_INUSE 0x1 +#define PAGE_IO_WANTED 0x2 + +/* + * Constants used for page_release status + */ +#define PGREL_NOTREL 0x1 +#define PGREL_CLEAN 0x2 +#define PGREL_MOD 0x3 + +/* + * The p_state field holds what used to be the p_age and p_free + * bits. These fields are protected by p_selock (see above). + */ +#define P_FREE 0x80 /* Page on free list */ +#define P_NORELOC 0x40 /* Page is non-relocatable */ +#define P_MIGRATE 0x20 /* Migrate page on next touch */ +#define P_SWAP 0x10 /* belongs to vnode that is V_ISSWAP */ + +#define PP_ISFREE(pp) ((pp)->p_state & P_FREE) +#define PP_ISAGED(pp) (((pp)->p_state & P_FREE) && \ + ((pp)->p_vnode == NULL)) +#define PP_ISNORELOC(pp) ((pp)->p_state & P_NORELOC) +#define PP_ISMIGRATE(pp) ((pp)->p_state & P_MIGRATE) +#define PP_ISSWAP(pp) ((pp)->p_state & P_SWAP) + +#define PP_SETFREE(pp) ((pp)->p_state = ((pp)->p_state & ~P_MIGRATE) \ + | P_FREE) +#define PP_SETAGED(pp) ASSERT(PP_ISAGED(pp)) +#define PP_SETNORELOC(pp) ((pp)->p_state |= P_NORELOC) +#define PP_SETMIGRATE(pp) ((pp)->p_state |= P_MIGRATE) +#define PP_SETSWAP(pp) ((pp)->p_state |= P_SWAP) + +#define PP_CLRFREE(pp) ((pp)->p_state &= ~P_FREE) +#define PP_CLRAGED(pp) ASSERT(!PP_ISAGED(pp)) +#define PP_CLRNORELOC(pp) ((pp)->p_state &= ~P_NORELOC) +#define PP_CLRMIGRATE(pp) ((pp)->p_state &= ~P_MIGRATE) +#define PP_CLRSWAP(pp) ((pp)->p_state &= ~P_SWAP) + + + +/* + * kpm large page description. + * The virtual address range of segkpm is divided into chunks of + * kpm_pgsz. Each chunk is controlled by a kpm_page_t. The ushort + * is sufficient for 2^^15 * PAGESIZE, so e.g. the maximum kpm_pgsz + * for 8K is 256M and 2G for 64K pages. It it kept as small as + * possible to save physical memory space. + * + * There are 2 segkpm mapping windows within in the virtual address + * space when we have to prevent VAC alias conflicts. The so called + * Alias window (mappings are always by PAGESIZE) is controlled by + * kp_refcnta. The regular window is controlled by kp_refcnt for the + * normal operation, which is to use the largest available pagesize. + * When VAC alias conflicts are present within a chunk in the regular + * window the large page mapping is broken up into smaller PAGESIZE + * mappings. kp_refcntc is used to control the pages that are invoked + * in the conflict and kp_refcnts holds the active mappings done + * with the small page size. In non vac conflict mode kp_refcntc is + * also used as "go" indication (-1) for the trap level tsbmiss + * handler. + */ +typedef struct kpm_page { + short kp_refcnt; /* pages mapped large */ + short kp_refcnta; /* pages mapped in Alias window */ + short kp_refcntc; /* TL-tsbmiss flag; #vac alias conflict pages */ + short kp_refcnts; /* vac alias: pages mapped small */ +} kpm_page_t; + +/* + * Note: khl_lock offset changes must be reflected in sfmmu_asm.s + */ +typedef struct kpm_hlk { + kmutex_t khl_mutex; /* kpm_page mutex */ + uint_t khl_lock; /* trap level tsbmiss handling */ +} kpm_hlk_t; + +/* + * kpm small page description. + * When kpm_pgsz is equal to PAGESIZE a smaller representation is used + * to save memory space. Alias range mappings and regular segkpm + * mappings are done in units of PAGESIZE and can share the mapping + * information and the mappings are always distinguishable by their + * virtual address. Other information neeeded for VAC conflict prevention + * is already available on a per page basis. There are basically 3 states + * a kpm_spage can have: not mapped (0), mapped in Alias range or virtually + * uncached (1) and mapped in the regular segkpm window (-1). The -1 value + * is also used as "go" indication for the segkpm trap level tsbmiss + * handler for small pages (value is kept the same as it is used for large + * mappings). + */ +typedef struct kpm_spage { + char kp_mapped; /* page mapped small */ +} kpm_spage_t; + +/* + * Note: kshl_lock offset changes must be reflected in sfmmu_asm.s + */ +typedef struct kpm_shlk { + uint_t kshl_lock; /* trap level tsbmiss handling */ +} kpm_shlk_t; + +/* + * Each segment of physical memory is described by a memseg struct. + * Within a segment, memory is considered contiguous. The members + * can be categorized as follows: + * . Platform independent: + * pages, epages, pages_base, pages_end, next, lnext. + * . 64bit only but platform independent: + * kpm_pbase, kpm_nkpmpgs, kpm_pages, kpm_spages. + * . Really platform or mmu specific: + * pagespa, epagespa, nextpa, kpm_pagespa. + * . Mixed: + * msegflags. + */ +struct memseg { + page_t *pages, *epages; /* [from, to] in page array */ + pfn_t pages_base, pages_end; /* [from, to] in page numbers */ + struct memseg *next; /* next segment in list */ +#if defined(__sparc) + struct memseg *lnext; /* next segment in deleted list */ + uint64_t pagespa, epagespa; /* [from, to] page array physical */ + uint64_t nextpa; /* physical next pointer */ + pfn_t kpm_pbase; /* start of kpm range */ + pgcnt_t kpm_nkpmpgs; /* # of kpm_pgsz pages */ + union _mseg_un { + kpm_page_t *kpm_lpgs; /* ptr to kpm_page array */ + kpm_spage_t *kpm_spgs; /* ptr to kpm_spage array */ + } mseg_un; + uint64_t kpm_pagespa; /* physical ptr to kpm (s)pages array */ + uint_t msegflags; /* memseg flags */ +#endif /* __sparc */ +}; + +/* memseg union aliases */ +#define kpm_pages mseg_un.kpm_lpgs +#define kpm_spages mseg_un.kpm_spgs + +/* msegflags */ +#define MEMSEG_DYNAMIC 0x1 /* DR: memory was added dynamically */ + +/* memseg support macros */ +#define MSEG_NPAGES(SEG) ((SEG)->pages_end - (SEG)->pages_base) + +/* memseg hash */ +#define MEM_HASH_SHIFT 0x9 +#define N_MEM_SLOTS 0x200 /* must be a power of 2 */ +#define MEMSEG_PFN_HASH(pfn) (((pfn)/mhash_per_slot) & (N_MEM_SLOTS - 1)) + +/* memseg externals */ +extern struct memseg *memsegs; /* list of memory segments */ +extern ulong_t mhash_per_slot; +extern uint64_t memsegspa; /* memsegs as physical address */ + +void build_pfn_hash(); +extern struct memseg *page_numtomemseg_nolock(pfn_t pfnum); + + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_PAGE_H */ diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c new file mode 100644 index 0000000000..9a2d12dd8e --- /dev/null +++ b/usr/src/uts/common/vm/page_lock.c @@ -0,0 +1,861 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - page locking primitives + */ +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/vtrace.h> +#include <sys/debug.h> +#include <sys/cmn_err.h> +#include <sys/vnode.h> +#include <sys/bitmap.h> +#include <sys/lockstat.h> +#include <sys/condvar_impl.h> +#include <vm/page.h> +#include <vm/seg_enum.h> +#include <vm/vm_dep.h> + +/* + * This global mutex is for logical page locking. + * The following fields in the page structure are protected + * by this lock: + * + * p_lckcnt + * p_cowcnt + */ +kmutex_t page_llock; + +/* + * This is a global lock for the logical page free list. The + * logical free list, in this implementation, is maintained as two + * separate physical lists - the cache list and the free list. + */ +kmutex_t page_freelock; + +/* + * The hash table, page_hash[], the p_selock fields, and the + * list of pages associated with vnodes are protected by arrays of mutexes. + * + * Unless the hashes are changed radically, the table sizes must be + * a power of two. Also, we typically need more mutexes for the + * vnodes since these locks are occasionally held for long periods. + * And since there seem to be two special vnodes (kvp and swapvp), + * we make room for private mutexes for them. + * + * The pse_mutex[] array holds the mutexes to protect the p_selock + * fields of all page_t structures. + * + * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex + * when given a pointer to a page_t. + * + * PSE_TABLE_SIZE must be a power of two. One could argue that we + * should go to the trouble of setting it up at run time and base it + * on memory size rather than the number of compile time CPUs. + * + * XX64 We should be using physmem size to calculate PSE_TABLE_SIZE, + * PSE_SHIFT, PIO_SHIFT. + * + * These might break in 64 bit world. + */ +#define PSE_SHIFT 7 /* log2(PSE_TABLE_SIZE) */ + +#define PSE_TABLE_SIZE 128 /* number of mutexes to have */ + +#define PIO_SHIFT PSE_SHIFT /* next power of 2 bigger than page_t */ +#define PIO_TABLE_SIZE PSE_TABLE_SIZE /* number of io mutexes to have */ + +pad_mutex_t ph_mutex[PH_TABLE_SIZE]; +pad_mutex_t pse_mutex[PSE_TABLE_SIZE]; +kmutex_t pio_mutex[PIO_TABLE_SIZE]; + +#define PAGE_SE_MUTEX(pp) \ + &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \ + ((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \ + (PSE_TABLE_SIZE - 1))].pad_mutex + +#define PAGE_IO_MUTEX(pp) \ + &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] + +#define PSZC_MTX_TABLE_SIZE 128 +#define PSZC_MTX_TABLE_SHIFT 7 + +static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; + +#define PAGE_SZC_MUTEX(_pp) \ + &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ + ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ + ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ + (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex + +/* + * The vph_mutex[] array holds the mutexes to protect the vnode chains, + * (i.e., the list of pages anchored by v_pages and connected via p_vpprev + * and p_vpnext). + * + * The page_vnode_mutex(vp) function returns the address of the appropriate + * mutex from this array given a pointer to a vnode. It is complicated + * by the fact that the kernel's vnode and the swapfs vnode are referenced + * frequently enough to warrent their own mutexes. + * + * The VP_HASH_FUNC returns the index into the vph_mutex array given + * an address of a vnode. + */ + +/* + * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world. + * Need to review again. + */ +#define VPH_TABLE_SIZE (2 << VP_SHIFT) + +#define VP_HASH_FUNC(vp) \ + ((((uintptr_t)(vp) >> 6) + \ + ((uintptr_t)(vp) >> 8) + \ + ((uintptr_t)(vp) >> 10) + \ + ((uintptr_t)(vp) >> 12)) \ + & (VPH_TABLE_SIZE - 1)) + +extern struct vnode kvp; + +kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; + +/* + * Initialize the locks used by the Virtual Memory Management system. + */ +void +page_lock_init() +{ +} + +/* + * At present we only use page ownership to aid debugging, so it's + * OK if the owner field isn't exact. In the 32-bit world two thread ids + * can map to the same owner because we just 'or' in 0x80000000 and + * then clear the second highest bit, so that (for example) 0x2faced00 + * and 0xafaced00 both map to 0xafaced00. + * In the 64-bit world, p_selock may not be large enough to hold a full + * thread pointer. If we ever need precise ownership (e.g. if we implement + * priority inheritance for page locks) then p_selock should become a + * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). + */ +#define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) +#define SE_READER 1 + +/* + * A page that is deleted must be marked as such using the + * page_lock_delete() function. The page must be exclusively locked. + * The SE_DELETED marker is put in p_selock when this function is called. + * SE_DELETED must be distinct from any SE_WRITER value. + */ +#define SE_DELETED (1 | INT_MIN) + +#ifdef VM_STATS +uint_t vph_kvp_count; +uint_t vph_swapfsvp_count; +uint_t vph_other; +#endif /* VM_STATS */ + +#ifdef VM_STATS +uint_t page_lock_count; +uint_t page_lock_miss; +uint_t page_lock_miss_lock; +uint_t page_lock_reclaim; +uint_t page_lock_bad_reclaim; +uint_t page_lock_same_page; +uint_t page_lock_upgrade; +uint_t page_lock_upgrade_failed; +uint_t page_lock_deleted; + +uint_t page_trylock_locked; +uint_t page_trylock_missed; + +uint_t page_try_reclaim_upgrade; +#endif /* VM_STATS */ + + +/* + * Acquire the "shared/exclusive" lock on a page. + * + * Returns 1 on success and locks the page appropriately. + * 0 on failure and does not lock the page. + * + * If `lock' is non-NULL, it will be dropped and reacquired in the + * failure case. This routine can block, and if it does + * it will always return a failure since the page identity [vp, off] + * or state may have changed. + */ + +int +page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) +{ + return (page_lock_es(pp, se, lock, reclaim, 0)); +} + +/* + * With the addition of reader-writer lock semantics to page_lock_es, + * callers wanting an exclusive (writer) lock may prevent shared-lock + * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. + * In this case, when an exclusive lock cannot be acquired, p_selock's + * SE_EWANTED bit is set. + * This bit, along with the se and es parameters, are used to decide + * if the requested lock should be granted: + * + * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED Action + * ---------- -------------- ------------------- --------- + * SE_EXCL no dont-care/1 deny lock + * SE_EXCL any(see note) unlocked/any grant lock, clear SE_EWANTED + * SE_EXCL yes any lock/any deny, set SE_EWANTED + * SE_EXCL no any lock/any deny + * SE_SHARED not applicable shared/0 grant + * SE_SHARED not applicable unlocked/0 grant + * SE_SHARED not applicable shared/1 deny + * SE_SHARED not applicable unlocked/1 deny + * SE_SHARED not applicable excl/any deny + * + * Note: the code grants an exclusive lock to the caller and clears + * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED + * bit's value. This was deemed acceptable as we are not concerned about + * exclusive-lock starvation. If this ever becomes an issue, a priority or + * fifo mechanism should also be implemented. + */ +int +page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) +{ + int retval; + kmutex_t *pse = PAGE_SE_MUTEX(pp); + int upgraded; + int reclaim_it; + + ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); + + VM_STAT_ADD(page_lock_count); + + upgraded = 0; + reclaim_it = 0; + + mutex_enter(pse); + + /* + * Current uses of 'es': + * es == 1 page_lookup_create will attempt page relocation + * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete + * memory thread); this prevents reader-starvation of waiting + * writer thread(s). + */ + + + ASSERT(((es & SE_EXCL_WANTED) == 0) || + ((es == SE_EXCL_WANTED) && (se == SE_EXCL))); + + if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { + se = SE_EXCL; + } + + if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { + + reclaim_it = 1; + if (se == SE_SHARED) { + /* + * This is an interesting situation. + * + * Remember that p_free can only change if + * p_selock < 0. + * p_free does not depend on our holding `pse'. + * And, since we hold `pse', p_selock can not change. + * So, if p_free changes on us, the page is already + * exclusively held, and we would fail to get p_selock + * regardless. + * + * We want to avoid getting the share + * lock on a free page that needs to be reclaimed. + * It is possible that some other thread has the share + * lock and has left the free page on the cache list. + * pvn_vplist_dirty() does this for brief periods. + * If the se_share is currently SE_EXCL, we will fail + * to acquire p_selock anyway. Blocking is the + * right thing to do. + * If we need to reclaim this page, we must get + * exclusive access to it, force the upgrade now. + * Again, we will fail to acquire p_selock if the + * page is not free and block. + */ + upgraded = 1; + se = SE_EXCL; + VM_STAT_ADD(page_lock_upgrade); + } + } + + if (se == SE_EXCL) { + if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { + /* + * if the caller wants a writer lock (but did not + * specify exclusive access), and there is a pending + * writer that wants exclusive access, return failure + */ + retval = 0; + } else if ((pp->p_selock & ~SE_EWANTED) == 0) { + /* no reader/writer lock held */ + THREAD_KPRI_REQUEST(); + /* this clears our setting of the SE_EWANTED bit */ + pp->p_selock = SE_WRITER; + retval = 1; + } else { + /* page is locked */ + if (es == SE_EXCL_WANTED) { + /* set the SE_EWANTED bit */ + pp->p_selock |= SE_EWANTED; + } + retval = 0; + } + } else { + retval = 0; + if (pp->p_selock >= 0) { + /* readers are not allowed when excl wanted */ + if (!(pp->p_selock & SE_EWANTED)) { + pp->p_selock += SE_READER; + retval = 1; + } + } + } + + if (retval == 0) { + if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { + VM_STAT_ADD(page_lock_deleted); + mutex_exit(pse); + return (retval); + } + +#ifdef VM_STATS + VM_STAT_ADD(page_lock_miss); + if (upgraded) { + VM_STAT_ADD(page_lock_upgrade_failed); + } +#endif + if (lock) { + VM_STAT_ADD(page_lock_miss_lock); + mutex_exit(lock); + } + + /* + * Now, wait for the page to be unlocked and + * release the lock protecting p_cv and p_selock. + */ + cv_wait(&pp->p_cv, pse); + mutex_exit(pse); + + /* + * The page identity may have changed while we were + * blocked. If we are willing to depend on "pp" + * still pointing to a valid page structure (i.e., + * assuming page structures are not dynamically allocated + * or freed), we could try to lock the page if its + * identity hasn't changed. + * + * This needs to be measured, since we come back from + * cv_wait holding pse (the expensive part of this + * operation) we might as well try the cheap part. + * Though we would also have to confirm that dropping + * `lock' did not cause any grief to the callers. + */ + if (lock) { + mutex_enter(lock); + } + } else { + /* + * We have the page lock. + * If we needed to reclaim the page, and the page + * needed reclaiming (ie, it was free), then we + * have the page exclusively locked. We may need + * to downgrade the page. + */ + ASSERT((upgraded) ? + ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); + mutex_exit(pse); + + /* + * We now hold this page's lock, either shared or + * exclusive. This will prevent its identity from changing. + * The page, however, may or may not be free. If the caller + * requested, and it is free, go reclaim it from the + * free list. If the page can't be reclaimed, return failure + * so that the caller can start all over again. + * + * NOTE:page_reclaim() releases the page lock (p_selock) + * if it can't be reclaimed. + */ + if (reclaim_it) { + if (!page_reclaim(pp, lock)) { + VM_STAT_ADD(page_lock_bad_reclaim); + retval = 0; + } else { + VM_STAT_ADD(page_lock_reclaim); + if (upgraded) { + page_downgrade(pp); + } + } + } + } + return (retval); +} + +/* + * Clear the SE_EWANTED bit from p_selock. This function allows + * callers of page_lock_es and page_try_reclaim_lock to clear + * their setting of this bit if they decide they no longer wish + * to gain exclusive access to the page. Currently only + * delete_memory_thread uses this when the delete memory + * operation is cancelled. + */ +void +page_lock_clr_exclwanted(page_t *pp) +{ + kmutex_t *pse = PAGE_SE_MUTEX(pp); + + mutex_enter(pse); + pp->p_selock &= ~SE_EWANTED; + if (CV_HAS_WAITERS(&pp->p_cv)) + cv_broadcast(&pp->p_cv); + mutex_exit(pse); +} + +/* + * Read the comments inside of page_lock_es() carefully. + * + * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the + * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. + * This is used by threads subject to reader-starvation (eg. memory delete). + * + * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, + * it is expected that it will retry at a later time. Threads that will + * not retry the lock *must* call page_lock_clr_exclwanted to clear the + * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, + * the bit is cleared.) + */ +int +page_try_reclaim_lock(page_t *pp, se_t se, int es) +{ + kmutex_t *pse = PAGE_SE_MUTEX(pp); + selock_t old; + + mutex_enter(pse); + + old = pp->p_selock; + + ASSERT(((es & SE_EXCL_WANTED) == 0) || + ((es == SE_EXCL_WANTED) && (se == SE_EXCL))); + + if (se == SE_SHARED && es == 1 && old == 0) { + se = SE_EXCL; + } + + if (se == SE_SHARED) { + if (!PP_ISFREE(pp)) { + if (old >= 0) { + /* readers are not allowed when excl wanted */ + if (!(old & SE_EWANTED)) { + pp->p_selock = old + SE_READER; + mutex_exit(pse); + return (1); + } + } + mutex_exit(pse); + return (0); + } + /* + * The page is free, so we really want SE_EXCL (below) + */ + VM_STAT_ADD(page_try_reclaim_upgrade); + } + + /* + * The caller wants a writer lock. We try for it only if + * SE_EWANTED is not set, or if the caller specified + * SE_EXCL_WANTED. + */ + if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) { + if ((old & ~SE_EWANTED) == 0) { + /* no reader/writer lock held */ + THREAD_KPRI_REQUEST(); + /* this clears out our setting of the SE_EWANTED bit */ + pp->p_selock = SE_WRITER; + mutex_exit(pse); + return (1); + } + } + if (es == SE_EXCL_WANTED) { + /* page is locked, set the SE_EWANTED bit */ + pp->p_selock |= SE_EWANTED; + } + mutex_exit(pse); + return (0); +} + +/* + * Acquire a page's "shared/exclusive" lock, but never block. + * Returns 1 on success, 0 on failure. + */ +int +page_trylock(page_t *pp, se_t se) +{ + kmutex_t *pse = PAGE_SE_MUTEX(pp); + + mutex_enter(pse); + if (pp->p_selock & SE_EWANTED) { + /* fail if a thread wants exclusive access */ + mutex_exit(pse); + return (0); + } + + if (se == SE_EXCL) { + if (pp->p_selock == 0) { + THREAD_KPRI_REQUEST(); + pp->p_selock = SE_WRITER; + mutex_exit(pse); + return (1); + } + } else { + if (pp->p_selock >= 0) { + pp->p_selock += SE_READER; + mutex_exit(pse); + return (1); + } + } + mutex_exit(pse); + return (0); +} + +/* + * Release the page's "shared/exclusive" lock and wake up anyone + * who might be waiting for it. + */ +void +page_unlock(page_t *pp) +{ + kmutex_t *pse = PAGE_SE_MUTEX(pp); + selock_t old; + + mutex_enter(pse); + old = pp->p_selock; + if ((old & ~SE_EWANTED) == SE_READER) { + pp->p_selock = old & ~SE_READER; + if (CV_HAS_WAITERS(&pp->p_cv)) + cv_broadcast(&pp->p_cv); + } else if ((old & ~SE_EWANTED) == SE_DELETED) { + panic("page_unlock: page %p is deleted", pp); + } else if (old < 0) { + THREAD_KPRI_RELEASE(); + pp->p_selock &= SE_EWANTED; + if (CV_HAS_WAITERS(&pp->p_cv)) + cv_broadcast(&pp->p_cv); + } else if ((old & ~SE_EWANTED) > SE_READER) { + pp->p_selock = old - SE_READER; + } else { + panic("page_unlock: page %p is not locked", pp); + } + mutex_exit(pse); +} + +/* + * Try to upgrade the lock on the page from a "shared" to an + * "exclusive" lock. Since this upgrade operation is done while + * holding the mutex protecting this page, no one else can acquire this page's + * lock and change the page. Thus, it is safe to drop the "shared" + * lock and attempt to acquire the "exclusive" lock. + * + * Returns 1 on success, 0 on failure. + */ +int +page_tryupgrade(page_t *pp) +{ + kmutex_t *pse = PAGE_SE_MUTEX(pp); + + mutex_enter(pse); + if (!(pp->p_selock & SE_EWANTED)) { + /* no threads want exclusive access, try upgrade */ + if (pp->p_selock == SE_READER) { + THREAD_KPRI_REQUEST(); + /* convert to exclusive lock */ + pp->p_selock = SE_WRITER; + mutex_exit(pse); + return (1); + } + } + mutex_exit(pse); + return (0); +} + +/* + * Downgrade the "exclusive" lock on the page to a "shared" lock + * while holding the mutex protecting this page's p_selock field. + */ +void +page_downgrade(page_t *pp) +{ + kmutex_t *pse = PAGE_SE_MUTEX(pp); + int excl_waiting; + + ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); + ASSERT(PAGE_EXCL(pp)); + + mutex_enter(pse); + excl_waiting = pp->p_selock & SE_EWANTED; + THREAD_KPRI_RELEASE(); + pp->p_selock = SE_READER | excl_waiting; + if (CV_HAS_WAITERS(&pp->p_cv)) + cv_broadcast(&pp->p_cv); + mutex_exit(pse); +} + +void +page_lock_delete(page_t *pp) +{ + kmutex_t *pse = PAGE_SE_MUTEX(pp); + + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_vnode == NULL); + ASSERT(pp->p_offset == (u_offset_t)-1); + ASSERT(!PP_ISFREE(pp)); + + mutex_enter(pse); + THREAD_KPRI_RELEASE(); + pp->p_selock = SE_DELETED; + if (CV_HAS_WAITERS(&pp->p_cv)) + cv_broadcast(&pp->p_cv); + mutex_exit(pse); +} + +/* + * Implement the io lock for pages + */ +void +page_iolock_init(page_t *pp) +{ + pp->p_iolock_state = 0; + cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); +} + +/* + * Acquire the i/o lock on a page. + */ +void +page_io_lock(page_t *pp) +{ + kmutex_t *pio; + + pio = PAGE_IO_MUTEX(pp); + mutex_enter(pio); + while (pp->p_iolock_state & PAGE_IO_INUSE) { + cv_wait(&(pp->p_io_cv), pio); + } + pp->p_iolock_state |= PAGE_IO_INUSE; + mutex_exit(pio); +} + +/* + * Release the i/o lock on a page. + */ +void +page_io_unlock(page_t *pp) +{ + kmutex_t *pio; + + pio = PAGE_IO_MUTEX(pp); + mutex_enter(pio); + cv_signal(&pp->p_io_cv); + pp->p_iolock_state &= ~PAGE_IO_INUSE; + mutex_exit(pio); +} + +/* + * Try to acquire the i/o lock on a page without blocking. + * Returns 1 on success, 0 on failure. + */ +int +page_io_trylock(page_t *pp) +{ + kmutex_t *pio; + + if (pp->p_iolock_state & PAGE_IO_INUSE) + return (0); + + pio = PAGE_IO_MUTEX(pp); + mutex_enter(pio); + + if (pp->p_iolock_state & PAGE_IO_INUSE) { + mutex_exit(pio); + return (0); + } + pp->p_iolock_state |= PAGE_IO_INUSE; + mutex_exit(pio); + + return (1); +} + +/* + * Assert that the i/o lock on a page is held. + * Returns 1 on success, 0 on failure. + */ +int +page_iolock_assert(page_t *pp) +{ + return (pp->p_iolock_state & PAGE_IO_INUSE); +} + +/* + * Wrapper exported to kernel routines that are built + * platform-independent (the macro is platform-dependent; + * the size of vph_mutex[] is based on NCPU). + * + * Note that you can do stress testing on this by setting the + * variable page_vnode_mutex_stress to something other than + * zero in a DEBUG kernel in a debugger after loading the kernel. + * Setting it after the kernel is running may not work correctly. + */ +#ifdef DEBUG +static int page_vnode_mutex_stress = 0; +#endif + +kmutex_t * +page_vnode_mutex(vnode_t *vp) +{ + if (vp == &kvp) + return (&vph_mutex[VPH_TABLE_SIZE + 0]); +#ifdef DEBUG + if (page_vnode_mutex_stress != 0) + return (&vph_mutex[0]); +#endif + + return (&vph_mutex[VP_HASH_FUNC(vp)]); +} + +kmutex_t * +page_se_mutex(page_t *pp) +{ + return (PAGE_SE_MUTEX(pp)); +} + +#ifdef VM_STATS +uint_t pszclck_stat[4]; +#endif +/* + * Find, take and return a mutex held by hat_page_demote(). + * Called by page_demote_vp_pages() before hat_page_demote() call and by + * routines that want to block hat_page_demote() but can't do it + * via locking all constituent pages. + * + * Return NULL if p_szc is 0. + * + * It should only be used for pages that can be demoted by hat_page_demote() + * i.e. non swapfs file system pages. The logic here is lifted from + * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase + * since the page is locked and not free. + * + * Hash of the root page is used to find the lock. + * To find the root in the presense of hat_page_demote() chageing the location + * of the root this routine relies on the fact that hat_page_demote() changes + * root last. + * + * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is + * returned pp's p_szc may be any value. + */ +kmutex_t * +page_szc_lock(page_t *pp) +{ + kmutex_t *mtx; + page_t *rootpp; + uint_t szc; + uint_t rszc; + uint_t pszc = pp->p_szc; + + ASSERT(pp != NULL); + ASSERT(PAGE_LOCKED(pp)); + ASSERT(!PP_ISFREE(pp)); + ASSERT(pp->p_vnode != NULL); + ASSERT(!IS_SWAPFSVP(pp->p_vnode)); + ASSERT(pp->p_vnode != &kvp); + +again: + if (pszc == 0) { + VM_STAT_ADD(pszclck_stat[0]); + return (NULL); + } + + /* The lock lives in the root page */ + + rootpp = PP_GROUPLEADER(pp, pszc); + mtx = PAGE_SZC_MUTEX(rootpp); + mutex_enter(mtx); + + /* + * since p_szc can only decrease if pp == rootpp + * rootpp will be always the same i.e we have the right root + * regardless of rootpp->p_szc. + * If location of pp's root didn't change after we took + * the lock we have the right root. return mutex hashed off it. + */ + if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { + VM_STAT_ADD(pszclck_stat[1]); + return (mtx); + } + + /* + * root location changed because page got demoted. + * locate the new root. + */ + if (rszc < pszc) { + szc = pp->p_szc; + ASSERT(szc < pszc); + mutex_exit(mtx); + pszc = szc; + VM_STAT_ADD(pszclck_stat[2]); + goto again; + } + + VM_STAT_ADD(pszclck_stat[3]); + /* + * current hat_page_demote not done yet. + * wait for it to finish. + */ + mutex_exit(mtx); + rootpp = PP_GROUPLEADER(rootpp, rszc); + mtx = PAGE_SZC_MUTEX(rootpp); + mutex_enter(mtx); + mutex_exit(mtx); + ASSERT(rootpp->p_szc < rszc); + goto again; +} + +int +page_szc_lock_assert(page_t *pp) +{ + page_t *rootpp = PP_PAGEROOT(pp); + kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); + + return (MUTEX_HELD(mtx)); +} diff --git a/usr/src/uts/common/vm/pvn.h b/usr/src/uts/common/vm/pvn.h new file mode 100644 index 0000000000..0467589ae6 --- /dev/null +++ b/usr/src/uts/common/vm/pvn.h @@ -0,0 +1,117 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_PVN_H +#define _VM_PVN_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/buf.h> +#include <vm/seg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +/* + * VM - paged vnode. + * + * The VM system manages memory as a cache of paged vnodes. + * This file desribes the interfaces to common subroutines + * used to help implement the VM/file system routines. + */ + +struct page *pvn_read_kluster(struct vnode *vp, u_offset_t off, + struct seg *seg, caddr_t addr, u_offset_t *offp, + size_t *lenp, u_offset_t vp_off, size_t vp_len, + int isra); +struct page *pvn_write_kluster(struct vnode *vp, struct page *pp, + u_offset_t *offp, size_t *lenp, u_offset_t vp_off, + size_t vp_len, int flags); +void pvn_read_done(struct page *plist, int flags); +void pvn_write_done(struct page *plist, int flags); +void pvn_io_done(struct page *plist); +int pvn_vplist_dirty(struct vnode *vp, u_offset_t off, + int (*putapage)(vnode_t *, struct page *, u_offset_t *, + size_t *, int, cred_t *), + int flags, struct cred *cred); +int pvn_getdirty(struct page *pp, int flags); +void pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes); +int pvn_getpages( + int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, + struct page *[], size_t, struct seg *, + caddr_t, enum seg_rw, cred_t *), + struct vnode *vp, u_offset_t off, size_t len, + uint_t *protp, struct page **pl, size_t plsz, + struct seg *seg, caddr_t addr, enum seg_rw rw, + struct cred *cred); +void pvn_plist_init(struct page *pp, struct page **pl, size_t plsz, + u_offset_t off, size_t io_len, enum seg_rw rw); +void pvn_init(void); + +/* + * When requesting pages from the getpage routines, pvn_getpages will + * allocate space to return PVN_GETPAGE_NUM pages which map PVN_GETPAGE_SZ + * worth of bytes. These numbers are chosen to be the minimum of the max's + * given in terms of bytes and pages. + */ +#define PVN_MAX_GETPAGE_SZ 0x10000 /* getpage size limit */ +#define PVN_MAX_GETPAGE_NUM 0x8 /* getpage page limit */ + +#if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE + +#define PVN_GETPAGE_SZ ptob(PVN_MAX_GETPAGE_NUM) +#define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM + +#else + +#define PVN_GETPAGE_SZ PVN_MAX_GETPAGE_SZ +#define PVN_GETPAGE_NUM btop(PVN_MAX_GETPAGE_SZ) + +#endif + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_PVN_H */ diff --git a/usr/src/uts/common/vm/rm.h b/usr/src/uts/common/vm/rm.h new file mode 100644 index 0000000000..9789283993 --- /dev/null +++ b/usr/src/uts/common/vm/rm.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2001 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_RM_H +#define _VM_RM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +extern size_t rm_asrss(struct as *); +extern size_t rm_assize(struct as *); +extern ushort_t rm_pctmemory(struct as *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_RM_H */ diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h new file mode 100644 index 0000000000..2ada345960 --- /dev/null +++ b/usr/src/uts/common/vm/seg.h @@ -0,0 +1,252 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_SEG_H +#define _VM_SEG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/vnode.h> +#include <sys/avl.h> +#include <vm/seg_enum.h> +#include <vm/faultcode.h> +#include <vm/hat.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * VM - Segments. + */ + +/* + * kstat statistics for segment advise + */ +typedef struct { + kstat_named_t MADV_FREE_hit; + kstat_named_t MADV_FREE_miss; +} segadvstat_t; + +/* + * memory object ids + */ +typedef struct memid { u_longlong_t val[2]; } memid_t; + +/* + * An address space contains a set of segments, managed by drivers. + * Drivers support mapped devices, sharing, copy-on-write, etc. + * + * The seg structure contains a lock to prevent races, the base virtual + * address and size of the segment, a back pointer to the containing + * address space, pointers to maintain an AVL tree of segments in the + * same address space, and procedure and data hooks for the driver. + * The AVL tree of segments for the address space is sorted by + * ascending base addresses and overlapping segments are not allowed. + * + * After a segment is created, faults may occur on pages of the segment. + * When a fault occurs, the fault handling code must get the desired + * object and set up the hardware translation to the object. For some + * objects, the fault handling code also implements copy-on-write. + * + * When the hat wants to unload a translation, it can call the unload + * routine which is responsible for processing reference and modify bits. + * + * Each segment is protected by it's containing address space lock. To + * access any field in the segment structure, the "as" must be locked. + * If a segment field is to be modified, the address space lock must be + * write locked. + */ + +struct seg { + caddr_t s_base; /* base virtual address */ + size_t s_size; /* size in bytes */ + uint_t s_szc; /* max page size code */ + uint_t s_flags; /* flags for segment, see below */ + struct as *s_as; /* containing address space */ + avl_node_t s_tree; /* AVL tree links to segs in this as */ + struct seg_ops *s_ops; /* ops vector: see below */ + void *s_data; /* private data for instance */ +}; + +#define S_PURGE (0x01) /* seg should be purged in as_gap() */ + +struct seg_ops { + int (*dup)(struct seg *, struct seg *); + int (*unmap)(struct seg *, caddr_t, size_t); + void (*free)(struct seg *); + faultcode_t (*fault)(struct hat *, struct seg *, caddr_t, size_t, + enum fault_type, enum seg_rw); + faultcode_t (*faulta)(struct seg *, caddr_t); + int (*setprot)(struct seg *, caddr_t, size_t, uint_t); + int (*checkprot)(struct seg *, caddr_t, size_t, uint_t); + int (*kluster)(struct seg *, caddr_t, ssize_t); + size_t (*swapout)(struct seg *); + int (*sync)(struct seg *, caddr_t, size_t, int, uint_t); + size_t (*incore)(struct seg *, caddr_t, size_t, char *); + int (*lockop)(struct seg *, caddr_t, size_t, int, int, ulong_t *, + size_t); + int (*getprot)(struct seg *, caddr_t, size_t, uint_t *); + u_offset_t (*getoffset)(struct seg *, caddr_t); + int (*gettype)(struct seg *, caddr_t); + int (*getvp)(struct seg *, caddr_t, struct vnode **); + int (*advise)(struct seg *, caddr_t, size_t, uint_t); + void (*dump)(struct seg *); + int (*pagelock)(struct seg *, caddr_t, size_t, struct page ***, + enum lock_type, enum seg_rw); + int (*setpagesize)(struct seg *, caddr_t, size_t, uint_t); + int (*getmemid)(struct seg *, caddr_t, memid_t *); + struct lgrp_mem_policy_info *(*getpolicy)(struct seg *, caddr_t); +}; + +#ifdef _KERNEL +/* + * Generic segment operations + */ +extern void seg_init(void); +extern struct seg *seg_alloc(struct as *as, caddr_t base, size_t size); +extern int seg_attach(struct as *as, caddr_t base, size_t size, + struct seg *seg); +extern void seg_unmap(struct seg *seg); +extern void seg_free(struct seg *seg); + +/* + * functions for pagelock cache support + */ +extern void seg_ppurge(struct seg *seg); +extern void seg_ppurge_seg(int (*callback)()); +extern void seg_pinactive(struct seg *seg, caddr_t addr, size_t len, + struct page **pp, enum seg_rw rw, int (*callback)()); +extern int seg_pinsert_check(struct seg *seg, size_t len, uint_t flags); +extern int seg_pinsert(struct seg *seg, caddr_t addr, size_t len, + struct page **pp, enum seg_rw rw, uint_t flags, + int (*callback)()); +extern struct page **seg_plookup(struct seg *seg, caddr_t addr, + size_t len, enum seg_rw rw); +extern void seg_pasync_thread(void); +extern void seg_preap(void); + +extern int seg_preapahead; +extern segadvstat_t segadvstat; +/* + * Flags for pagelock cache support + */ +#define SEGP_ASYNC_FLUSH 0x1 /* flushed by async thread */ +#define SEGP_FORCE_WIRED 0x2 /* skip check against seg_pwindow */ + +/* + * Return values for seg_pinsert and seg_pinsert_check functions. + */ +#define SEGP_SUCCESS 0 /* seg_pinsert() succeeded */ +#define SEGP_FAIL 1 /* seg_pinsert() failed */ + +/* Page status bits for segop_incore */ +#define SEG_PAGE_INCORE 0x01 /* VA has a page backing it */ +#define SEG_PAGE_LOCKED 0x02 /* VA has a page that is locked */ +#define SEG_PAGE_HASCOW 0x04 /* VA has a page with a copy-on-write */ +#define SEG_PAGE_SOFTLOCK 0x08 /* VA has a page with softlock held */ +#define SEG_PAGE_VNODEBACKED 0x10 /* Segment is backed by a vnode */ +#define SEG_PAGE_ANON 0x20 /* VA has an anonymous page */ +#define SEG_PAGE_VNODE 0x40 /* VA has a vnode page backing it */ + +#define SEGOP_DUP(s, n) (*(s)->s_ops->dup)((s), (n)) +#define SEGOP_UNMAP(s, a, l) (*(s)->s_ops->unmap)((s), (a), (l)) +#define SEGOP_FREE(s) (*(s)->s_ops->free)((s)) +#define SEGOP_FAULT(h, s, a, l, t, rw) \ + (*(s)->s_ops->fault)((h), (s), (a), (l), (t), (rw)) +#define SEGOP_FAULTA(s, a) (*(s)->s_ops->faulta)((s), (a)) +#define SEGOP_SETPROT(s, a, l, p) (*(s)->s_ops->setprot)((s), (a), (l), (p)) +#define SEGOP_CHECKPROT(s, a, l, p) (*(s)->s_ops->checkprot)((s), (a), (l), (p)) +#define SEGOP_KLUSTER(s, a, d) (*(s)->s_ops->kluster)((s), (a), (d)) +#define SEGOP_SWAPOUT(s) (*(s)->s_ops->swapout)((s)) +#define SEGOP_SYNC(s, a, l, atr, f) \ + (*(s)->s_ops->sync)((s), (a), (l), (atr), (f)) +#define SEGOP_INCORE(s, a, l, v) (*(s)->s_ops->incore)((s), (a), (l), (v)) +#define SEGOP_LOCKOP(s, a, l, atr, op, b, p) \ + (*(s)->s_ops->lockop)((s), (a), (l), (atr), (op), (b), (p)) +#define SEGOP_GETPROT(s, a, l, p) (*(s)->s_ops->getprot)((s), (a), (l), (p)) +#define SEGOP_GETOFFSET(s, a) (*(s)->s_ops->getoffset)((s), (a)) +#define SEGOP_GETTYPE(s, a) (*(s)->s_ops->gettype)((s), (a)) +#define SEGOP_GETVP(s, a, vpp) (*(s)->s_ops->getvp)((s), (a), (vpp)) +#define SEGOP_ADVISE(s, a, l, b) (*(s)->s_ops->advise)((s), (a), (l), (b)) +#define SEGOP_DUMP(s) (*(s)->s_ops->dump)((s)) +#define SEGOP_PAGELOCK(s, a, l, p, t, rw) \ + (*(s)->s_ops->pagelock)((s), (a), (l), (p), (t), (rw)) +#define SEGOP_SETPAGESIZE(s, a, l, szc) \ + (*(s)->s_ops->setpagesize)((s), (a), (l), (szc)) +#define SEGOP_GETMEMID(s, a, mp) (*(s)->s_ops->getmemid)((s), (a), (mp)) +#define SEGOP_GETPOLICY(s, a) (*(s)->s_ops->getpolicy)((s), (a)) + +#define seg_page(seg, addr) \ + (((uintptr_t)((addr) - (seg)->s_base)) >> PAGESHIFT) + +#define seg_pages(seg) \ + (((uintptr_t)((seg)->s_size + PAGEOFFSET)) >> PAGESHIFT) + +#define IE_NOMEM -1 /* internal to seg layer */ +#define IE_RETRY -2 /* internal to seg layer */ +#define IE_REATTACH -3 /* internal to seg layer */ + +/* Delay/retry factors for seg_p_mem_config_pre_del */ +#define SEGP_PREDEL_DELAY_FACTOR 4 +/* + * As a workaround to being unable to purge the pagelock + * cache during a DR delete memory operation, we use + * a stall threshold that is twice the maximum seen + * during testing. This workaround will be removed + * when a suitable fix is found. + */ +#define SEGP_STALL_SECONDS 25 +#define SEGP_STALL_THRESHOLD \ + (SEGP_STALL_SECONDS * SEGP_PREDEL_DELAY_FACTOR) + +#ifdef VMDEBUG + +uint_t seg_page(struct seg *, caddr_t); +uint_t seg_pages(struct seg *); + +#endif /* VMDEBUG */ + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_H */ diff --git a/usr/src/uts/common/vm/seg_dev.c b/usr/src/uts/common/vm/seg_dev.c new file mode 100644 index 0000000000..9b3733871f --- /dev/null +++ b/usr/src/uts/common/vm/seg_dev.c @@ -0,0 +1,4073 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - segment of a mapped device. + * + * This segment driver is used when mapping character special devices. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/vtrace.h> +#include <sys/systm.h> +#include <sys/vmsystm.h> +#include <sys/mman.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/ddidevmap.h> +#include <sys/lgrp.h> + +#include <vm/page.h> +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_dev.h> +#include <vm/seg_kp.h> +#include <vm/seg_kmem.h> +#include <vm/vpage.h> + +#include <sys/sunddi.h> +#include <sys/esunddi.h> +#include <sys/fs/snode.h> + +#if DEBUG +int segdev_debug; +#define DEBUGF(level, args) { if (segdev_debug >= (level)) cmn_err args; } +#else +#define DEBUGF(level, args) +#endif + +/* Default timeout for devmap context management */ +#define CTX_TIMEOUT_VALUE 0 + +#define HOLD_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ + { mutex_enter(&dhp->dh_lock); } + +#define RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ + { mutex_exit(&dhp->dh_lock); } + +#define round_down_p2(a, s) ((a) & ~((s) - 1)) +#define round_up_p2(a, s) (((a) + (s) - 1) & ~((s) - 1)) + +/* + * VA_PA_ALIGNED checks to see if both VA and PA are on pgsize boundary + * VA_PA_PGSIZE_ALIGNED check to see if VA is aligned with PA w.r.t. pgsize + */ +#define VA_PA_ALIGNED(uvaddr, paddr, pgsize) \ + (((uvaddr | paddr) & (pgsize - 1)) == 0) +#define VA_PA_PGSIZE_ALIGNED(uvaddr, paddr, pgsize) \ + (((uvaddr ^ paddr) & (pgsize - 1)) == 0) + +#define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ + +#define VTOCVP(vp) (VTOS(vp)->s_commonvp) /* we "know" it's an snode */ + +static struct devmap_ctx *devmapctx_list = NULL; +static struct devmap_softlock *devmap_slist = NULL; + +/* + * mutex, vnode and page for the page of zeros we use for the trash mappings. + * One trash page is allocated on the first ddi_umem_setup call that uses it + * XXX Eventually, we may want to combine this with what segnf does when all + * hat layers implement HAT_NOFAULT. + * + * The trash page is used when the backing store for a userland mapping is + * removed but the application semantics do not take kindly to a SIGBUS. + * In that scenario, the applications pages are mapped to some dummy page + * which returns garbage on read and writes go into a common place. + * (Perfect for NO_FAULT semantics) + * The device driver is responsible to communicating to the app with some + * other mechanism that such remapping has happened and the app should take + * corrective action. + * We can also use an anonymous memory page as there is no requirement to + * keep the page locked, however this complicates the fault code. RFE. + */ +static struct vnode trashvp; +static struct page *trashpp; + +/* Non-pageable kernel memory is allocated from the umem_np_arena. */ +static vmem_t *umem_np_arena; + +/* Set the cookie to a value we know will never be a valid umem_cookie */ +#define DEVMAP_DEVMEM_COOKIE ((ddi_umem_cookie_t)0x1) + +/* + * Macros to check if type of devmap handle + */ +#define cookie_is_devmem(c) \ + ((c) == (struct ddi_umem_cookie *)DEVMAP_DEVMEM_COOKIE) + +#define cookie_is_pmem(c) \ + ((c) == (struct ddi_umem_cookie *)DEVMAP_PMEM_COOKIE) + +#define cookie_is_kpmem(c) (!cookie_is_devmem(c) && !cookie_is_pmem(c) &&\ + ((c)->type == KMEM_PAGEABLE)) + +#define dhp_is_devmem(dhp) \ + (cookie_is_devmem((struct ddi_umem_cookie *)((dhp)->dh_cookie))) + +#define dhp_is_pmem(dhp) \ + (cookie_is_pmem((struct ddi_umem_cookie *)((dhp)->dh_cookie))) + +#define dhp_is_kpmem(dhp) \ + (cookie_is_kpmem((struct ddi_umem_cookie *)((dhp)->dh_cookie))) + +/* + * Private seg op routines. + */ +static int segdev_dup(struct seg *, struct seg *); +static int segdev_unmap(struct seg *, caddr_t, size_t); +static void segdev_free(struct seg *); +static faultcode_t segdev_fault(struct hat *, struct seg *, caddr_t, size_t, + enum fault_type, enum seg_rw); +static faultcode_t segdev_faulta(struct seg *, caddr_t); +static int segdev_setprot(struct seg *, caddr_t, size_t, uint_t); +static int segdev_checkprot(struct seg *, caddr_t, size_t, uint_t); +static void segdev_badop(void); +static int segdev_sync(struct seg *, caddr_t, size_t, int, uint_t); +static size_t segdev_incore(struct seg *, caddr_t, size_t, char *); +static int segdev_lockop(struct seg *, caddr_t, size_t, int, int, + ulong_t *, size_t); +static int segdev_getprot(struct seg *, caddr_t, size_t, uint_t *); +static u_offset_t segdev_getoffset(struct seg *, caddr_t); +static int segdev_gettype(struct seg *, caddr_t); +static int segdev_getvp(struct seg *, caddr_t, struct vnode **); +static int segdev_advise(struct seg *, caddr_t, size_t, uint_t); +static void segdev_dump(struct seg *); +static int segdev_pagelock(struct seg *, caddr_t, size_t, + struct page ***, enum lock_type, enum seg_rw); +static int segdev_setpagesize(struct seg *, caddr_t, size_t, uint_t); +static int segdev_getmemid(struct seg *, caddr_t, memid_t *); +static lgrp_mem_policy_info_t *segdev_getpolicy(struct seg *, caddr_t); + +/* + * XXX this struct is used by rootnex_map_fault to identify + * the segment it has been passed. So if you make it + * "static" you'll need to fix rootnex_map_fault. + */ +struct seg_ops segdev_ops = { + segdev_dup, + segdev_unmap, + segdev_free, + segdev_fault, + segdev_faulta, + segdev_setprot, + segdev_checkprot, + (int (*)())segdev_badop, /* kluster */ + (size_t (*)(struct seg *))NULL, /* swapout */ + segdev_sync, /* sync */ + segdev_incore, + segdev_lockop, /* lockop */ + segdev_getprot, + segdev_getoffset, + segdev_gettype, + segdev_getvp, + segdev_advise, + segdev_dump, + segdev_pagelock, + segdev_setpagesize, + segdev_getmemid, + segdev_getpolicy, +}; + +/* + * Private segdev support routines + */ +static struct segdev_data *sdp_alloc(void); + +static void segdev_softunlock(struct hat *, struct seg *, caddr_t, + size_t, enum seg_rw); + +static faultcode_t segdev_faultpage(struct hat *, struct seg *, caddr_t, + struct vpage *, enum fault_type, enum seg_rw, devmap_handle_t *); + +static faultcode_t segdev_faultpages(struct hat *, struct seg *, caddr_t, + size_t, enum fault_type, enum seg_rw, devmap_handle_t *); + +static struct devmap_ctx *devmap_ctxinit(dev_t, ulong_t); +static struct devmap_softlock *devmap_softlock_init(dev_t, ulong_t); +static void devmap_softlock_rele(devmap_handle_t *); +static void devmap_ctx_rele(devmap_handle_t *); + +static void devmap_ctxto(void *); + +static devmap_handle_t *devmap_find_handle(devmap_handle_t *dhp_head, + caddr_t addr); + +static ulong_t devmap_roundup(devmap_handle_t *dhp, ulong_t offset, size_t len, + ulong_t *opfn, ulong_t *pagesize); + +static void free_devmap_handle(devmap_handle_t *dhp); + +static int devmap_handle_dup(devmap_handle_t *dhp, devmap_handle_t **new_dhp, + struct seg *newseg); + +static devmap_handle_t *devmap_handle_unmap(devmap_handle_t *dhp); + +static void devmap_handle_unmap_head(devmap_handle_t *dhp, size_t len); + +static void devmap_handle_unmap_tail(devmap_handle_t *dhp, caddr_t addr); + +static int devmap_device(devmap_handle_t *dhp, struct as *as, caddr_t *addr, + offset_t off, size_t len, uint_t flags); + +static void devmap_get_large_pgsize(devmap_handle_t *dhp, size_t len, + caddr_t addr, size_t *llen, caddr_t *laddr); + +static void devmap_handle_reduce_len(devmap_handle_t *dhp, size_t len); + +static void *devmap_alloc_pages(vmem_t *vmp, size_t size, int vmflag); +static void devmap_free_pages(vmem_t *vmp, void *inaddr, size_t size); + +static void *devmap_umem_alloc_np(size_t size, size_t flags); +static void devmap_umem_free_np(void *addr, size_t size); + +/* + * routines to lock and unlock underlying segkp segment for + * KMEM_PAGEABLE type cookies. + */ +static faultcode_t acquire_kpmem_lock(struct ddi_umem_cookie *, size_t); +static void release_kpmem_lock(struct ddi_umem_cookie *, size_t); + +/* + * Routines to synchronize F_SOFTLOCK and F_INVAL faults for + * drivers with devmap_access callbacks + */ +static int devmap_softlock_enter(struct devmap_softlock *, size_t, + enum fault_type); +static void devmap_softlock_exit(struct devmap_softlock *, size_t, + enum fault_type); + +static kmutex_t devmapctx_lock; + +static kmutex_t devmap_slock; + +/* + * Initialize the thread callbacks and thread private data. + */ +static struct devmap_ctx * +devmap_ctxinit(dev_t dev, ulong_t id) +{ + struct devmap_ctx *devctx; + struct devmap_ctx *tmp; + dev_info_t *dip; + + tmp = kmem_zalloc(sizeof (struct devmap_ctx), KM_SLEEP); + + mutex_enter(&devmapctx_lock); + + dip = e_ddi_hold_devi_by_dev(dev, 0); + ASSERT(dip != NULL); + ddi_release_devi(dip); + + for (devctx = devmapctx_list; devctx != NULL; devctx = devctx->next) + if ((devctx->dip == dip) && (devctx->id == id)) + break; + + if (devctx == NULL) { + devctx = tmp; + devctx->dip = dip; + devctx->id = id; + mutex_init(&devctx->lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&devctx->cv, NULL, CV_DEFAULT, NULL); + devctx->next = devmapctx_list; + devmapctx_list = devctx; + } else + kmem_free(tmp, sizeof (struct devmap_ctx)); + + mutex_enter(&devctx->lock); + devctx->refcnt++; + mutex_exit(&devctx->lock); + mutex_exit(&devmapctx_lock); + + return (devctx); +} + +/* + * Timeout callback called if a CPU has not given up the device context + * within dhp->dh_timeout_length ticks + */ +static void +devmap_ctxto(void *data) +{ + struct devmap_ctx *devctx = data; + + TRACE_1(TR_FAC_DEVMAP, TR_DEVMAP_CTXTO, + "devmap_ctxto:timeout expired, devctx=%p", (void *)devctx); + mutex_enter(&devctx->lock); + /* + * Set oncpu = 0 so the next mapping trying to get the device context + * can. + */ + devctx->oncpu = 0; + devctx->timeout = 0; + cv_signal(&devctx->cv); + mutex_exit(&devctx->lock); +} + +/* + * Create a device segment. + */ +int +segdev_create(struct seg *seg, void *argsp) +{ + struct segdev_data *sdp; + struct segdev_crargs *a = (struct segdev_crargs *)argsp; + devmap_handle_t *dhp = (devmap_handle_t *)a->devmap_data; + int error; + + /* + * Since the address space is "write" locked, we + * don't need the segment lock to protect "segdev" data. + */ + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); + + sdp = sdp_alloc(); + + sdp->mapfunc = a->mapfunc; + sdp->offset = a->offset; + sdp->prot = a->prot; + sdp->maxprot = a->maxprot; + sdp->type = a->type; + sdp->pageprot = 0; + sdp->softlockcnt = 0; + sdp->vpage = NULL; + + if (sdp->mapfunc == NULL) + sdp->devmap_data = dhp; + else + sdp->devmap_data = dhp = NULL; + + sdp->hat_flags = a->hat_flags; + sdp->hat_attr = a->hat_attr; + + /* + * Currently, hat_flags supports only HAT_LOAD_NOCONSIST + */ + ASSERT(!(sdp->hat_flags & ~HAT_LOAD_NOCONSIST)); + + /* + * Hold shadow vnode -- segdev only deals with + * character (VCHR) devices. We use the common + * vp to hang pages on. + */ + sdp->vp = specfind(a->dev, VCHR); + ASSERT(sdp->vp != NULL); + + seg->s_ops = &segdev_ops; + seg->s_data = sdp; + + while (dhp != NULL) { + dhp->dh_seg = seg; + dhp = dhp->dh_next; + } + + /* + * Inform the vnode of the new mapping. + */ + /* + * It is ok to use pass sdp->maxprot to ADDMAP rather than to use + * dhp specific maxprot because spec_addmap does not use maxprot. + */ + error = VOP_ADDMAP(VTOCVP(sdp->vp), sdp->offset, + seg->s_as, seg->s_base, seg->s_size, + sdp->prot, sdp->maxprot, sdp->type, CRED()); + + if (error != 0) { + sdp->devmap_data = NULL; + hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, + HAT_UNLOAD_UNMAP); + } + + return (error); +} + +static struct segdev_data * +sdp_alloc(void) +{ + struct segdev_data *sdp; + + sdp = kmem_zalloc(sizeof (struct segdev_data), KM_SLEEP); + mutex_init(&sdp->lock, NULL, MUTEX_DEFAULT, NULL); + + return (sdp); +} + +/* + * Duplicate seg and return new segment in newseg. + */ +static int +segdev_dup(struct seg *seg, struct seg *newseg) +{ + struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + struct segdev_data *newsdp; + devmap_handle_t *dhp = (devmap_handle_t *)sdp->devmap_data; + size_t npages; + int ret; + + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_DUP, + "segdev_dup:start dhp=%p, seg=%p", (void *)dhp, (void *)seg); + + DEBUGF(3, (CE_CONT, "segdev_dup: dhp %p seg %p\n", + (void *)dhp, (void *)seg)); + + /* + * Since the address space is "write" locked, we + * don't need the segment lock to protect "segdev" data. + */ + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + newsdp = sdp_alloc(); + + newseg->s_ops = seg->s_ops; + newseg->s_data = (void *)newsdp; + + VN_HOLD(sdp->vp); + newsdp->vp = sdp->vp; + newsdp->mapfunc = sdp->mapfunc; + newsdp->offset = sdp->offset; + newsdp->pageprot = sdp->pageprot; + newsdp->prot = sdp->prot; + newsdp->maxprot = sdp->maxprot; + newsdp->type = sdp->type; + newsdp->hat_attr = sdp->hat_attr; + newsdp->hat_flags = sdp->hat_flags; + newsdp->softlockcnt = 0; + + /* + * Initialize per page data if the segment we are + * dup'ing has per page information. + */ + npages = seg_pages(newseg); + + if (sdp->vpage != NULL) { + size_t nbytes = vpgtob(npages); + + newsdp->vpage = kmem_zalloc(nbytes, KM_SLEEP); + bcopy(sdp->vpage, newsdp->vpage, nbytes); + } else + newsdp->vpage = NULL; + + /* + * duplicate devmap handles + */ + if (dhp != NULL) { + ret = devmap_handle_dup(dhp, + (devmap_handle_t **)&newsdp->devmap_data, newseg); + if (ret != 0) { + TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_DUP_CK1, + "segdev_dup:ret1 ret=%x, dhp=%p seg=%p", + ret, (void *)dhp, (void *)seg); + DEBUGF(1, (CE_CONT, + "segdev_dup: ret %x dhp %p seg %p\n", + ret, (void *)dhp, (void *)seg)); + return (ret); + } + } + + /* + * Inform the common vnode of the new mapping. + */ + return (VOP_ADDMAP(VTOCVP(newsdp->vp), + newsdp->offset, newseg->s_as, + newseg->s_base, newseg->s_size, newsdp->prot, + newsdp->maxprot, sdp->type, CRED())); +} + +/* + * duplicate devmap handles + */ +static int +devmap_handle_dup(devmap_handle_t *dhp, devmap_handle_t **new_dhp, + struct seg *newseg) +{ + devmap_handle_t *newdhp_save = NULL; + devmap_handle_t *newdhp = NULL; + struct devmap_callback_ctl *callbackops; + + while (dhp != NULL) { + newdhp = kmem_alloc(sizeof (devmap_handle_t), KM_SLEEP); + + /* Need to lock the original dhp while copying if REMAP */ + HOLD_DHP_LOCK(dhp); + bcopy(dhp, newdhp, sizeof (devmap_handle_t)); + RELE_DHP_LOCK(dhp); + newdhp->dh_seg = newseg; + newdhp->dh_next = NULL; + if (newdhp_save != NULL) + newdhp_save->dh_next = newdhp; + else + *new_dhp = newdhp; + newdhp_save = newdhp; + + callbackops = &newdhp->dh_callbackops; + + if (dhp->dh_softlock != NULL) + newdhp->dh_softlock = devmap_softlock_init( + newdhp->dh_dev, + (ulong_t)callbackops->devmap_access); + if (dhp->dh_ctx != NULL) + newdhp->dh_ctx = devmap_ctxinit(newdhp->dh_dev, + (ulong_t)callbackops->devmap_access); + + /* + * Initialize dh_lock if we want to do remap. + */ + if (newdhp->dh_flags & DEVMAP_ALLOW_REMAP) { + mutex_init(&newdhp->dh_lock, NULL, MUTEX_DEFAULT, NULL); + newdhp->dh_flags |= DEVMAP_LOCK_INITED; + } + + if (callbackops->devmap_dup != NULL) { + int ret; + + /* + * Call the dup callback so that the driver can + * duplicate its private data. + */ + ret = (*callbackops->devmap_dup)(dhp, dhp->dh_pvtp, + (devmap_cookie_t *)newdhp, &newdhp->dh_pvtp); + + if (ret != 0) { + /* + * We want to free up this segment as the driver + * has indicated that we can't dup it. But we + * don't want to call the drivers, devmap_unmap, + * callback function as the driver does not + * think this segment exists. The caller of + * devmap_dup will call seg_free on newseg + * as it was the caller that allocated the + * segment. + */ + DEBUGF(1, (CE_CONT, "devmap_handle_dup ERROR: " + "newdhp %p dhp %p\n", (void *)newdhp, + (void *)dhp)); + callbackops->devmap_unmap = NULL; + return (ret); + } + } + + dhp = dhp->dh_next; + } + + return (0); +} + +/* + * Split a segment at addr for length len. + */ +/*ARGSUSED*/ +static int +segdev_unmap(struct seg *seg, caddr_t addr, size_t len) +{ + register struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + register struct segdev_data *nsdp; + register struct seg *nseg; + register size_t opages; /* old segment size in pages */ + register size_t npages; /* new segment size in pages */ + register size_t dpages; /* pages being deleted (unmapped) */ + register size_t nbytes; + devmap_handle_t *dhp = (devmap_handle_t *)sdp->devmap_data; + devmap_handle_t *dhpp; + devmap_handle_t *newdhp; + struct devmap_callback_ctl *callbackops; + caddr_t nbase; + offset_t off; + ulong_t nsize; + size_t mlen, sz; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP, + "segdev_unmap:start dhp=%p, seg=%p addr=%p len=%lx", + (void *)dhp, (void *)seg, (void *)addr, len); + + DEBUGF(3, (CE_CONT, "segdev_unmap: dhp %p seg %p addr %p len %lx\n", + (void *)dhp, (void *)seg, (void *)addr, len)); + + /* + * Since the address space is "write" locked, we + * don't need the segment lock to protect "segdev" data. + */ + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + if ((sz = sdp->softlockcnt) > 0) { + /* + * Fail the unmap if pages are SOFTLOCKed through this mapping. + * softlockcnt is protected from change by the as write lock. + */ + TRACE_1(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP_CK1, + "segdev_unmap:error softlockcnt = %ld", sz); + DEBUGF(1, (CE_CONT, "segdev_unmap: softlockcnt %ld\n", sz)); + return (EAGAIN); + } + + /* + * Check for bad sizes + */ + if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || + (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) + panic("segdev_unmap"); + + if (dhp != NULL) { + devmap_handle_t *tdhp; + /* + * If large page size was used in hat_devload(), + * the same page size must be used in hat_unload(). + */ + dhpp = tdhp = devmap_find_handle(dhp, addr); + while (tdhp != NULL) { + if (tdhp->dh_flags & DEVMAP_FLAG_LARGE) { + break; + } + tdhp = tdhp->dh_next; + } + if (tdhp != NULL) { /* found a dhp using large pages */ + size_t slen = len; + size_t mlen; + size_t soff; + + soff = (ulong_t)(addr - dhpp->dh_uvaddr); + while (slen != 0) { + mlen = MIN(slen, (dhpp->dh_len - soff)); + hat_unload(seg->s_as->a_hat, dhpp->dh_uvaddr, + dhpp->dh_len, HAT_UNLOAD_UNMAP); + dhpp = dhpp->dh_next; + ASSERT(slen >= mlen); + slen -= mlen; + soff = 0; + } + } else + hat_unload(seg->s_as->a_hat, addr, len, + HAT_UNLOAD_UNMAP); + } else { + /* + * Unload any hardware translations in the range + * to be taken out. + */ + hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP); + } + + /* + * get the user offset which will used in the driver callbacks + */ + off = sdp->offset + (offset_t)(addr - seg->s_base); + + /* + * Inform the vnode of the unmapping. + */ + ASSERT(sdp->vp != NULL); + (void) VOP_DELMAP(VTOCVP(sdp->vp), off, seg->s_as, addr, len, + sdp->prot, sdp->maxprot, sdp->type, CRED()); + + /* + * Check for entire segment + */ + if (addr == seg->s_base && len == seg->s_size) { + seg_free(seg); + return (0); + } + + opages = seg_pages(seg); + dpages = btop(len); + npages = opages - dpages; + + /* + * Check for beginning of segment + */ + if (addr == seg->s_base) { + if (sdp->vpage != NULL) { + register struct vpage *ovpage; + + ovpage = sdp->vpage; /* keep pointer to vpage */ + + nbytes = vpgtob(npages); + sdp->vpage = kmem_alloc(nbytes, KM_SLEEP); + bcopy(&ovpage[dpages], sdp->vpage, nbytes); + + /* free up old vpage */ + kmem_free(ovpage, vpgtob(opages)); + } + + /* + * free devmap handles from the beginning of the mapping. + */ + if (dhp != NULL) + devmap_handle_unmap_head(dhp, len); + + sdp->offset += (offset_t)len; + + seg->s_base += len; + seg->s_size -= len; + + return (0); + } + + /* + * Check for end of segment + */ + if (addr + len == seg->s_base + seg->s_size) { + if (sdp->vpage != NULL) { + register struct vpage *ovpage; + + ovpage = sdp->vpage; /* keep pointer to vpage */ + + nbytes = vpgtob(npages); + sdp->vpage = kmem_alloc(nbytes, KM_SLEEP); + bcopy(ovpage, sdp->vpage, nbytes); + + /* free up old vpage */ + kmem_free(ovpage, vpgtob(opages)); + } + seg->s_size -= len; + + /* + * free devmap handles from addr to the end of the mapping. + */ + if (dhp != NULL) + devmap_handle_unmap_tail(dhp, addr); + + return (0); + } + + /* + * The section to go is in the middle of the segment, + * have to make it into two segments. nseg is made for + * the high end while seg is cut down at the low end. + */ + nbase = addr + len; /* new seg base */ + nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ + seg->s_size = addr - seg->s_base; /* shrink old seg */ + nseg = seg_alloc(seg->s_as, nbase, nsize); + if (nseg == NULL) + panic("segdev_unmap seg_alloc"); + + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP_CK2, + "segdev_unmap: seg=%p nseg=%p", (void *)seg, (void *)nseg); + DEBUGF(3, (CE_CONT, "segdev_unmap: segdev_dup seg %p nseg %p\n", + (void *)seg, (void *)nseg)); + nsdp = sdp_alloc(); + + nseg->s_ops = seg->s_ops; + nseg->s_data = (void *)nsdp; + + VN_HOLD(sdp->vp); + nsdp->mapfunc = sdp->mapfunc; + nsdp->offset = sdp->offset + (offset_t)(nseg->s_base - seg->s_base); + nsdp->vp = sdp->vp; + nsdp->pageprot = sdp->pageprot; + nsdp->prot = sdp->prot; + nsdp->maxprot = sdp->maxprot; + nsdp->type = sdp->type; + nsdp->hat_attr = sdp->hat_attr; + nsdp->hat_flags = sdp->hat_flags; + nsdp->softlockcnt = 0; + + /* + * Initialize per page data if the segment we are + * dup'ing has per page information. + */ + if (sdp->vpage != NULL) { + /* need to split vpage into two arrays */ + register size_t nnbytes; + register size_t nnpages; + register struct vpage *ovpage; + + ovpage = sdp->vpage; /* keep pointer to vpage */ + + npages = seg_pages(seg); /* seg has shrunk */ + nbytes = vpgtob(npages); + nnpages = seg_pages(nseg); + nnbytes = vpgtob(nnpages); + + sdp->vpage = kmem_alloc(nbytes, KM_SLEEP); + bcopy(ovpage, sdp->vpage, nbytes); + + nsdp->vpage = kmem_alloc(nnbytes, KM_SLEEP); + bcopy(&ovpage[npages + dpages], nsdp->vpage, nnbytes); + + /* free up old vpage */ + kmem_free(ovpage, vpgtob(opages)); + } else + nsdp->vpage = NULL; + + /* + * unmap dhps. + */ + if (dhp == NULL) { + nsdp->devmap_data = NULL; + return (0); + } + while (dhp != NULL) { + callbackops = &dhp->dh_callbackops; + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP_CK3, + "segdev_unmap: dhp=%p addr=%p", dhp, addr); + DEBUGF(3, (CE_CONT, "unmap: dhp %p addr %p uvaddr %p len %lx\n", + (void *)dhp, (void *)addr, + (void *)dhp->dh_uvaddr, dhp->dh_len)); + + if (addr == (dhp->dh_uvaddr + dhp->dh_len)) { + dhpp = dhp->dh_next; + dhp->dh_next = NULL; + dhp = dhpp; + } else if (addr > (dhp->dh_uvaddr + dhp->dh_len)) { + dhp = dhp->dh_next; + } else if (addr > dhp->dh_uvaddr && + (addr + len) < (dhp->dh_uvaddr + dhp->dh_len)) { + /* + * <addr, addr+len> is enclosed by dhp. + * create a newdhp that begins at addr+len and + * ends at dhp->dh_uvaddr+dhp->dh_len. + */ + newdhp = kmem_alloc(sizeof (devmap_handle_t), KM_SLEEP); + HOLD_DHP_LOCK(dhp); + bcopy(dhp, newdhp, sizeof (devmap_handle_t)); + RELE_DHP_LOCK(dhp); + newdhp->dh_seg = nseg; + newdhp->dh_next = dhp->dh_next; + if (dhp->dh_softlock != NULL) + newdhp->dh_softlock = devmap_softlock_init( + newdhp->dh_dev, + (ulong_t)callbackops->devmap_access); + if (dhp->dh_ctx != NULL) + newdhp->dh_ctx = devmap_ctxinit(newdhp->dh_dev, + (ulong_t)callbackops->devmap_access); + if (newdhp->dh_flags & DEVMAP_LOCK_INITED) { + mutex_init(&newdhp->dh_lock, + NULL, MUTEX_DEFAULT, NULL); + } + if (callbackops->devmap_unmap != NULL) + (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp, + off, len, dhp, &dhp->dh_pvtp, + newdhp, &newdhp->dh_pvtp); + mlen = len + (addr - dhp->dh_uvaddr); + devmap_handle_reduce_len(newdhp, mlen); + nsdp->devmap_data = newdhp; + /* XX Changing len should recalculate LARGE flag */ + dhp->dh_len = addr - dhp->dh_uvaddr; + dhpp = dhp->dh_next; + dhp->dh_next = NULL; + dhp = dhpp; + } else if ((addr > dhp->dh_uvaddr) && + ((addr + len) >= (dhp->dh_uvaddr + dhp->dh_len))) { + mlen = dhp->dh_len + dhp->dh_uvaddr - addr; + /* + * <addr, addr+len> spans over dhps. + */ + if (callbackops->devmap_unmap != NULL) + (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp, + off, mlen, (devmap_cookie_t *)dhp, + &dhp->dh_pvtp, NULL, NULL); + /* XX Changing len should recalculate LARGE flag */ + dhp->dh_len = addr - dhp->dh_uvaddr; + dhpp = dhp->dh_next; + dhp->dh_next = NULL; + dhp = dhpp; + nsdp->devmap_data = dhp; + } else if ((addr + len) >= (dhp->dh_uvaddr + dhp->dh_len)) { + /* + * dhp is enclosed by <addr, addr+len>. + */ + dhp->dh_seg = nseg; + nsdp->devmap_data = dhp; + dhp = devmap_handle_unmap(dhp); + nsdp->devmap_data = dhp; /* XX redundant? */ + } else if (((addr + len) > dhp->dh_uvaddr) && + ((addr + len) < (dhp->dh_uvaddr + dhp->dh_len))) { + mlen = addr + len - dhp->dh_uvaddr; + if (callbackops->devmap_unmap != NULL) + (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp, + dhp->dh_uoff, mlen, NULL, + NULL, dhp, &dhp->dh_pvtp); + devmap_handle_reduce_len(dhp, mlen); + nsdp->devmap_data = dhp; + dhp->dh_seg = nseg; + dhp = dhp->dh_next; + } else { + dhp->dh_seg = nseg; + dhp = dhp->dh_next; + } + } + return (0); +} + +/* + * Utility function handles reducing the length of a devmap handle during unmap + * Note that is only used for unmapping the front portion of the handler, + * i.e., we are bumping up the offset/pfn etc up by len + * Do not use if reducing length at the tail. + */ +static void +devmap_handle_reduce_len(devmap_handle_t *dhp, size_t len) +{ + struct ddi_umem_cookie *cp; + struct devmap_pmem_cookie *pcp; + /* + * adjust devmap handle fields + */ + ASSERT(len < dhp->dh_len); + + /* Make sure only page-aligned changes are done */ + ASSERT((len & PAGEOFFSET) == 0); + + dhp->dh_len -= len; + dhp->dh_uoff += (offset_t)len; + dhp->dh_roff += (offset_t)len; + dhp->dh_uvaddr += len; + /* Need to grab dhp lock if REMAP */ + HOLD_DHP_LOCK(dhp); + cp = dhp->dh_cookie; + if (!(dhp->dh_flags & DEVMAP_MAPPING_INVALID)) { + if (cookie_is_devmem(cp)) { + dhp->dh_pfn += btop(len); + } else if (cookie_is_pmem(cp)) { + pcp = (struct devmap_pmem_cookie *)dhp->dh_pcookie; + ASSERT((dhp->dh_roff & PAGEOFFSET) == 0 && + dhp->dh_roff < ptob(pcp->dp_npages)); + } else { + ASSERT(dhp->dh_roff < cp->size); + ASSERT(dhp->dh_cvaddr >= cp->cvaddr && + dhp->dh_cvaddr < (cp->cvaddr + cp->size)); + ASSERT((dhp->dh_cvaddr + len) <= + (cp->cvaddr + cp->size)); + + dhp->dh_cvaddr += len; + } + } + /* XXX - Should recalculate the DEVMAP_FLAG_LARGE after changes */ + RELE_DHP_LOCK(dhp); +} + +/* + * Free devmap handle, dhp. + * Return the next devmap handle on the linked list. + */ +static devmap_handle_t * +devmap_handle_unmap(devmap_handle_t *dhp) +{ + struct devmap_callback_ctl *callbackops = &dhp->dh_callbackops; + struct segdev_data *sdp = (struct segdev_data *)dhp->dh_seg->s_data; + devmap_handle_t *dhpp = (devmap_handle_t *)sdp->devmap_data; + + ASSERT(dhp != NULL); + + /* + * before we free up dhp, call the driver's devmap_unmap entry point + * to free resources allocated for this dhp. + */ + if (callbackops->devmap_unmap != NULL) { + (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp, dhp->dh_uoff, + dhp->dh_len, NULL, NULL, NULL, NULL); + } + + if (dhpp == dhp) { /* releasing first dhp, change sdp data */ + sdp->devmap_data = dhp->dh_next; + } else { + while (dhpp->dh_next != dhp) { + dhpp = dhpp->dh_next; + } + dhpp->dh_next = dhp->dh_next; + } + dhpp = dhp->dh_next; /* return value is next dhp in chain */ + + if (dhp->dh_softlock != NULL) + devmap_softlock_rele(dhp); + + if (dhp->dh_ctx != NULL) + devmap_ctx_rele(dhp); + + if (dhp->dh_flags & DEVMAP_LOCK_INITED) { + mutex_destroy(&dhp->dh_lock); + } + kmem_free(dhp, sizeof (devmap_handle_t)); + + return (dhpp); +} + +/* + * Free complete devmap handles from dhp for len bytes + * dhp can be either the first handle or a subsequent handle + */ +static void +devmap_handle_unmap_head(devmap_handle_t *dhp, size_t len) +{ + struct devmap_callback_ctl *callbackops; + + /* + * free the devmap handles covered by len. + */ + while (len >= dhp->dh_len) { + len -= dhp->dh_len; + dhp = devmap_handle_unmap(dhp); + } + if (len != 0) { /* partial unmap at head of first remaining dhp */ + callbackops = &dhp->dh_callbackops; + + /* + * Call the unmap callback so the drivers can make + * adjustment on its private data. + */ + if (callbackops->devmap_unmap != NULL) + (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp, + dhp->dh_uoff, len, NULL, NULL, dhp, &dhp->dh_pvtp); + devmap_handle_reduce_len(dhp, len); + } +} + +/* + * Free devmap handles to truncate the mapping after addr + * RFE: Simpler to pass in dhp pointing at correct dhp (avoid find again) + * Also could then use the routine in middle unmap case too + */ +static void +devmap_handle_unmap_tail(devmap_handle_t *dhp, caddr_t addr) +{ + register struct seg *seg = dhp->dh_seg; + register struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + register devmap_handle_t *dhph = (devmap_handle_t *)sdp->devmap_data; + struct devmap_callback_ctl *callbackops; + register devmap_handle_t *dhpp; + size_t maplen; + ulong_t off; + size_t len; + + maplen = (size_t)(addr - dhp->dh_uvaddr); + dhph = devmap_find_handle(dhph, addr); + + while (dhph != NULL) { + if (maplen == 0) { + dhph = devmap_handle_unmap(dhph); + } else { + callbackops = &dhph->dh_callbackops; + len = dhph->dh_len - maplen; + off = (ulong_t)sdp->offset + (addr - seg->s_base); + /* + * Call the unmap callback so the driver + * can make adjustments on its private data. + */ + if (callbackops->devmap_unmap != NULL) + (*callbackops->devmap_unmap)(dhph, + dhph->dh_pvtp, off, len, + (devmap_cookie_t *)dhph, + &dhph->dh_pvtp, NULL, NULL); + /* XXX Reducing len needs to recalculate LARGE flag */ + dhph->dh_len = maplen; + maplen = 0; + dhpp = dhph->dh_next; + dhph->dh_next = NULL; + dhph = dhpp; + } + } /* end while */ +} + +/* + * Free a segment. + */ +static void +segdev_free(struct seg *seg) +{ + register struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + devmap_handle_t *dhp = (devmap_handle_t *)sdp->devmap_data; + + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_FREE, + "segdev_free: dhp=%p seg=%p", (void *)dhp, (void *)seg); + DEBUGF(3, (CE_CONT, "segdev_free: dhp %p seg %p\n", + (void *)dhp, (void *)seg)); + + /* + * Since the address space is "write" locked, we + * don't need the segment lock to protect "segdev" data. + */ + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + while (dhp != NULL) + dhp = devmap_handle_unmap(dhp); + + VN_RELE(sdp->vp); + if (sdp->vpage != NULL) + kmem_free(sdp->vpage, vpgtob(seg_pages(seg))); + + mutex_destroy(&sdp->lock); + kmem_free(sdp, sizeof (*sdp)); +} + +static void +free_devmap_handle(devmap_handle_t *dhp) +{ + register devmap_handle_t *dhpp; + + /* + * free up devmap handle + */ + while (dhp != NULL) { + dhpp = dhp->dh_next; + if (dhp->dh_flags & DEVMAP_LOCK_INITED) { + mutex_destroy(&dhp->dh_lock); + } + + if (dhp->dh_softlock != NULL) + devmap_softlock_rele(dhp); + + if (dhp->dh_ctx != NULL) + devmap_ctx_rele(dhp); + + kmem_free(dhp, sizeof (devmap_handle_t)); + dhp = dhpp; + } +} + +/* + * routines to lock and unlock underlying segkp segment for + * KMEM_PAGEABLE type cookies. + * segkp only allows a single pending F_SOFTLOCK + * we keep track of number of locks in the cookie so we can + * have multiple pending faults and manage the calls to segkp. + * RFE: if segkp supports either pagelock or can support multiple + * calls to F_SOFTLOCK, then these routines can go away. + * If pagelock, segdev_faultpage can fault on a page by page basis + * and simplifies the code quite a bit. + * if multiple calls allowed but not partial ranges, then need for + * cookie->lock and locked count goes away, code can call as_fault directly + */ +static faultcode_t +acquire_kpmem_lock(struct ddi_umem_cookie *cookie, size_t npages) +{ + int err = 0; + ASSERT(cookie_is_kpmem(cookie)); + /* + * Fault in pages in segkp with F_SOFTLOCK. + * We want to hold the lock until all pages have been loaded. + * segkp only allows single caller to hold SOFTLOCK, so cookie + * holds a count so we dont call into segkp multiple times + */ + mutex_enter(&cookie->lock); + + /* + * Check for overflow in locked field + */ + if ((UINT32_MAX - cookie->locked) < npages) { + err = FC_MAKE_ERR(ENOMEM); + } else if (cookie->locked == 0) { + /* First time locking */ + err = as_fault(kas.a_hat, &kas, cookie->cvaddr, + cookie->size, F_SOFTLOCK, PROT_READ|PROT_WRITE); + } + if (!err) { + cookie->locked += npages; + } + mutex_exit(&cookie->lock); + return (err); +} + +static void +release_kpmem_lock(struct ddi_umem_cookie *cookie, size_t npages) +{ + mutex_enter(&cookie->lock); + ASSERT(cookie_is_kpmem(cookie)); + ASSERT(cookie->locked >= npages); + cookie->locked -= (uint_t)npages; + if (cookie->locked == 0) { + /* Last unlock */ + if (as_fault(kas.a_hat, &kas, cookie->cvaddr, + cookie->size, F_SOFTUNLOCK, PROT_READ|PROT_WRITE)) + panic("segdev releasing kpmem lock %p", (void *)cookie); + } + mutex_exit(&cookie->lock); +} + +/* + * Routines to synchronize F_SOFTLOCK and F_INVAL faults for + * drivers with devmap_access callbacks + * slock->softlocked basically works like a rw lock + * -ve counts => F_SOFTLOCK in progress + * +ve counts => F_INVAL/F_PROT in progress + * We allow only one F_SOFTLOCK at a time + * but can have multiple pending F_INVAL/F_PROT calls + * + * This routine waits using cv_wait_sig so killing processes is more graceful + * Returns EINTR if coming out of this routine due to a signal, 0 otherwise + */ +static int devmap_softlock_enter( + struct devmap_softlock *slock, + size_t npages, + enum fault_type type) +{ + if (npages == 0) + return (0); + mutex_enter(&(slock->lock)); + switch (type) { + case F_SOFTLOCK : + while (slock->softlocked) { + if (cv_wait_sig(&(slock)->cv, &(slock)->lock) == 0) { + /* signalled */ + mutex_exit(&(slock->lock)); + return (EINTR); + } + } + slock->softlocked -= npages; /* -ve count => locked */ + break; + case F_INVAL : + case F_PROT : + while (slock->softlocked < 0) + if (cv_wait_sig(&(slock)->cv, &(slock)->lock) == 0) { + /* signalled */ + mutex_exit(&(slock->lock)); + return (EINTR); + } + slock->softlocked += npages; /* +ve count => f_invals */ + break; + default: + ASSERT(0); + } + mutex_exit(&(slock->lock)); + return (0); +} + +static void devmap_softlock_exit( + struct devmap_softlock *slock, + size_t npages, + enum fault_type type) +{ + if (slock == NULL) + return; + mutex_enter(&(slock->lock)); + switch (type) { + case F_SOFTLOCK : + ASSERT(-slock->softlocked >= npages); + slock->softlocked += npages; /* -ve count is softlocked */ + if (slock->softlocked == 0) + cv_signal(&slock->cv); + break; + case F_INVAL : + case F_PROT: + ASSERT(slock->softlocked >= npages); + slock->softlocked -= npages; + if (slock->softlocked == 0) + cv_signal(&slock->cv); + break; + default: + ASSERT(0); + } + mutex_exit(&(slock->lock)); +} + +/* + * Do a F_SOFTUNLOCK call over the range requested. + * The range must have already been F_SOFTLOCK'ed. + * The segment lock should be held, (but not the segment private lock?) + * The softunlock code below does not adjust for large page sizes + * assumes the caller already did any addr/len adjustments for + * pagesize mappings before calling. + */ +/*ARGSUSED*/ +static void +segdev_softunlock( + struct hat *hat, /* the hat */ + struct seg *seg, /* seg_dev of interest */ + caddr_t addr, /* base address of range */ + size_t len, /* number of bytes */ + enum seg_rw rw) /* type of access at fault */ +{ + struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + devmap_handle_t *dhp_head = (devmap_handle_t *)sdp->devmap_data; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_SOFTUNLOCK, + "segdev_softunlock:dhp_head=%p sdp=%p addr=%p len=%lx", + dhp_head, sdp, addr, len); + DEBUGF(3, (CE_CONT, "segdev_softunlock: dhp %p lockcnt %lx " + "addr %p len %lx\n", + (void *)dhp_head, sdp->softlockcnt, (void *)addr, len)); + + hat_unlock(hat, addr, len); + + if (dhp_head != NULL) { + devmap_handle_t *dhp; + size_t mlen; + ulong_t off; + + dhp = devmap_find_handle(dhp_head, addr); + ASSERT(dhp != NULL); + + off = (ulong_t)(addr - dhp->dh_uvaddr); + while (len != 0) { + mlen = MIN(len, (dhp->dh_len - off)); + + /* + * unlock segkp memory, locked during F_SOFTLOCK + */ + if (dhp_is_kpmem(dhp)) { + release_kpmem_lock( + (struct ddi_umem_cookie *)dhp->dh_cookie, + btopr(mlen)); + } + + /* + * Do the softlock accounting for devmap_access + */ + if (dhp->dh_callbackops.devmap_access != NULL) { + devmap_softlock_exit(dhp->dh_softlock, + btopr(mlen), F_SOFTLOCK); + } + + len -= mlen; + dhp = dhp->dh_next; + off = 0; + } + } + + mutex_enter(&freemem_lock); + ASSERT(sdp->softlockcnt >= btopr(len)); + sdp->softlockcnt -= btopr(len); + mutex_exit(&freemem_lock); + if (sdp->softlockcnt == 0) { + /* + * All SOFTLOCKS are gone. Wakeup any waiting + * unmappers so they can try again to unmap. + * Check for waiters first without the mutex + * held so we don't always grab the mutex on + * softunlocks. + */ + if (AS_ISUNMAPWAIT(seg->s_as)) { + mutex_enter(&seg->s_as->a_contents); + if (AS_ISUNMAPWAIT(seg->s_as)) { + AS_CLRUNMAPWAIT(seg->s_as); + cv_broadcast(&seg->s_as->a_cv); + } + mutex_exit(&seg->s_as->a_contents); + } + } + +} + +/* + * Handle fault for a single page. + * Done in a separate routine so we can handle errors more easily. + * This routine is called only from segdev_faultpages() + * when looping over the range of addresses requested. The segment lock is held. + */ +static faultcode_t +segdev_faultpage( + struct hat *hat, /* the hat */ + struct seg *seg, /* seg_dev of interest */ + caddr_t addr, /* address in as */ + struct vpage *vpage, /* pointer to vpage for seg, addr */ + enum fault_type type, /* type of fault */ + enum seg_rw rw, /* type of access at fault */ + devmap_handle_t *dhp) /* devmap handle if any for this page */ +{ + struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + uint_t prot; + pfn_t pfnum = PFN_INVALID; + u_offset_t offset; + uint_t hat_flags; + dev_info_t *dip; + + TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_FAULTPAGE, + "segdev_faultpage: dhp=%p seg=%p addr=%p", dhp, seg, addr); + DEBUGF(8, (CE_CONT, "segdev_faultpage: dhp %p seg %p addr %p \n", + (void *)dhp, (void *)seg, (void *)addr)); + + /* + * Initialize protection value for this page. + * If we have per page protection values check it now. + */ + if (sdp->pageprot) { + uint_t protchk; + + switch (rw) { + case S_READ: + protchk = PROT_READ; + break; + case S_WRITE: + protchk = PROT_WRITE; + break; + case S_EXEC: + protchk = PROT_EXEC; + break; + case S_OTHER: + default: + protchk = PROT_READ | PROT_WRITE | PROT_EXEC; + break; + } + + prot = VPP_PROT(vpage); + if ((prot & protchk) == 0) + return (FC_PROT); /* illegal access type */ + } else { + prot = sdp->prot; + /* caller has already done segment level protection check */ + } + + if (type == F_SOFTLOCK) { + mutex_enter(&freemem_lock); + sdp->softlockcnt++; + mutex_exit(&freemem_lock); + } + + hat_flags = ((type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD); + offset = sdp->offset + (u_offset_t)(addr - seg->s_base); + /* + * In the devmap framework, sdp->mapfunc is set to NULL. we can get + * pfnum from dhp->dh_pfn (at beginning of segment) and offset from + * seg->s_base. + */ + if (dhp == NULL) { + /* If segment has devmap_data, then dhp should be non-NULL */ + ASSERT(sdp->devmap_data == NULL); + pfnum = (pfn_t)cdev_mmap(sdp->mapfunc, sdp->vp->v_rdev, + (off_t)offset, prot); + prot |= sdp->hat_attr; + } else { + ulong_t off; + struct ddi_umem_cookie *cp; + struct devmap_pmem_cookie *pcp; + + /* ensure the dhp passed in contains addr. */ + ASSERT(dhp == devmap_find_handle( + (devmap_handle_t *)sdp->devmap_data, addr)); + + off = addr - dhp->dh_uvaddr; + + /* + * This routine assumes that the caller makes sure that the + * fields in dhp used below are unchanged due to remap during + * this call. Caller does HOLD_DHP_LOCK if neeed + */ + cp = dhp->dh_cookie; + if (dhp->dh_flags & DEVMAP_MAPPING_INVALID) { + pfnum = PFN_INVALID; + } else if (cookie_is_devmem(cp)) { + pfnum = dhp->dh_pfn + btop(off); + } else if (cookie_is_pmem(cp)) { + pcp = (struct devmap_pmem_cookie *)dhp->dh_pcookie; + ASSERT((dhp->dh_roff & PAGEOFFSET) == 0 && + dhp->dh_roff < ptob(pcp->dp_npages)); + pfnum = page_pptonum( + pcp->dp_pparray[btop(off + dhp->dh_roff)]); + } else { + ASSERT(dhp->dh_roff < cp->size); + ASSERT(dhp->dh_cvaddr >= cp->cvaddr && + dhp->dh_cvaddr < (cp->cvaddr + cp->size)); + ASSERT((dhp->dh_cvaddr + off) <= + (cp->cvaddr + cp->size)); + ASSERT((dhp->dh_cvaddr + off + PAGESIZE) <= + (cp->cvaddr + cp->size)); + + switch (cp->type) { + case UMEM_LOCKED : + if (cp->pparray != NULL) { + ASSERT((dhp->dh_roff & PAGEOFFSET) == 0); + pfnum = page_pptonum( + cp->pparray[btop(off + dhp->dh_roff)]); + } else { + pfnum = hat_getpfnum( + ((proc_t *)cp->procp)->p_as->a_hat, + cp->cvaddr + off); + } + break; + case UMEM_TRASH : + pfnum = page_pptonum(trashpp); + /* We should set hat_flags to HAT_NOFAULT also */ + /* However, not all hat layers implement this */ + break; + case KMEM_PAGEABLE: + case KMEM_NON_PAGEABLE: + pfnum = hat_getpfnum(kas.a_hat, + dhp->dh_cvaddr + off); + break; + default : + pfnum = PFN_INVALID; + break; + } + } + prot |= dhp->dh_hat_attr; + } + if (pfnum == PFN_INVALID) { + return (FC_MAKE_ERR(EFAULT)); + } + /* prot should already be OR'ed in with hat_attributes if needed */ + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_FAULTPAGE_CK1, + "segdev_faultpage: pfnum=%lx memory=%x prot=%x flags=%x", + pfnum, pf_is_memory(pfnum), prot, hat_flags); + DEBUGF(9, (CE_CONT, "segdev_faultpage: pfnum %lx memory %x " + "prot %x flags %x\n", pfnum, pf_is_memory(pfnum), prot, hat_flags)); + + if (pf_is_memory(pfnum) || (dhp != NULL)) { + /* + * It's not _really_ required here to pass sdp->hat_flags + * to hat_devload even though we do it. + * This is because hat figures it out DEVMEM mappings + * are non-consistent, anyway. + */ + hat_devload(hat, addr, PAGESIZE, pfnum, + prot, hat_flags | sdp->hat_flags); + return (0); + } + + /* + * Fall through to the case where devmap is not used and need to call + * up the device tree to set up the mapping + */ + + dip = VTOS(VTOCVP(sdp->vp))->s_dip; + ASSERT(dip); + + /* + * When calling ddi_map_fault, we do not OR in sdp->hat_attr + * This is because this calls drivers which may not expect + * prot to have any other values than PROT_ALL + * The root nexus driver has a hack to peek into the segment + * structure and then OR in sdp->hat_attr. + * XX In case the bus_ops interfaces are ever revisited + * we need to fix this. prot should include other hat attributes + */ + if (ddi_map_fault(dip, hat, seg, addr, NULL, pfnum, prot & PROT_ALL, + (uint_t)(type == F_SOFTLOCK)) != DDI_SUCCESS) { + return (FC_MAKE_ERR(EFAULT)); + } + return (0); +} + +static faultcode_t +segdev_fault( + struct hat *hat, /* the hat */ + struct seg *seg, /* the seg_dev of interest */ + caddr_t addr, /* the address of the fault */ + size_t len, /* the length of the range */ + enum fault_type type, /* type of fault */ + enum seg_rw rw) /* type of access at fault */ +{ + struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + devmap_handle_t *dhp_head = (devmap_handle_t *)sdp->devmap_data; + devmap_handle_t *dhp; + struct devmap_softlock *slock = NULL; + ulong_t slpage = 0; + ulong_t off; + caddr_t maddr = addr; + int err; + int err_is_faultcode = 0; + + TRACE_5(TR_FAC_DEVMAP, TR_DEVMAP_FAULT, + "segdev_fault: dhp_head=%p seg=%p addr=%p len=%lx type=%x", + (void *)dhp_head, (void *)seg, (void *)addr, len, type); + DEBUGF(7, (CE_CONT, "segdev_fault: dhp_head %p seg %p " + "addr %p len %lx type %x\n", + (void *)dhp_head, (void *)seg, (void *)addr, len, type)); + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* Handle non-devmap case */ + if (dhp_head == NULL) + return (segdev_faultpages(hat, seg, addr, len, type, rw, NULL)); + + /* Find devmap handle */ + if ((dhp = devmap_find_handle(dhp_head, addr)) == NULL) + return (FC_NOMAP); + + /* + * The seg_dev driver does not implement copy-on-write, + * and always loads translations with maximal allowed permissions + * but we got an fault trying to access the device. + * Servicing the fault is not going to result in any better result + * RFE: If we want devmap_access callbacks to be involved in F_PROT + * faults, then the code below is written for that + * Pending resolution of the following: + * - determine if the F_INVAL/F_SOFTLOCK syncing + * is needed for F_PROT also or not. The code below assumes it does + * - If driver sees F_PROT and calls devmap_load with same type, + * then segdev_faultpages will fail with FC_PROT anyway, need to + * change that so calls from devmap_load to segdev_faultpages for + * F_PROT type are retagged to F_INVAL. + * RFE: Today we dont have drivers that use devmap and want to handle + * F_PROT calls. The code in segdev_fault* is written to allow + * this case but is not tested. A driver that needs this capability + * should be able to remove the short-circuit case; resolve the + * above issues and "should" work. + */ + if (type == F_PROT) { + return (FC_PROT); + } + + /* + * Loop through dhp list calling devmap_access or segdev_faultpages for + * each devmap handle. + * drivers which implement devmap_access can interpose on faults and do + * device-appropriate special actions before calling devmap_load. + */ + + /* + * Unfortunately, this simple loop has turned out to expose a variety + * of complex problems which results in the following convoluted code. + * + * First, a desire to handle a serialization of F_SOFTLOCK calls + * to the driver within the framework. + * This results in a dh_softlock structure that is on a per device + * (or device instance) basis and serializes devmap_access calls. + * Ideally we would need to do this for underlying + * memory/device regions that are being faulted on + * but that is hard to identify and with REMAP, harder + * Second, a desire to serialize F_INVAL(and F_PROT) calls w.r.t. + * to F_SOFTLOCK calls to the driver. + * These serializations are to simplify the driver programmer model. + * To support these two features, the code first goes through the + * devmap handles and counts the pages (slpage) that are covered + * by devmap_access callbacks. + * This part ends with a devmap_softlock_enter call + * which allows only one F_SOFTLOCK active on a device instance, + * but multiple F_INVAL/F_PROTs can be active except when a + * F_SOFTLOCK is active + * + * Next, we dont short-circuit the fault code upfront to call + * segdev_softunlock for F_SOFTUNLOCK, because we must use + * the same length when we softlock and softunlock. + * + * -Hat layers may not support softunlocking lengths less than the + * original length when there is large page support. + * -kpmem locking is dependent on keeping the lengths same. + * -if drivers handled F_SOFTLOCK, they probably also expect to + * see an F_SOFTUNLOCK of the same length + * Hence, if extending lengths during softlock, + * softunlock has to make the same adjustments and goes through + * the same loop calling segdev_faultpages/segdev_softunlock + * But some of the synchronization and error handling is different + */ + + if (type != F_SOFTUNLOCK) { + devmap_handle_t *dhpp = dhp; + size_t slen = len; + + /* + * Calculate count of pages that are : + * a) within the (potentially extended) fault region + * b) AND covered by devmap handle with devmap_access + */ + off = (ulong_t)(addr - dhpp->dh_uvaddr); + while (slen != 0) { + size_t mlen; + + /* + * Softlocking on a region that allows remap is + * unsupported due to unresolved locking issues + * XXX: unclear what these are? + * One potential is that if there is a pending + * softlock, then a remap should not be allowed + * until the unlock is done. This is easily + * fixed by returning error in devmap*remap on + * checking the dh->dh_softlock->softlocked value + */ + if ((type == F_SOFTLOCK) && + (dhpp->dh_flags & DEVMAP_ALLOW_REMAP)) { + return (FC_NOSUPPORT); + } + + mlen = MIN(slen, (dhpp->dh_len - off)); + if (dhpp->dh_callbackops.devmap_access) { + size_t llen; + caddr_t laddr; + /* + * use extended length for large page mappings + */ + HOLD_DHP_LOCK(dhpp); + if ((sdp->pageprot == 0) && + (dhpp->dh_flags & DEVMAP_FLAG_LARGE)) { + devmap_get_large_pgsize(dhpp, + mlen, maddr, &llen, &laddr); + } else { + llen = mlen; + } + RELE_DHP_LOCK(dhpp); + + slpage += btopr(llen); + slock = dhpp->dh_softlock; + } + maddr += mlen; + ASSERT(slen >= mlen); + slen -= mlen; + dhpp = dhpp->dh_next; + off = 0; + } + /* + * synchonize with other faulting threads and wait till safe + * devmap_softlock_enter might return due to signal in cv_wait + * + * devmap_softlock_enter has to be called outside of while loop + * to prevent a deadlock if len spans over multiple dhps. + * dh_softlock is based on device instance and if multiple dhps + * use the same device instance, the second dhp's LOCK call + * will hang waiting on the first to complete. + * devmap_setup verifies that slocks in a dhp_chain are same. + * RFE: this deadlock only hold true for F_SOFTLOCK. For + * F_INVAL/F_PROT, since we now allow multiple in parallel, + * we could have done the softlock_enter inside the loop + * and supported multi-dhp mappings with dissimilar devices + */ + if (err = devmap_softlock_enter(slock, slpage, type)) + return (FC_MAKE_ERR(err)); + } + + /* reset 'maddr' to the start addr of the range of fault. */ + maddr = addr; + + /* calculate the offset corresponds to 'addr' in the first dhp. */ + off = (ulong_t)(addr - dhp->dh_uvaddr); + + /* + * The fault length may span over multiple dhps. + * Loop until the total length is satisfied. + */ + while (len != 0) { + size_t llen; + size_t mlen; + caddr_t laddr; + + /* + * mlen is the smaller of 'len' and the length + * from addr to the end of mapping defined by dhp. + */ + mlen = MIN(len, (dhp->dh_len - off)); + + HOLD_DHP_LOCK(dhp); + /* + * Pass the extended length and address to devmap_access + * if large pagesize is used for loading address translations. + */ + if ((sdp->pageprot == 0) && + (dhp->dh_flags & DEVMAP_FLAG_LARGE)) { + devmap_get_large_pgsize(dhp, mlen, maddr, + &llen, &laddr); + ASSERT(maddr == addr || laddr == maddr); + } else { + llen = mlen; + laddr = maddr; + } + + if (dhp->dh_callbackops.devmap_access != NULL) { + offset_t aoff; + + aoff = sdp->offset + (offset_t)(laddr - seg->s_base); + + /* + * call driver's devmap_access entry point which will + * call devmap_load/contextmgmt to load the translations + * + * We drop the dhp_lock before calling access so + * drivers can call devmap_*_remap within access + */ + RELE_DHP_LOCK(dhp); + + err = (*dhp->dh_callbackops.devmap_access)( + dhp, (void *)dhp->dh_pvtp, aoff, llen, type, rw); + } else { + /* + * If no devmap_access entry point, then load mappings + * hold dhp_lock across faultpages if REMAP + */ + err = segdev_faultpages(hat, seg, laddr, llen, + type, rw, dhp); + err_is_faultcode = 1; + RELE_DHP_LOCK(dhp); + } + + if (err) { + if ((type == F_SOFTLOCK) && (maddr > addr)) { + /* + * If not first dhp, use + * segdev_fault(F_SOFTUNLOCK) for prior dhps + * While this is recursion, it is incorrect to + * call just segdev_softunlock + * if we are using either large pages + * or devmap_access. It will be more right + * to go through the same loop as above + * rather than call segdev_softunlock directly + * It will use the right lenghths as well as + * call into the driver devmap_access routines. + */ + size_t done = (size_t)(maddr - addr); + (void) segdev_fault(hat, seg, addr, done, + F_SOFTUNLOCK, S_OTHER); + /* + * reduce slpage by number of pages + * released by segdev_softunlock + */ + ASSERT(slpage >= btopr(done)); + devmap_softlock_exit(slock, + slpage - btopr(done), type); + } else { + devmap_softlock_exit(slock, slpage, type); + } + + + /* + * Segdev_faultpages() already returns a faultcode, + * hence, result from segdev_faultpages() should be + * returned directly. + */ + if (err_is_faultcode) + return (err); + return (FC_MAKE_ERR(err)); + } + + maddr += mlen; + ASSERT(len >= mlen); + len -= mlen; + dhp = dhp->dh_next; + off = 0; + + ASSERT(!dhp || len == 0 || maddr == dhp->dh_uvaddr); + } + /* + * release the softlock count at end of fault + * For F_SOFTLOCk this is done in the later F_SOFTUNLOCK + */ + if ((type == F_INVAL) || (type == F_PROT)) + devmap_softlock_exit(slock, slpage, type); + return (0); +} + +/* + * segdev_faultpages + * + * Used to fault in seg_dev segment pages. Called by segdev_fault or devmap_load + * This routine assumes that the callers makes sure that the fields + * in dhp used below are not changed due to remap during this call. + * Caller does HOLD_DHP_LOCK if neeed + * This routine returns a faultcode_t as a return value for segdev_fault. + */ +static faultcode_t +segdev_faultpages( + struct hat *hat, /* the hat */ + struct seg *seg, /* the seg_dev of interest */ + caddr_t addr, /* the address of the fault */ + size_t len, /* the length of the range */ + enum fault_type type, /* type of fault */ + enum seg_rw rw, /* type of access at fault */ + devmap_handle_t *dhp) /* devmap handle */ +{ + register struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + register caddr_t a; + struct vpage *vpage; + struct ddi_umem_cookie *kpmem_cookie = NULL; + int err; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_FAULTPAGES, + "segdev_faultpages: dhp=%p seg=%p addr=%p len=%lx", + (void *)dhp, (void *)seg, (void *)addr, len); + DEBUGF(5, (CE_CONT, "segdev_faultpages: " + "dhp %p seg %p addr %p len %lx\n", + (void *)dhp, (void *)seg, (void *)addr, len)); + + /* + * The seg_dev driver does not implement copy-on-write, + * and always loads translations with maximal allowed permissions + * but we got an fault trying to access the device. + * Servicing the fault is not going to result in any better result + * XXX: If we want to allow devmap_access to handle F_PROT calls, + * This code should be removed and let the normal fault handling + * take care of finding the error + */ + if (type == F_PROT) { + return (FC_PROT); + } + + if (type == F_SOFTUNLOCK) { + segdev_softunlock(hat, seg, addr, len, rw); + return (0); + } + + /* + * For kernel pageable memory, fault/lock segkp pages + * We hold this until the completion of this + * fault (INVAL/PROT) or till unlock (SOFTLOCK). + */ + if ((dhp != NULL) && dhp_is_kpmem(dhp)) { + kpmem_cookie = (struct ddi_umem_cookie *)dhp->dh_cookie; + if (err = acquire_kpmem_lock(kpmem_cookie, btopr(len))) + return (err); + } + + /* + * If we have the same protections for the entire segment, + * insure that the access being attempted is legitimate. + */ + mutex_enter(&sdp->lock); + if (sdp->pageprot == 0) { + uint_t protchk; + + switch (rw) { + case S_READ: + protchk = PROT_READ; + break; + case S_WRITE: + protchk = PROT_WRITE; + break; + case S_EXEC: + protchk = PROT_EXEC; + break; + case S_OTHER: + default: + protchk = PROT_READ | PROT_WRITE | PROT_EXEC; + break; + } + + if ((sdp->prot & protchk) == 0) { + mutex_exit(&sdp->lock); + /* undo kpmem locking */ + if (kpmem_cookie != NULL) { + release_kpmem_lock(kpmem_cookie, btopr(len)); + } + return (FC_PROT); /* illegal access type */ + } + } + + /* + * we do a single hat_devload for the range if + * - devmap framework (dhp is not NULL), + * - pageprot == 0, i.e., no per-page protection set and + * - is device pages, irrespective of whether we are using large pages + */ + if ((sdp->pageprot == 0) && (dhp != NULL) && dhp_is_devmem(dhp)) { + pfn_t pfnum; + uint_t hat_flags; + + if (dhp->dh_flags & DEVMAP_MAPPING_INVALID) { + mutex_exit(&sdp->lock); + return (FC_NOMAP); + } + + if (type == F_SOFTLOCK) { + mutex_enter(&freemem_lock); + sdp->softlockcnt += btopr(len); + mutex_exit(&freemem_lock); + } + + hat_flags = ((type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD); + pfnum = dhp->dh_pfn + btop((uintptr_t)(addr - dhp->dh_uvaddr)); + ASSERT(!pf_is_memory(pfnum)); + + hat_devload(hat, addr, len, pfnum, sdp->prot | dhp->dh_hat_attr, + hat_flags | sdp->hat_flags); + mutex_exit(&sdp->lock); + return (0); + } + + /* Handle cases where we have to loop through fault handling per-page */ + + if (sdp->vpage == NULL) + vpage = NULL; + else + vpage = &sdp->vpage[seg_page(seg, addr)]; + + /* loop over the address range handling each fault */ + for (a = addr; a < addr + len; a += PAGESIZE) { + if (err = segdev_faultpage(hat, seg, a, vpage, type, rw, dhp)) { + break; + } + if (vpage != NULL) + vpage++; + } + mutex_exit(&sdp->lock); + if (err && (type == F_SOFTLOCK)) { /* error handling for F_SOFTLOCK */ + size_t done = (size_t)(a - addr); /* pages fault successfully */ + if (done > 0) { + /* use softunlock for those pages */ + segdev_softunlock(hat, seg, addr, done, S_OTHER); + } + if (kpmem_cookie != NULL) { + /* release kpmem lock for rest of pages */ + ASSERT(len >= done); + release_kpmem_lock(kpmem_cookie, btopr(len - done)); + } + } else if ((kpmem_cookie != NULL) && (type != F_SOFTLOCK)) { + /* for non-SOFTLOCK cases, release kpmem */ + release_kpmem_lock(kpmem_cookie, btopr(len)); + } + return (err); +} + +/* + * Asynchronous page fault. We simply do nothing since this + * entry point is not supposed to load up the translation. + */ +/*ARGSUSED*/ +static faultcode_t +segdev_faulta(struct seg *seg, caddr_t addr) +{ + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_FAULTA, + "segdev_faulta: seg=%p addr=%p", (void *)seg, (void *)addr); + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + return (0); +} + +static int +segdev_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + register struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + register devmap_handle_t *dhp; + register struct vpage *vp, *evp; + devmap_handle_t *dhp_head = (devmap_handle_t *)sdp->devmap_data; + ulong_t off; + size_t mlen, sz; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_SETPROT, + "segdev_setprot:start seg=%p addr=%p len=%lx prot=%x", + (void *)seg, (void *)addr, len, prot); + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + if ((sz = sdp->softlockcnt) > 0 && dhp_head != NULL) { + /* + * Fail the setprot if pages are SOFTLOCKed through this + * mapping. + * Softlockcnt is protected from change by the as read lock. + */ + TRACE_1(TR_FAC_DEVMAP, TR_DEVMAP_SETPROT_CK1, + "segdev_setprot:error softlockcnt=%lx", sz); + DEBUGF(1, (CE_CONT, "segdev_setprot: softlockcnt %ld\n", sz)); + return (EAGAIN); + } + + if (dhp_head != NULL) { + if ((dhp = devmap_find_handle(dhp_head, addr)) == NULL) + return (EINVAL); + + /* + * check if violate maxprot. + */ + off = (ulong_t)(addr - dhp->dh_uvaddr); + mlen = len; + while (dhp) { + if ((dhp->dh_maxprot & prot) != prot) + return (EACCES); /* violated maxprot */ + + if (mlen > (dhp->dh_len - off)) { + mlen -= dhp->dh_len - off; + dhp = dhp->dh_next; + off = 0; + } else + break; + } + } else { + if ((sdp->maxprot & prot) != prot) + return (EACCES); + } + + mutex_enter(&sdp->lock); + if (addr == seg->s_base && len == seg->s_size && sdp->pageprot == 0) { + if (sdp->prot == prot) { + mutex_exit(&sdp->lock); + return (0); /* all done */ + } + sdp->prot = (uchar_t)prot; + } else { + sdp->pageprot = 1; + if (sdp->vpage == NULL) { + /* + * First time through setting per page permissions, + * initialize all the vpage structures to prot + */ + sdp->vpage = kmem_zalloc(vpgtob(seg_pages(seg)), + KM_SLEEP); + evp = &sdp->vpage[seg_pages(seg)]; + for (vp = sdp->vpage; vp < evp; vp++) + VPP_SETPROT(vp, sdp->prot); + } + /* + * Now go change the needed vpages protections. + */ + evp = &sdp->vpage[seg_page(seg, addr + len)]; + for (vp = &sdp->vpage[seg_page(seg, addr)]; vp < evp; vp++) + VPP_SETPROT(vp, prot); + } + mutex_exit(&sdp->lock); + + if (dhp_head != NULL) { + devmap_handle_t *tdhp; + /* + * If large page size was used in hat_devload(), + * the same page size must be used in hat_unload(). + */ + dhp = tdhp = devmap_find_handle(dhp_head, addr); + while (tdhp != NULL) { + if (tdhp->dh_flags & DEVMAP_FLAG_LARGE) { + break; + } + tdhp = tdhp->dh_next; + } + if (tdhp) { + size_t slen = len; + size_t mlen; + size_t soff; + + soff = (ulong_t)(addr - dhp->dh_uvaddr); + while (slen != 0) { + mlen = MIN(slen, (dhp->dh_len - soff)); + hat_unload(seg->s_as->a_hat, dhp->dh_uvaddr, + dhp->dh_len, HAT_UNLOAD); + dhp = dhp->dh_next; + ASSERT(slen >= mlen); + slen -= mlen; + soff = 0; + } + return (0); + } + } + + if ((prot & ~PROT_USER) == PROT_NONE) { + hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); + } else { + /* + * RFE: the segment should keep track of all attributes + * allowing us to remove the deprecated hat_chgprot + * and use hat_chgattr. + */ + hat_chgprot(seg->s_as->a_hat, addr, len, prot); + } + + return (0); +} + +static int +segdev_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + struct vpage *vp, *evp; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_CHECKPROT, + "segdev_checkprot:start seg=%p addr=%p len=%lx prot=%x", + (void *)seg, (void *)addr, len, prot); + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * If segment protection can be used, simply check against them + */ + mutex_enter(&sdp->lock); + if (sdp->pageprot == 0) { + register int err; + + err = ((sdp->prot & prot) != prot) ? EACCES : 0; + mutex_exit(&sdp->lock); + return (err); + } + + /* + * Have to check down to the vpage level + */ + evp = &sdp->vpage[seg_page(seg, addr + len)]; + for (vp = &sdp->vpage[seg_page(seg, addr)]; vp < evp; vp++) { + if ((VPP_PROT(vp) & prot) != prot) { + mutex_exit(&sdp->lock); + return (EACCES); + } + } + mutex_exit(&sdp->lock); + return (0); +} + +static int +segdev_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + size_t pgno; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_GETPROT, + "segdev_getprot:start seg=%p addr=%p len=%lx protv=%p", + (void *)seg, (void *)addr, len, (void *)protv); + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; + if (pgno != 0) { + mutex_enter(&sdp->lock); + if (sdp->pageprot == 0) { + do + protv[--pgno] = sdp->prot; + while (pgno != 0); + } else { + size_t pgoff = seg_page(seg, addr); + + do { + pgno--; + protv[pgno] = + VPP_PROT(&sdp->vpage[pgno + pgoff]); + } while (pgno != 0); + } + mutex_exit(&sdp->lock); + } + return (0); +} + +static u_offset_t +segdev_getoffset(register struct seg *seg, caddr_t addr) +{ + register struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_GETOFFSET, + "segdev_getoffset:start seg=%p addr=%p", (void *)seg, (void *)addr); + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + return ((u_offset_t)sdp->offset + (addr - seg->s_base)); +} + +/*ARGSUSED*/ +static int +segdev_gettype(register struct seg *seg, caddr_t addr) +{ + register struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_GETTYPE, + "segdev_gettype:start seg=%p addr=%p", (void *)seg, (void *)addr); + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + return (sdp->type); +} + + +/*ARGSUSED*/ +static int +segdev_getvp(register struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + register struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_GETVP, + "segdev_getvp:start seg=%p addr=%p", (void *)seg, (void *)addr); + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * Note that this vp is the common_vp of the device, where the + * pages are hung .. + */ + *vpp = VTOCVP(sdp->vp); + + return (0); +} + +static void +segdev_badop(void) +{ + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SEGDEV_BADOP, + "segdev_badop:start"); + panic("segdev_badop"); + /*NOTREACHED*/ +} + +/* + * segdev pages are not in the cache, and thus can't really be controlled. + * Hence, syncs are simply always successful. + */ +/*ARGSUSED*/ +static int +segdev_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) +{ + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SYNC, "segdev_sync:start"); + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + return (0); +} + +/* + * segdev pages are always "in core". + */ +/*ARGSUSED*/ +static size_t +segdev_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) +{ + size_t v = 0; + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_INCORE, "segdev_incore:start"); + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + for (len = (len + PAGEOFFSET) & PAGEMASK; len; len -= PAGESIZE, + v += PAGESIZE) + *vec++ = 1; + return (v); +} + +/* + * segdev pages are not in the cache, and thus can't really be controlled. + * Hence, locks are simply always successful. + */ +/*ARGSUSED*/ +static int +segdev_lockop(struct seg *seg, caddr_t addr, + size_t len, int attr, int op, ulong_t *lockmap, size_t pos) +{ + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_LOCKOP, "segdev_lockop:start"); + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + return (0); +} + +/* + * segdev pages are not in the cache, and thus can't really be controlled. + * Hence, advise is simply always successful. + */ +/*ARGSUSED*/ +static int +segdev_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) +{ + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_ADVISE, "segdev_advise:start"); + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + return (0); +} + +/* + * segdev pages are not dumped, so we just return + */ +/*ARGSUSED*/ +static void +segdev_dump(struct seg *seg) +{} + +/* + * ddi_segmap_setup: Used by drivers who wish specify mapping attributes + * for a segment. Called from a drivers segmap(9E) + * routine. + */ +/*ARGSUSED*/ +int +ddi_segmap_setup(dev_t dev, off_t offset, struct as *as, caddr_t *addrp, + off_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cred, + ddi_device_acc_attr_t *accattrp, uint_t rnumber) +{ + struct segdev_crargs dev_a; + int (*mapfunc)(dev_t dev, off_t off, int prot); + uint_t hat_attr; + pfn_t pfn; + int error, i; + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SEGMAP_SETUP, + "ddi_segmap_setup:start"); + + if ((mapfunc = devopsp[getmajor(dev)]->devo_cb_ops->cb_mmap) == nodev) + return (ENODEV); + + /* + * Character devices that support the d_mmap + * interface can only be mmap'ed shared. + */ + if ((flags & MAP_TYPE) != MAP_SHARED) + return (EINVAL); + + /* + * Check that this region is indeed mappable on this platform. + * Use the mapping function. + */ + if (ddi_device_mapping_check(dev, accattrp, rnumber, &hat_attr) == -1) + return (ENXIO); + + /* + * Check to ensure that the entire range is + * legal and we are not trying to map in + * more than the device will let us. + */ + for (i = 0; i < len; i += PAGESIZE) { + if (i == 0) { + /* + * Save the pfn at offset here. This pfn will be + * used later to get user address. + */ + if ((pfn = (pfn_t)cdev_mmap(mapfunc, dev, offset, + maxprot)) == PFN_INVALID) + return (ENXIO); + } else { + if (cdev_mmap(mapfunc, dev, offset + i, maxprot) == + PFN_INVALID) + return (ENXIO); + } + } + + as_rangelock(as); + if ((flags & MAP_FIXED) == 0) { + /* + * Pick an address w/o worrying about + * any vac alignment constraints. + */ + map_addr(addrp, len, ptob(pfn), 0, flags); + if (*addrp == NULL) { + as_rangeunlock(as); + return (ENOMEM); + } + } else { + /* + * User-specified address; blow away any previous mappings. + */ + (void) as_unmap(as, *addrp, len); + } + + dev_a.mapfunc = mapfunc; + dev_a.dev = dev; + dev_a.offset = (offset_t)offset; + dev_a.type = flags & MAP_TYPE; + dev_a.prot = (uchar_t)prot; + dev_a.maxprot = (uchar_t)maxprot; + dev_a.hat_attr = hat_attr; + dev_a.hat_flags = 0; + dev_a.devmap_data = NULL; + + error = as_map(as, *addrp, len, segdev_create, &dev_a); + as_rangeunlock(as); + return (error); + +} + +/*ARGSUSED*/ +static int +segdev_pagelock(struct seg *seg, caddr_t addr, size_t len, + struct page ***ppp, enum lock_type type, enum seg_rw rw) +{ + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_PAGELOCK, + "segdev_pagelock:start"); + return (ENOTSUP); +} + +/*ARGSUSED*/ +static int +segdev_setpagesize(struct seg *seg, caddr_t addr, size_t len, + uint_t szc) +{ + return (ENOTSUP); +} + +/* + * devmap_device: Used by devmap framework to establish mapping + * called by devmap_seup(9F) during map setup time. + */ +/*ARGSUSED*/ +static int +devmap_device(devmap_handle_t *dhp, struct as *as, caddr_t *addr, + offset_t off, size_t len, uint_t flags) +{ + devmap_handle_t *rdhp, *maxdhp; + struct segdev_crargs dev_a; + int err; + uint_t maxprot = PROT_ALL; + offset_t offset = 0; + pfn_t pfn; + struct devmap_pmem_cookie *pcp; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_DEVICE, + "devmap_device:start dhp=%p addr=%p off=%llx, len=%lx", + (void *)dhp, (void *)addr, off, len); + + DEBUGF(2, (CE_CONT, "devmap_device: dhp %p addr %p off %llx len %lx\n", + (void *)dhp, (void *)addr, off, len)); + + as_rangelock(as); + if ((flags & MAP_FIXED) == 0) { + offset_t aligned_off; + + rdhp = maxdhp = dhp; + while (rdhp != NULL) { + maxdhp = (maxdhp->dh_len > rdhp->dh_len) ? + maxdhp : rdhp; + rdhp = rdhp->dh_next; + maxprot |= dhp->dh_maxprot; + } + offset = maxdhp->dh_uoff - dhp->dh_uoff; + + /* + * Use the dhp that has the + * largest len to get user address. + */ + /* + * If MAPPING_INVALID, cannot use dh_pfn/dh_cvaddr, + * use 0 which is as good as any other. + */ + if (maxdhp->dh_flags & DEVMAP_MAPPING_INVALID) { + aligned_off = (offset_t)0; + } else if (dhp_is_devmem(maxdhp)) { + aligned_off = (offset_t)ptob(maxdhp->dh_pfn) - offset; + } else if (dhp_is_pmem(maxdhp)) { + pcp = (struct devmap_pmem_cookie *)maxdhp->dh_pcookie; + pfn = page_pptonum( + pcp->dp_pparray[btop(maxdhp->dh_roff)]); + aligned_off = (offset_t)ptob(pfn) - offset; + } else { + aligned_off = (offset_t)(uintptr_t)maxdhp->dh_cvaddr - + offset; + } + + /* + * Pick an address aligned to dh_cookie. + * for kernel memory/user memory, cookie is cvaddr. + * for device memory, cookie is physical address. + */ + map_addr(addr, len, aligned_off, 1, flags); + if (*addr == NULL) { + as_rangeunlock(as); + return (ENOMEM); + } + } else { + /* + * User-specified address; blow away any previous mappings. + */ + (void) as_unmap(as, *addr, len); + } + + dev_a.mapfunc = NULL; + dev_a.dev = dhp->dh_dev; + dev_a.type = flags & MAP_TYPE; + dev_a.offset = off; + /* + * sdp->maxprot has the least restrict protection of all dhps. + */ + dev_a.maxprot = maxprot; + dev_a.prot = dhp->dh_prot; + /* + * devmap uses dhp->dh_hat_attr for hat. + */ + dev_a.hat_flags = 0; + dev_a.hat_attr = 0; + dev_a.devmap_data = (void *)dhp; + + err = as_map(as, *addr, len, segdev_create, &dev_a); + as_rangeunlock(as); + return (err); +} + +int +devmap_do_ctxmgt(devmap_cookie_t dhc, void *pvtp, offset_t off, size_t len, + uint_t type, uint_t rw, int (*ctxmgt)(devmap_cookie_t, void *, offset_t, + size_t, uint_t, uint_t)) +{ + register devmap_handle_t *dhp = (devmap_handle_t *)dhc; + struct devmap_ctx *devctx; + int do_timeout = 0; + int ret; + +#ifdef lint + pvtp = pvtp; +#endif + + TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT, + "devmap_do_ctxmgt:start dhp=%p off=%llx, len=%lx", + (void *)dhp, off, len); + DEBUGF(7, (CE_CONT, "devmap_do_ctxmgt: dhp %p off %llx len %lx\n", + (void *)dhp, off, len)); + + if (ctxmgt == NULL) + return (FC_HWERR); + + devctx = dhp->dh_ctx; + + /* + * If we are on an MP system with more than one cpu running + * and if a thread on some CPU already has the context, wait + * for it to finish if there is a hysteresis timeout. + * + * We call cv_wait() instead of cv_wait_sig() because + * it does not matter much if it returned due to a signal + * or due to a cv_signal() or cv_broadcast(). In either event + * we need to complete the mapping otherwise the processes + * will die with a SEGV. + */ + if ((dhp->dh_timeout_length > 0) && (ncpus > 1)) { + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK1, + "devmap_do_ctxmgt:doing hysteresis, devctl %p dhp %p", + devctx, dhp); + do_timeout = 1; + mutex_enter(&devctx->lock); + while (devctx->oncpu) + cv_wait(&devctx->cv, &devctx->lock); + devctx->oncpu = 1; + mutex_exit(&devctx->lock); + } + + /* + * Call the contextmgt callback so that the driver can handle + * the fault. + */ + ret = (*ctxmgt)(dhp, dhp->dh_pvtp, off, len, type, rw); + + /* + * If devmap_access() returned -1, then there was a hardware + * error so we need to convert the return value to something + * that trap() will understand. Otherwise, the return value + * is already a fault code generated by devmap_unload() + * or devmap_load(). + */ + if (ret) { + TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK2, + "devmap_do_ctxmgt: ret=%x dhp=%p devctx=%p", + ret, dhp, devctx); + DEBUGF(1, (CE_CONT, "devmap_do_ctxmgt: ret %x dhp %p\n", + ret, (void *)dhp)); + if (devctx->oncpu) { + mutex_enter(&devctx->lock); + devctx->oncpu = 0; + cv_signal(&devctx->cv); + mutex_exit(&devctx->lock); + } + return (FC_HWERR); + } + + /* + * Setup the timeout if we need to + */ + if (do_timeout) { + mutex_enter(&devctx->lock); + if (dhp->dh_timeout_length > 0) { + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK3, + "devmap_do_ctxmgt:timeout set"); + devctx->timeout = timeout(devmap_ctxto, + devctx, dhp->dh_timeout_length); + } else { + /* + * We don't want to wait so set oncpu to + * 0 and wake up anyone waiting. + */ + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK4, + "devmap_do_ctxmgt:timeout not set"); + devctx->oncpu = 0; + cv_signal(&devctx->cv); + } + mutex_exit(&devctx->lock); + } + + return (DDI_SUCCESS); +} + +/* + * end of mapping + * poff fault_offset | + * base | | | + * | | | | + * V V V V + * +-----------+---------------+-------+---------+-------+ + * ^ ^ ^ ^ + * |<--- offset--->|<-len->| | + * |<--- dh_len(size of mapping) --->| + * |<-- pg -->| + * -->|rlen|<-- + */ +static ulong_t +devmap_roundup(devmap_handle_t *dhp, ulong_t offset, size_t len, + ulong_t *opfn, ulong_t *pagesize) +{ + register int level; + ulong_t pg; + ulong_t poff; + ulong_t base; + caddr_t uvaddr; + long rlen; + + TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_ROUNDUP, + "devmap_roundup:start dhp=%p off=%lx len=%lx", + (void *)dhp, offset, len); + DEBUGF(2, (CE_CONT, "devmap_roundup: dhp %p off %lx len %lx\n", + (void *)dhp, offset, len)); + + /* + * get the max. pagesize that is aligned within the range + * <dh_pfn, dh_pfn+offset>. + * + * The calculations below use physical address to ddetermine + * the page size to use. The same calculations can use the + * virtual address to determine the page size. + */ + base = (ulong_t)ptob(dhp->dh_pfn); + for (level = dhp->dh_mmulevel; level >= 0; level--) { + pg = page_get_pagesize(level); + poff = ((base + offset) & ~(pg - 1)); + uvaddr = dhp->dh_uvaddr + (poff - base); + if ((poff >= base) && + ((poff + pg) <= (base + dhp->dh_len)) && + VA_PA_ALIGNED((uintptr_t)uvaddr, poff, pg)) + break; + } + + TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_ROUNDUP_CK1, + "devmap_roundup: base=%lx poff=%lx dhp=%p", + base, poff, dhp); + DEBUGF(2, (CE_CONT, "devmap_roundup: base %lx poff %lx pfn %lx\n", + base, poff, dhp->dh_pfn)); + + ASSERT(VA_PA_ALIGNED((uintptr_t)uvaddr, poff, pg)); + ASSERT(level >= 0); + + *pagesize = pg; + *opfn = dhp->dh_pfn + btop(poff - base); + + rlen = len + offset - (poff - base + pg); + + ASSERT(rlen < (long)len); + + TRACE_5(TR_FAC_DEVMAP, TR_DEVMAP_ROUNDUP_CK2, + "devmap_roundup:ret dhp=%p level=%x rlen=%lx psiz=%p opfn=%p", + (void *)dhp, level, rlen, pagesize, opfn); + DEBUGF(1, (CE_CONT, "devmap_roundup: dhp %p " + "level %x rlen %lx psize %lx opfn %lx\n", + (void *)dhp, level, rlen, *pagesize, *opfn)); + + return ((ulong_t)((rlen > 0) ? rlen : 0)); +} + +/* + * find the dhp that contains addr. + */ +static devmap_handle_t * +devmap_find_handle(devmap_handle_t *dhp_head, caddr_t addr) +{ + devmap_handle_t *dhp; + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_FIND_HANDLE, + "devmap_find_handle:start"); + + dhp = dhp_head; + while (dhp) { + if (addr >= dhp->dh_uvaddr && + addr < (dhp->dh_uvaddr + dhp->dh_len)) + return (dhp); + dhp = dhp->dh_next; + } + + return ((devmap_handle_t *)NULL); +} + +/* + * devmap_unload: + * Marks a segdev segment or pages if offset->offset+len + * is not the entire segment as intercept and unloads the + * pages in the range offset -> offset+len. + */ +int +devmap_unload(devmap_cookie_t dhc, offset_t offset, size_t len) +{ + register devmap_handle_t *dhp = (devmap_handle_t *)dhc; + caddr_t addr; + ulong_t size; + ssize_t soff; + + TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_UNLOAD, + "devmap_unload:start dhp=%p offset=%llx len=%lx", + (void *)dhp, offset, len); + DEBUGF(7, (CE_CONT, "devmap_unload: dhp %p offset %llx len %lx\n", + (void *)dhp, offset, len)); + + soff = (ssize_t)(offset - dhp->dh_uoff); + soff = round_down_p2(soff, PAGESIZE); + if (soff < 0 || soff >= dhp->dh_len) + return (FC_MAKE_ERR(EINVAL)); + + /* + * Address and size must be page aligned. Len is set to the + * number of bytes in the number of pages that are required to + * support len. Offset is set to the byte offset of the first byte + * of the page that contains offset. + */ + len = round_up_p2(len, PAGESIZE); + + /* + * If len is == 0, then calculate the size by getting + * the number of bytes from offset to the end of the segment. + */ + if (len == 0) + size = dhp->dh_len - soff; + else { + size = len; + if ((soff + size) > dhp->dh_len) + return (FC_MAKE_ERR(EINVAL)); + } + + /* + * The address is offset bytes from the base address of + * the dhp. + */ + addr = (caddr_t)(soff + dhp->dh_uvaddr); + + /* + * If large page size was used in hat_devload(), + * the same page size must be used in hat_unload(). + */ + if (dhp->dh_flags & DEVMAP_FLAG_LARGE) { + hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr, + dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER); + } else { + hat_unload(dhp->dh_seg->s_as->a_hat, addr, size, + HAT_UNLOAD|HAT_UNLOAD_OTHER); + } + + return (0); +} + +/* + * calculates the optimal page size that will be used for hat_devload(). + */ +static void +devmap_get_large_pgsize(devmap_handle_t *dhp, size_t len, caddr_t addr, + size_t *llen, caddr_t *laddr) +{ + ulong_t off; + ulong_t pfn; + ulong_t pgsize; + uint_t first = 1; + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_GET_LARGE_PGSIZE, + "devmap_get_large_pgsize:start"); + + /* + * RFE - Code only supports large page mappings for devmem + * This code could be changed in future if we want to support + * large page mappings for kernel exported memory. + */ + ASSERT(dhp_is_devmem(dhp)); + ASSERT(!(dhp->dh_flags & DEVMAP_MAPPING_INVALID)); + + *llen = 0; + off = (ulong_t)(addr - dhp->dh_uvaddr); + while ((long)len > 0) { + /* + * get the optimal pfn to minimize address translations. + * devmap_roundup() returns residue bytes for next round + * calculations. + */ + len = devmap_roundup(dhp, off, len, &pfn, &pgsize); + + if (first) { + *laddr = dhp->dh_uvaddr + ptob(pfn - dhp->dh_pfn); + first = 0; + } + + *llen += pgsize; + off = ptob(pfn - dhp->dh_pfn) + pgsize; + } + /* Large page mapping len/addr cover more range than orginal fault */ + ASSERT(*llen >= len && *laddr <= addr); + ASSERT((*laddr + *llen) >= (addr + len)); +} + +/* + * Initialize the devmap_softlock structure. + */ +static struct devmap_softlock * +devmap_softlock_init(dev_t dev, ulong_t id) +{ + struct devmap_softlock *slock; + struct devmap_softlock *tmp; + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SOFTLOCK_INIT, + "devmap_softlock_init:start"); + + tmp = kmem_zalloc(sizeof (struct devmap_softlock), KM_SLEEP); + mutex_enter(&devmap_slock); + + for (slock = devmap_slist; slock != NULL; slock = slock->next) + if ((slock->dev == dev) && (slock->id == id)) + break; + + if (slock == NULL) { + slock = tmp; + slock->dev = dev; + slock->id = id; + mutex_init(&slock->lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&slock->cv, NULL, CV_DEFAULT, NULL); + slock->next = devmap_slist; + devmap_slist = slock; + } else + kmem_free(tmp, sizeof (struct devmap_softlock)); + + mutex_enter(&slock->lock); + slock->refcnt++; + mutex_exit(&slock->lock); + mutex_exit(&devmap_slock); + + return (slock); +} + +/* + * Wake up processes that sleep on softlocked. + * Free dh_softlock if refcnt is 0. + */ +static void +devmap_softlock_rele(devmap_handle_t *dhp) +{ + struct devmap_softlock *slock = dhp->dh_softlock; + struct devmap_softlock *tmp; + struct devmap_softlock *parent; + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SOFTLOCK_RELE, + "devmap_softlock_rele:start"); + + mutex_enter(&devmap_slock); + mutex_enter(&slock->lock); + + ASSERT(slock->refcnt > 0); + + slock->refcnt--; + + /* + * If no one is using the device, free up the slock data. + */ + if (slock->refcnt == 0) { + slock->softlocked = 0; + cv_signal(&slock->cv); + + if (devmap_slist == slock) + devmap_slist = slock->next; + else { + parent = devmap_slist; + for (tmp = devmap_slist->next; tmp != NULL; + tmp = tmp->next) { + if (tmp == slock) { + parent->next = tmp->next; + break; + } + parent = tmp; + } + } + mutex_exit(&slock->lock); + mutex_destroy(&slock->lock); + cv_destroy(&slock->cv); + kmem_free(slock, sizeof (struct devmap_softlock)); + } else + mutex_exit(&slock->lock); + + mutex_exit(&devmap_slock); +} + +/* + * Wake up processes that sleep on dh_ctx->locked. + * Free dh_ctx if refcnt is 0. + */ +static void +devmap_ctx_rele(devmap_handle_t *dhp) +{ + struct devmap_ctx *devctx = dhp->dh_ctx; + struct devmap_ctx *tmp; + struct devmap_ctx *parent; + timeout_id_t tid; + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_CTX_RELE, + "devmap_ctx_rele:start"); + + mutex_enter(&devmapctx_lock); + mutex_enter(&devctx->lock); + + ASSERT(devctx->refcnt > 0); + + devctx->refcnt--; + + /* + * If no one is using the device, free up the devctx data. + */ + if (devctx->refcnt == 0) { + /* + * Untimeout any threads using this mapping as they are about + * to go away. + */ + if (devctx->timeout != 0) { + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_CTX_RELE_CK1, + "devmap_ctx_rele:untimeout ctx->timeout"); + + tid = devctx->timeout; + mutex_exit(&devctx->lock); + (void) untimeout(tid); + mutex_enter(&devctx->lock); + } + + devctx->oncpu = 0; + cv_signal(&devctx->cv); + + if (devmapctx_list == devctx) + devmapctx_list = devctx->next; + else { + parent = devmapctx_list; + for (tmp = devmapctx_list->next; tmp != NULL; + tmp = tmp->next) { + if (tmp == devctx) { + parent->next = tmp->next; + break; + } + parent = tmp; + } + } + mutex_exit(&devctx->lock); + mutex_destroy(&devctx->lock); + cv_destroy(&devctx->cv); + kmem_free(devctx, sizeof (struct devmap_ctx)); + } else + mutex_exit(&devctx->lock); + + mutex_exit(&devmapctx_lock); +} + +/* + * devmap_load: + * Marks a segdev segment or pages if offset->offset+len + * is not the entire segment as nointercept and faults in + * the pages in the range offset -> offset+len. + */ +int +devmap_load(devmap_cookie_t dhc, offset_t offset, size_t len, uint_t type, + uint_t rw) +{ + devmap_handle_t *dhp = (devmap_handle_t *)dhc; + struct as *asp = dhp->dh_seg->s_as; + caddr_t addr; + ulong_t size; + ssize_t soff; /* offset from the beginning of the segment */ + int rc; + + TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_LOAD, + "devmap_load:start dhp=%p offset=%llx len=%lx", + (void *)dhp, offset, len); + + DEBUGF(7, (CE_CONT, "devmap_load: dhp %p offset %llx len %lx\n", + (void *)dhp, offset, len)); + + /* + * Hat layer only supports devload to process' context for which + * the as lock is held. Verify here and return error if drivers + * inadvertently call devmap_load on a wrong devmap handle. + */ + if ((asp != &kas) && !AS_LOCK_HELD(asp, &asp->a_lock)) + return (FC_MAKE_ERR(EINVAL)); + + soff = (ssize_t)(offset - dhp->dh_uoff); + soff = round_down_p2(soff, PAGESIZE); + if (soff < 0 || soff >= dhp->dh_len) + return (FC_MAKE_ERR(EINVAL)); + + /* + * Address and size must be page aligned. Len is set to the + * number of bytes in the number of pages that are required to + * support len. Offset is set to the byte offset of the first byte + * of the page that contains offset. + */ + len = round_up_p2(len, PAGESIZE); + + /* + * If len == 0, then calculate the size by getting + * the number of bytes from offset to the end of the segment. + */ + if (len == 0) + size = dhp->dh_len - soff; + else { + size = len; + if ((soff + size) > dhp->dh_len) + return (FC_MAKE_ERR(EINVAL)); + } + + /* + * The address is offset bytes from the base address of + * the segment. + */ + addr = (caddr_t)(soff + dhp->dh_uvaddr); + + HOLD_DHP_LOCK(dhp); + rc = segdev_faultpages(asp->a_hat, + dhp->dh_seg, addr, size, type, rw, dhp); + RELE_DHP_LOCK(dhp); + return (rc); +} + +int +devmap_setup(dev_t dev, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred) +{ + register devmap_handle_t *dhp; + int (*devmap)(dev_t, devmap_cookie_t, offset_t, size_t, + size_t *, uint_t); + int (*mmap)(dev_t, off_t, int); + struct devmap_callback_ctl *callbackops; + devmap_handle_t *dhp_head = NULL; + devmap_handle_t *dhp_prev = NULL; + devmap_handle_t *dhp_curr; + caddr_t addr; + int map_flag; + int ret; + ulong_t total_len; + size_t map_len; + size_t resid_len = len; + offset_t map_off = off; + struct devmap_softlock *slock = NULL; + +#ifdef lint + cred = cred; +#endif + + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_SETUP, + "devmap_setup:start off=%llx len=%lx", off, len); + DEBUGF(3, (CE_CONT, "devmap_setup: off %llx len %lx\n", + off, len)); + + devmap = devopsp[getmajor(dev)]->devo_cb_ops->cb_devmap; + mmap = devopsp[getmajor(dev)]->devo_cb_ops->cb_mmap; + + /* + * driver must provide devmap(9E) entry point in cb_ops to use the + * devmap framework. + */ + if (devmap == NULL || devmap == nulldev || devmap == nodev) + return (EINVAL); + + /* + * To protect from an inadvertent entry because the devmap entry point + * is not NULL, return error if D_DEVMAP bit is not set in cb_flag and + * mmap is NULL. + */ + map_flag = devopsp[getmajor(dev)]->devo_cb_ops->cb_flag; + if ((map_flag & D_DEVMAP) == 0 && (mmap == NULL || mmap == nulldev)) + return (EINVAL); + + /* + * devmap allows mmap(2) to map multiple registers. + * one devmap_handle is created for each register mapped. + */ + for (total_len = 0; total_len < len; total_len += map_len) { + dhp = kmem_zalloc(sizeof (devmap_handle_t), KM_SLEEP); + + if (dhp_prev != NULL) + dhp_prev->dh_next = dhp; + else + dhp_head = dhp; + dhp_prev = dhp; + + dhp->dh_prot = prot; + dhp->dh_orig_maxprot = dhp->dh_maxprot = maxprot; + dhp->dh_dev = dev; + dhp->dh_timeout_length = CTX_TIMEOUT_VALUE; + dhp->dh_uoff = map_off; + + /* + * Get mapping specific info from + * the driver, such as rnumber, roff, len, callbackops, + * accattrp and, if the mapping is for kernel memory, + * ddi_umem_cookie. + */ + if ((ret = cdev_devmap(dev, dhp, map_off, + resid_len, &map_len, get_udatamodel())) != 0) { + free_devmap_handle(dhp_head); + return (ENXIO); + } + + if (map_len & PAGEOFFSET) { + free_devmap_handle(dhp_head); + return (EINVAL); + } + + callbackops = &dhp->dh_callbackops; + + if ((callbackops->devmap_access == NULL) || + (callbackops->devmap_access == nulldev) || + (callbackops->devmap_access == nodev)) { + /* + * Normally devmap does not support MAP_PRIVATE unless + * the drivers provide a valid devmap_access routine. + */ + if ((flags & MAP_PRIVATE) != 0) { + free_devmap_handle(dhp_head); + return (EINVAL); + } + } else { + /* + * Initialize dhp_softlock and dh_ctx if the drivers + * provide devmap_access. + */ + dhp->dh_softlock = devmap_softlock_init(dev, + (ulong_t)callbackops->devmap_access); + dhp->dh_ctx = devmap_ctxinit(dev, + (ulong_t)callbackops->devmap_access); + + /* + * segdev_fault can only work when all + * dh_softlock in a multi-dhp mapping + * are same. see comments in segdev_fault + * This code keeps track of the first + * dh_softlock allocated in slock and + * compares all later allocations and if + * not similar, returns an error. + */ + if (slock == NULL) + slock = dhp->dh_softlock; + if (slock != dhp->dh_softlock) { + free_devmap_handle(dhp_head); + return (ENOTSUP); + } + } + + map_off += map_len; + resid_len -= map_len; + } + + /* + * get the user virtual address and establish the mapping between + * uvaddr and device physical address. + */ + if ((ret = devmap_device(dhp_head, as, addrp, off, len, flags)) + != 0) { + /* + * free devmap handles if error during the mapping. + */ + free_devmap_handle(dhp_head); + + return (ret); + } + + /* + * call the driver's devmap_map callback to do more after the mapping, + * such as to allocate driver private data for context management. + */ + dhp = dhp_head; + map_off = off; + addr = *addrp; + while (dhp != NULL) { + callbackops = &dhp->dh_callbackops; + dhp->dh_uvaddr = addr; + dhp_curr = dhp; + if (callbackops->devmap_map != NULL) { + ret = (*callbackops->devmap_map)((devmap_cookie_t)dhp, + dev, flags, map_off, + dhp->dh_len, &dhp->dh_pvtp); + if (ret != 0) { + struct segdev_data *sdp; + + /* + * call driver's devmap_unmap entry point + * to free driver resources. + */ + dhp = dhp_head; + map_off = off; + while (dhp != dhp_curr) { + callbackops = &dhp->dh_callbackops; + if (callbackops->devmap_unmap != NULL) { + (*callbackops->devmap_unmap)( + dhp, dhp->dh_pvtp, + map_off, dhp->dh_len, + NULL, NULL, NULL, NULL); + } + map_off += dhp->dh_len; + dhp = dhp->dh_next; + } + sdp = dhp_head->dh_seg->s_data; + sdp->devmap_data = NULL; + free_devmap_handle(dhp_head); + return (ENXIO); + } + } + map_off += dhp->dh_len; + addr += dhp->dh_len; + dhp = dhp->dh_next; + } + + return (0); +} + +int +ddi_devmap_segmap(dev_t dev, off_t off, ddi_as_handle_t as, caddr_t *addrp, + off_t len, uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred) +{ + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SEGMAP, + "devmap_segmap:start"); + return (devmap_setup(dev, (offset_t)off, (struct as *)as, addrp, + (size_t)len, prot, maxprot, flags, cred)); +} + +/* + * Called from devmap_devmem_setup/remap to see if can use large pages for + * this device mapping. + * Also calculate the max. page size for this mapping. + * this page size will be used in fault routine for + * optimal page size calculations. + */ +static void +devmap_devmem_large_page_setup(devmap_handle_t *dhp) +{ + ASSERT(dhp_is_devmem(dhp)); + dhp->dh_mmulevel = 0; + + /* + * use large page size only if: + * 1. device memory. + * 2. mmu supports multiple page sizes, + * 3. Driver did not disallow it + * 4. dhp length is at least as big as the large pagesize + * 5. the uvaddr and pfn are large pagesize aligned + */ + if (page_num_pagesizes() > 1 && + !(dhp->dh_flags & (DEVMAP_USE_PAGESIZE | DEVMAP_MAPPING_INVALID))) { + ulong_t base; + int level; + + base = (ulong_t)ptob(dhp->dh_pfn); + for (level = 1; level < page_num_pagesizes(); level++) { + size_t pgsize = page_get_pagesize(level); + if ((dhp->dh_len < pgsize) || + (!VA_PA_PGSIZE_ALIGNED((uintptr_t)dhp->dh_uvaddr, + base, pgsize))) { + break; + } + } + dhp->dh_mmulevel = level - 1; + } + if (dhp->dh_mmulevel > 0) { + dhp->dh_flags |= DEVMAP_FLAG_LARGE; + } else { + dhp->dh_flags &= ~DEVMAP_FLAG_LARGE; + } +} + +/* + * Called by driver devmap routine to pass device specific info to + * the framework. used for device memory mapping only. + */ +int +devmap_devmem_setup(devmap_cookie_t dhc, dev_info_t *dip, + struct devmap_callback_ctl *callbackops, uint_t rnumber, offset_t roff, + size_t len, uint_t maxprot, uint_t flags, ddi_device_acc_attr_t *accattrp) +{ + devmap_handle_t *dhp = (devmap_handle_t *)dhc; + ddi_acc_handle_t handle; + ddi_map_req_t mr; + ddi_acc_hdl_t *hp; + int err; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_DEVMEM_SETUP, + "devmap_devmem_setup:start dhp=%p offset=%llx rnum=%d len=%lx", + (void *)dhp, roff, rnumber, (uint_t)len); + DEBUGF(2, (CE_CONT, "devmap_devmem_setup: dhp %p offset %llx " + "rnum %d len %lx\n", (void *)dhp, roff, rnumber, len)); + + /* + * First to check if this function has been called for this dhp. + */ + if (dhp->dh_flags & DEVMAP_SETUP_DONE) + return (DDI_FAILURE); + + if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) + return (DDI_FAILURE); + + if (flags & DEVMAP_MAPPING_INVALID) { + /* + * Don't go up the tree to get pfn if the driver specifies + * DEVMAP_MAPPING_INVALID in flags. + * + * If DEVMAP_MAPPING_INVALID is specified, we have to grant + * remap permission. + */ + if (!(flags & DEVMAP_ALLOW_REMAP)) { + return (DDI_FAILURE); + } + dhp->dh_pfn = PFN_INVALID; + } else { + handle = impl_acc_hdl_alloc(KM_SLEEP, NULL); + if (handle == NULL) + return (DDI_FAILURE); + + hp = impl_acc_hdl_get(handle); + hp->ah_vers = VERS_ACCHDL; + hp->ah_dip = dip; + hp->ah_rnumber = rnumber; + hp->ah_offset = roff; + hp->ah_len = len; + if (accattrp != NULL) + hp->ah_acc = *accattrp; + + mr.map_op = DDI_MO_MAP_LOCKED; + mr.map_type = DDI_MT_RNUMBER; + mr.map_obj.rnumber = rnumber; + mr.map_prot = maxprot & dhp->dh_orig_maxprot; + mr.map_flags = DDI_MF_DEVICE_MAPPING; + mr.map_handlep = hp; + mr.map_vers = DDI_MAP_VERSION; + + /* + * up the device tree to get pfn. + * The rootnex_map_regspec() routine in nexus drivers has been + * modified to return pfn if map_flags is DDI_MF_DEVICE_MAPPING. + */ + err = ddi_map(dip, &mr, roff, len, (caddr_t *)&dhp->dh_pfn); + dhp->dh_hat_attr = hp->ah_hat_flags; + impl_acc_hdl_free(handle); + + if (err) + return (DDI_FAILURE); + } + /* Should not be using devmem setup for memory pages */ + ASSERT(!pf_is_memory(dhp->dh_pfn)); + + /* Only some of the flags bits are settable by the driver */ + dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS); + dhp->dh_len = ptob(btopr(len)); + + dhp->dh_cookie = DEVMAP_DEVMEM_COOKIE; + dhp->dh_roff = ptob(btop(roff)); + + /* setup the dh_mmulevel and DEVMAP_FLAG_LARGE */ + devmap_devmem_large_page_setup(dhp); + dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; + ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); + + + if (callbackops != NULL) { + bcopy(callbackops, &dhp->dh_callbackops, + sizeof (struct devmap_callback_ctl)); + } + + /* + * Initialize dh_lock if we want to do remap. + */ + if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) { + mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL); + dhp->dh_flags |= DEVMAP_LOCK_INITED; + } + + dhp->dh_flags |= DEVMAP_SETUP_DONE; + + return (DDI_SUCCESS); +} + +int +devmap_devmem_remap(devmap_cookie_t dhc, dev_info_t *dip, + uint_t rnumber, offset_t roff, size_t len, uint_t maxprot, + uint_t flags, ddi_device_acc_attr_t *accattrp) +{ + devmap_handle_t *dhp = (devmap_handle_t *)dhc; + ddi_acc_handle_t handle; + ddi_map_req_t mr; + ddi_acc_hdl_t *hp; + pfn_t pfn; + uint_t hat_flags; + int err; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_DEVMEM_REMAP, + "devmap_devmem_setup:start dhp=%p offset=%llx rnum=%d len=%lx", + (void *)dhp, roff, rnumber, (uint_t)len); + DEBUGF(2, (CE_CONT, "devmap_devmem_remap: dhp %p offset %llx " + "rnum %d len %lx\n", (void *)dhp, roff, rnumber, len)); + + /* + * Return failure if setup has not been done or no remap permission + * has been granted during the setup. + */ + if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 || + (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0) + return (DDI_FAILURE); + + /* Only DEVMAP_MAPPING_INVALID flag supported for remap */ + if ((flags != 0) && (flags != DEVMAP_MAPPING_INVALID)) + return (DDI_FAILURE); + + if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) + return (DDI_FAILURE); + + if (!(flags & DEVMAP_MAPPING_INVALID)) { + handle = impl_acc_hdl_alloc(KM_SLEEP, NULL); + if (handle == NULL) + return (DDI_FAILURE); + } + + HOLD_DHP_LOCK(dhp); + + /* + * Unload the old mapping, so next fault will setup the new mappings + * Do this while holding the dhp lock so other faults dont reestablish + * the mappings + */ + hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr, + dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER); + + if (flags & DEVMAP_MAPPING_INVALID) { + dhp->dh_flags |= DEVMAP_MAPPING_INVALID; + dhp->dh_pfn = PFN_INVALID; + } else { + /* clear any prior DEVMAP_MAPPING_INVALID flag */ + dhp->dh_flags &= ~DEVMAP_MAPPING_INVALID; + hp = impl_acc_hdl_get(handle); + hp->ah_vers = VERS_ACCHDL; + hp->ah_dip = dip; + hp->ah_rnumber = rnumber; + hp->ah_offset = roff; + hp->ah_len = len; + if (accattrp != NULL) + hp->ah_acc = *accattrp; + + mr.map_op = DDI_MO_MAP_LOCKED; + mr.map_type = DDI_MT_RNUMBER; + mr.map_obj.rnumber = rnumber; + mr.map_prot = maxprot & dhp->dh_orig_maxprot; + mr.map_flags = DDI_MF_DEVICE_MAPPING; + mr.map_handlep = hp; + mr.map_vers = DDI_MAP_VERSION; + + /* + * up the device tree to get pfn. + * The rootnex_map_regspec() routine in nexus drivers has been + * modified to return pfn if map_flags is DDI_MF_DEVICE_MAPPING. + */ + err = ddi_map(dip, &mr, roff, len, (caddr_t *)&pfn); + hat_flags = hp->ah_hat_flags; + impl_acc_hdl_free(handle); + if (err) { + RELE_DHP_LOCK(dhp); + return (DDI_FAILURE); + } + /* + * Store result of ddi_map first in local variables, as we do + * not want to overwrite the existing dhp with wrong data. + */ + dhp->dh_pfn = pfn; + dhp->dh_hat_attr = hat_flags; + } + + /* clear the large page size flag */ + dhp->dh_flags &= ~DEVMAP_FLAG_LARGE; + + dhp->dh_cookie = DEVMAP_DEVMEM_COOKIE; + dhp->dh_roff = ptob(btop(roff)); + + /* setup the dh_mmulevel and DEVMAP_FLAG_LARGE */ + devmap_devmem_large_page_setup(dhp); + dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; + ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); + + RELE_DHP_LOCK(dhp); + return (DDI_SUCCESS); +} + +/* + * called by driver devmap routine to pass kernel virtual address mapping + * info to the framework. used only for kernel memory + * allocated from ddi_umem_alloc(). + */ +int +devmap_umem_setup(devmap_cookie_t dhc, dev_info_t *dip, + struct devmap_callback_ctl *callbackops, ddi_umem_cookie_t cookie, + offset_t off, size_t len, uint_t maxprot, uint_t flags, + ddi_device_acc_attr_t *accattrp) +{ + devmap_handle_t *dhp = (devmap_handle_t *)dhc; + struct ddi_umem_cookie *cp = (struct ddi_umem_cookie *)cookie; + +#ifdef lint + dip = dip; + accattrp = accattrp; +#endif + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_SETUP, + "devmap_umem_setup:start dhp=%p offset=%llx cookie=%p len=%lx", + (void *)dhp, off, cookie, len); + DEBUGF(2, (CE_CONT, "devmap_umem_setup: dhp %p offset %llx " + "cookie %p len %lx\n", (void *)dhp, off, (void *)cookie, len)); + + if (cookie == NULL) + return (DDI_FAILURE); + + /* For UMEM_TRASH, this restriction is not needed */ + if ((off + len) > cp->size) + return (DDI_FAILURE); + + /* + * First to check if this function has been called for this dhp. + */ + if (dhp->dh_flags & DEVMAP_SETUP_DONE) + return (DDI_FAILURE); + + if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) + return (DDI_FAILURE); + + if (flags & DEVMAP_MAPPING_INVALID) { + /* + * If DEVMAP_MAPPING_INVALID is specified, we have to grant + * remap permission. + */ + if (!(flags & DEVMAP_ALLOW_REMAP)) { + return (DDI_FAILURE); + } + } else { + dhp->dh_cookie = cookie; + dhp->dh_roff = ptob(btop(off)); + dhp->dh_cvaddr = cp->cvaddr + dhp->dh_roff; + } + + /* + * The default is _not_ to pass HAT_LOAD_NOCONSIST to hat_devload(); + * we pass HAT_LOAD_NOCONSIST _only_ in cases where hat tries to + * create consistent mappings but our intention was to create + * non-consistent mappings. + * + * DEVMEM: hat figures it out it's DEVMEM and creates non-consistent + * mappings. + * + * kernel exported memory: hat figures it out it's memory and always + * creates consistent mappings. + * + * /dev/mem: non-consistent mappings. See comments in common/io/mem.c + * + * /dev/kmem: consistent mappings are created unless they are + * MAP_FIXED. We _explicitly_ tell hat to create non-consistent + * mappings by passing HAT_LOAD_NOCONSIST in case of MAP_FIXED + * mappings of /dev/kmem. See common/io/mem.c + */ + + /* Only some of the flags bits are settable by the driver */ + dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS); + + dhp->dh_len = ptob(btopr(len)); + dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; + ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); + + if (callbackops != NULL) { + bcopy(callbackops, &dhp->dh_callbackops, + sizeof (struct devmap_callback_ctl)); + } + /* + * Initialize dh_lock if we want to do remap. + */ + if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) { + mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL); + dhp->dh_flags |= DEVMAP_LOCK_INITED; + } + + dhp->dh_flags |= DEVMAP_SETUP_DONE; + + return (DDI_SUCCESS); +} + +int +devmap_umem_remap(devmap_cookie_t dhc, dev_info_t *dip, + ddi_umem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot, + uint_t flags, ddi_device_acc_attr_t *accattrp) +{ + devmap_handle_t *dhp = (devmap_handle_t *)dhc; + struct ddi_umem_cookie *cp = (struct ddi_umem_cookie *)cookie; + + TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_REMAP, + "devmap_umem_remap:start dhp=%p offset=%llx cookie=%p len=%lx", + (void *)dhp, off, cookie, len); + DEBUGF(2, (CE_CONT, "devmap_umem_remap: dhp %p offset %llx " + "cookie %p len %lx\n", (void *)dhp, off, (void *)cookie, len)); + +#ifdef lint + dip = dip; + accattrp = accattrp; +#endif + /* + * Reture failure if setup has not been done or no remap permission + * has been granted during the setup. + */ + if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 || + (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0) + return (DDI_FAILURE); + + /* No flags supported for remap yet */ + if (flags != 0) + return (DDI_FAILURE); + + if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) + return (DDI_FAILURE); + + /* For UMEM_TRASH, this restriction is not needed */ + if ((off + len) > cp->size) + return (DDI_FAILURE); + + HOLD_DHP_LOCK(dhp); + /* + * Unload the old mapping, so next fault will setup the new mappings + * Do this while holding the dhp lock so other faults dont reestablish + * the mappings + */ + hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr, + dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER); + + dhp->dh_cookie = cookie; + dhp->dh_roff = ptob(btop(off)); + dhp->dh_cvaddr = cp->cvaddr + dhp->dh_roff; + + /* clear the large page size flag */ + dhp->dh_flags &= ~DEVMAP_FLAG_LARGE; + + dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; + ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); + RELE_DHP_LOCK(dhp); + return (DDI_SUCCESS); +} + +/* + * to set timeout value for the driver's context management callback, e.g. + * devmap_access(). + */ +void +devmap_set_ctx_timeout(devmap_cookie_t dhc, clock_t ticks) +{ + devmap_handle_t *dhp = (devmap_handle_t *)dhc; + + TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_SET_CTX_TIMEOUT, + "devmap_set_ctx_timeout:start dhp=%p ticks=%x", + (void *)dhp, ticks); + dhp->dh_timeout_length = ticks; +} + +int +devmap_default_access(devmap_cookie_t dhp, void *pvtp, offset_t off, + size_t len, uint_t type, uint_t rw) +{ +#ifdef lint + pvtp = pvtp; +#endif + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_DEFAULT_ACCESS, + "devmap_default_access:start"); + return (devmap_load(dhp, off, len, type, rw)); +} + +/* + * segkmem_alloc() wrapper to allocate memory which is both + * non-relocatable (for DR) and sharelocked, since the rest + * of this segment driver requires it. + */ +static void * +devmap_alloc_pages(vmem_t *vmp, size_t size, int vmflag) +{ + ASSERT(vmp != NULL); + ASSERT(kvseg.s_base != NULL); + vmflag |= (VM_NORELOC | SEGKMEM_SHARELOCKED); + return (segkmem_alloc(vmp, size, vmflag)); +} + +/* + * This is where things are a bit incestrous with seg_kmem: unlike + * seg_kp, seg_kmem does not keep its pages long-term sharelocked, so + * we need to do a bit of a dance around that to prevent duplication of + * code until we decide to bite the bullet and implement a new kernel + * segment for driver-allocated memory that is exported to user space. + */ +static void +devmap_free_pages(vmem_t *vmp, void *inaddr, size_t size) +{ + page_t *pp; + caddr_t addr = inaddr; + caddr_t eaddr; + pgcnt_t npages = btopr(size); + + ASSERT(vmp != NULL); + ASSERT(kvseg.s_base != NULL); + ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0); + + hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK); + + for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) { + /* + * Use page_find() instead of page_lookup() to find the page + * since we know that it is hashed and has a shared lock. + */ + pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr); + + if (pp == NULL) + panic("devmap_free_pages: page not found"); + if (!page_tryupgrade(pp)) { + page_unlock(pp); + pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, + SE_EXCL); + if (pp == NULL) + panic("devmap_free_pages: page already freed"); + } + /* Clear p_lckcnt so page_destroy() doesn't update availrmem */ + pp->p_lckcnt = 0; + page_destroy(pp, 0); + } + page_unresv(npages); + + if (vmp != NULL) + vmem_free(vmp, inaddr, size); +} + +/* + * devmap_umem_alloc_np() replaces kmem_zalloc() as the method for + * allocating non-pageable kmem in response to a ddi_umem_alloc() + * default request. For now we allocate our own pages and we keep + * them long-term sharelocked, since: A) the fault routines expect the + * memory to already be locked; B) pageable umem is already long-term + * locked; C) it's a lot of work to make it otherwise, particuarly + * since the nexus layer expects the pages to never fault. An RFE is to + * not keep the pages long-term locked, but instead to be able to + * take faults on them and simply look them up in kvp in case we + * fault on them. Even then, we must take care not to let pageout + * steal them from us since the data must remain resident; if we + * do this we must come up with some way to pin the pages to prevent + * faults while a driver is doing DMA to/from them. + */ +static void * +devmap_umem_alloc_np(size_t size, size_t flags) +{ + void *buf; + int vmflags = (flags & DDI_UMEM_NOSLEEP)? VM_NOSLEEP : VM_SLEEP; + + buf = vmem_alloc(umem_np_arena, size, vmflags); + if (buf != NULL) + bzero(buf, size); + return (buf); +} + +static void +devmap_umem_free_np(void *addr, size_t size) +{ + vmem_free(umem_np_arena, addr, size); +} + +/* + * allocate page aligned kernel memory for exporting to user land. + * The devmap framework will use the cookie allocated by ddi_umem_alloc() + * to find a user virtual address that is in same color as the address + * allocated here. + */ +void * +ddi_umem_alloc(size_t size, int flags, ddi_umem_cookie_t *cookie) +{ + register size_t len = ptob(btopr(size)); + void *buf = NULL; + struct ddi_umem_cookie *cp; + int iflags = 0; + + *cookie = NULL; + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_ALLOC, + "devmap_umem_alloc:start"); + if (len == 0) + return ((void *)NULL); + + /* + * allocate cookie + */ + if ((cp = kmem_zalloc(sizeof (struct ddi_umem_cookie), + flags & DDI_UMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP)) == NULL) { + ASSERT(flags & DDI_UMEM_NOSLEEP); + return ((void *)NULL); + } + + if (flags & DDI_UMEM_PAGEABLE) { + /* Only one of the flags is allowed */ + ASSERT(!(flags & DDI_UMEM_TRASH)); + /* initialize resource with 0 */ + iflags = KPD_ZERO; + + /* + * to allocate unlocked pageable memory, use segkp_get() to + * create a segkp segment. Since segkp can only service kas, + * other segment drivers such as segdev have to do + * as_fault(segkp, SOFTLOCK) in its fault routine, + */ + if (flags & DDI_UMEM_NOSLEEP) + iflags |= KPD_NOWAIT; + + if ((buf = segkp_get(segkp, len, iflags)) == NULL) { + kmem_free(cp, sizeof (struct ddi_umem_cookie)); + return ((void *)NULL); + } + cp->type = KMEM_PAGEABLE; + mutex_init(&cp->lock, NULL, MUTEX_DEFAULT, NULL); + cp->locked = 0; + } else if (flags & DDI_UMEM_TRASH) { + /* Only one of the flags is allowed */ + ASSERT(!(flags & DDI_UMEM_PAGEABLE)); + cp->type = UMEM_TRASH; + buf = NULL; + } else { + if ((buf = devmap_umem_alloc_np(len, flags)) == NULL) { + kmem_free(cp, sizeof (struct ddi_umem_cookie)); + return ((void *)NULL); + } + + cp->type = KMEM_NON_PAGEABLE; + } + + /* + * need to save size here. size will be used when + * we do kmem_free. + */ + cp->size = len; + cp->cvaddr = (caddr_t)buf; + + *cookie = (void *)cp; + return (buf); +} + +void +ddi_umem_free(ddi_umem_cookie_t cookie) +{ + struct ddi_umem_cookie *cp; + + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_FREE, + "devmap_umem_free:start"); + + /* + * if cookie is NULL, no effects on the system + */ + if (cookie == NULL) + return; + + cp = (struct ddi_umem_cookie *)cookie; + + switch (cp->type) { + case KMEM_PAGEABLE : + ASSERT(cp->cvaddr != NULL && cp->size != 0); + /* + * Check if there are still any pending faults on the cookie + * while the driver is deleting it, + * XXX - could change to an ASSERT but wont catch errant drivers + */ + mutex_enter(&cp->lock); + if (cp->locked) { + mutex_exit(&cp->lock); + panic("ddi_umem_free for cookie with pending faults %p", + (void *)cp); + return; + } + + segkp_release(segkp, cp->cvaddr); + + /* + * release mutex associated with this cookie. + */ + mutex_destroy(&cp->lock); + break; + case KMEM_NON_PAGEABLE : + ASSERT(cp->cvaddr != NULL && cp->size != 0); + devmap_umem_free_np(cp->cvaddr, cp->size); + break; + case UMEM_TRASH : + break; + case UMEM_LOCKED : + /* Callers should use ddi_umem_unlock for this type */ + ddi_umem_unlock(cookie); + /* Frees the cookie too */ + return; + default: + /* panic so we can diagnose the underlying cause */ + panic("ddi_umem_free: illegal cookie type 0x%x\n", + cp->type); + } + + kmem_free(cookie, sizeof (struct ddi_umem_cookie)); +} + + +static int +segdev_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + + /* + * It looks as if it is always mapped shared + */ + TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_GETMEMID, + "segdev_getmemid:start"); + memidp->val[0] = (uintptr_t)VTOCVP(sdp->vp); + memidp->val[1] = sdp->offset + (uintptr_t)(addr - seg->s_base); + return (0); +} + +/*ARGSUSED*/ +static lgrp_mem_policy_info_t * +segdev_getpolicy(struct seg *seg, caddr_t addr) +{ + return (NULL); +} + +/* + * ddi_umem_alloc() non-pageable quantum cache max size. + * This is just a SWAG. + */ +#define DEVMAP_UMEM_QUANTUM (8*PAGESIZE) + +/* + * Initialize seg_dev from boot. This routine sets up the trash page + * and creates the umem_np_arena used to back non-pageable memory + * requests. + */ +void +segdev_init(void) +{ + struct seg kseg; + + umem_np_arena = vmem_create("umem_np", NULL, 0, PAGESIZE, + devmap_alloc_pages, devmap_free_pages, heap_arena, + DEVMAP_UMEM_QUANTUM, VM_SLEEP); + + kseg.s_as = &kas; + trashpp = page_create_va(&trashvp, 0, PAGESIZE, + PG_NORELOC | PG_EXCL | PG_WAIT, &kseg, NULL); + if (trashpp == NULL) + panic("segdev_init: failed to create trash page"); + pagezero(trashpp, 0, PAGESIZE); + page_downgrade(trashpp); +} + +/* + * Invoke platform-dependent support routines so that /proc can have + * the platform code deal with curious hardware. + */ +int +segdev_copyfrom(struct seg *seg, + caddr_t uaddr, const void *devaddr, void *kaddr, size_t len) +{ + struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + struct snode *sp = VTOS(VTOCVP(sdp->vp)); + + return (e_ddi_copyfromdev(sp->s_dip, + (off_t)(uaddr - seg->s_base), devaddr, kaddr, len)); +} + +int +segdev_copyto(struct seg *seg, + caddr_t uaddr, const void *kaddr, void *devaddr, size_t len) +{ + struct segdev_data *sdp = (struct segdev_data *)seg->s_data; + struct snode *sp = VTOS(VTOCVP(sdp->vp)); + + return (e_ddi_copytodev(sp->s_dip, + (off_t)(uaddr - seg->s_base), kaddr, devaddr, len)); +} diff --git a/usr/src/uts/common/vm/seg_dev.h b/usr/src/uts/common/vm/seg_dev.h new file mode 100644 index 0000000000..c498c06ecf --- /dev/null +++ b/usr/src/uts/common/vm/seg_dev.h @@ -0,0 +1,131 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_SEG_DEV_H +#define _VM_SEG_DEV_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/project.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Structure whose pointer is passed to the segdev_create routine + */ +struct segdev_crargs { + offset_t offset; /* starting offset */ + int (*mapfunc)(dev_t dev, off_t off, int prot); /* map function */ + dev_t dev; /* device number */ + uchar_t type; /* type of sharing done */ + uchar_t prot; /* protection */ + uchar_t maxprot; /* maximum protection */ + uint_t hat_attr; /* hat attr */ + uint_t hat_flags; /* currently, hat_flags is used ONLY for */ + /* HAT_LOAD_NOCONSIST; in future, it can be */ + /* expanded to include any flags that are */ + /* not already part of hat_attr */ + void *devmap_data; /* devmap_handle private data */ +}; + +/* + * (Semi) private data maintained by the seg_dev driver per segment mapping + * + * The segment lock is necessary to protect fields that are modified + * when the "read" version of the address space lock is held. This lock + * is not needed when the segment operation has the "write" version of + * the address space lock (it would be redundant). + * + * The following fields in segdev_data are read-only when the address + * space is "read" locked, and don't require the segment lock: + * + * vp + * offset + * mapfunc + * maxprot + */ +struct segdev_data { + offset_t offset; /* device offset for start of mapping */ + kmutex_t lock; /* protects segdev_data */ + int (*mapfunc)(dev_t dev, off_t off, int prot); + struct vnode *vp; /* vnode associated with device */ + uchar_t pageprot; /* true if per page protections present */ + uchar_t prot; /* current segment prot if pageprot == 0 */ + uchar_t maxprot; /* maximum segment protections */ + uchar_t type; /* type of sharing done */ + struct vpage *vpage; /* per-page information, if needed */ + uint_t hat_attr; /* hat attr - pass to attr in hat_devload */ + uint_t hat_flags; /* set HAT_LOAD_NOCONSIST flag in hat_devload */ + /* see comments above in segdev_crargs */ + size_t softlockcnt; /* # of SOFTLOCKED in seg */ + void *devmap_data; /* devmap_handle private data */ +}; + +/* Direct physical-userland mapping, without occupying kernel address space */ +#define DEVMAP_PMEM_COOKIE ((ddi_umem_cookie_t)0x2) + +/* + * pmem_cookie: + * Records physical memory pages to be exported to userland. + */ +struct devmap_pmem_cookie { + pgcnt_t dp_npages; /* number of allocated mem pages */ + page_t **dp_pparray; /* pages allocated for this cookie */ + vnode_t *dp_vnp; /* vnode associated with this cookie */ + kproject_t *dp_projp; /* project ptr for resource ctl */ +}; + +#ifdef _KERNEL + +extern void segdev_init(void); + +extern int segdev_create(struct seg *, void *); + +extern int segdev_copyto(struct seg *, caddr_t, const void *, void *, size_t); +extern int segdev_copyfrom(struct seg *, caddr_t, const void *, void *, size_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_DEV_H */ diff --git a/usr/src/uts/common/vm/seg_enum.h b/usr/src/uts/common/vm/seg_enum.h new file mode 100644 index 0000000000..25922e7b40 --- /dev/null +++ b/usr/src/uts/common/vm/seg_enum.h @@ -0,0 +1,85 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _VM_SEG_ENUM_H +#define _VM_SEG_ENUM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * These enumerations are needed in both <vm/seg.h> and + * <sys/vnode.h> in order to declare function prototypes. + */ + +/* + * Fault information passed to the seg fault handling routine. + * The F_SOFTLOCK and F_SOFTUNLOCK are used by software + * to lock and unlock pages for physical I/O. + */ +enum fault_type { + F_INVAL, /* invalid page */ + F_PROT, /* protection fault */ + F_SOFTLOCK, /* software requested locking */ + F_SOFTUNLOCK /* software requested unlocking */ +}; + +/* + * Lock information passed to the seg pagelock handling routine. + */ +enum lock_type { + L_PAGELOCK, /* lock pages */ + L_PAGEUNLOCK, /* unlock pages */ + L_PAGERECLAIM /* reclaim pages */ +}; + +/* + * seg_rw gives the access type for a fault operation + */ +enum seg_rw { + S_OTHER, /* unknown or not touched */ + S_READ, /* read access attempted */ + S_WRITE, /* write access attempted */ + S_EXEC, /* execution access attempted */ + S_CREATE, /* create if page doesn't exist */ + S_READ_NOCOW /* read access, don't do a copy on write */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_ENUM_H */ diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c new file mode 100644 index 0000000000..6f0c8f5750 --- /dev/null +++ b/usr/src/uts/common/vm/seg_kmem.c @@ -0,0 +1,1516 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/tuneable.h> +#include <sys/systm.h> +#include <sys/vm.h> +#include <sys/kmem.h> +#include <sys/vmem.h> +#include <sys/mman.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/dumphdr.h> +#include <sys/bootconf.h> +#include <sys/lgrp.h> +#include <vm/seg_kmem.h> +#include <vm/hat.h> +#include <vm/page.h> +#include <vm/vm_dep.h> +#include <vm/faultcode.h> +#include <sys/promif.h> +#include <vm/seg_kp.h> +#include <sys/bitmap.h> +#include <sys/mem_cage.h> + +/* + * seg_kmem is the primary kernel memory segment driver. It + * maps the kernel heap [kernelheap, ekernelheap), module text, + * and all memory which was allocated before the VM was initialized + * into kas. + * + * Pages which belong to seg_kmem are hashed into &kvp vnode at + * an offset equal to (u_offset_t)virt_addr, and have p_lckcnt >= 1. + * They must never be paged out since segkmem_fault() is a no-op to + * prevent recursive faults. + * + * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on + * __x86 and are unlocked (p_sharelock == 0) on __sparc. Once __x86 + * supports relocation the #ifdef kludges can be removed. + * + * seg_kmem pages may be subject to relocation by page_relocate(), + * provided that the HAT supports it; if this is so, segkmem_reloc + * will be set to a nonzero value. All boot time allocated memory as + * well as static memory is considered off limits to relocation. + * Pages are "relocatable" if p_state does not have P_NORELOC set, so + * we request P_NORELOC pages for memory that isn't safe to relocate. + * + * The kernel heap is logically divided up into four pieces: + * + * heap32_arena is for allocations that require 32-bit absolute + * virtual addresses (e.g. code that uses 32-bit pointers/offsets). + * + * heap_core is for allocations that require 2GB *relative* + * offsets; in other words all memory from heap_core is within + * 2GB of all other memory from the same arena. This is a requirement + * of the addressing modes of some processors in supervisor code. + * + * heap_arena is the general heap arena. + * + * static_arena is the static memory arena. Allocations from it + * are not subject to relocation so it is safe to use the memory + * physical address as well as the virtual address (e.g. the VA to + * PA translations are static). Caches may import from static_arena; + * all other static memory allocations should use static_alloc_arena. + * + * On some platforms which have limited virtual address space, seg_kmem + * may share [kernelheap, ekernelheap) with seg_kp; if this is so, + * segkp_bitmap is non-NULL, and each bit represents a page of virtual + * address space which is actually seg_kp mapped. + */ + +extern ulong_t *segkp_bitmap; /* Is set if segkp is from the kernel heap */ + +char *kernelheap; /* start of primary kernel heap */ +char *ekernelheap; /* end of primary kernel heap */ +struct seg kvseg; /* primary kernel heap segment */ +struct seg kvseg_core; /* "core" kernel heap segment */ +vmem_t *heap_arena; /* primary kernel heap arena */ +vmem_t *heap_core_arena; /* core kernel heap arena */ +char *heap_core_base; /* start of core kernel heap arena */ +char *heap_lp_base; /* start of kernel large page heap arena */ +char *heap_lp_end; /* end of kernel large page heap arena */ +vmem_t *hat_memload_arena; /* HAT translation data */ +struct seg kvseg32; /* 32-bit kernel heap segment */ +vmem_t *heap32_arena; /* 32-bit kernel heap arena */ +vmem_t *heaptext_arena; /* heaptext arena */ +struct as kas; /* kernel address space */ +struct vnode kvp; /* vnode for all segkmem pages */ +int segkmem_reloc; /* enable/disable relocatable segkmem pages */ +vmem_t *static_arena; /* arena for caches to import static memory */ +vmem_t *static_alloc_arena; /* arena for allocating static memory */ + +/* + * seg_kmem driver can map part of the kernel heap with large pages. + * Currently this functionality is implemented for sparc platforms only. + * + * The large page size "segkmem_lpsize" for kernel heap is selected in the + * platform specific code. It can also be modified via /etc/system file. + * Setting segkmem_lpsize to PAGESIZE in /etc/system disables usage of large + * pages for kernel heap. "segkmem_lpshift" is adjusted appropriately to + * match segkmem_lpsize. + * + * At boot time we carve from kernel heap arena a range of virtual addresses + * that will be used for large page mappings. This range [heap_lp_base, + * heap_lp_end) is set up as a separate vmem arena - "heap_lp_arena". We also + * create "kmem_lp_arena" that caches memory already backed up by large + * pages. kmem_lp_arena imports virtual segments from heap_lp_arena. + */ + +size_t segkmem_lpsize; +static uint_t segkmem_lpshift = PAGESHIFT; + +size_t segkmem_kmemlp_quantum = 0x400000; /* 4MB */ +size_t segkmem_heaplp_quantum; +static vmem_t *heap_lp_arena; +static vmem_t *kmem_lp_arena; +static vmem_t *segkmem_ppa_arena; +static segkmem_lpcb_t segkmem_lpcb; + +/* + * We use "segkmem_kmemlp_max" to limit the total amount of physical memory + * consumed by the large page heap. By default this parameter is set to 1/4 of + * physmem but can be adjusted through /etc/system either directly or + * indirectly by setting "segkmem_kmemlp_pcnt" to the percent of physmem + * we allow for large page heap. + */ +size_t segkmem_kmemlp_max; +static uint_t segkmem_kmemlp_pcnt; + +/* + * Getting large pages for kernel heap could be problematic due to + * physical memory fragmentation. That's why we allow to preallocate + * "segkmem_kmemlp_min" bytes at boot time. + */ +static size_t segkmem_kmemlp_min; + +/* + * Throttling is used to avoid expensive tries to allocate large pages + * for kernel heap when a lot of succesive attempts to do so fail. + */ +static ulong_t segkmem_lpthrottle_max = 0x400000; +static ulong_t segkmem_lpthrottle_start = 0x40; +static ulong_t segkmem_use_lpthrottle = 1; + +/* + * Freed pages accumulate on a garbage list until segkmem is ready, + * at which point we call segkmem_gc() to free it all. + */ +typedef struct segkmem_gc_list { + struct segkmem_gc_list *gc_next; + vmem_t *gc_arena; + size_t gc_size; +} segkmem_gc_list_t; + +static segkmem_gc_list_t *segkmem_gc_list; + +/* + * Allocations from the hat_memload arena add VM_MEMLOAD to their + * vmflags so that segkmem_xalloc() can inform the hat layer that it needs + * to take steps to prevent infinite recursion. HAT allocations also + * must be non-relocatable to prevent recursive page faults. + */ +static void * +hat_memload_alloc(vmem_t *vmp, size_t size, int flags) +{ + flags |= (VM_MEMLOAD | VM_NORELOC); + return (segkmem_alloc(vmp, size, flags)); +} + +/* + * Allocations from static_arena arena (or any other arena that uses + * segkmem_alloc_permanent()) require non-relocatable (permanently + * wired) memory pages, since these pages are referenced by physical + * as well as virtual address. + */ +void * +segkmem_alloc_permanent(vmem_t *vmp, size_t size, int flags) +{ + return (segkmem_alloc(vmp, size, flags | VM_NORELOC)); +} + +/* + * Initialize kernel heap boundaries. + */ +void +kernelheap_init( + void *heap_start, + void *heap_end, + char *first_avail, + void *core_start, + void *core_end) +{ + uintptr_t textbase; + size_t core_size; + size_t heap_size; + vmem_t *heaptext_parent; + size_t heap_lp_size = 0; + + kernelheap = heap_start; + ekernelheap = heap_end; + +#ifdef __sparc + heap_lp_size = (((uintptr_t)heap_end - (uintptr_t)heap_start) / 4); + heap_lp_base = ekernelheap - heap_lp_size; + heap_lp_end = heap_lp_base + heap_lp_size; +#endif /* __sparc */ + + /* + * If this platform has a 'core' heap area, then the space for + * overflow module text should be carved out of the end of that + * heap. Otherwise, it gets carved out of the general purpose + * heap. + */ + core_size = (uintptr_t)core_end - (uintptr_t)core_start; + if (core_size > 0) { + ASSERT(core_size >= HEAPTEXT_SIZE); + textbase = (uintptr_t)core_end - HEAPTEXT_SIZE; + core_size -= HEAPTEXT_SIZE; + } +#ifndef __sparc + else { + ekernelheap -= HEAPTEXT_SIZE; + textbase = (uintptr_t)ekernelheap; + } +#endif + + heap_size = (uintptr_t)ekernelheap - (uintptr_t)kernelheap; + heap_arena = vmem_init("heap", kernelheap, heap_size, PAGESIZE, + segkmem_alloc, segkmem_free); + + if (core_size > 0) { + heap_core_arena = vmem_create("heap_core", core_start, + core_size, PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP); + heap_core_base = core_start; + } else { + heap_core_arena = heap_arena; + heap_core_base = kernelheap; + } + + /* + * reserve space for the large page heap. If large pages for kernel + * heap is enabled large page heap arean will be created later in the + * boot sequence in segkmem_heap_lp_init(). Otherwise the allocated + * range will be returned back to the heap_arena. + */ + if (heap_lp_size) { + (void) vmem_xalloc(heap_arena, heap_lp_size, PAGESIZE, 0, 0, + heap_lp_base, heap_lp_end, + VM_NOSLEEP | VM_BESTFIT | VM_PANIC); + } + + /* + * Remove the already-spoken-for memory range [kernelheap, first_avail). + */ + (void) vmem_xalloc(heap_arena, first_avail - kernelheap, PAGESIZE, + 0, 0, kernelheap, first_avail, VM_NOSLEEP | VM_BESTFIT | VM_PANIC); + +#ifdef __sparc + heap32_arena = vmem_create("heap32", (void *)SYSBASE32, + SYSLIMIT32 - SYSBASE32 - HEAPTEXT_SIZE, PAGESIZE, NULL, + NULL, NULL, 0, VM_SLEEP); + + textbase = SYSLIMIT32 - HEAPTEXT_SIZE; + heaptext_parent = NULL; +#else /* __sparc */ + heap32_arena = heap_core_arena; + heaptext_parent = heap_core_arena; +#endif /* __sparc */ + + heaptext_arena = vmem_create("heaptext", (void *)textbase, + HEAPTEXT_SIZE, PAGESIZE, NULL, NULL, heaptext_parent, 0, VM_SLEEP); + + /* + * Create a set of arenas for memory with static translations + * (e.g. VA -> PA translations cannot change). Since using + * kernel pages by physical address implies it isn't safe to + * walk across page boundaries, the static_arena quantum must + * be PAGESIZE. Any kmem caches that require static memory + * should source from static_arena, while direct allocations + * should only use static_alloc_arena. + */ + static_arena = vmem_create("static", NULL, 0, PAGESIZE, + segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP); + static_alloc_arena = vmem_create("static_alloc", NULL, 0, + sizeof (uint64_t), vmem_alloc, vmem_free, static_arena, + 0, VM_SLEEP); + + /* + * Create an arena for translation data (ptes, hmes, or hblks). + * We need an arena for this because hat_memload() is essential + * to vmem_populate() (see comments in common/os/vmem.c). + * + * Note: any kmem cache that allocates from hat_memload_arena + * must be created as a KMC_NOHASH cache (i.e. no external slab + * and bufctl structures to allocate) so that slab creation doesn't + * require anything more than a single vmem_alloc(). + */ + hat_memload_arena = vmem_create("hat_memload", NULL, 0, PAGESIZE, + hat_memload_alloc, segkmem_free, heap_arena, 0, + VM_SLEEP | VMC_POPULATOR); +} + +/* + * Grow kernel heap downward. + */ +void +kernelheap_extend(void *range_start, void *range_end) +{ + size_t len = (uintptr_t)range_end - (uintptr_t)range_start; + + ASSERT(range_start < range_end && range_end == kernelheap); + + if (vmem_add(heap_arena, range_start, len, VM_NOSLEEP) == NULL) { + cmn_err(CE_WARN, "Could not grow kernel heap below 0x%p", + (void *)kernelheap); + } else { + kernelheap = range_start; + } +} + +void +boot_mapin(caddr_t addr, size_t size) +{ + caddr_t eaddr; + page_t *pp; + pfn_t pfnum; + + if (page_resv(btop(size), KM_NOSLEEP) == 0) + panic("boot_mapin: page_resv failed"); + + for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) { + pfnum = va_to_pfn(addr); + if ((pp = page_numtopp_nolock(pfnum)) == NULL) + panic("boot_mapin(): No pp for pfnum = %lx", pfnum); + + /* + * must break up any large pages that may have constituent + * pages being utilized for BOP_ALLOC()'s before calling + * page_numtopp().The locking code (ie. page_reclaim()) + * can't handle them + */ + if (pp->p_szc != 0) + page_boot_demote(pp); + + pp = page_numtopp(pfnum, SE_EXCL); + if (pp == NULL || PP_ISFREE(pp)) + panic("boot_alloc: pp is NULL or free"); + + /* + * If the cage is on but doesn't yet contain this page, + * mark it as non-relocatable. + */ + if (kcage_on && !PP_ISNORELOC(pp)) + PP_SETNORELOC(pp); + + (void) page_hashin(pp, &kvp, (u_offset_t)(uintptr_t)addr, NULL); + pp->p_lckcnt = 1; +#if defined(__x86) + page_downgrade(pp); +#else + page_unlock(pp); +#endif + } +} + +/* + * Get pages from boot and hash them into the kernel's vp. + * Used after page structs have been allocated, but before segkmem is ready. + */ +void * +boot_alloc(void *inaddr, size_t size, uint_t align) +{ + caddr_t addr = inaddr; + + if (bootops == NULL) + prom_panic("boot_alloc: attempt to allocate memory after " + "BOP_GONE"); + + size = ptob(btopr(size)); + if (BOP_ALLOC(bootops, addr, size, align) != addr) + panic("boot_alloc: BOP_ALLOC failed"); + boot_mapin((caddr_t)addr, size); + return (addr); +} + +static void +segkmem_badop() +{ + panic("segkmem_badop"); +} + +#define SEGKMEM_BADOP(t) (t(*)())segkmem_badop + +/*ARGSUSED*/ +static faultcode_t +segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size, + enum fault_type type, enum seg_rw rw) +{ + ASSERT(RW_READ_HELD(&seg->s_as->a_lock)); + + if (seg->s_as != &kas || size > seg->s_size || + addr < seg->s_base || addr + size > seg->s_base + seg->s_size) + panic("segkmem_fault: bad args"); + + if (segkp_bitmap && seg == &kvseg) { + + /* + * If it is one of segkp pages, call segkp_fault. + */ + if (BT_TEST(segkp_bitmap, + btop((uintptr_t)(addr - seg->s_base)))) + return (SEGOP_FAULT(hat, segkp, addr, size, type, rw)); + } + + switch (type) { + case F_SOFTLOCK: /* lock down already-loaded translations */ + if (rw == S_OTHER) { + hat_reserve(seg->s_as, addr, size); + return (0); + } + /*FALLTHROUGH*/ + case F_SOFTUNLOCK: + if (rw == S_READ || rw == S_WRITE) + return (0); + /*FALLTHROUGH*/ + default: + break; + } + return (FC_NOSUPPORT); +} + +static int +segkmem_setprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot) +{ + ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock)); + + if (seg->s_as != &kas || size > seg->s_size || + addr < seg->s_base || addr + size > seg->s_base + seg->s_size) + panic("segkmem_setprot: bad args"); + + if (segkp_bitmap && seg == &kvseg) { + + /* + * If it is one of segkp pages, call segkp. + */ + if (BT_TEST(segkp_bitmap, + btop((uintptr_t)(addr - seg->s_base)))) + return (SEGOP_SETPROT(segkp, addr, size, prot)); + } + + if (prot == 0) + hat_unload(kas.a_hat, addr, size, HAT_UNLOAD); + else + hat_chgprot(kas.a_hat, addr, size, prot); + return (0); +} + +/* + * This is a dummy segkmem function overloaded to call segkp + * when segkp is under the heap. + */ +/* ARGSUSED */ +static int +segkmem_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot) +{ + ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock)); + + if (seg->s_as != &kas) + segkmem_badop(); + + if (segkp_bitmap && seg == &kvseg) { + + /* + * If it is one of segkp pages, call into segkp. + */ + if (BT_TEST(segkp_bitmap, + btop((uintptr_t)(addr - seg->s_base)))) + return (SEGOP_CHECKPROT(segkp, addr, size, prot)); + } + segkmem_badop(); + return (0); +} + +/* + * This is a dummy segkmem function overloaded to call segkp + * when segkp is under the heap. + */ +/* ARGSUSED */ +static int +segkmem_kluster(struct seg *seg, caddr_t addr, ssize_t delta) +{ + ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock)); + + if (seg->s_as != &kas) + segkmem_badop(); + + if (segkp_bitmap && seg == &kvseg) { + + /* + * If it is one of segkp pages, call into segkp. + */ + if (BT_TEST(segkp_bitmap, + btop((uintptr_t)(addr - seg->s_base)))) + return (SEGOP_KLUSTER(segkp, addr, delta)); + } + segkmem_badop(); + return (0); +} + +static void +segkmem_xdump_range(void *arg, void *start, size_t size) +{ + struct as *as = arg; + caddr_t addr = start; + caddr_t addr_end = addr + size; + + while (addr < addr_end) { + pfn_t pfn = hat_getpfnum(kas.a_hat, addr); + if (pfn != PFN_INVALID && pfn <= physmax && pf_is_memory(pfn)) + dump_addpage(as, addr, pfn); + addr += PAGESIZE; + dump_timeleft = dump_timeout; + } +} + +static void +segkmem_dump_range(void *arg, void *start, size_t size) +{ + caddr_t addr = start; + caddr_t addr_end = addr + size; + + /* + * If we are about to start dumping the range of addresses we + * carved out of the kernel heap for the large page heap walk + * heap_lp_arena to find what segments are actually populated + */ + if (SEGKMEM_USE_LARGEPAGES && + addr == heap_lp_base && addr_end == heap_lp_end && + vmem_size(heap_lp_arena, VMEM_ALLOC) < size) { + vmem_walk(heap_lp_arena, VMEM_ALLOC | VMEM_REENTRANT, + segkmem_xdump_range, arg); + } else { + segkmem_xdump_range(arg, start, size); + } +} + +static void +segkmem_dump(struct seg *seg) +{ + /* + * The kernel's heap_arena (represented by kvseg) is a very large + * VA space, most of which is typically unused. To speed up dumping + * we use vmem_walk() to quickly find the pieces of heap_arena that + * are actually in use. We do the same for heap32_arena and + * heap_core. + * + * We specify VMEM_REENTRANT to vmem_walk() because dump_addpage() + * may ultimately need to allocate memory. Reentrant walks are + * necessarily imperfect snapshots. The kernel heap continues + * to change during a live crash dump, for example. For a normal + * crash dump, however, we know that there won't be any other threads + * messing with the heap. Therefore, at worst, we may fail to dump + * the pages that get allocated by the act of dumping; but we will + * always dump every page that was allocated when the walk began. + * + * The other segkmem segments are dense (fully populated), so there's + * no need to use this technique when dumping them. + * + * Note: when adding special dump handling for any new sparsely- + * populated segments, be sure to add similar handling to the ::kgrep + * code in mdb. + */ + if (seg == &kvseg) { + vmem_walk(heap_arena, VMEM_ALLOC | VMEM_REENTRANT, + segkmem_dump_range, seg->s_as); +#ifndef __sparc + vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT, + segkmem_dump_range, seg->s_as); +#endif + } else if (seg == &kvseg_core) { + vmem_walk(heap_core_arena, VMEM_ALLOC | VMEM_REENTRANT, + segkmem_dump_range, seg->s_as); + } else if (seg == &kvseg32) { + vmem_walk(heap32_arena, VMEM_ALLOC | VMEM_REENTRANT, + segkmem_dump_range, seg->s_as); + vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT, + segkmem_dump_range, seg->s_as); + } else { + segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size); + } +} + +/* + * lock/unlock kmem pages over a given range [addr, addr+len). + * Returns a shadow list of pages in ppp if *ppp is not NULL + * and memory can be allocated to hold the shadow list. + */ +/*ARGSUSED*/ +static int +segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len, + page_t ***ppp, enum lock_type type, enum seg_rw rw) +{ + page_t **pplist, *pp; + pgcnt_t npages; + size_t nb; + + if (segkp_bitmap && seg == &kvseg) { + /* + * If it is one of segkp pages, call into segkp. + */ + if (BT_TEST(segkp_bitmap, + btop((uintptr_t)(addr - seg->s_base)))) + return (SEGOP_PAGELOCK(segkp, addr, len, ppp, + type, rw)); + } + + if (type == L_PAGERECLAIM) + return (ENOTSUP); + + npages = btopr(len); + nb = sizeof (page_t *) * npages; + + if (type == L_PAGEUNLOCK) { + if ((pplist = *ppp) == NULL) { + /* + * No shadow list. Iterate over the range + * using page_find() and unlock the pages + * that we encounter. + */ + while (npages--) { + pp = page_find(&kvp, + (u_offset_t)(uintptr_t)addr); + if (pp) + page_unlock(pp); + addr += PAGESIZE; + } + return (0); + } + + while (npages--) { + pp = *pplist++; + if (pp) + page_unlock(pp); + } + kmem_free(*ppp, nb); + return (0); + } + + ASSERT(type == L_PAGELOCK); + + pplist = NULL; + if (ppp != NULL) + *ppp = pplist = kmem_alloc(nb, KM_NOSLEEP); + + while (npages--) { + pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_SHARED); + /* + * We'd like to ASSERT(pp != NULL) here, but we can't + * because there are legitimate cases where the address + * isn't really mapped -- for instance, attaching a + * kernel debugger and poking at a non-existent address. + */ + if (pplist) + *pplist++ = pp; + addr += PAGESIZE; + } + return (0); +} + +/* + * This is a dummy segkmem function overloaded to call segkp + * when segkp is under the heap. + */ +/* ARGSUSED */ +static int +segkmem_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock)); + + if (seg->s_as != &kas) + segkmem_badop(); + + if (segkp_bitmap && seg == &kvseg) { + + /* + * If it is one of segkp pages, call into segkp. + */ + if (BT_TEST(segkp_bitmap, + btop((uintptr_t)(addr - seg->s_base)))) + return (SEGOP_GETMEMID(segkp, addr, memidp)); + } + segkmem_badop(); + return (0); +} + +/*ARGSUSED*/ +static lgrp_mem_policy_info_t * +segkmem_getpolicy(struct seg *seg, caddr_t addr) +{ + return (NULL); +} + + +static struct seg_ops segkmem_ops = { + SEGKMEM_BADOP(int), /* dup */ + SEGKMEM_BADOP(int), /* unmap */ + SEGKMEM_BADOP(void), /* free */ + segkmem_fault, + SEGKMEM_BADOP(faultcode_t), /* faulta */ + segkmem_setprot, + segkmem_checkprot, + segkmem_kluster, + SEGKMEM_BADOP(size_t), /* swapout */ + SEGKMEM_BADOP(int), /* sync */ + SEGKMEM_BADOP(size_t), /* incore */ + SEGKMEM_BADOP(int), /* lockop */ + SEGKMEM_BADOP(int), /* getprot */ + SEGKMEM_BADOP(u_offset_t), /* getoffset */ + SEGKMEM_BADOP(int), /* gettype */ + SEGKMEM_BADOP(int), /* getvp */ + SEGKMEM_BADOP(int), /* advise */ + segkmem_dump, + segkmem_pagelock, + SEGKMEM_BADOP(int), /* setpgsz */ + segkmem_getmemid, + segkmem_getpolicy, /* getpolicy */ +}; + +int +segkmem_create(struct seg *seg) +{ + ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock)); + seg->s_ops = &segkmem_ops; + seg->s_data = NULL; + kas.a_size += seg->s_size; + return (0); +} + +/*ARGSUSED*/ +page_t * +segkmem_page_create(void *addr, size_t size, int vmflag, void *arg) +{ + struct seg kseg; + int pgflags; + + kseg.s_as = &kas; + pgflags = PG_EXCL; + + if (segkmem_reloc == 0 || (vmflag & VM_NORELOC)) + pgflags |= PG_NORELOC; + if ((vmflag & VM_NOSLEEP) == 0) + pgflags |= PG_WAIT; + if (vmflag & VM_PANIC) + pgflags |= PG_PANIC; + if (vmflag & VM_PUSHPAGE) + pgflags |= PG_PUSHPAGE; + + return (page_create_va(&kvp, (u_offset_t)(uintptr_t)addr, size, + pgflags, &kseg, addr)); +} + +/* + * Allocate pages to back the virtual address range [addr, addr + size). + * If addr is NULL, allocate the virtual address space as well. + */ +void * +segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr, + page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg) +{ + page_t *ppl; + caddr_t addr = inaddr; + pgcnt_t npages = btopr(size); + int allocflag; + + if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL) + return (NULL); + + ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0); + + if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) { + if (inaddr == NULL) + vmem_free(vmp, addr, size); + return (NULL); + } + + ppl = page_create_func(addr, size, vmflag, pcarg); + if (ppl == NULL) { + if (inaddr == NULL) + vmem_free(vmp, addr, size); + page_unresv(npages); + return (NULL); + } + + /* + * Under certain conditions, we need to let the HAT layer know + * that it cannot safely allocate memory. Allocations from + * the hat_memload vmem arena always need this, to prevent + * infinite recursion. + * + * In addition, the x86 hat cannot safely do memory + * allocations while in vmem_populate(), because there + * is no simple bound on its usage. + */ + if (vmflag & VM_MEMLOAD) + allocflag = HAT_NO_KALLOC; +#if defined(__x86) + else if (vmem_is_populator()) + allocflag = HAT_NO_KALLOC; +#endif + else + allocflag = 0; + + while (ppl != NULL) { + page_t *pp = ppl; + page_sub(&ppl, pp); + ASSERT(page_iolock_assert(pp)); + ASSERT(PAGE_EXCL(pp)); + page_io_unlock(pp); + hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp, + (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, + HAT_LOAD_LOCK | allocflag); + pp->p_lckcnt = 1; +#if defined(__x86) + page_downgrade(pp); +#else + if (vmflag & SEGKMEM_SHARELOCKED) + page_downgrade(pp); + else + page_unlock(pp); +#endif + } + + return (addr); +} + +void * +segkmem_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + void *addr; + segkmem_gc_list_t *gcp, **prev_gcpp; + + if (kvseg.s_base == NULL) { +#ifndef __sparc + if (bootops->bsys_alloc == NULL) + halt("Memory allocation between bop_alloc() and " + "kmem_alloc().\n"); +#endif + + /* + * There's not a lot of memory to go around during boot, + * so recycle it if we can. + */ + for (prev_gcpp = &segkmem_gc_list; (gcp = *prev_gcpp) != NULL; + prev_gcpp = &gcp->gc_next) { + if (gcp->gc_arena == vmp && gcp->gc_size == size) { + *prev_gcpp = gcp->gc_next; + return (gcp); + } + } + + addr = vmem_alloc(vmp, size, vmflag | VM_PANIC); + if (boot_alloc(addr, size, BO_NO_ALIGN) != addr) + panic("segkmem_alloc: boot_alloc failed"); + return (addr); + } + return (segkmem_xalloc(vmp, NULL, size, vmflag, 0, + segkmem_page_create, NULL)); +} + +/* + * Any changes to this routine must also be carried over to + * devmap_free_pages() in the seg_dev driver. This is because + * we currently don't have a special kernel segment for non-paged + * kernel memory that is exported by drivers to user space. + */ +void +segkmem_free(vmem_t *vmp, void *inaddr, size_t size) +{ + page_t *pp; + caddr_t addr = inaddr; + caddr_t eaddr; + pgcnt_t npages = btopr(size); + + ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0); + + if (kvseg.s_base == NULL) { + segkmem_gc_list_t *gc = inaddr; + gc->gc_arena = vmp; + gc->gc_size = size; + gc->gc_next = segkmem_gc_list; + segkmem_gc_list = gc; + return; + } + + hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK); + + for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) { +#if defined(__x86) + pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr); + if (pp == NULL) + panic("segkmem_free: page not found"); + if (!page_tryupgrade(pp)) { + /* + * Some other thread has a sharelock. Wait for + * it to drop the lock so we can free this page. + */ + page_unlock(pp); + pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, + SE_EXCL); + } +#else + pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL); +#endif + if (pp == NULL) + panic("segkmem_free: page not found"); + /* Clear p_lckcnt so page_destroy() doesn't update availrmem */ + pp->p_lckcnt = 0; + page_destroy(pp, 0); + } + page_unresv(npages); + + if (vmp != NULL) + vmem_free(vmp, inaddr, size); +} + +void +segkmem_gc(void) +{ + ASSERT(kvseg.s_base != NULL); + while (segkmem_gc_list != NULL) { + segkmem_gc_list_t *gc = segkmem_gc_list; + segkmem_gc_list = gc->gc_next; + segkmem_free(gc->gc_arena, gc, gc->gc_size); + } +} + +/* + * Legacy entry points from here to end of file. + */ +void +segkmem_mapin(struct seg *seg, void *addr, size_t size, uint_t vprot, + pfn_t pfn, uint_t flags) +{ + hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK); + hat_devload(seg->s_as->a_hat, addr, size, pfn, vprot, + flags | HAT_LOAD_LOCK); +} + +void +segkmem_mapout(struct seg *seg, void *addr, size_t size) +{ + hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK); +} + +void * +kmem_getpages(pgcnt_t npages, int kmflag) +{ + return (kmem_alloc(ptob(npages), kmflag)); +} + +void +kmem_freepages(void *addr, pgcnt_t npages) +{ + kmem_free(addr, ptob(npages)); +} + +/* + * segkmem_page_create_large() allocates a large page to be used for the kmem + * caches. If kpr is enabled we ask for a relocatable page unless requested + * otherwise. If kpr is disabled we have to ask for a non-reloc page + */ +static page_t * +segkmem_page_create_large(void *addr, size_t size, int vmflag, void *arg) +{ + int pgflags; + + pgflags = PG_EXCL; + + if (segkmem_reloc == 0 || (vmflag & VM_NORELOC)) + pgflags |= PG_NORELOC; + if (!(vmflag & VM_NOSLEEP)) + pgflags |= PG_WAIT; + if (vmflag & VM_PUSHPAGE) + pgflags |= PG_PUSHPAGE; + + return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size, + pgflags, &kvseg, addr, arg)); +} + +/* + * Allocate a large page to back the virtual address range + * [addr, addr + size). If addr is NULL, allocate the virtual address + * space as well. + */ +static void * +segkmem_xalloc_lp(vmem_t *vmp, void *inaddr, size_t size, int vmflag, + uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *), + void *pcarg) +{ + caddr_t addr = inaddr, pa; + size_t lpsize = segkmem_lpsize; + pgcnt_t npages = btopr(size); + pgcnt_t nbpages = btop(lpsize); + pgcnt_t nlpages = size >> segkmem_lpshift; + size_t ppasize = nbpages * sizeof (page_t *); + page_t *pp, *rootpp, **ppa, *pplist = NULL; + int i; + + if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) { + return (NULL); + } + + /* + * allocate an array we need for hat_memload_array. + * we use a separate arena to avoid recursion. + * we will not need this array when hat_memload_array learns pp++ + */ + if ((ppa = vmem_alloc(segkmem_ppa_arena, ppasize, vmflag)) == NULL) { + goto fail_array_alloc; + } + + if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL) + goto fail_vmem_alloc; + + ASSERT(((uintptr_t)addr & (lpsize - 1)) == 0); + + /* create all the pages */ + for (pa = addr, i = 0; i < nlpages; i++, pa += lpsize) { + if ((pp = page_create_func(pa, lpsize, vmflag, pcarg)) == NULL) + goto fail_page_create; + page_list_concat(&pplist, &pp); + } + + /* at this point we have all the resource to complete the request */ + while ((rootpp = pplist) != NULL) { + for (i = 0; i < nbpages; i++) { + ASSERT(pplist != NULL); + pp = pplist; + page_sub(&pplist, pp); + ASSERT(page_iolock_assert(pp)); + page_io_unlock(pp); + ppa[i] = pp; + } + /* + * Load the locked entry. It's OK to preload the entry into the + * TSB since we now support large mappings in the kernel TSB. + */ + hat_memload_array(kas.a_hat, + (caddr_t)(uintptr_t)rootpp->p_offset, lpsize, + ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, + HAT_LOAD_LOCK); + + for (--i; i >= 0; --i) { + ppa[i]->p_lckcnt = 1; + page_unlock(ppa[i]); + } + } + + vmem_free(segkmem_ppa_arena, ppa, ppasize); + return (addr); + +fail_page_create: + while ((rootpp = pplist) != NULL) { + for (i = 0, pp = pplist; i < nbpages; i++, pp = pplist) { + ASSERT(pp != NULL); + page_sub(&pplist, pp); + ASSERT(page_iolock_assert(pp)); + page_io_unlock(pp); + } + page_destroy_pages(rootpp); + } + + if (inaddr == NULL) + vmem_free(vmp, addr, size); + +fail_vmem_alloc: + vmem_free(segkmem_ppa_arena, ppa, ppasize); + +fail_array_alloc: + page_unresv(npages); + + return (NULL); +} + +static void +segkmem_free_one_lp(caddr_t addr, size_t size) +{ + page_t *pp, *rootpp = NULL; + pgcnt_t pgs_left = btopr(size); + + ASSERT(size == segkmem_lpsize); + + hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK); + + for (; pgs_left > 0; addr += PAGESIZE, pgs_left--) { + pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL); + if (pp == NULL) + panic("segkmem_free_one_lp: page not found"); + ASSERT(PAGE_EXCL(pp)); + pp->p_lckcnt = 0; + if (rootpp == NULL) + rootpp = pp; + } + ASSERT(rootpp != NULL); + page_destroy_pages(rootpp); + + /* page_unresv() is done by the caller */ +} + +/* + * This function is called to import new spans into the vmem arenas like + * kmem_default_arena and kmem_oversize_arena. It first tries to import + * spans from large page arena - kmem_lp_arena. In order to do this it might + * have to "upgrade the requested size" to kmem_lp_arena quantum. If + * it was not able to satisfy the upgraded request it then calls regular + * segkmem_alloc() that satisfies the request by importing from "*vmp" arena + */ +void * +segkmem_alloc_lp(vmem_t *vmp, size_t *sizep, int vmflag) +{ + size_t size; + kthread_t *t = curthread; + segkmem_lpcb_t *lpcb = &segkmem_lpcb; + + ASSERT(sizep != NULL); + + size = *sizep; + + if (lpcb->lp_uselp && !(t->t_flag & T_PANIC) && + !(vmflag & SEGKMEM_SHARELOCKED)) { + + size_t kmemlp_qnt = segkmem_kmemlp_quantum; + size_t asize = P2ROUNDUP(size, kmemlp_qnt); + void *addr = NULL; + ulong_t *lpthrtp = &lpcb->lp_throttle; + ulong_t lpthrt = *lpthrtp; + int dowakeup = 0; + int doalloc = 1; + + ASSERT(kmem_lp_arena != NULL); + ASSERT(asize >= size); + + if (lpthrt != 0) { + /* try to update the throttle value */ + lpthrt = atomic_add_long_nv(lpthrtp, 1); + if (lpthrt >= segkmem_lpthrottle_max) { + lpthrt = atomic_cas_ulong(lpthrtp, lpthrt, + segkmem_lpthrottle_max / 4); + } + + /* + * when we get above throttle start do an exponential + * backoff at trying large pages and reaping + */ + if (lpthrt > segkmem_lpthrottle_start && + (lpthrt & (lpthrt - 1))) { + atomic_add_64(&lpcb->allocs_throttled, 1L); + lpthrt--; + if ((lpthrt & (lpthrt - 1)) == 0) + kmem_reap(); + return (segkmem_alloc(vmp, size, vmflag)); + } + } + + if (!(vmflag & VM_NOSLEEP) && + segkmem_heaplp_quantum >= (8 * kmemlp_qnt) && + vmem_size(kmem_lp_arena, VMEM_FREE) <= kmemlp_qnt && + asize < (segkmem_heaplp_quantum - kmemlp_qnt)) { + + /* + * we are low on free memory in kmem_lp_arena + * we let only one guy to allocate heap_lp + * quantum size chunk that everybody is going to + * share + */ + mutex_enter(&lpcb->lp_lock); + + if (lpcb->lp_wait) { + + /* we are not the first one - wait */ + cv_wait(&lpcb->lp_cv, &lpcb->lp_lock); + if (vmem_size(kmem_lp_arena, VMEM_FREE) < + kmemlp_qnt) { + doalloc = 0; + } + } else if (vmem_size(kmem_lp_arena, VMEM_FREE) <= + kmemlp_qnt) { + + /* + * we are the first one, make sure we import + * a large page + */ + if (asize == kmemlp_qnt) + asize += kmemlp_qnt; + dowakeup = 1; + lpcb->lp_wait = 1; + } + + mutex_exit(&lpcb->lp_lock); + } + + /* + * VM_ABORT flag prevents sleeps in vmem_xalloc when + * large pages are not available. In that case this allocation + * attempt will fail and we will retry allocation with small + * pages. We also do not want to panic if this allocation fails + * because we are going to retry. + */ + if (doalloc) { + addr = vmem_alloc(kmem_lp_arena, asize, + (vmflag | VM_ABORT) & ~VM_PANIC); + + if (dowakeup) { + mutex_enter(&lpcb->lp_lock); + ASSERT(lpcb->lp_wait != 0); + lpcb->lp_wait = 0; + cv_broadcast(&lpcb->lp_cv); + mutex_exit(&lpcb->lp_lock); + } + } + + if (addr != NULL) { + *sizep = asize; + *lpthrtp = 0; + return (addr); + } + + if (vmflag & VM_NOSLEEP) + atomic_add_64(&lpcb->nosleep_allocs_failed, 1L); + else + atomic_add_64(&lpcb->sleep_allocs_failed, 1L); + atomic_add_64(&lpcb->alloc_bytes_failed, size); + + /* if large page throttling is not started yet do it */ + if (segkmem_use_lpthrottle && lpthrt == 0) { + lpthrt = atomic_cas_ulong(lpthrtp, lpthrt, 1); + } + } + return (segkmem_alloc(vmp, size, vmflag)); +} + +void +segkmem_free_lp(vmem_t *vmp, void *inaddr, size_t size) +{ + if (kmem_lp_arena == NULL || !IS_KMEM_VA_LARGEPAGE((caddr_t)inaddr)) { + segkmem_free(vmp, inaddr, size); + } else { + vmem_free(kmem_lp_arena, inaddr, size); + } +} + +/* + * segkmem_alloc_lpi() imports virtual memory from large page heap arena + * into kmem_lp arena. In the process it maps the imported segment with + * large pages + */ +static void * +segkmem_alloc_lpi(vmem_t *vmp, size_t size, int vmflag) +{ + segkmem_lpcb_t *lpcb = &segkmem_lpcb; + void *addr; + + ASSERT(size != 0); + ASSERT(vmp == heap_lp_arena); + + /* do not allow large page heap grow beyound limits */ + if (vmem_size(vmp, VMEM_ALLOC) >= segkmem_kmemlp_max) { + atomic_add_64(&lpcb->allocs_limited, 1); + return (NULL); + } + + addr = segkmem_xalloc_lp(vmp, NULL, size, vmflag, 0, + segkmem_page_create_large, NULL); + return (addr); +} + +/* + * segkmem_free_lpi() returns virtual memory back into large page heap arena + * from kmem_lp arena. Beore doing this it unmaps the segment and frees + * large pages used to map it. + */ +static void +segkmem_free_lpi(vmem_t *vmp, void *inaddr, size_t size) +{ + pgcnt_t nlpages = size >> segkmem_lpshift; + size_t lpsize = segkmem_lpsize; + caddr_t addr = inaddr; + pgcnt_t npages = btopr(size); + int i; + + ASSERT(vmp == heap_lp_arena); + ASSERT(IS_KMEM_VA_LARGEPAGE(addr)); + ASSERT(((uintptr_t)inaddr & (lpsize - 1)) == 0); + + for (i = 0; i < nlpages; i++) { + segkmem_free_one_lp(addr, lpsize); + addr += lpsize; + } + + page_unresv(npages); + + vmem_free(vmp, inaddr, size); +} + +/* + * This function is called at system boot time by kmem_init right after + * /etc/system file has been read. It checks based on hardware configuration + * and /etc/system settings if system is going to use large pages. The + * initialiazation necessary to actually start using large pages + * happens later in the process after segkmem_heap_lp_init() is called. + */ +int +segkmem_lpsetup() +{ + int use_large_pages = 0; + +#ifdef __sparc + + size_t memtotal = physmem * PAGESIZE; + + if (heap_lp_base == NULL) { + segkmem_lpsize = PAGESIZE; + return (0); + } + + /* get a platform dependent value of large page size for kernel heap */ + segkmem_lpsize = get_segkmem_lpsize(segkmem_lpsize); + + if (segkmem_lpsize <= PAGESIZE) { + /* + * put virtual space reserved for the large page kernel + * back to the regular heap + */ + vmem_xfree(heap_arena, heap_lp_base, + heap_lp_end - heap_lp_base); + heap_lp_base = NULL; + heap_lp_end = NULL; + segkmem_lpsize = PAGESIZE; + return (0); + } + + /* set heap_lp quantum if necessary */ + if (segkmem_heaplp_quantum == 0 || + (segkmem_heaplp_quantum & (segkmem_heaplp_quantum - 1)) || + P2PHASE(segkmem_heaplp_quantum, segkmem_lpsize)) { + segkmem_heaplp_quantum = segkmem_lpsize; + } + + /* set kmem_lp quantum if necessary */ + if (segkmem_kmemlp_quantum == 0 || + (segkmem_kmemlp_quantum & (segkmem_kmemlp_quantum - 1)) || + segkmem_kmemlp_quantum > segkmem_heaplp_quantum) { + segkmem_kmemlp_quantum = segkmem_heaplp_quantum; + } + + /* set total amount of memory allowed for large page kernel heap */ + if (segkmem_kmemlp_max == 0) { + if (segkmem_kmemlp_pcnt == 0 || segkmem_kmemlp_pcnt > 100) + segkmem_kmemlp_pcnt = 25; + segkmem_kmemlp_max = (memtotal * 100) / segkmem_kmemlp_pcnt; + } + segkmem_kmemlp_max = P2ROUNDUP(segkmem_kmemlp_max, + segkmem_heaplp_quantum); + + /* fix lp kmem preallocation request if necesssary */ + if (segkmem_kmemlp_min) { + segkmem_kmemlp_min = P2ROUNDUP(segkmem_kmemlp_min, + segkmem_heaplp_quantum); + if (segkmem_kmemlp_min > segkmem_kmemlp_max) + segkmem_kmemlp_min = segkmem_kmemlp_max; + } + + use_large_pages = 1; + segkmem_lpshift = page_get_shift(page_szc(segkmem_lpsize)); + +#endif + return (use_large_pages); +} + +#ifdef __sparc + + +static void * +segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag) +{ + size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *); + void *addr; + + if (ppaquantum <= PAGESIZE) + return (segkmem_alloc(vmp, size, vmflag)); + + ASSERT((size & (ppaquantum - 1)) == 0); + + addr = vmem_xalloc(vmp, size, ppaquantum, 0, 0, NULL, NULL, vmflag); + if (addr != NULL && segkmem_xalloc(vmp, addr, size, vmflag, 0, + segkmem_page_create, NULL) == NULL) { + vmem_xfree(vmp, addr, size); + addr = NULL; + } + + return (addr); +} + +static void +segkmem_free_ppa(vmem_t *vmp, void *addr, size_t size) +{ + size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *); + + ASSERT(addr != NULL); + + if (ppaquantum <= PAGESIZE) { + segkmem_free(vmp, addr, size); + } else { + segkmem_free(NULL, addr, size); + vmem_xfree(vmp, addr, size); + } +} + +void +segkmem_heap_lp_init() +{ + segkmem_lpcb_t *lpcb = &segkmem_lpcb; + size_t heap_lp_size = heap_lp_end - heap_lp_base; + size_t lpsize = segkmem_lpsize; + size_t ppaquantum; + void *addr; + + if (segkmem_lpsize <= PAGESIZE) { + ASSERT(heap_lp_base == NULL); + ASSERT(heap_lp_end == NULL); + return; + } + + ASSERT(segkmem_heaplp_quantum >= lpsize); + ASSERT((segkmem_heaplp_quantum & (lpsize - 1)) == 0); + ASSERT(lpcb->lp_uselp == 0); + ASSERT(heap_lp_base != NULL); + ASSERT(heap_lp_end != NULL); + ASSERT(heap_lp_base < heap_lp_end); + ASSERT(heap_lp_arena == NULL); + ASSERT(((uintptr_t)heap_lp_base & (lpsize - 1)) == 0); + ASSERT(((uintptr_t)heap_lp_end & (lpsize - 1)) == 0); + + /* create large page heap arena */ + heap_lp_arena = vmem_create("heap_lp", heap_lp_base, heap_lp_size, + segkmem_heaplp_quantum, NULL, NULL, NULL, 0, VM_SLEEP); + + ASSERT(heap_lp_arena != NULL); + + /* This arena caches memory already mapped by large pages */ + kmem_lp_arena = vmem_create("kmem_lp", NULL, 0, segkmem_kmemlp_quantum, + segkmem_alloc_lpi, segkmem_free_lpi, heap_lp_arena, 0, VM_SLEEP); + + ASSERT(kmem_lp_arena != NULL); + + mutex_init(&lpcb->lp_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&lpcb->lp_cv, NULL, CV_DEFAULT, NULL); + + /* + * this arena is used for the array of page_t pointers necessary + * to call hat_mem_load_array + */ + ppaquantum = btopr(lpsize) * sizeof (page_t *); + segkmem_ppa_arena = vmem_create("segkmem_ppa", NULL, 0, ppaquantum, + segkmem_alloc_ppa, segkmem_free_ppa, heap_arena, ppaquantum, + VM_SLEEP); + + ASSERT(segkmem_ppa_arena != NULL); + + /* prealloacate some memory for the lp kernel heap */ + if (segkmem_kmemlp_min) { + + ASSERT(P2PHASE(segkmem_kmemlp_min, + segkmem_heaplp_quantum) == 0); + + if ((addr = segkmem_alloc_lpi(heap_lp_arena, + segkmem_kmemlp_min, VM_SLEEP)) != NULL) { + + addr = vmem_add(kmem_lp_arena, addr, + segkmem_kmemlp_min, VM_SLEEP); + ASSERT(addr != NULL); + } + } + + lpcb->lp_uselp = 1; +} + +#endif diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h new file mode 100644 index 0000000000..a1fcf43643 --- /dev/null +++ b/usr/src/uts/common/vm/seg_kmem.h @@ -0,0 +1,129 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VM_SEG_KMEM_H +#define _VM_SEG_KMEM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/vnode.h> +#include <sys/vmem.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/page.h> + +/* + * VM - Kernel Segment Driver + */ + +#if defined(_KERNEL) + +extern char *kernelheap; /* start of primary kernel heap */ +extern char *ekernelheap; /* end of primary kernel heap */ +extern char *heap_lp_base; /* start of kernel large page heap arena */ +extern char *heap_lp_end; /* end of kernel large page heap arena */ +extern struct seg kvseg; /* primary kernel heap segment */ +extern struct seg kvseg_core; /* "core" kernel heap segment */ +extern vmem_t *heap_arena; /* primary kernel heap arena */ +extern vmem_t *hat_memload_arena; /* HAT translation arena */ +extern struct seg kvseg32; /* 32-bit kernel heap segment */ +extern vmem_t *heap32_arena; /* 32-bit kernel heap arena */ +extern vmem_t *heaptext_arena; /* kernel text arena, from heap */ +extern struct ctx *kctx; /* kernel context */ +extern struct as kas; /* kernel address space */ +extern struct vnode kvp; /* vnode for all segkmem pages */ +extern int segkmem_reloc; /* enable/disable segkmem relocatable pages */ +extern vmem_t *static_arena; /* arena for caches to import static memory */ +extern vmem_t *static_alloc_arena; /* arena for allocating static memory */ + +extern int segkmem_create(struct seg *); +extern page_t *segkmem_page_create(void *, size_t, int, void *); +extern void *segkmem_xalloc(vmem_t *, void *, size_t, int, uint_t, + page_t *(*page_create_func)(void *, size_t, int, void *), void *); +extern void *segkmem_alloc(vmem_t *, size_t, int); +extern void *segkmem_alloc_permanent(vmem_t *, size_t, int); +extern void segkmem_free(vmem_t *, void *, size_t); + +extern void *boot_alloc(void *, size_t, uint_t); +extern void boot_mapin(caddr_t addr, size_t size); +extern void kernelheap_init(void *, void *, char *, void *, void *); +extern void kernelheap_extend(void *, void *); +extern void segkmem_gc(void); + +/* + * Flags for segkmem_xalloc(). + * + * SEGKMEM_SHARELOCKED requests pages which are locked SE_SHARED to be + * returned rather than unlocked which is now the default. Note that + * memory returned by SEGKMEM_SHARELOCKED cannot be freed by segkmem_free(). + * This is a hack for seg_dev that should be cleaned up in the future. + */ +#define SEGKMEM_SHARELOCKED 0x20000 + +/* + * Large page for kmem caches support + */ +typedef struct segkmem_lpcb { + kmutex_t lp_lock; + kcondvar_t lp_cv; + uint_t lp_wait; + uint_t lp_uselp; + ulong_t lp_throttle; + + /* stats */ + uint64_t sleep_allocs_failed; + uint64_t nosleep_allocs_failed; + uint64_t allocs_throttled; + uint64_t allocs_limited; + uint64_t alloc_bytes_failed; +} segkmem_lpcb_t; + +extern void *segkmem_alloc_lp(vmem_t *, size_t *, int); +extern void segkmem_free_lp(vmem_t *, void *, size_t); +extern int segkmem_lpsetup(); +extern void segkmem_heap_lp_init(void); + +extern size_t segkmem_lpsize; +extern size_t segkmem_heaplp_quantum; +extern size_t segkmem_kmemlp_max; + +#define SEGKMEM_USE_LARGEPAGES (segkmem_lpsize > PAGESIZE) + +#define IS_KMEM_VA_LARGEPAGE(vaddr) \ + (((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end)) + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_KMEM_H */ diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c new file mode 100644 index 0000000000..9c7b0710f3 --- /dev/null +++ b/usr/src/uts/common/vm/seg_kp.c @@ -0,0 +1,1444 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * segkp is a segment driver that administers the allocation and deallocation + * of pageable variable size chunks of kernel virtual address space. Each + * allocated resource is page-aligned. + * + * The user may specify whether the resource should be initialized to 0, + * include a redzone, or locked in memory. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/thread.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/mman.h> +#include <sys/vnode.h> +#include <sys/cmn_err.h> +#include <sys/swap.h> +#include <sys/tuneable.h> +#include <sys/kmem.h> +#include <sys/vmem.h> +#include <sys/cred.h> +#include <sys/dumphdr.h> +#include <sys/debug.h> +#include <sys/vtrace.h> +#include <sys/stack.h> +#include <sys/atomic.h> +#include <sys/archsystm.h> +#include <sys/lgrp.h> + +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kp.h> +#include <vm/seg_kmem.h> +#include <vm/anon.h> +#include <vm/page.h> +#include <vm/hat.h> +#include <sys/bitmap.h> + +/* + * Private seg op routines + */ +static void segkp_badop(void); +static void segkp_dump(struct seg *seg); +static int segkp_checkprot(struct seg *seg, caddr_t addr, size_t len, + uint_t prot); +static int segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta); +static int segkp_pagelock(struct seg *seg, caddr_t addr, size_t len, + struct page ***page, enum lock_type type, + enum seg_rw rw); +static void segkp_insert(struct seg *seg, struct segkp_data *kpd); +static void segkp_delete(struct seg *seg, struct segkp_data *kpd); +static caddr_t segkp_get_internal(struct seg *seg, size_t len, uint_t flags, + struct segkp_data **tkpd, struct anon_map *amp); +static void segkp_release_internal(struct seg *seg, + struct segkp_data *kpd, size_t len); +static int segkp_unlock(struct hat *hat, struct seg *seg, caddr_t vaddr, + size_t len, struct segkp_data *kpd, uint_t flags); +static int segkp_load(struct hat *hat, struct seg *seg, caddr_t vaddr, + size_t len, struct segkp_data *kpd, uint_t flags); +static struct segkp_data *segkp_find(struct seg *seg, caddr_t vaddr); +static int segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp); +static lgrp_mem_policy_info_t *segkp_getpolicy(struct seg *seg, + caddr_t addr); + +/* + * Lock used to protect the hash table(s) and caches. + */ +static kmutex_t segkp_lock; + +/* + * The segkp caches + */ +static struct segkp_cache segkp_cache[SEGKP_MAX_CACHE]; + +#define SEGKP_BADOP(t) (t(*)())segkp_badop + +/* + * When there are fewer than red_minavail bytes left on the stack, + * segkp_map_red() will map in the redzone (if called). 5000 seems + * to work reasonably well... + */ +long red_minavail = 5000; + +/* + * will be set to 1 for 32 bit x86 systems only, in startup.c + */ +int segkp_fromheap = 0; +ulong_t *segkp_bitmap; + +/* + * If segkp_map_red() is called with the redzone already mapped and + * with less than RED_DEEP_THRESHOLD bytes available on the stack, + * then the stack situation has become quite serious; if much more stack + * is consumed, we have the potential of scrogging the next thread/LWP + * structure. To help debug the "can't happen" panics which may + * result from this condition, we record lbolt and the calling thread + * in red_deep_lbolt and red_deep_thread respectively. + */ +#define RED_DEEP_THRESHOLD 2000 + +clock_t red_deep_lbolt; +kthread_t *red_deep_thread; + +uint32_t red_nmapped; +uint32_t red_closest = UINT_MAX; +uint32_t red_ndoubles; + +pgcnt_t anon_segkp_pages_locked; /* See vm/anon.h */ + +static struct seg_ops segkp_ops = { + SEGKP_BADOP(int), /* dup */ + SEGKP_BADOP(int), /* unmap */ + SEGKP_BADOP(void), /* free */ + segkp_fault, + SEGKP_BADOP(faultcode_t), /* faulta */ + SEGKP_BADOP(int), /* setprot */ + segkp_checkprot, + segkp_kluster, + SEGKP_BADOP(size_t), /* swapout */ + SEGKP_BADOP(int), /* sync */ + SEGKP_BADOP(size_t), /* incore */ + SEGKP_BADOP(int), /* lockop */ + SEGKP_BADOP(int), /* getprot */ + SEGKP_BADOP(u_offset_t), /* getoffset */ + SEGKP_BADOP(int), /* gettype */ + SEGKP_BADOP(int), /* getvp */ + SEGKP_BADOP(int), /* advise */ + segkp_dump, /* dump */ + segkp_pagelock, /* pagelock */ + SEGKP_BADOP(int), /* setpgsz */ + segkp_getmemid, /* getmemid */ + segkp_getpolicy, /* getpolicy */ +}; + + +static void +segkp_badop(void) +{ + panic("segkp_badop"); + /*NOTREACHED*/ +} + +static void segkpinit_mem_config(struct seg *); + +static uint32_t segkp_indel; + +/* + * Allocate the segment specific private data struct and fill it in + * with the per kp segment mutex, anon ptr. array and hash table. + */ +int +segkp_create(struct seg *seg) +{ + struct segkp_segdata *kpsd; + size_t np; + + ASSERT(seg != NULL && seg->s_as == &kas); + ASSERT(RW_WRITE_HELD(&seg->s_as->a_lock)); + + if (seg->s_size & PAGEOFFSET) { + panic("Bad segkp size"); + /*NOTREACHED*/ + } + + kpsd = kmem_zalloc(sizeof (struct segkp_segdata), KM_SLEEP); + + /* + * Allocate the virtual memory for segkp and initialize it + */ + if (segkp_fromheap) { + np = btop(kvseg.s_size); + segkp_bitmap = kmem_zalloc(BT_SIZEOFMAP(np), KM_SLEEP); + kpsd->kpsd_arena = vmem_create("segkp", NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, 5 * PAGESIZE, VM_SLEEP); + } else { + segkp_bitmap = NULL; + np = btop(seg->s_size); + kpsd->kpsd_arena = vmem_create("segkp", seg->s_base, + seg->s_size, PAGESIZE, NULL, NULL, NULL, 5 * PAGESIZE, + VM_SLEEP); + } + + kpsd->kpsd_anon = anon_create(np, ANON_SLEEP | ANON_ALLOC_FORCE); + + kpsd->kpsd_hash = kmem_zalloc(SEGKP_HASHSZ * sizeof (struct segkp *), + KM_SLEEP); + seg->s_data = (void *)kpsd; + seg->s_ops = &segkp_ops; + segkpinit_mem_config(seg); + return (0); +} + + +/* + * Find a free 'freelist' and initialize it with the appropriate attributes + */ +void * +segkp_cache_init(struct seg *seg, int maxsize, size_t len, uint_t flags) +{ + int i; + + if ((flags & KPD_NO_ANON) && !(flags & KPD_LOCKED)) + return ((void *)-1); + + mutex_enter(&segkp_lock); + for (i = 0; i < SEGKP_MAX_CACHE; i++) { + if (segkp_cache[i].kpf_inuse) + continue; + segkp_cache[i].kpf_inuse = 1; + segkp_cache[i].kpf_max = maxsize; + segkp_cache[i].kpf_flags = flags; + segkp_cache[i].kpf_seg = seg; + segkp_cache[i].kpf_len = len; + mutex_exit(&segkp_lock); + return ((void *)(uintptr_t)i); + } + mutex_exit(&segkp_lock); + return ((void *)-1); +} + +/* + * Free all the cache resources. + */ +void +segkp_cache_free(void) +{ + struct segkp_data *kpd; + struct seg *seg; + int i; + + mutex_enter(&segkp_lock); + for (i = 0; i < SEGKP_MAX_CACHE; i++) { + if (!segkp_cache[i].kpf_inuse) + continue; + /* + * Disconnect the freelist and process each element + */ + kpd = segkp_cache[i].kpf_list; + seg = segkp_cache[i].kpf_seg; + segkp_cache[i].kpf_list = NULL; + segkp_cache[i].kpf_count = 0; + mutex_exit(&segkp_lock); + + while (kpd != NULL) { + struct segkp_data *next; + + next = kpd->kp_next; + segkp_release_internal(seg, kpd, kpd->kp_len); + kpd = next; + } + mutex_enter(&segkp_lock); + } + mutex_exit(&segkp_lock); +} + +/* + * There are 2 entries into segkp_get_internal. The first includes a cookie + * used to access a pool of cached segkp resources. The second does not + * use the cache. + */ +caddr_t +segkp_get(struct seg *seg, size_t len, uint_t flags) +{ + struct segkp_data *kpd = NULL; + + if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) { + kpd->kp_cookie = -1; + return (stom(kpd->kp_base, flags)); + } + return (NULL); +} + +/* + * Return a 'cached' segkp address + */ +caddr_t +segkp_cache_get(void *cookie) +{ + struct segkp_cache *freelist = NULL; + struct segkp_data *kpd = NULL; + int index = (int)(uintptr_t)cookie; + struct seg *seg; + size_t len; + uint_t flags; + + if (index < 0 || index >= SEGKP_MAX_CACHE) + return (NULL); + freelist = &segkp_cache[index]; + + mutex_enter(&segkp_lock); + seg = freelist->kpf_seg; + flags = freelist->kpf_flags; + if (freelist->kpf_list != NULL) { + kpd = freelist->kpf_list; + freelist->kpf_list = kpd->kp_next; + freelist->kpf_count--; + mutex_exit(&segkp_lock); + kpd->kp_next = NULL; + segkp_insert(seg, kpd); + return (stom(kpd->kp_base, flags)); + } + len = freelist->kpf_len; + mutex_exit(&segkp_lock); + if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) { + kpd->kp_cookie = index; + return (stom(kpd->kp_base, flags)); + } + return (NULL); +} + +caddr_t +segkp_get_withanonmap( + struct seg *seg, + size_t len, + uint_t flags, + struct anon_map *amp) +{ + struct segkp_data *kpd = NULL; + + ASSERT(amp != NULL); + flags |= KPD_HASAMP; + if (segkp_get_internal(seg, len, flags, &kpd, amp) != NULL) { + kpd->kp_cookie = -1; + return (stom(kpd->kp_base, flags)); + } + return (NULL); +} + +/* + * This does the real work of segkp allocation. + * Return to client base addr. len must be page-aligned. A null value is + * returned if there are no more vm resources (e.g. pages, swap). The len + * and base recorded in the private data structure include the redzone + * and the redzone length (if applicable). If the user requests a redzone + * either the first or last page is left unmapped depending whether stacks + * grow to low or high memory. + * + * The client may also specify a no-wait flag. If that is set then the + * request will choose a non-blocking path when requesting resources. + * The default is make the client wait. + */ +static caddr_t +segkp_get_internal( + struct seg *seg, + size_t len, + uint_t flags, + struct segkp_data **tkpd, + struct anon_map *amp) +{ + struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data; + struct segkp_data *kpd; + caddr_t vbase = NULL; /* always first virtual, may not be mapped */ + pgcnt_t np = 0; /* number of pages in the resource */ + pgcnt_t segkpindex; + long i; + caddr_t va; + pgcnt_t pages = 0; + ulong_t anon_idx = 0; + int kmflag = (flags & KPD_NOWAIT) ? KM_NOSLEEP : KM_SLEEP; + caddr_t s_base = (segkp_fromheap) ? kvseg.s_base : seg->s_base; + + if (len & PAGEOFFSET) { + panic("segkp_get: len is not page-aligned"); + /*NOTREACHED*/ + } + + ASSERT(((flags & KPD_HASAMP) == 0) == (amp == NULL)); + + /* Only allow KPD_NO_ANON if we are going to lock it down */ + if ((flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON) + return (NULL); + + if ((kpd = kmem_zalloc(sizeof (struct segkp_data), kmflag)) == NULL) + return (NULL); + /* + * Fix up the len to reflect the REDZONE if applicable + */ + if (flags & KPD_HASREDZONE) + len += PAGESIZE; + np = btop(len); + + vbase = vmem_alloc(SEGKP_VMEM(seg), len, kmflag | VM_BESTFIT); + if (vbase == NULL) { + kmem_free(kpd, sizeof (struct segkp_data)); + return (NULL); + } + + /* If locking, reserve physical memory */ + if (flags & KPD_LOCKED) { + pages = btop(SEGKP_MAPLEN(len, flags)); + if (page_resv(pages, kmflag) == 0) { + vmem_free(SEGKP_VMEM(seg), vbase, len); + kmem_free(kpd, sizeof (struct segkp_data)); + return (NULL); + } + if ((flags & KPD_NO_ANON) == 0) + atomic_add_long(&anon_segkp_pages_locked, pages); + } + + /* + * Reserve sufficient swap space for this vm resource. We'll + * actually allocate it in the loop below, but reserving it + * here allows us to back out more gracefully than if we + * had an allocation failure in the body of the loop. + * + * Note that we don't need swap space for the red zone page. + */ + if (amp != NULL) { + ASSERT((flags & KPD_NO_ANON) == 0); + /* The reserve has been done and the anon_hdr is separate. */ + anon_idx = 0; + kpd->kp_anon_idx = anon_idx; + kpd->kp_anon = amp->ahp; + + TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u", + kpd, vbase, len, flags, 1); + + } else if ((flags & KPD_NO_ANON) == 0) { + if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) { + if (flags & KPD_LOCKED) { + atomic_add_long(&anon_segkp_pages_locked, + -pages); + page_unresv(pages); + } + vmem_free(SEGKP_VMEM(seg), vbase, len); + kmem_free(kpd, sizeof (struct segkp_data)); + return (NULL); + } + anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT; + kpd->kp_anon_idx = anon_idx; + kpd->kp_anon = kpsd->kpsd_anon; + + TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u", + kpd, vbase, len, flags, 1); + } else { + kpd->kp_anon = NULL; + kpd->kp_anon_idx = 0; + } + + /* + * Allocate page and anon resources for the virtual address range + * except the redzone + */ + if (segkp_fromheap) + segkpindex = btop((uintptr_t)(vbase - kvseg.s_base)); + for (i = 0, va = vbase; i < np; i++, va += PAGESIZE) { + page_t *pl[2]; + struct vnode *vp; + anoff_t off; + int err; + page_t *pp = NULL; + + /* + * Mark this page to be a segkp page in the bitmap. + */ + if (segkp_fromheap) { + BT_ATOMIC_SET(segkp_bitmap, segkpindex); + segkpindex++; + } + + /* + * If this page is the red zone page, we don't need swap + * space for it. Note that we skip over the code that + * establishes MMU mappings, so that the page remains + * invalid. + */ + if ((flags & KPD_HASREDZONE) && KPD_REDZONE(kpd) == i) + continue; + + if (kpd->kp_anon != NULL) { + struct anon *ap; + + ASSERT(anon_get_ptr(kpd->kp_anon, anon_idx + i) + == NULL); + /* + * Determine the "vp" and "off" of the anon slot. + */ + ap = anon_alloc(NULL, 0); + if (amp != NULL) + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + (void) anon_set_ptr(kpd->kp_anon, anon_idx + i, + ap, ANON_SLEEP); + if (amp != NULL) + ANON_LOCK_EXIT(&->a_rwlock); + swap_xlate(ap, &vp, &off); + + /* + * Create a page with the specified identity. The + * page is returned with the "shared" lock held. + */ + err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, + NULL, pl, PAGESIZE, seg, va, S_CREATE, + kcred); + if (err) { + /* + * XXX - This should not fail. + */ + panic("segkp_get: no pages"); + /*NOTREACHED*/ + } + pp = pl[0]; + } else { + ASSERT(page_exists(&kvp, + (u_offset_t)(uintptr_t)va) == NULL); + + if ((pp = page_create_va(&kvp, + (u_offset_t)(uintptr_t)va, PAGESIZE, + (flags & KPD_NOWAIT ? 0 : PG_WAIT) | PG_EXCL | + PG_NORELOC, seg, va)) == NULL) { + /* + * Legitimize resource; then destroy it. + * Easier than trying to unwind here. + */ + kpd->kp_flags = flags; + kpd->kp_base = vbase; + kpd->kp_len = len; + segkp_release_internal(seg, kpd, va - vbase); + return (NULL); + } + page_io_unlock(pp); + } + + if (flags & KPD_ZERO) + pagezero(pp, 0, PAGESIZE); + + /* + * Load and lock an MMU translation for the page. + */ + hat_memload(seg->s_as->a_hat, va, pp, (PROT_READ|PROT_WRITE), + ((flags & KPD_LOCKED) ? HAT_LOAD_LOCK : HAT_LOAD)); + + /* + * Now, release lock on the page. + */ + if (flags & KPD_LOCKED) + page_downgrade(pp); + else + page_unlock(pp); + } + + kpd->kp_flags = flags; + kpd->kp_base = vbase; + kpd->kp_len = len; + segkp_insert(seg, kpd); + *tkpd = kpd; + return (stom(kpd->kp_base, flags)); +} + +/* + * Release the resource to cache if the pool(designate by the cookie) + * has less than the maximum allowable. If inserted in cache, + * segkp_delete insures element is taken off of active list. + */ +void +segkp_release(struct seg *seg, caddr_t vaddr) +{ + struct segkp_cache *freelist; + struct segkp_data *kpd = NULL; + + if ((kpd = segkp_find(seg, vaddr)) == NULL) { + panic("segkp_release: null kpd"); + /*NOTREACHED*/ + } + + if (kpd->kp_cookie != -1) { + freelist = &segkp_cache[kpd->kp_cookie]; + mutex_enter(&segkp_lock); + if (!segkp_indel && freelist->kpf_count < freelist->kpf_max) { + segkp_delete(seg, kpd); + kpd->kp_next = freelist->kpf_list; + freelist->kpf_list = kpd; + freelist->kpf_count++; + mutex_exit(&segkp_lock); + return; + } else { + mutex_exit(&segkp_lock); + kpd->kp_cookie = -1; + } + } + segkp_release_internal(seg, kpd, kpd->kp_len); +} + +/* + * Free the entire resource. segkp_unlock gets called with the start of the + * mapped portion of the resource. The length is the size of the mapped + * portion + */ +static void +segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len) +{ + caddr_t va; + long i; + long redzone; + size_t np; + page_t *pp; + struct vnode *vp; + anoff_t off; + struct anon *ap; + pgcnt_t segkpindex; + + ASSERT(kpd != NULL); + ASSERT((kpd->kp_flags & KPD_HASAMP) == 0 || kpd->kp_cookie == -1); + np = btop(len); + + /* Remove from active hash list */ + if (kpd->kp_cookie == -1) { + mutex_enter(&segkp_lock); + segkp_delete(seg, kpd); + mutex_exit(&segkp_lock); + } + + /* + * Precompute redzone page index. + */ + redzone = -1; + if (kpd->kp_flags & KPD_HASREDZONE) + redzone = KPD_REDZONE(kpd); + + + va = kpd->kp_base; + + hat_unload(seg->s_as->a_hat, va, (np << PAGESHIFT), + ((kpd->kp_flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD)); + /* + * Free up those anon resources that are quiescent. + */ + if (segkp_fromheap) + segkpindex = btop((uintptr_t)(va - kvseg.s_base)); + for (i = 0; i < np; i++, va += PAGESIZE) { + + /* + * Clear the bit for this page from the bitmap. + */ + if (segkp_fromheap) { + BT_ATOMIC_CLEAR(segkp_bitmap, segkpindex); + segkpindex++; + } + + if (i == redzone) + continue; + if (kpd->kp_anon) { + /* + * Free up anon resources and destroy the + * associated pages. + * + * Release the lock if there is one. Have to get the + * page to do this, unfortunately. + */ + if (kpd->kp_flags & KPD_LOCKED) { + ap = anon_get_ptr(kpd->kp_anon, + kpd->kp_anon_idx + i); + swap_xlate(ap, &vp, &off); + /* Find the shared-locked page. */ + pp = page_find(vp, (u_offset_t)off); + if (pp == NULL) { + panic("segkp_release: " + "kp_anon: no page to unlock "); + /*NOTREACHED*/ + } + page_unlock(pp); + } + if ((kpd->kp_flags & KPD_HASAMP) == 0) { + anon_free(kpd->kp_anon, kpd->kp_anon_idx + i, + PAGESIZE); + anon_unresv(PAGESIZE); + } + TRACE_5(TR_FAC_VM, + TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u", + kpd, va, PAGESIZE, 0, 0); + } else { + if (kpd->kp_flags & KPD_LOCKED) { + pp = page_find(&kvp, (u_offset_t)(uintptr_t)va); + if (pp == NULL) { + panic("segkp_release: " + "no page to unlock"); + /*NOTREACHED*/ + } + /* + * We should just upgrade the lock here + * but there is no upgrade that waits. + */ + page_unlock(pp); + } + pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)va, + SE_EXCL); + if (pp != NULL) + page_destroy(pp, 0); + } + } + + /* If locked, release physical memory reservation */ + if (kpd->kp_flags & KPD_LOCKED) { + pgcnt_t pages = btop(SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)); + if ((kpd->kp_flags & KPD_NO_ANON) == 0) + atomic_add_long(&anon_segkp_pages_locked, -pages); + page_unresv(pages); + } + + vmem_free(SEGKP_VMEM(seg), kpd->kp_base, kpd->kp_len); + kmem_free(kpd, sizeof (struct segkp_data)); +} + +/* + * segkp_map_red() will check the current frame pointer against the + * stack base. If the amount of stack remaining is questionable + * (less than red_minavail), then segkp_map_red() will map in the redzone + * and return 1. Otherwise, it will return 0. segkp_map_red() can + * _only_ be called when: + * + * - it is safe to sleep on page_create_va(). + * - the caller is non-swappable. + * + * It is up to the caller to remember whether segkp_map_red() successfully + * mapped the redzone, and, if so, to call segkp_unmap_red() at a later + * time. Note that the caller must _remain_ non-swappable until after + * calling segkp_unmap_red(). + * + * Currently, this routine is only called from pagefault() (which necessarily + * satisfies the above conditions). + */ +#if defined(STACK_GROWTH_DOWN) +int +segkp_map_red(void) +{ + uintptr_t fp = STACK_BIAS + (uintptr_t)getfp(); +#ifndef _LP64 + caddr_t stkbase; +#endif + + ASSERT(curthread->t_schedflag & TS_DONT_SWAP); + + /* + * Optimize for the common case where we simply return. + */ + if ((curthread->t_red_pp == NULL) && + (fp - (uintptr_t)curthread->t_stkbase >= red_minavail)) + return (0); + +#if defined(_LP64) + /* + * XXX We probably need something better than this. + */ + panic("kernel stack overflow"); + /*NOTREACHED*/ +#else /* _LP64 */ + if (curthread->t_red_pp == NULL) { + page_t *red_pp; + struct seg kseg; + + caddr_t red_va = (caddr_t) + (((uintptr_t)curthread->t_stkbase & (uintptr_t)PAGEMASK) - + PAGESIZE); + + ASSERT(page_exists(&kvp, (u_offset_t)(uintptr_t)red_va) == + NULL); + + /* + * Allocate the physical for the red page. + */ + /* + * No PG_NORELOC here to avoid waits. Unlikely to get + * a relocate happening in the short time the page exists + * and it will be OK anyway. + */ + + kseg.s_as = &kas; + red_pp = page_create_va(&kvp, (u_offset_t)(uintptr_t)red_va, + PAGESIZE, PG_WAIT | PG_EXCL, &kseg, red_va); + ASSERT(red_pp != NULL); + + /* + * So we now have a page to jam into the redzone... + */ + page_io_unlock(red_pp); + + hat_memload(kas.a_hat, red_va, red_pp, + (PROT_READ|PROT_WRITE), HAT_LOAD_LOCK); + page_downgrade(red_pp); + + /* + * The page is left SE_SHARED locked so we can hold on to + * the page_t pointer. + */ + curthread->t_red_pp = red_pp; + + atomic_add_32(&red_nmapped, 1); + while (fp - (uintptr_t)curthread->t_stkbase < red_closest) { + (void) cas32(&red_closest, red_closest, + (uint32_t)(fp - (uintptr_t)curthread->t_stkbase)); + } + return (1); + } + + stkbase = (caddr_t)(((uintptr_t)curthread->t_stkbase & + (uintptr_t)PAGEMASK) - PAGESIZE); + + atomic_add_32(&red_ndoubles, 1); + + if (fp - (uintptr_t)stkbase < RED_DEEP_THRESHOLD) { + /* + * Oh boy. We're already deep within the mapped-in + * redzone page, and the caller is trying to prepare + * for a deep stack run. We're running without a + * redzone right now: if the caller plows off the + * end of the stack, it'll plow another thread or + * LWP structure. That situation could result in + * a very hard-to-debug panic, so, in the spirit of + * recording the name of one's killer in one's own + * blood, we're going to record lbolt and the calling + * thread. + */ + red_deep_lbolt = lbolt; + red_deep_thread = curthread; + } + + /* + * If this is a DEBUG kernel, and we've run too deep for comfort, toss. + */ + ASSERT(fp - (uintptr_t)stkbase >= RED_DEEP_THRESHOLD); + return (0); +#endif /* _LP64 */ +} + +void +segkp_unmap_red(void) +{ + page_t *pp; + caddr_t red_va = (caddr_t)(((uintptr_t)curthread->t_stkbase & + (uintptr_t)PAGEMASK) - PAGESIZE); + + ASSERT(curthread->t_red_pp != NULL); + ASSERT(curthread->t_schedflag & TS_DONT_SWAP); + + /* + * Because we locked the mapping down, we can't simply rely + * on page_destroy() to clean everything up; we need to call + * hat_unload() to explicitly unlock the mapping resources. + */ + hat_unload(kas.a_hat, red_va, PAGESIZE, HAT_UNLOAD_UNLOCK); + + pp = curthread->t_red_pp; + + ASSERT(pp == page_find(&kvp, (u_offset_t)(uintptr_t)red_va)); + + /* + * Need to upgrade the SE_SHARED lock to SE_EXCL. + */ + if (!page_tryupgrade(pp)) { + /* + * As there is now wait for upgrade, release the + * SE_SHARED lock and wait for SE_EXCL. + */ + page_unlock(pp); + pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)red_va, SE_EXCL); + /* pp may be NULL here, hence the test below */ + } + + /* + * Destroy the page, with dontfree set to zero (i.e. free it). + */ + if (pp != NULL) + page_destroy(pp, 0); + curthread->t_red_pp = NULL; +} +#else +#error Red stacks only supported with downwards stack growth. +#endif + +/* + * Handle a fault on an address corresponding to one of the + * resources in the segkp segment. + */ +faultcode_t +segkp_fault( + struct hat *hat, + struct seg *seg, + caddr_t vaddr, + size_t len, + enum fault_type type, + enum seg_rw rw) +{ + struct segkp_data *kpd = NULL; + int err; + + ASSERT(seg->s_as == &kas && RW_READ_HELD(&seg->s_as->a_lock)); + + /* + * Sanity checks. + */ + if (type == F_PROT) { + panic("segkp_fault: unexpected F_PROT fault"); + /*NOTREACHED*/ + } + + if ((kpd = segkp_find(seg, vaddr)) == NULL) + return (FC_NOMAP); + + mutex_enter(&kpd->kp_lock); + + if (type == F_SOFTLOCK) { + ASSERT(!(kpd->kp_flags & KPD_LOCKED)); + /* + * The F_SOFTLOCK case has more stringent + * range requirements: the given range must exactly coincide + * with the resource's mapped portion. Note reference to + * redzone is handled since vaddr would not equal base + */ + if (vaddr != stom(kpd->kp_base, kpd->kp_flags) || + len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) { + mutex_exit(&kpd->kp_lock); + return (FC_MAKE_ERR(EFAULT)); + } + + if ((err = segkp_load(hat, seg, vaddr, len, kpd, KPD_LOCKED))) { + mutex_exit(&kpd->kp_lock); + return (FC_MAKE_ERR(err)); + } + kpd->kp_flags |= KPD_LOCKED; + mutex_exit(&kpd->kp_lock); + return (0); + } + + if (type == F_INVAL) { + ASSERT(!(kpd->kp_flags & KPD_NO_ANON)); + + /* + * Check if we touched the redzone. Somewhat optimistic + * here if we are touching the redzone of our own stack + * since we wouldn't have a stack to get this far... + */ + if ((kpd->kp_flags & KPD_HASREDZONE) && + btop((uintptr_t)(vaddr - kpd->kp_base)) == KPD_REDZONE(kpd)) + panic("segkp_fault: accessing redzone"); + + /* + * This fault may occur while the page is being F_SOFTLOCK'ed. + * Return since a 2nd segkp_load is unnecessary and also would + * result in the page being locked twice and eventually + * hang the thread_reaper thread. + */ + if (kpd->kp_flags & KPD_LOCKED) { + mutex_exit(&kpd->kp_lock); + return (0); + } + + err = segkp_load(hat, seg, vaddr, len, kpd, kpd->kp_flags); + mutex_exit(&kpd->kp_lock); + return (err ? FC_MAKE_ERR(err) : 0); + } + + if (type == F_SOFTUNLOCK) { + uint_t flags; + + /* + * Make sure the addr is LOCKED and it has anon backing + * before unlocking + */ + if ((kpd->kp_flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON) { + panic("segkp_fault: bad unlock"); + /*NOTREACHED*/ + } + + if (vaddr != stom(kpd->kp_base, kpd->kp_flags) || + len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) { + panic("segkp_fault: bad range"); + /*NOTREACHED*/ + } + + if (rw == S_WRITE) + flags = kpd->kp_flags | KPD_WRITEDIRTY; + else + flags = kpd->kp_flags; + err = segkp_unlock(hat, seg, vaddr, len, kpd, flags); + kpd->kp_flags &= ~KPD_LOCKED; + mutex_exit(&kpd->kp_lock); + return (err ? FC_MAKE_ERR(err) : 0); + } + mutex_exit(&kpd->kp_lock); + panic("segkp_fault: bogus fault type: %d\n", type); + /*NOTREACHED*/ +} + +/* + * Check that the given protections suffice over the range specified by + * vaddr and len. For this segment type, the only issue is whether or + * not the range lies completely within the mapped part of an allocated + * resource. + */ +/* ARGSUSED */ +static int +segkp_checkprot(struct seg *seg, caddr_t vaddr, size_t len, uint_t prot) +{ + struct segkp_data *kpd = NULL; + caddr_t mbase; + size_t mlen; + + if ((kpd = segkp_find(seg, vaddr)) == NULL) + return (EACCES); + + mutex_enter(&kpd->kp_lock); + mbase = stom(kpd->kp_base, kpd->kp_flags); + mlen = SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags); + if (len > mlen || vaddr < mbase || + ((vaddr + len) > (mbase + mlen))) { + mutex_exit(&kpd->kp_lock); + return (EACCES); + } + mutex_exit(&kpd->kp_lock); + return (0); +} + + +/* + * Check to see if it makes sense to do kluster/read ahead to + * addr + delta relative to the mapping at addr. We assume here + * that delta is a signed PAGESIZE'd multiple (which can be negative). + * + * For seg_u we always "approve" of this action from our standpoint. + */ +/*ARGSUSED*/ +static int +segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta) +{ + return (0); +} + +/* + * Load and possibly lock intra-slot resources in the range given by + * vaddr and len. + */ +static int +segkp_load( + struct hat *hat, + struct seg *seg, + caddr_t vaddr, + size_t len, + struct segkp_data *kpd, + uint_t flags) +{ + caddr_t va; + caddr_t vlim; + ulong_t i; + uint_t lock; + + ASSERT(MUTEX_HELD(&kpd->kp_lock)); + + len = P2ROUNDUP(len, PAGESIZE); + + /* If locking, reserve physical memory */ + if (flags & KPD_LOCKED) { + pgcnt_t pages = btop(len); + if ((kpd->kp_flags & KPD_NO_ANON) == 0) + atomic_add_long(&anon_segkp_pages_locked, pages); + (void) page_resv(pages, KM_SLEEP); + } + + /* + * Loop through the pages in the given range. + */ + va = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); + vaddr = va; + vlim = va + len; + lock = flags & KPD_LOCKED; + i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT; + for (; va < vlim; va += PAGESIZE, i++) { + page_t *pl[2]; /* second element NULL terminator */ + struct vnode *vp; + anoff_t off; + int err; + struct anon *ap; + + /* + * Summon the page. If it's not resident, arrange + * for synchronous i/o to pull it in. + */ + ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i); + swap_xlate(ap, &vp, &off); + + /* + * The returned page list will have exactly one entry, + * which is returned to us already kept. + */ + err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, NULL, + pl, PAGESIZE, seg, va, S_READ, kcred); + + if (err) { + /* + * Back out of what we've done so far. + */ + (void) segkp_unlock(hat, seg, vaddr, + (va - vaddr), kpd, flags); + return (err); + } + + /* + * Load an MMU translation for the page. + */ + hat_memload(hat, va, pl[0], (PROT_READ|PROT_WRITE), + lock ? HAT_LOAD_LOCK : HAT_LOAD); + + if (!lock) { + /* + * Now, release "shared" lock on the page. + */ + page_unlock(pl[0]); + } + } + return (0); +} + +/* + * At the very least unload the mmu-translations and unlock the range if locked + * Can be called with the following flag value KPD_WRITEDIRTY which specifies + * any dirty pages should be written to disk. + */ +static int +segkp_unlock( + struct hat *hat, + struct seg *seg, + caddr_t vaddr, + size_t len, + struct segkp_data *kpd, + uint_t flags) +{ + caddr_t va; + caddr_t vlim; + ulong_t i; + struct page *pp; + struct vnode *vp; + anoff_t off; + struct anon *ap; + +#ifdef lint + seg = seg; +#endif /* lint */ + + ASSERT(MUTEX_HELD(&kpd->kp_lock)); + + /* + * Loop through the pages in the given range. It is assumed + * segkp_unlock is called with page aligned base + */ + va = vaddr; + vlim = va + len; + i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT; + hat_unload(hat, va, len, + ((flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD)); + for (; va < vlim; va += PAGESIZE, i++) { + /* + * Find the page associated with this part of the + * slot, tracking it down through its associated swap + * space. + */ + ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i); + swap_xlate(ap, &vp, &off); + + if (flags & KPD_LOCKED) { + if ((pp = page_find(vp, off)) == NULL) { + if (flags & KPD_LOCKED) { + panic("segkp_softunlock: missing page"); + /*NOTREACHED*/ + } + } + } else { + /* + * Nothing to do if the slot is not locked and the + * page doesn't exist. + */ + if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) + continue; + } + + /* + * If the page doesn't have any translations, is + * dirty and not being shared, then push it out + * asynchronously and avoid waiting for the + * pageout daemon to do it for us. + * + * XXX - Do we really need to get the "exclusive" + * lock via an upgrade? + */ + if ((flags & KPD_WRITEDIRTY) && !hat_page_is_mapped(pp) && + hat_ismod(pp) && page_tryupgrade(pp)) { + /* + * Hold the vnode before releasing the page lock to + * prevent it from being freed and re-used by some + * other thread. + */ + VN_HOLD(vp); + page_unlock(pp); + + /* + * Want most powerful credentials we can get so + * use kcred. + */ + (void) VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, + B_ASYNC | B_FREE, kcred); + VN_RELE(vp); + } else { + page_unlock(pp); + } + } + + /* If unlocking, release physical memory */ + if (flags & KPD_LOCKED) { + pgcnt_t pages = btopr(len); + if ((kpd->kp_flags & KPD_NO_ANON) == 0) + atomic_add_long(&anon_segkp_pages_locked, -pages); + page_unresv(pages); + } + return (0); +} + +/* + * Insert the kpd in the hash table. + */ +static void +segkp_insert(struct seg *seg, struct segkp_data *kpd) +{ + struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data; + int index; + + /* + * Insert the kpd based on the address that will be returned + * via segkp_release. + */ + index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags)); + mutex_enter(&segkp_lock); + kpd->kp_next = kpsd->kpsd_hash[index]; + kpsd->kpsd_hash[index] = kpd; + mutex_exit(&segkp_lock); +} + +/* + * Remove kpd from the hash table. + */ +static void +segkp_delete(struct seg *seg, struct segkp_data *kpd) +{ + struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data; + struct segkp_data **kpp; + int index; + + ASSERT(MUTEX_HELD(&segkp_lock)); + + index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags)); + for (kpp = &kpsd->kpsd_hash[index]; + *kpp != NULL; kpp = &((*kpp)->kp_next)) { + if (*kpp == kpd) { + *kpp = kpd->kp_next; + return; + } + } + panic("segkp_delete: unable to find element to delete"); + /*NOTREACHED*/ +} + +/* + * Find the kpd associated with a vaddr. + * + * Most of the callers of segkp_find will pass the vaddr that + * hashes to the desired index, but there are cases where + * this is not true in which case we have to (potentially) scan + * the whole table looking for it. This should be very rare + * (e.g. a segkp_fault(F_INVAL) on an address somewhere in the + * middle of the segkp_data region). + */ +static struct segkp_data * +segkp_find(struct seg *seg, caddr_t vaddr) +{ + struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data; + struct segkp_data *kpd; + int i; + int stop; + + i = stop = SEGKP_HASH(vaddr); + mutex_enter(&segkp_lock); + do { + for (kpd = kpsd->kpsd_hash[i]; kpd != NULL; + kpd = kpd->kp_next) { + if (vaddr >= kpd->kp_base && + vaddr < kpd->kp_base + kpd->kp_len) { + mutex_exit(&segkp_lock); + return (kpd); + } + } + if (--i < 0) + i = SEGKP_HASHSZ - 1; /* Wrap */ + } while (i != stop); + mutex_exit(&segkp_lock); + return (NULL); /* Not found */ +} + +/* + * returns size of swappable area. + */ +size_t +swapsize(caddr_t v) +{ + struct segkp_data *kpd; + + if ((kpd = segkp_find(segkp, v)) != NULL) + return (SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)); + else + return (NULL); +} + +/* + * Dump out all the active segkp pages + */ +static void +segkp_dump(struct seg *seg) +{ + int i; + struct segkp_data *kpd; + struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data; + + for (i = 0; i < SEGKP_HASHSZ; i++) { + for (kpd = kpsd->kpsd_hash[i]; + kpd != NULL; kpd = kpd->kp_next) { + pfn_t pfn; + caddr_t addr; + caddr_t eaddr; + + addr = kpd->kp_base; + eaddr = addr + kpd->kp_len; + while (addr < eaddr) { + ASSERT(seg->s_as == &kas); + pfn = hat_getpfnum(seg->s_as->a_hat, addr); + if (pfn != PFN_INVALID) + dump_addpage(seg->s_as, addr, pfn); + addr += PAGESIZE; + dump_timeleft = dump_timeout; + } + } + } +} + +/*ARGSUSED*/ +static int +segkp_pagelock(struct seg *seg, caddr_t addr, size_t len, + struct page ***ppp, enum lock_type type, enum seg_rw rw) +{ + return (ENOTSUP); +} + +/*ARGSUSED*/ +static int +segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + return (ENODEV); +} + +/*ARGSUSED*/ +static lgrp_mem_policy_info_t * +segkp_getpolicy(struct seg *seg, caddr_t addr) +{ + return (NULL); +} + +#include <sys/mem_config.h> + +/*ARGSUSED*/ +static void +segkp_mem_config_post_add(void *arg, pgcnt_t delta_pages) +{} + +/* + * During memory delete, turn off caches so that pages are not held. + * A better solution may be to unlock the pages while they are + * in the cache so that they may be collected naturally. + */ + +/*ARGSUSED*/ +static int +segkp_mem_config_pre_del(void *arg, pgcnt_t delta_pages) +{ + atomic_add_32(&segkp_indel, 1); + segkp_cache_free(); + return (0); +} + +/*ARGSUSED*/ +static void +segkp_mem_config_post_del(void *arg, pgcnt_t delta_pages, int cancelled) +{ + atomic_add_32(&segkp_indel, -1); +} + +static kphysm_setup_vector_t segkp_mem_config_vec = { + KPHYSM_SETUP_VECTOR_VERSION, + segkp_mem_config_post_add, + segkp_mem_config_pre_del, + segkp_mem_config_post_del, +}; + +static void +segkpinit_mem_config(struct seg *seg) +{ + int ret; + + ret = kphysm_setup_func_register(&segkp_mem_config_vec, (void *)seg); + ASSERT(ret == 0); +} diff --git a/usr/src/uts/common/vm/seg_kp.h b/usr/src/uts/common/vm/seg_kp.h new file mode 100644 index 0000000000..64fa883cc9 --- /dev/null +++ b/usr/src/uts/common/vm/seg_kp.h @@ -0,0 +1,165 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VM_SEG_KP_H +#define _VM_SEG_KP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * segkp (as in kernel pageable) is a segment driver that supports allocation + * of page-aligned variable size of vm resources. + * + * Each vm resource represents a page-aligned range of virtual addresses. + * The caller may specify whether the resource should include a redzone, + * be locked down, or be zero initialized. + */ + +#include <vm/seg.h> +#include <sys/vmem.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +/* + * Private information per overall segkp segment (as opposed + * to per resource within segment). There are as many anon slots + * allocated as there there are pages in the segment. + */ +struct segkp_segdata { + struct anon_hdr *kpsd_anon; /* anon structs */ + vmem_t *kpsd_arena; /* virtual memory descriptor */ + struct segkp_data **kpsd_hash; /* Hash table for lookups */ +}; + +#define SEGKP_VMEM(seg) (((struct segkp_segdata *)(seg)->s_data)->kpsd_arena) + +/* + * A hash table is used to aid in the lookup of a kpd's based on vaddr. + * Since the heaviest use of segkp occurs from segkp_*get and segkp_*release, + * the hashing is based on the vaddr used by these routines. + */ +#define SEGKP_HASHSZ 256 /* power of two */ +#define SEGKP_HASHMASK (SEGKP_HASHSZ - 1) +#define SEGKP_HASH(vaddr) \ + ((int)(((uintptr_t)vaddr >> PAGESHIFT) & SEGKP_HASHMASK)) + +struct segkp_data { + kmutex_t kp_lock; /* per resource lock */ + caddr_t kp_base; /* starting addr of chunk */ + size_t kp_len; /* # of bytes */ + uint_t kp_flags; /* state info */ + int kp_cookie; /* index into cache array */ + ulong_t kp_anon_idx; /* index into main anon array */ + /* in segkp_segdata */ + struct anon_hdr *kp_anon; /* anon structs */ + struct segkp_data *kp_next; /* ptr to next in hash chain */ +}; + +/* + * Flag bits + * + */ +#define KPD_ZERO 0x01 /* initialize resource with 0 */ +#define KPD_LOCKED 0x02 /* resources locked */ +#define KPD_NO_ANON 0x04 /* no swap resources required */ +#define KPD_HASREDZONE 0x08 /* include a redzone */ +#define KPD_NOWAIT 0x10 /* do not wait for res. if unavail. */ +#define KPD_WRITEDIRTY 0x20 /* dirty pages should be flushed */ +#define KPD_HASAMP 0x40 /* anon_hdr managed by caller */ + +/* + * A cache of segkp elements may be created via segkp_cache_init(). + * The elements on the freelist all have the same len and flags value. + * The cookie passed to the client is an index into the freelist array. + */ +struct segkp_cache { + int kpf_max; /* max # of elements allowed */ + int kpf_count; /* current no. of elments */ + int kpf_inuse; /* list inuse */ + uint_t kpf_flags; /* seg_kp flag value */ + size_t kpf_len; /* len of resource */ + struct seg *kpf_seg; /* segment */ + struct segkp_data *kpf_list; /* list of kpd's */ +}; +#define SEGKP_MAX_CACHE 4 /* Number of caches maintained */ + +/* + * Define redzone, and stack_to_memory macros. + * The redzone is PAGESIZE bytes. + */ +#ifdef STACK_GROWTH_DOWN +#define KPD_REDZONE(kpd) (0) +#define stom(v, flags) (((flags) & KPD_HASREDZONE) ? (v) + PAGESIZE : (v)) + +#else /* STACK_GROWTH_DOWN */ + +#define KPD_REDZONE(kpd) (btop(kpd->kp_len) - 1) +#define stom(v) (v) +#endif /* STACK_GROWTH_DOWN */ + +#define SEGKP_MAPLEN(len, flags) \ + (((flags) & KPD_HASREDZONE) ? (len) - PAGESIZE : (len)) + +extern struct seg *segkp; +/* If segkp becomes more than one seg this test will need changing. */ +#define SEG_IS_SEGKP(SEG) ((SEG) == segkp) + +/* + * Public routine declarations not part of the segment ops vector go here. + */ +int segkp_create(struct seg *seg); +caddr_t segkp_get(struct seg *seg, size_t len, uint_t flags); +void segkp_release(struct seg *seg, caddr_t vaddr); +void * segkp_cache_init(struct seg *seg, int maxsize, size_t len, + uint_t flags); +void segkp_cache_free(); +caddr_t segkp_cache_get(void *cookie); +int segkp_map_red(void); +void segkp_unmap_red(void); +size_t swapsize(caddr_t v); + +/* Special currently only used by schedctl. */ +struct anon_map; /* Make the compiler happy about the next line. */ +caddr_t segkp_get_withanonmap(struct seg *, size_t, uint_t, struct anon_map *); + +/* + * We allow explicit calls to segkp_fault, even though it's part + * of the segkp ops vector. + */ +faultcode_t segkp_fault(struct hat *hat, struct seg *seg, caddr_t addr, + size_t len, enum fault_type type, enum seg_rw rw); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_KP_H */ diff --git a/usr/src/uts/common/vm/seg_kpm.c b/usr/src/uts/common/vm/seg_kpm.c new file mode 100644 index 0000000000..73b7dbe94c --- /dev/null +++ b/usr/src/uts/common/vm/seg_kpm.c @@ -0,0 +1,323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Kernel Physical Mapping (kpm) segment driver (segkpm). + * + * This driver delivers along with the hat_kpm* interfaces an alternative + * mechanism for kernel mappings within the 64-bit Solaris operating system, + * which allows the mapping of all physical memory into the kernel address + * space at once. This is feasible in 64 bit kernels, e.g. for Ultrasparc II + * and beyond processors, since the available VA range is much larger than + * possible physical memory. Momentarily all physical memory is supported, + * that is represented by the list of memory segments (memsegs). + * + * Segkpm mappings have also very low overhead and large pages are used + * (when possible) to minimize the TLB and TSB footprint. It is also + * extentable for other than Sparc architectures (e.g. AMD64). Main + * advantage is the avoidance of the TLB-shootdown X-calls, which are + * normally needed when a kernel (global) mapping has to be removed. + * + * First example of a kernel facility that uses the segkpm mapping scheme + * is seg_map, where it is used as an alternative to hat_memload(). + * See also hat layer for more information about the hat_kpm* routines. + * The kpm facilty can be turned off at boot time (e.g. /etc/system). + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/bitmap.h> +#include <sys/atomic.h> + +#include <vm/seg_kmem.h> +#include <vm/seg_kpm.h> +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/page.h> + +/* + * Global kpm controls. + * See also platform and mmu specific controls. + * + * kpm_enable -- global on/off switch for segkpm. + * . Set by default on 64bit platforms that have kpm support. + * . Will be disabled from platform layer if not supported. + * . Can be disabled via /etc/system. + * + * kpm_smallpages -- use only regular/system pagesize for kpm mappings. + * . Can be useful for critical debugging of kpm clients. + * . Set to zero by default for platforms that support kpm large pages. + * The use of kpm large pages reduces the footprint of kpm meta data + * and has all the other advantages of using large pages (e.g TLB + * miss reduction). + * . Set by default for platforms that don't support kpm large pages or + * where large pages cannot be used for other reasons (e.g. there are + * only few full associative TLB entries available for large pages). + * + * segmap_kpm -- separate on/off switch for segmap using segkpm: + * . Set by default. + * . Will be disabled when kpm_enable is zero. + * . Will be disabled when MAXBSIZE != PAGESIZE. + * . Can be disabled via /etc/system. + * + */ +int kpm_enable = 1; +int kpm_smallpages = 0; +int segmap_kpm = 1; + +/* + * Private seg op routines. + */ +faultcode_t segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr, + size_t len, enum fault_type type, enum seg_rw rw); +static void segkpm_dump(struct seg *); +static void segkpm_badop(void); +static int segkpm_notsup(void); + +#define SEGKPM_BADOP(t) (t(*)())segkpm_badop +#define SEGKPM_NOTSUP (int(*)())segkpm_notsup + +static struct seg_ops segkpm_ops = { + SEGKPM_BADOP(int), /* dup */ + SEGKPM_BADOP(int), /* unmap */ + SEGKPM_BADOP(void), /* free */ + segkpm_fault, + SEGKPM_BADOP(int), /* faulta */ + SEGKPM_BADOP(int), /* setprot */ + SEGKPM_BADOP(int), /* checkprot */ + SEGKPM_BADOP(int), /* kluster */ + SEGKPM_BADOP(size_t), /* swapout */ + SEGKPM_BADOP(int), /* sync */ + SEGKPM_BADOP(size_t), /* incore */ + SEGKPM_BADOP(int), /* lockop */ + SEGKPM_BADOP(int), /* getprot */ + SEGKPM_BADOP(u_offset_t), /* getoffset */ + SEGKPM_BADOP(int), /* gettype */ + SEGKPM_BADOP(int), /* getvp */ + SEGKPM_BADOP(int), /* advise */ + segkpm_dump, /* dump */ + SEGKPM_NOTSUP, /* pagelock */ + SEGKPM_BADOP(int), /* setpgsz */ + SEGKPM_BADOP(int), /* getmemid */ +}; + +/* + * kpm_pgsz and kpm_pgshft are set by platform layer. + */ +size_t kpm_pgsz; /* kpm page size */ +uint_t kpm_pgshft; /* kpm page shift */ +u_offset_t kpm_pgoff; /* kpm page offset mask */ +uint_t kpmp2pshft; /* kpm page to page shift */ +pgcnt_t kpmpnpgs; /* how many pages per kpm page */ + + +#ifdef SEGKPM_SUPPORT + +int +segkpm_create(struct seg *seg, void *argsp) +{ + struct segkpm_data *skd; + struct segkpm_crargs *b = (struct segkpm_crargs *)argsp; + ushort_t *p; + int i, j; + + ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock)); + ASSERT(btokpmp(seg->s_size) >= 1 && + kpmpageoff((uintptr_t)seg->s_base) == 0 && + kpmpageoff((uintptr_t)seg->s_base + seg->s_size) == 0); + + skd = kmem_zalloc(sizeof (struct segkpm_data), KM_SLEEP); + + seg->s_data = (void *)skd; + seg->s_ops = &segkpm_ops; + skd->skd_prot = b->prot; + + /* + * (1) Segkpm virtual addresses are based on physical adresses. + * From this and in opposite to other segment drivers it is + * often required to allocate a page first to be able to + * calculate the final segkpm virtual address. + * (2) Page allocation is done by calling page_create_va(), + * one important input argument is a virtual address (also + * expressed by the "va" in the function name). This function + * is highly optimized to select the right page for an optimal + * processor and platform support (e.g. virtual addressed + * caches (VAC), physical addressed caches, NUMA). + * + * Because of (1) the approach is to generate a faked virtual + * address for calling page_create_va(). In order to exploit + * the abilities of (2), especially to utilize the cache + * hierarchy (3) and to avoid VAC alias conflicts (4) the + * selection has to be done carefully. For each virtual color + * a separate counter is provided (4). The count values are + * used for the utilization of all cache lines (3) and are + * corresponding to the cache bins. + */ + skd->skd_nvcolors = b->nvcolors; + + p = skd->skd_va_select = + kmem_zalloc(NCPU * b->nvcolors * sizeof (ushort_t), KM_SLEEP); + + for (i = 0; i < NCPU; i++) + for (j = 0; j < b->nvcolors; j++, p++) + *p = j; + + return (0); +} + +/* + * This routine is called via a machine specific fault handling + * routine. + */ +/* ARGSUSED */ +faultcode_t +segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, + enum fault_type type, enum seg_rw rw) +{ + faultcode_t error; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + error = hat_kpm_fault(hat, addr); + + return (error); +} + +#define addr_to_vcolor(addr, vcolors) \ + ((int)(((uintptr_t)(addr) & ((vcolors << PAGESHIFT) - 1)) >> PAGESHIFT)) + +/* + * Create a virtual address that can be used for invocations of + * page_create_va. Goal is to utilize the cache hierarchy (round + * robin bins) and to select the right color for virtual indexed + * caches. It isn't exact since we also increment the bin counter + * when the caller uses VOP_GETPAGE and gets a hit in the page + * cache, but we keep the bins turning for cache distribution + * (see also segkpm_create block comment). + */ +caddr_t +segkpm_create_va(u_offset_t off) +{ + int vcolor; + ushort_t *p; + struct segkpm_data *skd = (struct segkpm_data *)segkpm->s_data; + int nvcolors = skd->skd_nvcolors; + caddr_t va; + + vcolor = (nvcolors > 1) ? addr_to_vcolor(off, nvcolors) : 0; + p = &skd->skd_va_select[(CPU->cpu_id * nvcolors) + vcolor]; + va = (caddr_t)ptob(*p); + + atomic_add_16(p, nvcolors); + + return (va); +} + +/* + * Unload mapping if the instance has an active kpm mapping. + */ +void +segkpm_mapout_validkpme(struct kpme *kpme) +{ + caddr_t vaddr; + page_t *pp; + +retry: + if ((pp = kpme->kpe_page) == NULL) { + return; + } + + if (page_lock(pp, SE_SHARED, (kmutex_t *)NULL, P_RECLAIM) == 0) + goto retry; + + /* + * Check if segkpm mapping is not unloaded in the meantime + */ + if (kpme->kpe_page == NULL) { + page_unlock(pp); + return; + } + + vaddr = hat_kpm_page2va(pp, 1); + hat_kpm_mapout(pp, kpme, vaddr); + page_unlock(pp); +} + +static void +segkpm_badop() +{ + panic("segkpm_badop"); +} + +#else /* SEGKPM_SUPPORT */ + +/* segkpm stubs */ + +/*ARGSUSED*/ +int segkpm_create(struct seg *seg, void *argsp) { return (0); } + +/* ARGSUSED */ +faultcode_t +segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, + enum fault_type type, enum seg_rw rw) +{ + return ((faultcode_t)0); +} + +/* ARGSUSED */ +caddr_t segkpm_create_va(u_offset_t off) { return (NULL); } + +/* ARGSUSED */ +void segkpm_mapout_validkpme(struct kpme *kpme) {} + +static void +segkpm_badop() {} + +#endif /* SEGKPM_SUPPORT */ + +static int +segkpm_notsup() +{ + return (ENOTSUP); +} + +/* + * segkpm pages are not dumped, so we just return + */ +/*ARGSUSED*/ +static void +segkpm_dump(struct seg *seg) +{} diff --git a/usr/src/uts/common/vm/seg_kpm.h b/usr/src/uts/common/vm/seg_kpm.h new file mode 100644 index 0000000000..0b766bbaf4 --- /dev/null +++ b/usr/src/uts/common/vm/seg_kpm.h @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VM_SEG_KPM_H +#define _VM_SEG_KPM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Kernel Physical Mapping (segkpm) segment driver. + */ + +#include <vm/kpm.h> + +struct segkpm_data { + ushort_t *skd_va_select; /* page_create_va kpm vaddr bin count */ + short skd_nvcolors; /* VAC colors to deal with */ + uchar_t skd_prot; +}; + +/* + * segkpm create needs some platform knowledge + */ +struct segkpm_crargs { + uint_t prot; + short nvcolors; /* VAC # virtual colors, 0 for PAC. */ +}; + +extern struct seg *segkpm; +extern u_offset_t kpm_pgoff; +extern size_t kpm_pgsz; +extern uint_t kpm_pgshft; +extern uint_t kpmp2pshft; +extern pgcnt_t kpmpnpgs; + +/* kpm controls */ +extern int kpm_enable; +extern int kpm_smallpages; +extern int segmap_kpm; + +/* + * kpm_page_t macros: + * . bytes (b) to kpm pages (kpmp) + * . pages (p) to kpm pages (kpmp), and back (with and without roundup) + * . kpm page offset in bytes + * . pages (p) modulo kpm pages (kpmp) + */ +#define btokpmp(x) ((x) >> kpm_pgshft) +#define btokpmpr(x) (((x) + kpm_pgoff) >> kpm_pgshft) +#define ptokpmp(x) ((x) >> kpmp2pshft) +#define ptokpmpr(x) (((x) + (kpmpnpgs - 1)) >> kpmp2pshft) +#define kpmptop(x) ((x) << kpmp2pshft) +#define kpmpageoff(x) ((x) & kpm_pgoff) +#define pmodkpmp(x) ((x) & (kpmpnpgs - 1)) + +#ifdef SEGKPM_SUPPORT + +#define IS_KPM_ADDR(addr) \ + ((addr) >= segkpm->s_base && (addr) < (segkpm->s_base + segkpm->s_size)) + +#define KPMPAGE_T_SZ \ + ((kpm_smallpages == 0) ? sizeof (kpm_page_t) : sizeof (kpm_spage_t)) + +#else /* SEGKPM_SUPPORT */ + +#define IS_KPM_ADDR(addr) (segkpm != NULL) +#define KPMPAGE_T_SZ (0) + +#endif /* SEGKPM_SUPPORT */ + +#ifdef _KERNEL +/* + * Public seg_kpm segment operations. + */ +extern int segkpm_create(struct seg *, void *); +extern faultcode_t segkpm_fault(struct hat *, struct seg *, caddr_t, + size_t, enum fault_type, enum seg_rw); + +/* + * Public seg_kpm interfaces. + */ +extern caddr_t segkpm_create_va(u_offset_t); +extern void segkpm_mapout_validkpme(struct kpme *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_KPM_H */ diff --git a/usr/src/uts/common/vm/seg_map.c b/usr/src/uts/common/vm/seg_map.c new file mode 100644 index 0000000000..d4b6a16ca4 --- /dev/null +++ b/usr/src/uts/common/vm/seg_map.c @@ -0,0 +1,2345 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - generic vnode mapping segment. + * + * The segmap driver is used only by the kernel to get faster (than seg_vn) + * mappings [lower routine overhead; more persistent cache] to random + * vnode/offsets. Note than the kernel may (and does) use seg_vn as well. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/buf.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/mman.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/vtrace.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/thread.h> +#include <sys/dumphdr.h> +#include <sys/bitmap.h> +#include <sys/lgrp.h> + +#include <vm/seg_kmem.h> +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kpm.h> +#include <vm/seg_map.h> +#include <vm/page.h> +#include <vm/pvn.h> +#include <vm/rm.h> + +/* + * Private seg op routines. + */ +static void segmap_free(struct seg *seg); +faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr, + size_t len, enum fault_type type, enum seg_rw rw); +static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr); +static int segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, + uint_t prot); +static int segmap_kluster(struct seg *seg, caddr_t addr, ssize_t); +static int segmap_getprot(struct seg *seg, caddr_t addr, size_t len, + uint_t *protv); +static u_offset_t segmap_getoffset(struct seg *seg, caddr_t addr); +static int segmap_gettype(struct seg *seg, caddr_t addr); +static int segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp); +static void segmap_dump(struct seg *seg); +static int segmap_pagelock(struct seg *seg, caddr_t addr, size_t len, + struct page ***ppp, enum lock_type type, + enum seg_rw rw); +static void segmap_badop(void); +static int segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp); +static lgrp_mem_policy_info_t *segmap_getpolicy(struct seg *seg, + caddr_t addr); + +/* segkpm support */ +static caddr_t segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t, + struct smap *, enum seg_rw); +struct smap *get_smap_kpm(caddr_t, page_t **); + +#define SEGMAP_BADOP(t) (t(*)())segmap_badop + +static struct seg_ops segmap_ops = { + SEGMAP_BADOP(int), /* dup */ + SEGMAP_BADOP(int), /* unmap */ + segmap_free, + segmap_fault, + segmap_faulta, + SEGMAP_BADOP(int), /* setprot */ + segmap_checkprot, + segmap_kluster, + SEGMAP_BADOP(size_t), /* swapout */ + SEGMAP_BADOP(int), /* sync */ + SEGMAP_BADOP(size_t), /* incore */ + SEGMAP_BADOP(int), /* lockop */ + segmap_getprot, + segmap_getoffset, + segmap_gettype, + segmap_getvp, + SEGMAP_BADOP(int), /* advise */ + segmap_dump, + segmap_pagelock, /* pagelock */ + SEGMAP_BADOP(int), /* setpgsz */ + segmap_getmemid, /* getmemid */ + segmap_getpolicy, /* getpolicy */ +}; + +/* + * Private segmap routines. + */ +static void segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr, + size_t len, enum seg_rw rw, struct smap *smp); +static void segmap_smapadd(struct smap *smp); +static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp, + u_offset_t off, int hashid); +static void segmap_hashout(struct smap *smp); + + +/* + * Statistics for segmap operations. + * + * No explicit locking to protect these stats. + */ +struct segmapcnt segmapcnt = { + { "fault", KSTAT_DATA_ULONG }, + { "faulta", KSTAT_DATA_ULONG }, + { "getmap", KSTAT_DATA_ULONG }, + { "get_use", KSTAT_DATA_ULONG }, + { "get_reclaim", KSTAT_DATA_ULONG }, + { "get_reuse", KSTAT_DATA_ULONG }, + { "get_unused", KSTAT_DATA_ULONG }, + { "get_nofree", KSTAT_DATA_ULONG }, + { "rel_async", KSTAT_DATA_ULONG }, + { "rel_write", KSTAT_DATA_ULONG }, + { "rel_free", KSTAT_DATA_ULONG }, + { "rel_abort", KSTAT_DATA_ULONG }, + { "rel_dontneed", KSTAT_DATA_ULONG }, + { "release", KSTAT_DATA_ULONG }, + { "pagecreate", KSTAT_DATA_ULONG }, + { "free_notfree", KSTAT_DATA_ULONG }, + { "free_dirty", KSTAT_DATA_ULONG }, + { "free", KSTAT_DATA_ULONG }, + { "stolen", KSTAT_DATA_ULONG }, + { "get_nomtx", KSTAT_DATA_ULONG } +}; + +kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt; +uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t); + +/* + * Return number of map pages in segment. + */ +#define MAP_PAGES(seg) ((seg)->s_size >> MAXBSHIFT) + +/* + * Translate addr into smap number within segment. + */ +#define MAP_PAGE(seg, addr) (((addr) - (seg)->s_base) >> MAXBSHIFT) + +/* + * Translate addr in seg into struct smap pointer. + */ +#define GET_SMAP(seg, addr) \ + &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)]) + +/* + * Bit in map (16 bit bitmap). + */ +#define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf)) + +static int smd_colormsk = 0; +static int smd_ncolor = 0; +static int smd_nfree = 0; +static int smd_freemsk = 0; +#ifdef DEBUG +static int *colors_used; +#endif +static struct smap *smd_smap; +static struct smaphash *smd_hash; +#ifdef SEGMAP_HASHSTATS +static unsigned int *smd_hash_len; +#endif +static struct smfree *smd_free; +static ulong_t smd_hashmsk = 0; + +#define SEGMAP_MAXCOLOR 2 +#define SEGMAP_CACHE_PAD 64 + +union segmap_cpu { + struct { + uint32_t scpu_free_ndx[SEGMAP_MAXCOLOR]; + struct smap *scpu_last_smap; + ulong_t scpu_getmap; + ulong_t scpu_release; + ulong_t scpu_get_reclaim; + ulong_t scpu_fault; + ulong_t scpu_pagecreate; + ulong_t scpu_get_reuse; + } scpu; + char scpu_pad[SEGMAP_CACHE_PAD]; +}; +static union segmap_cpu *smd_cpu; + +/* + * There are three locks in seg_map: + * - per freelist mutexes + * - per hashchain mutexes + * - per smap mutexes + * + * The lock ordering is to get the smap mutex to lock down the slot + * first then the hash lock (for hash in/out (vp, off) list) or the + * freelist lock to put the slot back on the free list. + * + * The hash search is done by only holding the hashchain lock, when a wanted + * slot is found, we drop the hashchain lock then lock the slot so there + * is no overlapping of hashchain and smap locks. After the slot is + * locked, we verify again if the slot is still what we are looking + * for. + * + * Allocation of a free slot is done by holding the freelist lock, + * then locking the smap slot at the head of the freelist. This is + * in reversed lock order so mutex_tryenter() is used. + * + * The smap lock protects all fields in smap structure except for + * the link fields for hash/free lists which are protected by + * hashchain and freelist locks. + */ + +#define SHASHMTX(hashid) (&smd_hash[hashid].sh_mtx) + +#define SMP2SMF(smp) (&smd_free[(smp - smd_smap) & smd_freemsk]) +#define SMP2SMF_NDX(smp) (ushort_t)((smp - smd_smap) & smd_freemsk) + +#define SMAPMTX(smp) (&smp->sm_mtx) + +#define SMAP_HASHFUNC(vp, off, hashid) \ + { \ + hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \ + ((off) >> MAXBSHIFT)) & smd_hashmsk); \ + } + +/* + * The most frequently updated kstat counters are kept in the + * per cpu array to avoid hot cache blocks. The update function + * sums the cpu local counters to update the global counters. + */ + +/* ARGSUSED */ +int +segmap_kstat_update(kstat_t *ksp, int rw) +{ + int i; + ulong_t getmap, release, get_reclaim; + ulong_t fault, pagecreate, get_reuse; + + if (rw == KSTAT_WRITE) + return (EACCES); + getmap = release = get_reclaim = (ulong_t)0; + fault = pagecreate = get_reuse = (ulong_t)0; + for (i = 0; i < max_ncpus; i++) { + getmap += smd_cpu[i].scpu.scpu_getmap; + release += smd_cpu[i].scpu.scpu_release; + get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim; + fault += smd_cpu[i].scpu.scpu_fault; + pagecreate += smd_cpu[i].scpu.scpu_pagecreate; + get_reuse += smd_cpu[i].scpu.scpu_get_reuse; + } + segmapcnt.smp_getmap.value.ul = getmap; + segmapcnt.smp_release.value.ul = release; + segmapcnt.smp_get_reclaim.value.ul = get_reclaim; + segmapcnt.smp_fault.value.ul = fault; + segmapcnt.smp_pagecreate.value.ul = pagecreate; + segmapcnt.smp_get_reuse.value.ul = get_reuse; + return (0); +} + +int +segmap_create(struct seg *seg, void *argsp) +{ + struct segmap_data *smd; + struct smap *smp; + struct smfree *sm; + struct segmap_crargs *a = (struct segmap_crargs *)argsp; + struct smaphash *shashp; + union segmap_cpu *scpu; + long i, npages; + size_t hashsz; + uint_t nfreelist; + extern void prefetch_smap_w(void *); + extern int max_ncpus; + + ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock)); + + if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) { + panic("segkmap not MAXBSIZE aligned"); + /*NOTREACHED*/ + } + + smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP); + + seg->s_data = (void *)smd; + seg->s_ops = &segmap_ops; + smd->smd_prot = a->prot; + + /* + * Scale the number of smap freelists to be + * proportional to max_ncpus * number of virtual colors. + * The caller can over-ride this scaling by providing + * a non-zero a->nfreelist argument. + */ + nfreelist = a->nfreelist; + if (nfreelist == 0) + nfreelist = max_ncpus; + else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) { + cmn_err(CE_WARN, "segmap_create: nfreelist out of range " + "%d, using %d", nfreelist, max_ncpus); + nfreelist = max_ncpus; + } + if (nfreelist & (nfreelist - 1)) { + /* round up nfreelist to the next power of two. */ + nfreelist = 1 << (highbit(nfreelist)); + } + + /* + * Get the number of virtual colors - must be a power of 2. + */ + if (a->shmsize) + smd_ncolor = a->shmsize >> MAXBSHIFT; + else + smd_ncolor = 1; + ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0); + ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR); + smd_colormsk = smd_ncolor - 1; + smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist; + smd_freemsk = smd_nfree - 1; + + /* + * Allocate and initialize the freelist headers. + * Note that sm_freeq[1] starts out as the release queue. This + * is known when the smap structures are initialized below. + */ + smd_free = smd->smd_free = + kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP); + for (i = 0; i < smd_nfree; i++) { + sm = &smd->smd_free[i]; + mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL); + sm->sm_allocq = &sm->sm_freeq[0]; + sm->sm_releq = &sm->sm_freeq[1]; + } + + /* + * Allocate and initialize the smap hash chain headers. + * Compute hash size rounding down to the next power of two. + */ + npages = MAP_PAGES(seg); + smd->smd_npages = npages; + hashsz = npages / SMAP_HASHAVELEN; + hashsz = 1 << (highbit(hashsz)-1); + smd_hashmsk = hashsz - 1; + smd_hash = smd->smd_hash = + kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP); +#ifdef SEGMAP_HASHSTATS + smd_hash_len = + kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP); +#endif + for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) { + shashp->sh_hash_list = NULL; + mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL); + } + + /* + * Allocate and initialize the smap structures. + * Link all slots onto the appropriate freelist. + * The smap array is large enough to affect boot time + * on large systems, so use memory prefetching and only + * go through the array 1 time. Inline a optimized version + * of segmap_smapadd to add structures to freelists with + * knowledge that no locks are needed here. + */ + smd_smap = smd->smd_sm = + kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP); + + for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1]; + smp >= smd->smd_sm; smp--) { + struct smap *smpfreelist; + struct sm_freeq *releq; + + prefetch_smap_w((char *)smp); + + smp->sm_vp = NULL; + smp->sm_hash = NULL; + smp->sm_off = 0; + smp->sm_bitmap = 0; + smp->sm_refcnt = 0; + mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL); + smp->sm_free_ndx = SMP2SMF_NDX(smp); + + sm = SMP2SMF(smp); + releq = sm->sm_releq; + + smpfreelist = releq->smq_free; + if (smpfreelist == 0) { + releq->smq_free = smp->sm_next = smp->sm_prev = smp; + } else { + smp->sm_next = smpfreelist; + smp->sm_prev = smpfreelist->sm_prev; + smpfreelist->sm_prev = smp; + smp->sm_prev->sm_next = smp; + releq->smq_free = smp->sm_next; + } + + /* + * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1] + */ + smp->sm_flags = 0; + +#ifdef SEGKPM_SUPPORT + /* + * Due to the fragile prefetch loop no + * separate function is used here. + */ + smp->sm_kpme_next = NULL; + smp->sm_kpme_prev = NULL; + smp->sm_kpme_page = NULL; +#endif + } + + /* + * Allocate the per color indices that distribute allocation + * requests over the free lists. Each cpu will have a private + * rotor index to spread the allocations even across the available + * smap freelists. Init the scpu_last_smap field to the first + * smap element so there is no need to check for NULL. + */ + smd_cpu = + kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP); + for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) { + int j; + for (j = 0; j < smd_ncolor; j++) + scpu->scpu.scpu_free_ndx[j] = j; + scpu->scpu.scpu_last_smap = smd_smap; + } + +#ifdef DEBUG + /* + * Keep track of which colors are used more often. + */ + colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP); +#endif /* DEBUG */ + + return (0); +} + +static void +segmap_free(seg) + struct seg *seg; +{ + ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock)); +} + +/* + * Do a F_SOFTUNLOCK call over the range requested. + * The range must have already been F_SOFTLOCK'ed. + */ +static void +segmap_unlock( + struct hat *hat, + struct seg *seg, + caddr_t addr, + size_t len, + enum seg_rw rw, + struct smap *smp) +{ + page_t *pp; + caddr_t adr; + u_offset_t off; + struct vnode *vp; + kmutex_t *smtx; + + ASSERT(smp->sm_refcnt > 0); + +#ifdef lint + seg = seg; +#endif + + if (segmap_kpm && IS_KPM_ADDR(addr)) { + + /* + * We're called only from segmap_fault and this was a + * NOP in case of a kpm based smap, so dangerous things + * must have happened in the meantime. Pages are prefaulted + * and locked in segmap_getmapflt and they will not be + * unlocked until segmap_release. + */ + panic("segmap_unlock: called with kpm addr %p", (void *)addr); + /*NOTREACHED*/ + } + + vp = smp->sm_vp; + off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET); + + hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE)); + for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) { + ushort_t bitmask; + + /* + * Use page_find() instead of page_lookup() to + * find the page since we know that it has + * "shared" lock. + */ + pp = page_find(vp, off); + if (pp == NULL) { + panic("segmap_unlock: page not found"); + /*NOTREACHED*/ + } + + if (rw == S_WRITE) { + hat_setrefmod(pp); + } else if (rw != S_OTHER) { + TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT, + "segmap_fault:pp %p vp %p offset %llx", + pp, vp, off); + hat_setref(pp); + } + + /* + * Clear bitmap, if the bit corresponding to "off" is set, + * since the page and translation are being unlocked. + */ + bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT); + + /* + * Large Files: Following assertion is to verify + * the correctness of the cast to (int) above. + */ + ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX); + smtx = SMAPMTX(smp); + mutex_enter(smtx); + if (smp->sm_bitmap & bitmask) { + smp->sm_bitmap &= ~bitmask; + } + mutex_exit(smtx); + + page_unlock(pp); + } +} + +#define MAXPPB (MAXBSIZE/4096) /* assumes minimum page size of 4k */ + +/* + * This routine is called via a machine specific fault handling + * routine. It is also called by software routines wishing to + * lock or unlock a range of addresses. + * + * Note that this routine expects a page-aligned "addr". + */ +faultcode_t +segmap_fault( + struct hat *hat, + struct seg *seg, + caddr_t addr, + size_t len, + enum fault_type type, + enum seg_rw rw) +{ + struct segmap_data *smd = (struct segmap_data *)seg->s_data; + struct smap *smp; + page_t *pp, **ppp; + struct vnode *vp; + u_offset_t off; + page_t *pl[MAXPPB + 1]; + uint_t prot; + u_offset_t addroff; + caddr_t adr; + int err; + u_offset_t sm_off; + int hat_flag; + + if (segmap_kpm && IS_KPM_ADDR(addr)) { + int newpage; + kmutex_t *smtx; + + /* + * Pages are successfully prefaulted and locked in + * segmap_getmapflt and can't be unlocked until + * segmap_release. No hat mappings have to be locked + * and they also can't be unlocked as long as the + * caller owns an active kpm addr. + */ +#ifndef DEBUG + if (type != F_SOFTUNLOCK) + return (0); +#endif + + if ((smp = get_smap_kpm(addr, NULL)) == NULL) { + panic("segmap_fault: smap not found " + "for addr %p", (void *)addr); + /*NOTREACHED*/ + } + + smtx = SMAPMTX(smp); +#ifdef DEBUG + newpage = smp->sm_flags & SM_KPM_NEWPAGE; + if (newpage) { + cmn_err(CE_WARN, "segmap_fault: newpage? smp %p", + (void *)smp); + } + + if (type != F_SOFTUNLOCK) { + mutex_exit(smtx); + return (0); + } +#endif + mutex_exit(smtx); + vp = smp->sm_vp; + sm_off = smp->sm_off; + + if (vp == NULL) + return (FC_MAKE_ERR(EIO)); + + ASSERT(smp->sm_refcnt > 0); + + addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET); + if (addroff + len > MAXBSIZE) + panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk", + (void *)(addr + len)); + + off = sm_off + addroff; + + pp = page_find(vp, off); + + if (pp == NULL) + panic("segmap_fault: softunlock page not found"); + + /* + * Set ref bit also here in case of S_OTHER to avoid the + * overhead of supporting other cases than F_SOFTUNLOCK + * with segkpm. We can do this because the underlying + * pages are locked anyway. + */ + if (rw == S_WRITE) { + hat_setrefmod(pp); + } else { + TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT, + "segmap_fault:pp %p vp %p offset %llx", + pp, vp, off); + hat_setref(pp); + } + + return (0); + } + + smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++; + smp = GET_SMAP(seg, addr); + vp = smp->sm_vp; + sm_off = smp->sm_off; + + if (vp == NULL) + return (FC_MAKE_ERR(EIO)); + + ASSERT(smp->sm_refcnt > 0); + + addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET); + if (addroff + len > MAXBSIZE) { + panic("segmap_fault: endaddr %p " + "exceeds MAXBSIZE chunk", (void *)(addr + len)); + /*NOTREACHED*/ + } + off = sm_off + addroff; + + /* + * First handle the easy stuff + */ + if (type == F_SOFTUNLOCK) { + segmap_unlock(hat, seg, addr, len, rw, smp); + return (0); + } + + TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE, + "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp); + err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE, + seg, addr, rw, CRED()); + + if (err) + return (FC_MAKE_ERR(err)); + + prot &= smd->smd_prot; + + /* + * Handle all pages returned in the pl[] array. + * This loop is coded on the assumption that if + * there was no error from the VOP_GETPAGE routine, + * that the page list returned will contain all the + * needed pages for the vp from [off..off + len]. + */ + ppp = pl; + while ((pp = *ppp++) != NULL) { + u_offset_t poff; + ASSERT(pp->p_vnode == vp); + hat_flag = HAT_LOAD; + + /* + * Verify that the pages returned are within the range + * of this segmap region. Note that it is theoretically + * possible for pages outside this range to be returned, + * but it is not very likely. If we cannot use the + * page here, just release it and go on to the next one. + */ + if (pp->p_offset < sm_off || + pp->p_offset >= sm_off + MAXBSIZE) { + (void) page_release(pp, 1); + continue; + } + + ASSERT(hat == kas.a_hat); + poff = pp->p_offset; + adr = addr + (poff - off); + if (adr >= addr && adr < addr + len) { + hat_setref(pp); + TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT, + "segmap_fault:pp %p vp %p offset %llx", + pp, vp, poff); + if (type == F_SOFTLOCK) + hat_flag = HAT_LOAD_LOCK; + } + + /* + * Deal with VMODSORT pages here. If we know this is a write + * do the setmod now and allow write protection. + * As long as it's modified or not S_OTHER, remove write + * protection. With S_OTHER it's up to the FS to deal with this. + */ + if (IS_VMODSORT(vp)) { + if (rw == S_WRITE) + hat_setmod(pp); + else if (rw != S_OTHER && !hat_ismod(pp)) + prot &= ~PROT_WRITE; + } + + hat_memload(hat, adr, pp, prot, hat_flag); + if (hat_flag != HAT_LOAD_LOCK) + page_unlock(pp); + } + return (0); +} + +/* + * This routine is used to start I/O on pages asynchronously. + */ +static faultcode_t +segmap_faulta(struct seg *seg, caddr_t addr) +{ + struct smap *smp; + struct vnode *vp; + u_offset_t off; + int err; + + if (segmap_kpm && IS_KPM_ADDR(addr)) { + int newpage; + kmutex_t *smtx; + + /* + * Pages are successfully prefaulted and locked in + * segmap_getmapflt and can't be unlocked until + * segmap_release. No hat mappings have to be locked + * and they also can't be unlocked as long as the + * caller owns an active kpm addr. + */ +#ifdef DEBUG + if ((smp = get_smap_kpm(addr, NULL)) == NULL) { + panic("segmap_faulta: smap not found " + "for addr %p", (void *)addr); + /*NOTREACHED*/ + } + + smtx = SMAPMTX(smp); + newpage = smp->sm_flags & SM_KPM_NEWPAGE; + mutex_exit(smtx); + if (newpage) + cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p", + (void *)smp); +#endif + return (0); + } + + segmapcnt.smp_faulta.value.ul++; + smp = GET_SMAP(seg, addr); + + ASSERT(smp->sm_refcnt > 0); + + vp = smp->sm_vp; + off = smp->sm_off; + + if (vp == NULL) { + cmn_err(CE_WARN, "segmap_faulta - no vp"); + return (FC_MAKE_ERR(EIO)); + } + + TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE, + "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp); + + err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr + & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0, + seg, addr, S_READ, CRED()); + + if (err) + return (FC_MAKE_ERR(err)); + return (0); +} + +/*ARGSUSED*/ +static int +segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + struct segmap_data *smd = (struct segmap_data *)seg->s_data; + + ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock)); + + /* + * Need not acquire the segment lock since + * "smd_prot" is a read-only field. + */ + return (((smd->smd_prot & prot) != prot) ? EACCES : 0); +} + +static int +segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + struct segmap_data *smd = (struct segmap_data *)seg->s_data; + size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + if (pgno != 0) { + do + protv[--pgno] = smd->smd_prot; + while (pgno != 0); + } + return (0); +} + +static u_offset_t +segmap_getoffset(struct seg *seg, caddr_t addr) +{ + struct segmap_data *smd = (struct segmap_data *)seg->s_data; + + ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock)); + + return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base)); +} + +/*ARGSUSED*/ +static int +segmap_gettype(struct seg *seg, caddr_t addr) +{ + ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock)); + + return (MAP_SHARED); +} + +/*ARGSUSED*/ +static int +segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + struct segmap_data *smd = (struct segmap_data *)seg->s_data; + + ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock)); + + /* XXX - This doesn't make any sense */ + *vpp = smd->smd_sm->sm_vp; + return (0); +} + +/* + * Check to see if it makes sense to do kluster/read ahead to + * addr + delta relative to the mapping at addr. We assume here + * that delta is a signed PAGESIZE'd multiple (which can be negative). + * + * For segmap we always "approve" of this action from our standpoint. + */ +/*ARGSUSED*/ +static int +segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta) +{ + return (0); +} + +static void +segmap_badop() +{ + panic("segmap_badop"); + /*NOTREACHED*/ +} + +/* + * Special private segmap operations + */ + +/* + * Add smap to the appropriate free list. + */ +static void +segmap_smapadd(struct smap *smp) +{ + struct smfree *sm; + struct smap *smpfreelist; + struct sm_freeq *releq; + + ASSERT(MUTEX_HELD(SMAPMTX(smp))); + + if (smp->sm_refcnt != 0) { + panic("segmap_smapadd"); + /*NOTREACHED*/ + } + + sm = &smd_free[smp->sm_free_ndx]; + /* + * Add to the tail of the release queue + * Note that sm_releq and sm_allocq could toggle + * before we get the lock. This does not affect + * correctness as the 2 queues are only maintained + * to reduce lock pressure. + */ + releq = sm->sm_releq; + if (releq == &sm->sm_freeq[0]) + smp->sm_flags |= SM_QNDX_ZERO; + else + smp->sm_flags &= ~SM_QNDX_ZERO; + mutex_enter(&releq->smq_mtx); + smpfreelist = releq->smq_free; + if (smpfreelist == 0) { + int want; + + releq->smq_free = smp->sm_next = smp->sm_prev = smp; + /* + * Both queue mutexes held to set sm_want; + * snapshot the value before dropping releq mutex. + * If sm_want appears after the releq mutex is dropped, + * then the smap just freed is already gone. + */ + want = sm->sm_want; + mutex_exit(&releq->smq_mtx); + /* + * See if there was a waiter before dropping the releq mutex + * then recheck after obtaining sm_freeq[0] mutex as + * the another thread may have already signaled. + */ + if (want) { + mutex_enter(&sm->sm_freeq[0].smq_mtx); + if (sm->sm_want) + cv_signal(&sm->sm_free_cv); + mutex_exit(&sm->sm_freeq[0].smq_mtx); + } + } else { + smp->sm_next = smpfreelist; + smp->sm_prev = smpfreelist->sm_prev; + smpfreelist->sm_prev = smp; + smp->sm_prev->sm_next = smp; + mutex_exit(&releq->smq_mtx); + } +} + + +static struct smap * +segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid) +{ + struct smap **hpp; + struct smap *tmp; + kmutex_t *hmtx; + + ASSERT(MUTEX_HELD(SMAPMTX(smp))); + ASSERT(smp->sm_vp == NULL); + ASSERT(smp->sm_hash == NULL); + ASSERT(smp->sm_prev == NULL); + ASSERT(smp->sm_next == NULL); + ASSERT(hashid >= 0 && hashid <= smd_hashmsk); + + hmtx = SHASHMTX(hashid); + + mutex_enter(hmtx); + /* + * First we need to verify that no one has created a smp + * with (vp,off) as its tag before we us. + */ + for (tmp = smd_hash[hashid].sh_hash_list; + tmp != NULL; tmp = tmp->sm_hash) + if (tmp->sm_vp == vp && tmp->sm_off == off) + break; + + if (tmp == NULL) { + /* + * No one created one yet. + * + * Funniness here - we don't increment the ref count on the + * vnode * even though we have another pointer to it here. + * The reason for this is that we don't want the fact that + * a seg_map entry somewhere refers to a vnode to prevent the + * vnode * itself from going away. This is because this + * reference to the vnode is a "soft one". In the case where + * a mapping is being used by a rdwr [or directory routine?] + * there already has to be a non-zero ref count on the vnode. + * In the case where the vp has been freed and the the smap + * structure is on the free list, there are no pages in memory + * that can refer to the vnode. Thus even if we reuse the same + * vnode/smap structure for a vnode which has the same + * address but represents a different object, we are ok. + */ + smp->sm_vp = vp; + smp->sm_off = off; + + hpp = &smd_hash[hashid].sh_hash_list; + smp->sm_hash = *hpp; + *hpp = smp; +#ifdef SEGMAP_HASHSTATS + smd_hash_len[hashid]++; +#endif + } + mutex_exit(hmtx); + + return (tmp); +} + +static void +segmap_hashout(struct smap *smp) +{ + struct smap **hpp, *hp; + struct vnode *vp; + kmutex_t *mtx; + int hashid; + u_offset_t off; + + ASSERT(MUTEX_HELD(SMAPMTX(smp))); + + vp = smp->sm_vp; + off = smp->sm_off; + + SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */ + mtx = SHASHMTX(hashid); + mutex_enter(mtx); + + hpp = &smd_hash[hashid].sh_hash_list; + for (;;) { + hp = *hpp; + if (hp == NULL) { + panic("segmap_hashout"); + /*NOTREACHED*/ + } + if (hp == smp) + break; + hpp = &hp->sm_hash; + } + + *hpp = smp->sm_hash; + smp->sm_hash = NULL; +#ifdef SEGMAP_HASHSTATS + smd_hash_len[hashid]--; +#endif + mutex_exit(mtx); + + smp->sm_vp = NULL; + smp->sm_off = (u_offset_t)0; + +} + +/* + * Attempt to free unmodified, unmapped, and non locked segmap + * pages. + */ +void +segmap_pagefree(struct vnode *vp, u_offset_t off) +{ + u_offset_t pgoff; + page_t *pp; + + for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) { + + if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL) + continue; + + switch (page_release(pp, 1)) { + case PGREL_NOTREL: + segmapcnt.smp_free_notfree.value.ul++; + break; + case PGREL_MOD: + segmapcnt.smp_free_dirty.value.ul++; + break; + case PGREL_CLEAN: + segmapcnt.smp_free.value.ul++; + break; + } + } +} + +/* + * Locks held on entry: smap lock + * Locks held on exit : smap lock. + */ + +static void +grab_smp(struct smap *smp, page_t *pp) +{ + ASSERT(MUTEX_HELD(SMAPMTX(smp))); + ASSERT(smp->sm_refcnt == 0); + + if (smp->sm_vp != (struct vnode *)NULL) { + struct vnode *vp = smp->sm_vp; + u_offset_t off = smp->sm_off; + /* + * Destroy old vnode association and + * unload any hardware translations to + * the old object. + */ + smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++; + segmap_hashout(smp); + + /* + * This node is off freelist and hashlist, + * so there is no reason to drop/reacquire sm_mtx + * across calls to hat_unload. + */ + if (segmap_kpm) { + caddr_t vaddr; + int hat_unload_needed = 0; + + /* + * unload kpm mapping + */ + if (pp != NULL) { + vaddr = hat_kpm_page2va(pp, 1); + hat_kpm_mapout(pp, GET_KPME(smp), vaddr); + page_unlock(pp); + } + + /* + * Check if we have (also) the rare case of a + * non kpm mapping. + */ + if (smp->sm_flags & SM_NOTKPM_RELEASED) { + hat_unload_needed = 1; + smp->sm_flags &= ~SM_NOTKPM_RELEASED; + } + + if (hat_unload_needed) { + hat_unload(kas.a_hat, segkmap->s_base + + ((smp - smd_smap) * MAXBSIZE), + MAXBSIZE, HAT_UNLOAD); + } + + } else { + ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED); + smp->sm_flags &= ~SM_NOTKPM_RELEASED; + hat_unload(kas.a_hat, segkmap->s_base + + ((smp - smd_smap) * MAXBSIZE), + MAXBSIZE, HAT_UNLOAD); + } + segmap_pagefree(vp, off); + } +} + +static struct smap * +get_free_smp(int free_ndx) +{ + struct smfree *sm; + kmutex_t *smtx; + struct smap *smp, *first; + struct sm_freeq *allocq, *releq; + struct kpme *kpme; + page_t *pp = NULL; + int end_ndx, page_locked = 0; + + end_ndx = free_ndx; + sm = &smd_free[free_ndx]; + +retry_queue: + allocq = sm->sm_allocq; + mutex_enter(&allocq->smq_mtx); + + if ((smp = allocq->smq_free) == NULL) { + +skip_queue: + /* + * The alloc list is empty or this queue is being skipped; + * first see if the allocq toggled. + */ + if (sm->sm_allocq != allocq) { + /* queue changed */ + mutex_exit(&allocq->smq_mtx); + goto retry_queue; + } + releq = sm->sm_releq; + if (!mutex_tryenter(&releq->smq_mtx)) { + /* cannot get releq; a free smp may be there now */ + mutex_exit(&allocq->smq_mtx); + + /* + * This loop could spin forever if this thread has + * higher priority than the thread that is holding + * releq->smq_mtx. In order to force the other thread + * to run, we'll lock/unlock the mutex which is safe + * since we just unlocked the allocq mutex. + */ + mutex_enter(&releq->smq_mtx); + mutex_exit(&releq->smq_mtx); + goto retry_queue; + } + if (releq->smq_free == NULL) { + /* + * This freelist is empty. + * This should not happen unless clients + * are failing to release the segmap + * window after accessing the data. + * Before resorting to sleeping, try + * the next list of the same color. + */ + free_ndx = (free_ndx + smd_ncolor) & smd_freemsk; + if (free_ndx != end_ndx) { + mutex_exit(&releq->smq_mtx); + mutex_exit(&allocq->smq_mtx); + sm = &smd_free[free_ndx]; + goto retry_queue; + } + /* + * Tried all freelists of the same color once, + * wait on this list and hope something gets freed. + */ + segmapcnt.smp_get_nofree.value.ul++; + sm->sm_want++; + mutex_exit(&sm->sm_freeq[1].smq_mtx); + cv_wait(&sm->sm_free_cv, + &sm->sm_freeq[0].smq_mtx); + sm->sm_want--; + mutex_exit(&sm->sm_freeq[0].smq_mtx); + sm = &smd_free[free_ndx]; + goto retry_queue; + } else { + /* + * Something on the rele queue; flip the alloc + * and rele queues and retry. + */ + sm->sm_allocq = releq; + sm->sm_releq = allocq; + mutex_exit(&allocq->smq_mtx); + mutex_exit(&releq->smq_mtx); + if (page_locked) { + delay(hz >> 2); + page_locked = 0; + } + goto retry_queue; + } + } else { + /* + * Fastpath the case we get the smap mutex + * on the first try. + */ + first = smp; +next_smap: + smtx = SMAPMTX(smp); + if (!mutex_tryenter(smtx)) { + /* + * Another thread is trying to reclaim this slot. + * Skip to the next queue or smap. + */ + if ((smp = smp->sm_next) == first) { + goto skip_queue; + } else { + goto next_smap; + } + } else { + /* + * if kpme exists, get shared lock on the page + */ + if (segmap_kpm && smp->sm_vp != NULL) { + + kpme = GET_KPME(smp); + pp = kpme->kpe_page; + + if (pp != NULL) { + if (!page_trylock(pp, SE_SHARED)) { + smp = smp->sm_next; + mutex_exit(smtx); + page_locked = 1; + + pp = NULL; + + if (smp == first) { + goto skip_queue; + } else { + goto next_smap; + } + } else { + if (kpme->kpe_page == NULL) { + page_unlock(pp); + pp = NULL; + } + } + } + } + + /* + * At this point, we've selected smp. Remove smp + * from its freelist. If smp is the first one in + * the freelist, update the head of the freelist. + */ + if (first == smp) { + ASSERT(first == allocq->smq_free); + allocq->smq_free = smp->sm_next; + } + + /* + * if the head of the freelist still points to smp, + * then there are no more free smaps in that list. + */ + if (allocq->smq_free == smp) + /* + * Took the last one + */ + allocq->smq_free = NULL; + else { + smp->sm_prev->sm_next = smp->sm_next; + smp->sm_next->sm_prev = smp->sm_prev; + } + mutex_exit(&allocq->smq_mtx); + smp->sm_prev = smp->sm_next = NULL; + + /* + * if pp != NULL, pp must have been locked; + * grab_smp() unlocks pp. + */ + ASSERT((pp == NULL) || PAGE_LOCKED(pp)); + grab_smp(smp, pp); + /* return smp locked. */ + ASSERT(SMAPMTX(smp) == smtx); + ASSERT(MUTEX_HELD(smtx)); + return (smp); + } + } +} + +/* + * Special public segmap operations + */ + +/* + * Create pages (without using VOP_GETPAGE) and load up tranlations to them. + * If softlock is TRUE, then set things up so that it looks like a call + * to segmap_fault with F_SOFTLOCK. + * + * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise. + * + * All fields in the generic segment (struct seg) are considered to be + * read-only for "segmap" even though the kernel address space (kas) may + * not be locked, hence no lock is needed to access them. + */ +int +segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock) +{ + struct segmap_data *smd = (struct segmap_data *)seg->s_data; + page_t *pp; + u_offset_t off; + struct smap *smp; + struct vnode *vp; + caddr_t eaddr; + int newpage = 0; + uint_t prot; + kmutex_t *smtx; + int hat_flag; + + ASSERT(seg->s_as == &kas); + + if (segmap_kpm && IS_KPM_ADDR(addr)) { + /* + * Pages are successfully prefaulted and locked in + * segmap_getmapflt and can't be unlocked until + * segmap_release. The SM_KPM_NEWPAGE flag is set + * in segmap_pagecreate_kpm when new pages are created. + * and it is returned as "newpage" indication here. + */ + if ((smp = get_smap_kpm(addr, NULL)) == NULL) { + panic("segmap_pagecreate: smap not found " + "for addr %p", (void *)addr); + /*NOTREACHED*/ + } + + smtx = SMAPMTX(smp); + newpage = smp->sm_flags & SM_KPM_NEWPAGE; + smp->sm_flags &= ~SM_KPM_NEWPAGE; + mutex_exit(smtx); + + return (newpage); + } + + smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++; + + eaddr = addr + len; + addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + + smp = GET_SMAP(seg, addr); + + /* + * We don't grab smp mutex here since we assume the smp + * has a refcnt set already which prevents the slot from + * changing its id. + */ + ASSERT(smp->sm_refcnt > 0); + + vp = smp->sm_vp; + off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET)); + prot = smd->smd_prot; + + for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) { + hat_flag = HAT_LOAD; + pp = page_lookup(vp, off, SE_SHARED); + if (pp == NULL) { + ushort_t bitindex; + + if ((pp = page_create_va(vp, off, + PAGESIZE, PG_WAIT, seg, addr)) == NULL) { + panic("segmap_pagecreate: page_create failed"); + /*NOTREACHED*/ + } + newpage = 1; + page_io_unlock(pp); + + /* + * Since pages created here do not contain valid + * data until the caller writes into them, the + * "exclusive" lock will not be dropped to prevent + * other users from accessing the page. We also + * have to lock the translation to prevent a fault + * from occuring when the virtual address mapped by + * this page is written into. This is necessary to + * avoid a deadlock since we haven't dropped the + * "exclusive" lock. + */ + bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT); + + /* + * Large Files: The following assertion is to + * verify the cast above. + */ + ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX); + smtx = SMAPMTX(smp); + mutex_enter(smtx); + smp->sm_bitmap |= SMAP_BIT_MASK(bitindex); + mutex_exit(smtx); + + hat_flag = HAT_LOAD_LOCK; + } else if (softlock) { + hat_flag = HAT_LOAD_LOCK; + } + + if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE)) + hat_setmod(pp); + + hat_memload(kas.a_hat, addr, pp, prot, hat_flag); + + if (hat_flag != HAT_LOAD_LOCK) + page_unlock(pp); + + TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE, + "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx", + seg, addr, pp, vp, off); + } + + return (newpage); +} + +void +segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) +{ + struct smap *smp; + ushort_t bitmask; + page_t *pp; + struct vnode *vp; + u_offset_t off; + caddr_t eaddr; + kmutex_t *smtx; + + ASSERT(seg->s_as == &kas); + + eaddr = addr + len; + addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + + if (segmap_kpm && IS_KPM_ADDR(addr)) { + /* + * Pages are successfully prefaulted and locked in + * segmap_getmapflt and can't be unlocked until + * segmap_release, so no pages or hat mappings have + * to be unlocked at this point. + */ +#ifdef DEBUG + if ((smp = get_smap_kpm(addr, NULL)) == NULL) { + panic("segmap_pageunlock: smap not found " + "for addr %p", (void *)addr); + /*NOTREACHED*/ + } + + ASSERT(smp->sm_refcnt > 0); + mutex_exit(SMAPMTX(smp)); +#endif + return; + } + + smp = GET_SMAP(seg, addr); + smtx = SMAPMTX(smp); + + ASSERT(smp->sm_refcnt > 0); + + vp = smp->sm_vp; + off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET)); + + for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) { + bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT); + + /* + * Large Files: Following assertion is to verify + * the correctness of the cast to (int) above. + */ + ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX); + + /* + * If the bit corresponding to "off" is set, + * clear this bit in the bitmap, unlock translations, + * and release the "exclusive" lock on the page. + */ + if (smp->sm_bitmap & bitmask) { + mutex_enter(smtx); + smp->sm_bitmap &= ~bitmask; + mutex_exit(smtx); + + hat_unlock(kas.a_hat, addr, PAGESIZE); + + /* + * Use page_find() instead of page_lookup() to + * find the page since we know that it has + * "exclusive" lock. + */ + pp = page_find(vp, off); + if (pp == NULL) { + panic("segmap_pageunlock: page not found"); + /*NOTREACHED*/ + } + if (rw == S_WRITE) { + hat_setrefmod(pp); + } else if (rw != S_OTHER) { + hat_setref(pp); + } + + page_unlock(pp); + } + } +} + +caddr_t +segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off) +{ + return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER)); +} + +/* + * This is the magic virtual address that offset 0 of an ELF + * file gets mapped to in user space. This is used to pick + * the vac color on the freelist. + */ +#define ELF_OFFZERO_VA (0x10000) +/* + * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp + * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned. + * The return address is always MAXBSIZE aligned. + * + * If forcefault is nonzero and the MMU translations haven't yet been created, + * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them. + */ +caddr_t +segmap_getmapflt( + struct seg *seg, + struct vnode *vp, + u_offset_t off, + size_t len, + int forcefault, + enum seg_rw rw) +{ + struct smap *smp, *nsmp; + extern struct vnode *common_specvp(); + caddr_t baseaddr; /* MAXBSIZE aligned */ + u_offset_t baseoff; + int newslot; + caddr_t vaddr; + int color, hashid; + kmutex_t *hashmtx, *smapmtx; + struct smfree *sm; + page_t *pp; + struct kpme *kpme; + uint_t prot; + caddr_t base; + page_t *pl[MAXPPB + 1]; + int error; + int is_kpm = 1; + + ASSERT(seg->s_as == &kas); + ASSERT(seg == segkmap); + + baseoff = off & (offset_t)MAXBMASK; + if (off + len > baseoff + MAXBSIZE) { + panic("segmap_getmap bad len"); + /*NOTREACHED*/ + } + + /* + * If this is a block device we have to be sure to use the + * "common" block device vnode for the mapping. + */ + if (vp->v_type == VBLK) + vp = common_specvp(vp); + + smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++; + + if (segmap_kpm == 0 || + (forcefault == SM_PAGECREATE && rw != S_WRITE)) { + is_kpm = 0; + } + + SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */ + hashmtx = SHASHMTX(hashid); + +retry_hash: + mutex_enter(hashmtx); + for (smp = smd_hash[hashid].sh_hash_list; + smp != NULL; smp = smp->sm_hash) + if (smp->sm_vp == vp && smp->sm_off == baseoff) + break; + mutex_exit(hashmtx); + +vrfy_smp: + if (smp != NULL) { + + ASSERT(vp->v_count != 0); + + /* + * Get smap lock and recheck its tag. The hash lock + * is dropped since the hash is based on (vp, off) + * and (vp, off) won't change when we have smap mtx. + */ + smapmtx = SMAPMTX(smp); + mutex_enter(smapmtx); + if (smp->sm_vp != vp || smp->sm_off != baseoff) { + mutex_exit(smapmtx); + goto retry_hash; + } + + if (smp->sm_refcnt == 0) { + + smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++; + + /* + * Could still be on the free list. However, this + * could also be an smp that is transitioning from + * the free list when we have too much contention + * for the smapmtx's. In this case, we have an + * unlocked smp that is not on the free list any + * longer, but still has a 0 refcnt. The only way + * to be sure is to check the freelist pointers. + * Since we now have the smapmtx, we are guaranteed + * that the (vp, off) won't change, so we are safe + * to reclaim it. get_free_smp() knows that this + * can happen, and it will check the refcnt. + */ + + if ((smp->sm_next != NULL)) { + struct sm_freeq *freeq; + + ASSERT(smp->sm_prev != NULL); + sm = &smd_free[smp->sm_free_ndx]; + + if (smp->sm_flags & SM_QNDX_ZERO) + freeq = &sm->sm_freeq[0]; + else + freeq = &sm->sm_freeq[1]; + + mutex_enter(&freeq->smq_mtx); + if (freeq->smq_free != smp) { + /* + * fastpath normal case + */ + smp->sm_prev->sm_next = smp->sm_next; + smp->sm_next->sm_prev = smp->sm_prev; + } else if (smp == smp->sm_next) { + /* + * Taking the last smap on freelist + */ + freeq->smq_free = NULL; + } else { + /* + * Reclaiming 1st smap on list + */ + freeq->smq_free = smp->sm_next; + smp->sm_prev->sm_next = smp->sm_next; + smp->sm_next->sm_prev = smp->sm_prev; + } + mutex_exit(&freeq->smq_mtx); + smp->sm_prev = smp->sm_next = NULL; + } else { + ASSERT(smp->sm_prev == NULL); + segmapcnt.smp_stolen.value.ul++; + } + + } else { + segmapcnt.smp_get_use.value.ul++; + } + smp->sm_refcnt++; /* another user */ + + /* + * We don't invoke segmap_fault via TLB miss, so we set ref + * and mod bits in advance. For S_OTHER we set them in + * segmap_fault F_SOFTUNLOCK. + */ + if (is_kpm) { + if (rw == S_WRITE) { + smp->sm_flags |= SM_WRITE_DATA; + } else if (rw == S_READ) { + smp->sm_flags |= SM_READ_DATA; + } + } + mutex_exit(smapmtx); + + newslot = 0; + } else { + + uint32_t free_ndx, *free_ndxp; + union segmap_cpu *scpu; + + /* + * On a PAC machine or a machine with anti-alias + * hardware, smd_colormsk will be zero. + * + * On a VAC machine- pick color by offset in the file + * so we won't get VAC conflicts on elf files. + * On data files, color does not matter but we + * don't know what kind of file it is so we always + * pick color by offset. This causes color + * corresponding to file offset zero to be used more + * heavily. + */ + color = (baseoff >> MAXBSHIFT) & smd_colormsk; + scpu = smd_cpu+CPU->cpu_seqid; + free_ndxp = &scpu->scpu.scpu_free_ndx[color]; + free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk; +#ifdef DEBUG + colors_used[free_ndx]++; +#endif /* DEBUG */ + + /* + * Get a locked smp slot from the free list. + */ + smp = get_free_smp(free_ndx); + smapmtx = SMAPMTX(smp); + + ASSERT(smp->sm_vp == NULL); + + if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) { + /* + * Failed to hashin, there exists one now. + * Return the smp we just allocated. + */ + segmap_smapadd(smp); + mutex_exit(smapmtx); + + smp = nsmp; + goto vrfy_smp; + } + smp->sm_refcnt++; /* another user */ + + /* + * We don't invoke segmap_fault via TLB miss, so we set ref + * and mod bits in advance. For S_OTHER we set them in + * segmap_fault F_SOFTUNLOCK. + */ + if (is_kpm) { + if (rw == S_WRITE) { + smp->sm_flags |= SM_WRITE_DATA; + } else if (rw == S_READ) { + smp->sm_flags |= SM_READ_DATA; + } + } + mutex_exit(smapmtx); + + newslot = 1; + } + + if (!is_kpm) + goto use_segmap_range; + + /* + * Use segkpm + */ + ASSERT(PAGESIZE == MAXBSIZE); + + /* + * remember the last smp faulted on this cpu. + */ + (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp; + + if (forcefault == SM_PAGECREATE) { + baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw); + return (baseaddr); + } + + if (newslot == 0 && + (pp = GET_KPME(smp)->kpe_page) != NULL) { + + /* fastpath */ + switch (rw) { + case S_READ: + case S_WRITE: + if (page_trylock(pp, SE_SHARED)) { + if (PP_ISFREE(pp) || + !(pp->p_vnode == vp && + pp->p_offset == baseoff)) { + page_unlock(pp); + pp = page_lookup(vp, baseoff, + SE_SHARED); + } + } else { + pp = page_lookup(vp, baseoff, SE_SHARED); + } + + if (pp == NULL) { + ASSERT(GET_KPME(smp)->kpe_page == NULL); + break; + } + + if (rw == S_WRITE && + hat_page_getattr(pp, P_MOD | P_REF) != + (P_MOD | P_REF)) { + page_unlock(pp); + break; + } + + /* + * We have the p_selock as reader, grab_smp + * can't hit us, we have bumped the smap + * refcnt and hat_pageunload needs the + * p_selock exclusive. + */ + kpme = GET_KPME(smp); + if (kpme->kpe_page == pp) { + baseaddr = hat_kpm_page2va(pp, 0); + } else if (kpme->kpe_page == NULL) { + baseaddr = hat_kpm_mapin(pp, kpme); + } else { + panic("segmap_getmapflt: stale " + "kpme page, kpme %p", (void *)kpme); + /*NOTREACHED*/ + } + + /* + * We don't invoke segmap_fault via TLB miss, + * so we set ref and mod bits in advance. + * For S_OTHER and we set them in segmap_fault + * F_SOFTUNLOCK. + */ + if (rw == S_READ && !hat_isref(pp)) + hat_setref(pp); + + return (baseaddr); + default: + break; + } + } + + base = segkpm_create_va(baseoff); + error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE, + seg, base, rw, CRED()); + + pp = pl[0]; + if (error || pp == NULL) { + /* + * Use segmap address slot and let segmap_fault deal + * with the error cases. There is no error return + * possible here. + */ + goto use_segmap_range; + } + + ASSERT(pl[1] == NULL); + + /* + * When prot is not returned w/ PROT_ALL the returned pages + * are not backed by fs blocks. For most of the segmap users + * this is no problem, they don't write to the pages in the + * same request and therefore don't rely on a following + * trap driven segmap_fault. With SM_LOCKPROTO users it + * is more secure to use segkmap adresses to allow + * protection segmap_fault's. + */ + if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) { + /* + * Use segmap address slot and let segmap_fault + * do the error return. + */ + ASSERT(rw != S_WRITE); + ASSERT(PAGE_LOCKED(pp)); + page_unlock(pp); + forcefault = 0; + goto use_segmap_range; + } + + /* + * We have the p_selock as reader, grab_smp can't hit us, we + * have bumped the smap refcnt and hat_pageunload needs the + * p_selock exclusive. + */ + kpme = GET_KPME(smp); + if (kpme->kpe_page == pp) { + baseaddr = hat_kpm_page2va(pp, 0); + } else if (kpme->kpe_page == NULL) { + baseaddr = hat_kpm_mapin(pp, kpme); + } else { + panic("segmap_getmapflt: stale kpme page after " + "VOP_GETPAGE, kpme %p", (void *)kpme); + /*NOTREACHED*/ + } + + smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++; + + return (baseaddr); + + +use_segmap_range: + baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE); + TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP, + "segmap_getmap:seg %p addr %p vp %p offset %llx", + seg, baseaddr, vp, baseoff); + + /* + * Prefault the translations + */ + vaddr = baseaddr + (off - baseoff); + if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) { + + caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr & + (uintptr_t)PAGEMASK); + + (void) segmap_fault(kas.a_hat, seg, pgaddr, + (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK, + F_INVAL, rw); + } + + return (baseaddr); +} + +int +segmap_release(struct seg *seg, caddr_t addr, uint_t flags) +{ + struct smap *smp; + int error; + int bflags = 0; + struct vnode *vp; + u_offset_t offset; + kmutex_t *smtx; + int is_kpm = 0; + page_t *pp; + + if (segmap_kpm && IS_KPM_ADDR(addr)) { + + if (((uintptr_t)addr & MAXBOFFSET) != 0) { + panic("segmap_release: addr %p not " + "MAXBSIZE aligned", (void *)addr); + /*NOTREACHED*/ + } + + if ((smp = get_smap_kpm(addr, &pp)) == NULL) { + panic("segmap_release: smap not found " + "for addr %p", (void *)addr); + /*NOTREACHED*/ + } + + TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP, + "segmap_relmap:seg %p addr %p smp %p", + seg, addr, smp); + + smtx = SMAPMTX(smp); + + /* + * For compatibilty reasons segmap_pagecreate_kpm sets this + * flag to allow a following segmap_pagecreate to return + * this as "newpage" flag. When segmap_pagecreate is not + * called at all we clear it now. + */ + smp->sm_flags &= ~SM_KPM_NEWPAGE; + is_kpm = 1; + if (smp->sm_flags & SM_WRITE_DATA) { + hat_setrefmod(pp); + } else if (smp->sm_flags & SM_READ_DATA) { + hat_setref(pp); + } + } else { + if (addr < seg->s_base || addr >= seg->s_base + seg->s_size || + ((uintptr_t)addr & MAXBOFFSET) != 0) { + panic("segmap_release: bad addr %p", (void *)addr); + /*NOTREACHED*/ + } + smp = GET_SMAP(seg, addr); + + TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP, + "segmap_relmap:seg %p addr %p smp %p", + seg, addr, smp); + + smtx = SMAPMTX(smp); + mutex_enter(smtx); + smp->sm_flags |= SM_NOTKPM_RELEASED; + } + + ASSERT(smp->sm_refcnt > 0); + + /* + * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED) + * are set. + */ + if ((flags & ~SM_DONTNEED) != 0) { + if (flags & SM_WRITE) + segmapcnt.smp_rel_write.value.ul++; + if (flags & SM_ASYNC) { + bflags |= B_ASYNC; + segmapcnt.smp_rel_async.value.ul++; + } + if (flags & SM_INVAL) { + bflags |= B_INVAL; + segmapcnt.smp_rel_abort.value.ul++; + } + if (flags & SM_DESTROY) { + bflags |= (B_INVAL|B_TRUNC); + segmapcnt.smp_rel_abort.value.ul++; + } + if (smp->sm_refcnt == 1) { + /* + * We only bother doing the FREE and DONTNEED flags + * if no one else is still referencing this mapping. + */ + if (flags & SM_FREE) { + bflags |= B_FREE; + segmapcnt.smp_rel_free.value.ul++; + } + if (flags & SM_DONTNEED) { + bflags |= B_DONTNEED; + segmapcnt.smp_rel_dontneed.value.ul++; + } + } + } else { + smd_cpu[CPU->cpu_seqid].scpu.scpu_release++; + } + + vp = smp->sm_vp; + offset = smp->sm_off; + + if (--smp->sm_refcnt == 0) { + + if (is_kpm) { + smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA); + } + if (flags & (SM_INVAL|SM_DESTROY)) { + segmap_hashout(smp); /* remove map info */ + if (is_kpm) { + hat_kpm_mapout(pp, GET_KPME(smp), addr); + if (smp->sm_flags & SM_NOTKPM_RELEASED) { + smp->sm_flags &= ~SM_NOTKPM_RELEASED; + hat_unload(kas.a_hat, addr, MAXBSIZE, + HAT_UNLOAD); + } + + } else { + if (segmap_kpm) + segkpm_mapout_validkpme(GET_KPME(smp)); + + smp->sm_flags &= ~SM_NOTKPM_RELEASED; + hat_unload(kas.a_hat, addr, MAXBSIZE, + HAT_UNLOAD); + } + } + segmap_smapadd(smp); /* add to free list */ + } + + mutex_exit(smtx); + + if (is_kpm) + page_unlock(pp); + /* + * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED) + * are set. + */ + if ((flags & ~SM_DONTNEED) != 0) { + error = VOP_PUTPAGE(vp, offset, MAXBSIZE, + bflags, CRED()); + } else { + error = 0; + } + + return (error); +} + +/* + * Dump the pages belonging to this segmap segment. + */ +static void +segmap_dump(struct seg *seg) +{ + struct segmap_data *smd; + struct smap *smp, *smp_end; + page_t *pp; + pfn_t pfn; + u_offset_t off; + caddr_t addr; + + smd = (struct segmap_data *)seg->s_data; + addr = seg->s_base; + for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages; + smp < smp_end; smp++) { + + if (smp->sm_refcnt) { + for (off = 0; off < MAXBSIZE; off += PAGESIZE) { + int we_own_it = 0; + + /* + * If pp == NULL, the page either does + * not exist or is exclusively locked. + * So determine if it exists before + * searching for it. + */ + if ((pp = page_lookup_nowait(smp->sm_vp, + smp->sm_off + off, SE_SHARED))) + we_own_it = 1; + else + pp = page_exists(smp->sm_vp, + smp->sm_off + off); + + if (pp) { + pfn = page_pptonum(pp); + dump_addpage(seg->s_as, + addr + off, pfn); + if (we_own_it) + page_unlock(pp); + } + dump_timeleft = dump_timeout; + } + } + addr += MAXBSIZE; + } +} + +/*ARGSUSED*/ +static int +segmap_pagelock(struct seg *seg, caddr_t addr, size_t len, + struct page ***ppp, enum lock_type type, enum seg_rw rw) +{ + return (ENOTSUP); +} + +static int +segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + struct segmap_data *smd = (struct segmap_data *)seg->s_data; + + memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp; + memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base); + return (0); +} + +/*ARGSUSED*/ +static lgrp_mem_policy_info_t * +segmap_getpolicy(struct seg *seg, caddr_t addr) +{ + return (NULL); +} + + +#ifdef SEGKPM_SUPPORT + +/* + * segkpm support routines + */ + +static caddr_t +segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off, + struct smap *smp, enum seg_rw rw) +{ + caddr_t base; + page_t *pp; + int newpage = 0; + struct kpme *kpme; + + ASSERT(smp->sm_refcnt > 0); + + if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { + kmutex_t *smtx; + + base = segkpm_create_va(off); + + if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, + seg, base)) == NULL) { + panic("segmap_pagecreate_kpm: " + "page_create failed"); + /*NOTREACHED*/ + } + + newpage = 1; + page_io_unlock(pp); + ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX); + + /* + * Mark this here until the following segmap_pagecreate + * or segmap_release. + */ + smtx = SMAPMTX(smp); + mutex_enter(smtx); + smp->sm_flags |= SM_KPM_NEWPAGE; + mutex_exit(smtx); + } + + kpme = GET_KPME(smp); + if (!newpage && kpme->kpe_page == pp) + base = hat_kpm_page2va(pp, 0); + else + base = hat_kpm_mapin(pp, kpme); + + /* + * FS code may decide not to call segmap_pagecreate and we + * don't invoke segmap_fault via TLB miss, so we have to set + * ref and mod bits in advance. + */ + if (rw == S_WRITE) { + hat_setrefmod(pp); + } else { + ASSERT(rw == S_READ); + hat_setref(pp); + } + + smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++; + + return (base); +} + +/* + * Find the smap structure corresponding to the + * KPM addr and return it locked. + */ +struct smap * +get_smap_kpm(caddr_t addr, page_t **ppp) +{ + struct smap *smp; + struct vnode *vp; + u_offset_t offset; + caddr_t baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK); + int hashid; + kmutex_t *hashmtx; + page_t *pp; + union segmap_cpu *scpu; + + pp = hat_kpm_vaddr2page(baseaddr); + + ASSERT(pp && !PP_ISFREE(pp)); + ASSERT(PAGE_LOCKED(pp)); + ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0); + + vp = pp->p_vnode; + offset = pp->p_offset; + ASSERT(vp != NULL); + + /* + * Assume the last smap used on this cpu is the one needed. + */ + scpu = smd_cpu+CPU->cpu_seqid; + smp = scpu->scpu.scpu_last_smap; + mutex_enter(&smp->sm_mtx); + if (smp->sm_vp == vp && smp->sm_off == offset) { + ASSERT(smp->sm_refcnt > 0); + } else { + /* + * Assumption wrong, find the smap on the hash chain. + */ + mutex_exit(&smp->sm_mtx); + SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */ + hashmtx = SHASHMTX(hashid); + + mutex_enter(hashmtx); + smp = smd_hash[hashid].sh_hash_list; + for (; smp != NULL; smp = smp->sm_hash) { + if (smp->sm_vp == vp && smp->sm_off == offset) + break; + } + mutex_exit(hashmtx); + if (smp) { + mutex_enter(&smp->sm_mtx); + ASSERT(smp->sm_vp == vp && smp->sm_off == offset); + } + } + + if (ppp) + *ppp = smp ? pp : NULL; + + return (smp); +} + +#else /* SEGKPM_SUPPORT */ + +/* segkpm stubs */ + +/*ARGSUSED*/ +static caddr_t +segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off, + struct smap *smp, enum seg_rw rw) +{ + return (NULL); +} + +/*ARGSUSED*/ +struct smap * +get_smap_kpm(caddr_t addr, page_t **ppp) +{ + return (NULL); +} + +#endif /* SEGKPM_SUPPORT */ diff --git a/usr/src/uts/common/vm/seg_map.h b/usr/src/uts/common/vm/seg_map.h new file mode 100644 index 0000000000..339dabe674 --- /dev/null +++ b/usr/src/uts/common/vm/seg_map.h @@ -0,0 +1,294 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_SEG_MAP_H +#define _VM_SEG_MAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * When segmap is created it is possible to program its behavior, + * using the create args [needed for performance reasons]. + * Segmap creates n lists of pages. + * For VAC machines, there will be at least one free list + * per color. If more than one free list per color is needed, + * set nfreelist as needed. + * + * For PAC machines, it will be treated as VAC with only one + * color- every page is of the same color. Again, set nfreelist + * to get more than one free list. + */ +struct segmap_crargs { + uint_t prot; + uint_t shmsize; /* shm_alignment for VAC, 0 for PAC. */ + uint_t nfreelist; /* number of freelist per color, >= 1 */ +}; + +#include <vm/kpm.h> + +/* + * Each smap struct represents a MAXBSIZE sized mapping to the + * <sm_vp, sm_off> given in the structure. The location of the + * the structure in the array gives the virtual address of the + * mapping. Structure rearranged for 64bit sm_off. + */ +struct smap { + kmutex_t sm_mtx; /* protect non-list fields */ + struct vnode *sm_vp; /* vnode pointer (if mapped) */ + struct smap *sm_hash; /* hash pointer */ + struct smap *sm_next; /* next pointer */ + struct smap *sm_prev; /* previous pointer */ + u_offset_t sm_off; /* file offset for mapping */ + ushort_t sm_bitmap; /* bit map for locked translations */ + ushort_t sm_refcnt; /* reference count for uses */ + ushort_t sm_flags; /* smap flags */ + ushort_t sm_free_ndx; /* freelist */ +#ifdef SEGKPM_SUPPORT + struct kpme sm_kpme; /* segkpm */ +#endif +}; + +#ifdef SEGKPM_SUPPORT +#define GET_KPME(smp) (&(smp)->sm_kpme) +#define sm_kpme_next sm_kpme.kpe_next +#define sm_kpme_prev sm_kpme.kpe_prev +#define sm_kpme_page sm_kpme.kpe_page +#else +#define GET_KPME(smp) ((struct kpme *)NULL) +#endif + +/* sm_flags */ +#define SM_KPM_NEWPAGE 0x00000001 /* page created in segmap_getmapft */ +#define SM_NOTKPM_RELEASED 0x00000002 /* released smap not in segkpm mode */ +#define SM_QNDX_ZERO 0x00000004 /* on the index 0 freelist */ +#define SM_READ_DATA 0x00000010 /* page created for read */ +#define SM_WRITE_DATA 0x00000020 /* page created for write */ + +/* + * Multiple smap free lists are maintained so that allocations + * will scale with cpu count. Each free list is made up of 2 queues + * so that allocations and deallocations can proceed concurrently. + * Each queue structure is padded to 64 bytes to avoid false sharing. + */ +#define SM_FREEQ_PAD (64 - sizeof (struct smap *) - sizeof (kmutex_t)) +struct sm_freeq { + struct smap *smq_free; /* points into freelist */ + kmutex_t smq_mtx; /* protects smq_free */ + char smq_pad[SM_FREEQ_PAD]; +}; + +struct smfree { + struct sm_freeq sm_freeq[2]; /* alloc and release queues */ + struct sm_freeq *sm_allocq; /* current allocq */ + struct sm_freeq *sm_releq; /* current releq */ + kcondvar_t sm_free_cv; + ushort_t sm_want; /* someone wants a slot of this color */ +}; + +/* + * Cached smaps are kept on hash chains to enable fast reclaim lookups. + */ +struct smaphash { + kmutex_t sh_mtx; /* protects this hash chain */ + struct smap *sh_hash_list; /* start of hash chain */ +}; + +/* + * (Semi) private data maintained by the segmap driver per SEGMENT mapping + * All fields in segmap_data are read-only after the segment is created. + * + */ + +struct segmap_data { + struct smap *smd_sm; /* array of smap structures */ + long smd_npages; /* size of smap array */ + struct smfree *smd_free; /* ptr to freelist header array */ + struct smaphash *smd_hash; /* ptr to hash header array */ + int smd_nfree; /* number of free lists */ + uchar_t smd_prot; /* protections for all smap's */ +}; + +/* + * Statistics for segmap operations. + * + * No explicit locking to protect these stats. + */ +struct segmapcnt { + kstat_named_t smp_fault; /* number of segmap_faults */ + kstat_named_t smp_faulta; /* number of segmap_faultas */ + kstat_named_t smp_getmap; /* number of segmap_getmaps */ + kstat_named_t smp_get_use; /* getmaps that reuse existing map */ + kstat_named_t smp_get_reclaim; /* getmaps that do a reclaim */ + kstat_named_t smp_get_reuse; /* getmaps that reuse a slot */ + kstat_named_t smp_get_unused; /* getmaps that reuse existing map */ + kstat_named_t smp_get_nofree; /* getmaps with no free slots */ + kstat_named_t smp_rel_async; /* releases that are async */ + kstat_named_t smp_rel_write; /* releases that write */ + kstat_named_t smp_rel_free; /* releases that free */ + kstat_named_t smp_rel_abort; /* releases that abort */ + kstat_named_t smp_rel_dontneed; /* releases with dontneed set */ + kstat_named_t smp_release; /* releases with no other action */ + kstat_named_t smp_pagecreate; /* pagecreates */ + kstat_named_t smp_free_notfree; /* pages not freed in */ + /* segmap_pagefree */ + kstat_named_t smp_free_dirty; /* dirty pages freeed */ + /* in segmap_pagefree */ + kstat_named_t smp_free; /* clean pages freeed in */ + /* segmap_pagefree */ + kstat_named_t smp_stolen; /* segmap_getmapflt() stole */ + /* from get_free_smp() */ + kstat_named_t smp_get_nomtx; /* free smaps but no mutex */ +}; + +/* + * These are flags used on release. Some of these might get handled + * by segment operations needed for msync (when we figure them out). + * SM_ASYNC modifies SM_WRITE. SM_DONTNEED modifies SM_FREE. SM_FREE + * and SM_INVAL as well as SM_FREE and SM_DESTROY are mutually exclusive. + * SM_DESTROY behaves like SM_INVAL but also forces the pages to be + * destroyed -- this prevents them from being written to the backing + * store. + */ +#define SM_WRITE 0x01 /* write back the pages upon release */ +#define SM_ASYNC 0x02 /* do the write asynchronously */ +#define SM_FREE 0x04 /* put pages back on free list */ +#define SM_INVAL 0x08 /* invalidate page (no caching) */ +#define SM_DONTNEED 0x10 /* less likely to be needed soon */ +#define SM_DESTROY 0x20 /* invalidate page, don't write back */ + +/* + * These are the forcefault flags used on getmapflt. + * + * The orginal semantic was extended to allow using the segkpm mapping + * scheme w/o a major segmap interface change for MAXBSIZE == PAGESIZE + * (which is required to enable segkpm for MAXBSIZE > PAGESIZE). + * Most segmap consumers needn't to be changed at all or only need to + * be changed slightly to take advantage of segkpm. Because the segkpm + * virtual address is based on the physical address of a page, a page is + * required to determine the virtual address (return value). Pages mapped + * with segkpm are always at least read locked and are hence protected + * from pageout or fsflush from segmap_getmap until segmap_release. This + * implies, that the segkpm mappings are locked within this period too. + * No trap driven segmap_fault's are possible in segkpm mode. + * + * The following combinations of "forcefault" and "rw" allow segkpm mode. + * (1) SM_FAULT, S_READ + * (2) SM_FAULT, S_WRITE + * (3) SM_PAGECREATE, S_WRITE + * (4) SM_LOCKPROTO, {S_READ, S_WRITE, S_OTHER} + * + * The regular additional operations (come in pairs in most of the cases): + * . segmap_pagecreate/segmap_pageunlock + * . segmap_fault(F_SOFTLOCK)/segmap_fault(F_SOFTUNLOCK) + * + * are mostly a no-op in segkpm mode with the following exceptions: + * . The "newpage" return value of segmap_pagecreate is still supported + * for zeroout operations needed on newly created pages. + * + * . segmap_fault() must follow when a error could be expected in + * the VOP_GETPAGE. In segkpm mode this error is recognized in + * segmap_getmapflt and returned from the following segmap_fault() + * call. The "hole" optimization (read only after first VOP_GETPAGE + * mapping in segmap_getmapflt followed by a trap driven protection + * fault and a second VOP_GETPAGE via segmap_fault) cannot be used. + * + * . segmap_fault(F_SOFTUNLOCK) must follow when segmap_getmapflt was + * called w/ (SM_LOCKPROTO, S_OTHER). S_WRITE has to be applied, when + * the page should be marked "dirty". Otherwise the page is not + * written to the backing store later (as mentioned above, no page + * or protection faults are possible in segkpm mode). Caller cannot + * use only S_OTHER and rely on a protection fault to force the page + * to become dirty. + * + * . The segmap_pagecreate parameter softlock is ignored, pages and + * mappings are locked anyway. + * + * SM_LOCKPROTO is used in the fbio layer and some special segmap consumers. + */ +#define SM_PAGECREATE 0x00 /* create page in segkpm mode, no I/O */ +#define SM_FAULT 0x01 /* fault in page if necessary */ +#define SM_LOCKPROTO 0x02 /* lock/unlock protocol used */ + +#define MAXBSHIFT 13 /* log2(MAXBSIZE) */ + +#define MAXBOFFSET (MAXBSIZE - 1) +#define MAXBMASK (~MAXBOFFSET) + +/* + * SMAP_HASHAVELEN is the average length desired for this chain, from + * which the size of the smd_hash table is derived at segment create time. + * SMAP_HASHVPSHIFT is defined so that 1 << SMAP_HASHVPSHIFT is the + * approximate size of a vnode struct. + */ +#define SMAP_HASHAVELEN 4 +#define SMAP_HASHVPSHIFT 6 + + +#ifdef _KERNEL +/* + * The kernel generic mapping segment. + */ +extern struct seg *segkmap; + +/* + * Public seg_map segment operations. + */ +extern int segmap_create(struct seg *, void *); +extern int segmap_pagecreate(struct seg *, caddr_t, size_t, int); +extern void segmap_pageunlock(struct seg *, caddr_t, size_t, enum seg_rw); +extern faultcode_t segmap_fault(struct hat *, struct seg *, caddr_t, size_t, + enum fault_type, enum seg_rw); +extern caddr_t segmap_getmap(struct seg *, struct vnode *, u_offset_t); +extern caddr_t segmap_getmapflt(struct seg *, struct vnode *, u_offset_t, + size_t, int, enum seg_rw); +extern int segmap_release(struct seg *, caddr_t, uint_t); +extern void segmap_flush(struct seg *, struct vnode *); +extern void segmap_inval(struct seg *, struct vnode *, u_offset_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_MAP_H */ diff --git a/usr/src/uts/common/vm/seg_spt.c b/usr/src/uts/common/vm/seg_spt.c new file mode 100644 index 0000000000..a97719ad5f --- /dev/null +++ b/usr/src/uts/common/vm/seg_spt.c @@ -0,0 +1,2701 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/user.h> +#include <sys/mman.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/systm.h> +#include <sys/tuneable.h> +#include <vm/hat.h> +#include <vm/seg.h> +#include <vm/as.h> +#include <vm/anon.h> +#include <vm/page.h> +#include <sys/buf.h> +#include <sys/swap.h> +#include <sys/atomic.h> +#include <vm/seg_spt.h> +#include <sys/debug.h> +#include <sys/vtrace.h> +#include <sys/shm.h> +#include <sys/lgrp.h> +#include <sys/vmsystm.h> + +#include <sys/tnf_probe.h> + +#define SEGSPTADDR (caddr_t)0x0 + +/* + * # pages used for spt + */ +static size_t spt_used; + +/* + * segspt_minfree is the memory left for system after ISM + * locked its pages; it is set up to 5% of availrmem in + * sptcreate when ISM is created. ISM should not use more + * than ~90% of availrmem; if it does, then the performance + * of the system may decrease. Machines with large memories may + * be able to use up more memory for ISM so we set the default + * segspt_minfree to 5% (which gives ISM max 95% of availrmem. + * If somebody wants even more memory for ISM (risking hanging + * the system) they can patch the segspt_minfree to smaller number. + */ +pgcnt_t segspt_minfree = 0; + +static int segspt_create(struct seg *seg, caddr_t argsp); +static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize); +static void segspt_free(struct seg *seg); +static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len); +static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr); + +static void +segspt_badop() +{ + panic("segspt_badop called"); + /*NOTREACHED*/ +} + +#define SEGSPT_BADOP(t) (t(*)())segspt_badop + +struct seg_ops segspt_ops = { + SEGSPT_BADOP(int), /* dup */ + segspt_unmap, + segspt_free, + SEGSPT_BADOP(int), /* fault */ + SEGSPT_BADOP(faultcode_t), /* faulta */ + SEGSPT_BADOP(int), /* setprot */ + SEGSPT_BADOP(int), /* checkprot */ + SEGSPT_BADOP(int), /* kluster */ + SEGSPT_BADOP(size_t), /* swapout */ + SEGSPT_BADOP(int), /* sync */ + SEGSPT_BADOP(size_t), /* incore */ + SEGSPT_BADOP(int), /* lockop */ + SEGSPT_BADOP(int), /* getprot */ + SEGSPT_BADOP(u_offset_t), /* getoffset */ + SEGSPT_BADOP(int), /* gettype */ + SEGSPT_BADOP(int), /* getvp */ + SEGSPT_BADOP(int), /* advise */ + SEGSPT_BADOP(void), /* dump */ + SEGSPT_BADOP(int), /* pagelock */ + SEGSPT_BADOP(int), /* setpgsz */ + SEGSPT_BADOP(int), /* getmemid */ + segspt_getpolicy, /* getpolicy */ +}; + +static int segspt_shmdup(struct seg *seg, struct seg *newseg); +static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize); +static void segspt_shmfree(struct seg *seg); +static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg, + caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw); +static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr); +static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr, + register size_t len, register uint_t prot); +static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, + uint_t prot); +static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta); +static size_t segspt_shmswapout(struct seg *seg); +static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, + register char *vec); +static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len, + int attr, uint_t flags); +static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, + int attr, int op, ulong_t *lockmap, size_t pos); +static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, + uint_t *protv); +static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr); +static int segspt_shmgettype(struct seg *seg, caddr_t addr); +static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp); +static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, + uint_t behav); +static void segspt_shmdump(struct seg *seg); +static int segspt_shmpagelock(struct seg *, caddr_t, size_t, + struct page ***, enum lock_type, enum seg_rw); +static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t); +static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *); +static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t); + +struct seg_ops segspt_shmops = { + segspt_shmdup, + segspt_shmunmap, + segspt_shmfree, + segspt_shmfault, + segspt_shmfaulta, + segspt_shmsetprot, + segspt_shmcheckprot, + segspt_shmkluster, + segspt_shmswapout, + segspt_shmsync, + segspt_shmincore, + segspt_shmlockop, + segspt_shmgetprot, + segspt_shmgetoffset, + segspt_shmgettype, + segspt_shmgetvp, + segspt_shmadvise, /* advise */ + segspt_shmdump, + segspt_shmpagelock, + segspt_shmsetpgsz, + segspt_shmgetmemid, + segspt_shmgetpolicy, +}; + +static void segspt_purge(struct seg *seg); +static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **, + enum seg_rw); +static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, + page_t **ppa); + + + +/*ARGSUSED*/ +int +sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp, + uint_t prot, uint_t flags, uint_t share_szc) +{ + int err; + struct as *newas; + struct segspt_crargs sptcargs; + +#ifdef DEBUG + TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */, + tnf_ulong, size, size ); +#endif + if (segspt_minfree == 0) /* leave min 5% of availrmem for */ + segspt_minfree = availrmem/20; /* for the system */ + + if (!hat_supported(HAT_SHARED_PT, (void *)0)) + return (EINVAL); + + /* + * get a new as for this shared memory segment + */ + newas = as_alloc(); + sptcargs.amp = amp; + sptcargs.prot = prot; + sptcargs.flags = flags; + sptcargs.szc = share_szc; + + /* + * create a shared page table (spt) segment + */ + + if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) { + as_free(newas); + return (err); + } + *sptseg = sptcargs.seg_spt; + return (0); +} + +void +sptdestroy(struct as *as, struct anon_map *amp) +{ + +#ifdef DEBUG + TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */); +#endif + (void) as_unmap(as, SEGSPTADDR, amp->size); + as_free(as); +} + +/* + * called from seg_free(). + * free (i.e., unlock, unmap, return to free list) + * all the pages in the given seg. + */ +void +segspt_free(struct seg *seg) +{ + struct spt_data *sptd = (struct spt_data *)seg->s_data; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + if (sptd != NULL) { + if (sptd->spt_realsize) + segspt_free_pages(seg, seg->s_base, sptd->spt_realsize); + + if (sptd->spt_ppa_lckcnt) + kmem_free(sptd->spt_ppa_lckcnt, + sizeof (*sptd->spt_ppa_lckcnt) + * btopr(sptd->spt_amp->size)); + kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp)); + mutex_destroy(&sptd->spt_lock); + kmem_free(sptd, sizeof (*sptd)); + } +} + +/*ARGSUSED*/ +static int +segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr, + uint_t flags) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + return (0); +} + +/*ARGSUSED*/ +static size_t +segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec) +{ + caddr_t eo_seg; + pgcnt_t npages; + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct seg *sptseg; + struct spt_data *sptd; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); +#ifdef lint + seg = seg; +#endif + sptseg = shmd->shm_sptseg; + sptd = sptseg->s_data; + + if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { + eo_seg = addr + len; + while (addr < eo_seg) { + /* page exists, and it's locked. */ + *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED | + SEG_PAGE_ANON; + addr += PAGESIZE; + } + return (len); + } else { + struct anon_map *amp = shmd->shm_amp; + struct anon *ap; + page_t *pp; + pgcnt_t anon_index; + struct vnode *vp; + u_offset_t off; + ulong_t i; + int ret; + anon_sync_obj_t cookie; + + addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + anon_index = seg_page(seg, addr); + npages = btopr(len); + if (anon_index + npages > btopr(shmd->shm_amp->size)) { + return (EINVAL); + } + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (i = 0; i < npages; i++, anon_index++) { + ret = 0; + anon_array_enter(amp, anon_index, &cookie); + ap = anon_get_ptr(amp->ahp, anon_index); + if (ap != NULL) { + swap_xlate(ap, &vp, &off); + anon_array_exit(&cookie); + pp = page_lookup_nowait(vp, off, SE_SHARED); + if (pp != NULL) { + ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON; + page_unlock(pp); + } + } else { + anon_array_exit(&cookie); + } + if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { + ret |= SEG_PAGE_LOCKED; + } + *vec++ = (char)ret; + } + ANON_LOCK_EXIT(&->a_rwlock); + return (len); + } +} + +static int +segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize) +{ + size_t share_size; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * seg.s_size may have been rounded up to the largest page size + * in shmat(). + * XXX This should be cleanedup. sptdestroy should take a length + * argument which should be the same as sptcreate. Then + * this rounding would not be needed (or is done in shm.c) + * Only the check for full segment will be needed. + * + * XXX -- shouldn't raddr == 0 always? These tests don't seem + * to be useful at all. + */ + share_size = page_get_pagesize(seg->s_szc); + ssize = P2ROUNDUP(ssize, share_size); + + if (raddr == seg->s_base && ssize == seg->s_size) { + seg_free(seg); + return (0); + } else + return (EINVAL); +} + +int +segspt_create(struct seg *seg, caddr_t argsp) +{ + int err; + caddr_t addr = seg->s_base; + struct spt_data *sptd; + struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp; + struct anon_map *amp = sptcargs->amp; + struct cred *cred = CRED(); + ulong_t i, j, anon_index = 0; + pgcnt_t npages = btopr(amp->size); + struct vnode *vp; + page_t **ppa; + uint_t hat_flags; + + /* + * We are holding the a_lock on the underlying dummy as, + * so we can make calls to the HAT layer. + */ + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + +#ifdef DEBUG + TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */, + tnf_opaque, addr, addr, + tnf_ulong, len, seg->s_size); +#endif + if ((sptcargs->flags & SHM_PAGEABLE) == 0) { + if (err = anon_swap_adjust(npages)) + return (err); + } + err = ENOMEM; + + if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL) + goto out1; + + if ((sptcargs->flags & SHM_PAGEABLE) == 0) { + if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages), + KM_NOSLEEP)) == NULL) + goto out2; + } + + mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL); + + if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL) + goto out3; + + seg->s_ops = &segspt_ops; + sptd->spt_vp = vp; + sptd->spt_amp = amp; + sptd->spt_prot = sptcargs->prot; + sptd->spt_flags = sptcargs->flags; + seg->s_data = (caddr_t)sptd; + sptd->spt_ppa = NULL; + sptd->spt_ppa_lckcnt = NULL; + seg->s_szc = sptcargs->szc; + + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + amp->a_szc = seg->s_szc; + ANON_LOCK_EXIT(&->a_rwlock); + + /* + * Set policy to affect initial allocation of pages in + * anon_map_createpages() + */ + (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index, + NULL, 0, ptob(npages)); + + if (sptcargs->flags & SHM_PAGEABLE) { + size_t share_sz; + pgcnt_t new_npgs, more_pgs; + struct anon_hdr *nahp; + + share_sz = page_get_pagesize(seg->s_szc); + if (!IS_P2ALIGNED(amp->size, share_sz)) { + /* + * We are rounding up the size of the anon array + * on 4 M boundary because we always create 4 M + * of page(s) when locking, faulting pages and we + * don't have to check for all corner cases e.g. + * if there is enough space to allocate 4 M + * page. + */ + new_npgs = btop(P2ROUNDUP(amp->size, share_sz)); + more_pgs = new_npgs - npages; + + if (anon_resv(ptob(more_pgs)) == 0) { + err = ENOMEM; + goto out4; + } + nahp = anon_create(new_npgs, ANON_SLEEP); + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages, + ANON_SLEEP); + anon_release(amp->ahp, npages); + amp->ahp = nahp; + amp->swresv = amp->size = ptob(new_npgs); + ANON_LOCK_EXIT(&->a_rwlock); + npages = new_npgs; + } + + sptd->spt_ppa_lckcnt = kmem_zalloc(npages * + sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP); + sptd->spt_pcachecnt = 0; + sptd->spt_realsize = ptob(npages); + sptcargs->seg_spt = seg; + return (0); + } + + /* + * get array of pages for each anon slot in amp + */ + if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa, + seg, addr, S_CREATE, cred)) != 0) + goto out4; + + /* + * addr is initial address corresponding to the first page on ppa list + */ + for (i = 0; i < npages; i++) { + /* attempt to lock all pages */ + if (!page_pp_lock(ppa[i], 0, 1)) { + /* + * if unable to lock any page, unlock all + * of them and return error + */ + for (j = 0; j < i; j++) + page_pp_unlock(ppa[j], 0, 1); + for (i = 0; i < npages; i++) { + page_unlock(ppa[i]); + } + err = ENOMEM; + goto out4; + } + } + + /* + * Some platforms assume that ISM mappings are HAT_LOAD_LOCK + * for the entire life of the segment. For example platforms + * that do not support Dynamic Reconfiguration. + */ + hat_flags = HAT_LOAD_SHARE; + if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL)) + hat_flags |= HAT_LOAD_LOCK; + + hat_memload_array(seg->s_as->a_hat, addr, ptob(npages), + ppa, sptd->spt_prot, hat_flags); + + /* + * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, + * we will leave the pages locked SE_SHARED for the life + * of the ISM segment. This will prevent any calls to + * hat_pageunload() on this ISM segment for those platforms. + */ + if (!(hat_flags & HAT_LOAD_LOCK)) { + /* + * On platforms that support HAT_DYNAMIC_ISM_UNMAP, + * we no longer need to hold the SE_SHARED lock on the pages, + * since L_PAGELOCK and F_SOFTLOCK calls will grab the + * SE_SHARED lock on the pages as necessary. + */ + for (i = 0; i < npages; i++) + page_unlock(ppa[i]); + } + sptd->spt_pcachecnt = 0; + kmem_free(ppa, ((sizeof (page_t *)) * npages)); + sptd->spt_realsize = ptob(npages); + atomic_add_long(&spt_used, npages); + sptcargs->seg_spt = seg; + return (0); + +out4: + seg->s_data = NULL; + kmem_free(vp, sizeof (*vp)); +out3: + mutex_destroy(&sptd->spt_lock); + if ((sptcargs->flags & SHM_PAGEABLE) == 0) + kmem_free(ppa, (sizeof (*ppa) * npages)); +out2: + kmem_free(sptd, sizeof (*sptd)); +out1: + if ((sptcargs->flags & SHM_PAGEABLE) == 0) + anon_swap_restore(npages); + return (err); +} + +/*ARGSUSED*/ +void +segspt_free_pages(struct seg *seg, caddr_t addr, size_t len) +{ + struct page *pp; + struct spt_data *sptd = (struct spt_data *)seg->s_data; + pgcnt_t npages; + ulong_t anon_idx; + struct anon_map *amp; + struct anon *ap; + struct vnode *vp; + u_offset_t off; + uint_t hat_flags; + int root = 0; + pgcnt_t pgs, curnpgs = 0; + page_t *rootpp; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + len = P2ROUNDUP(len, PAGESIZE); + + npages = btop(len); + + hat_flags = HAT_UNLOAD_UNLOCK; + if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) || + (sptd->spt_flags & SHM_PAGEABLE)) { + hat_flags = HAT_UNLOAD; + } + + hat_unload(seg->s_as->a_hat, addr, len, hat_flags); + + amp = sptd->spt_amp; + if (sptd->spt_flags & SHM_PAGEABLE) + npages = btop(amp->size); + + ASSERT(amp); + for (anon_idx = 0; anon_idx < npages; anon_idx++) { + if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { + if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { + panic("segspt_free_pages: null app"); + /*NOTREACHED*/ + } + } else { + if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx)) + == NULL) + continue; + } + ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0); + swap_xlate(ap, &vp, &off); + + /* + * If this platform supports HAT_DYNAMIC_ISM_UNMAP, + * the pages won't be having SE_SHARED lock at this + * point. + * + * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, + * the pages are still held SE_SHARED locked from the + * original segspt_create() + * + * Our goal is to get SE_EXCL lock on each page, remove + * permanent lock on it and invalidate the page. + */ + if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { + if (hat_flags == HAT_UNLOAD) + pp = page_lookup(vp, off, SE_EXCL); + else { + if ((pp = page_find(vp, off)) == NULL) { + panic("segspt_free_pages: " + "page not locked"); + /*NOTREACHED*/ + } + if (!page_tryupgrade(pp)) { + page_unlock(pp); + pp = page_lookup(vp, off, SE_EXCL); + } + } + if (pp == NULL) { + panic("segspt_free_pages: " + "page not in the system"); + /*NOTREACHED*/ + } + page_pp_unlock(pp, 0, 1); + } else { + if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL) + continue; + page_pp_unlock(pp, 0, 0); + } + /* + * It's logical to invalidate the pages here as in most cases + * these were created by segspt. + */ + if (pp->p_szc != 0) { + /* + * For DISM swap is released in shm_rm_amp. + */ + if ((sptd->spt_flags & SHM_PAGEABLE) == 0 && + ap->an_pvp != NULL) { + panic("segspt_free_pages: pvp non NULL"); + /*NOTREACHED*/ + } + if (root == 0) { + ASSERT(curnpgs == 0); + root = 1; + rootpp = pp; + pgs = curnpgs = page_get_pagecnt(pp->p_szc); + ASSERT(pgs > 1); + ASSERT(IS_P2ALIGNED(pgs, pgs)); + ASSERT(!(page_pptonum(pp) & (pgs - 1))); + curnpgs--; + } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) { + ASSERT(curnpgs == 1); + ASSERT(page_pptonum(pp) == + page_pptonum(rootpp) + (pgs - 1)); + page_destroy_pages(rootpp); + root = 0; + curnpgs = 0; + } else { + ASSERT(curnpgs > 1); + ASSERT(page_pptonum(pp) == + page_pptonum(rootpp) + (pgs - curnpgs)); + curnpgs--; + } + } else { + if (root != 0 || curnpgs != 0) { + panic("segspt_free_pages: bad large page"); + /*NOTREACHED*/ + } + /*LINTED: constant in conditional context */ + VN_DISPOSE(pp, B_INVAL, 0, kcred); + } + } + + if (root != 0 || curnpgs != 0) { + panic("segspt_free_pages: bad large page"); + /*NOTREACHED*/ + } + + /* + * mark that pages have been released + */ + sptd->spt_realsize = 0; + + if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { + atomic_add_long(&spt_used, -npages); + anon_swap_restore(npages); + } +} + +/* + * Get memory allocation policy info for specified address in given segment + */ +static lgrp_mem_policy_info_t * +segspt_getpolicy(struct seg *seg, caddr_t addr) +{ + struct anon_map *amp; + ulong_t anon_index; + lgrp_mem_policy_info_t *policy_info; + struct spt_data *spt_data; + + ASSERT(seg != NULL); + + /* + * Get anon_map from segspt + * + * Assume that no lock needs to be held on anon_map, since + * it should be protected by its reference count which must be + * nonzero for an existing segment + * Need to grab readers lock on policy tree though + */ + spt_data = (struct spt_data *)seg->s_data; + if (spt_data == NULL) + return (NULL); + amp = spt_data->spt_amp; + ASSERT(amp->refcnt != 0); + + /* + * Get policy info + * + * Assume starting anon index of 0 + */ + anon_index = seg_page(seg, addr); + policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); + + return (policy_info); +} + +/* + * DISM only. + * Return locked pages over a given range. + * + * We will cache all DISM locked pages and save the pplist for the + * entire segment in the ppa field of the underlying DISM segment structure. + * Later, during a call to segspt_reclaim() we will use this ppa array + * to page_unlock() all of the pages and then we will free this ppa list. + */ +/*ARGSUSED*/ +static int +segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len, + struct page ***ppp, enum lock_type type, enum seg_rw rw) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct seg *sptseg = shmd->shm_sptseg; + struct spt_data *sptd = sptseg->s_data; + pgcnt_t pg_idx, npages, tot_npages, npgs; + struct page **pplist, **pl, **ppa, *pp; + struct anon_map *amp; + spgcnt_t an_idx; + int ret = ENOTSUP; + uint_t pl_built = 0; + struct anon *ap; + struct vnode *vp; + u_offset_t off; + pgcnt_t claim_availrmem = 0; + uint_t szc; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * We want to lock/unlock the entire ISM segment. Therefore, + * we will be using the underlying sptseg and it's base address + * and length for the caching arguments. + */ + ASSERT(sptseg); + ASSERT(sptd); + + pg_idx = seg_page(seg, addr); + npages = btopr(len); + + /* + * check if the request is larger than number of pages covered + * by amp + */ + if (pg_idx + npages > btopr(sptd->spt_amp->size)) { + *ppp = NULL; + return (ENOTSUP); + } + + if (type == L_PAGEUNLOCK) { + ASSERT(sptd->spt_ppa != NULL); + + seg_pinactive(seg, seg->s_base, sptd->spt_amp->size, + sptd->spt_ppa, sptd->spt_prot, segspt_reclaim); + + /* + * If someone is blocked while unmapping, we purge + * segment page cache and thus reclaim pplist synchronously + * without waiting for seg_pasync_thread. This speeds up + * unmapping in cases where munmap(2) is called, while + * raw async i/o is still in progress or where a thread + * exits on data fault in a multithreaded application. + */ + if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { + segspt_purge(seg); + } + return (0); + } else if (type == L_PAGERECLAIM) { + ASSERT(sptd->spt_ppa != NULL); + (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size, + sptd->spt_ppa, sptd->spt_prot); + return (0); + } + + if (sptd->spt_flags & DISM_PPA_CHANGED) { + segspt_purge(seg); + /* + * for DISM ppa needs to be rebuild since + * number of locked pages could be changed + */ + *ppp = NULL; + return (ENOTSUP); + } + + /* + * First try to find pages in segment page cache, without + * holding the segment lock. + */ + pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, + sptd->spt_prot); + if (pplist != NULL) { + ASSERT(sptd->spt_ppa != NULL); + ASSERT(sptd->spt_ppa == pplist); + ppa = sptd->spt_ppa; + for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { + if (ppa[an_idx] == NULL) { + seg_pinactive(seg, seg->s_base, + sptd->spt_amp->size, ppa, + sptd->spt_prot, segspt_reclaim); + *ppp = NULL; + return (ENOTSUP); + } + if ((szc = ppa[an_idx]->p_szc) != 0) { + npgs = page_get_pagecnt(szc); + an_idx = P2ROUNDUP(an_idx + 1, npgs); + } else { + an_idx++; + } + } + /* + * Since we cache the entire DISM segment, we want to + * set ppp to point to the first slot that corresponds + * to the requested addr, i.e. pg_idx. + */ + *ppp = &(sptd->spt_ppa[pg_idx]); + return (0); + } + + /* The L_PAGELOCK case... */ + mutex_enter(&sptd->spt_lock); + /* + * try to find pages in segment page cache with mutex + */ + pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, + sptd->spt_prot); + if (pplist != NULL) { + ASSERT(sptd->spt_ppa != NULL); + ASSERT(sptd->spt_ppa == pplist); + ppa = sptd->spt_ppa; + for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { + if (ppa[an_idx] == NULL) { + mutex_exit(&sptd->spt_lock); + seg_pinactive(seg, seg->s_base, + sptd->spt_amp->size, ppa, + sptd->spt_prot, segspt_reclaim); + *ppp = NULL; + return (ENOTSUP); + } + if ((szc = ppa[an_idx]->p_szc) != 0) { + npgs = page_get_pagecnt(szc); + an_idx = P2ROUNDUP(an_idx + 1, npgs); + } else { + an_idx++; + } + } + /* + * Since we cache the entire DISM segment, we want to + * set ppp to point to the first slot that corresponds + * to the requested addr, i.e. pg_idx. + */ + mutex_exit(&sptd->spt_lock); + *ppp = &(sptd->spt_ppa[pg_idx]); + return (0); + } + if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) == + SEGP_FAIL) { + mutex_exit(&sptd->spt_lock); + *ppp = NULL; + return (ENOTSUP); + } + + /* + * No need to worry about protections because DISM pages are always rw. + */ + pl = pplist = NULL; + amp = sptd->spt_amp; + + /* + * Do we need to build the ppa array? + */ + if (sptd->spt_ppa == NULL) { + pgcnt_t lpg_cnt = 0; + + pl_built = 1; + tot_npages = btopr(sptd->spt_amp->size); + + ASSERT(sptd->spt_pcachecnt == 0); + pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP); + pl = pplist; + + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + for (an_idx = 0; an_idx < tot_npages; ) { + ap = anon_get_ptr(amp->ahp, an_idx); + /* + * Cache only mlocked pages. For large pages + * if one (constituent) page is mlocked + * all pages for that large page + * are cached also. This is for quick + * lookups of ppa array; + */ + if ((ap != NULL) && (lpg_cnt != 0 || + (sptd->spt_ppa_lckcnt[an_idx] != 0))) { + + swap_xlate(ap, &vp, &off); + pp = page_lookup(vp, off, SE_SHARED); + ASSERT(pp != NULL); + if (lpg_cnt == 0) { + npgs = page_get_pagecnt(pp->p_szc); + if (!IS_P2ALIGNED(an_idx, npgs)) { + an_idx = P2ALIGN(an_idx, npgs); + page_unlock(pp); + continue; + } + } + if (++lpg_cnt == npgs) + lpg_cnt = 0; + + /* + * availrmem is decremented only + * for unlocked pages + */ + if (sptd->spt_ppa_lckcnt[an_idx] == 0) + claim_availrmem++; + pplist[an_idx] = pp; + } + an_idx++; + } + ANON_LOCK_EXIT(&->a_rwlock); + + mutex_enter(&freemem_lock); + if (availrmem < tune.t_minarmem + claim_availrmem) { + mutex_exit(&freemem_lock); + ret = FC_MAKE_ERR(ENOMEM); + claim_availrmem = 0; + goto insert_fail; + } else { + availrmem -= claim_availrmem; + } + mutex_exit(&freemem_lock); + + sptd->spt_ppa = pl; + } else { + /* + * We already have a valid ppa[]. + */ + pl = sptd->spt_ppa; + } + + ASSERT(pl != NULL); + + ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size, + pl, sptd->spt_prot, SEGP_FORCE_WIRED | SEGP_ASYNC_FLUSH, + segspt_reclaim); + if (ret == SEGP_FAIL) { + /* + * seg_pinsert failed. We return + * ENOTSUP, so that the as_pagelock() code will + * then try the slower F_SOFTLOCK path. + */ + sptd->spt_ppa = NULL; + ret = ENOTSUP; + goto insert_fail; + } + + /* + * In either case, we increment softlockcnt on the 'real' segment. + */ + sptd->spt_pcachecnt++; + atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1); + + ppa = sptd->spt_ppa; + for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { + if (ppa[an_idx] == NULL) { + mutex_exit(&sptd->spt_lock); + seg_pinactive(seg, seg->s_base, sptd->spt_amp->size, + pl, sptd->spt_prot, segspt_reclaim); + *ppp = NULL; + return (ENOTSUP); + } + if ((szc = ppa[an_idx]->p_szc) != 0) { + npgs = page_get_pagecnt(szc); + an_idx = P2ROUNDUP(an_idx + 1, npgs); + } else { + an_idx++; + } + } + /* + * We can now drop the sptd->spt_lock since the ppa[] + * exists and he have incremented pacachecnt. + */ + mutex_exit(&sptd->spt_lock); + + /* + * Since we cache the entire segment, we want to + * set ppp to point to the first slot that corresponds + * to the requested addr, i.e. pg_idx. + */ + *ppp = &(sptd->spt_ppa[pg_idx]); + return (ret); + +insert_fail: + /* + * We will only reach this code if we tried and failed. + * + * And we can drop the lock on the dummy seg, once we've failed + * to set up a new ppa[]. + */ + mutex_exit(&sptd->spt_lock); + + if (pl_built) { + mutex_enter(&freemem_lock); + availrmem += claim_availrmem; + mutex_exit(&freemem_lock); + + /* + * We created pl and we need to destroy it. + */ + pplist = pl; + for (an_idx = 0; an_idx < tot_npages; an_idx++) { + if (pplist[an_idx] != NULL) + page_unlock(pplist[an_idx]); + } + kmem_free(pl, sizeof (page_t *) * tot_npages); + } + + if (shmd->shm_softlockcnt <= 0) { + if (AS_ISUNMAPWAIT(seg->s_as)) { + mutex_enter(&seg->s_as->a_contents); + if (AS_ISUNMAPWAIT(seg->s_as)) { + AS_CLRUNMAPWAIT(seg->s_as); + cv_broadcast(&seg->s_as->a_cv); + } + mutex_exit(&seg->s_as->a_contents); + } + } + *ppp = NULL; + return (ret); +} + + + +/* + * return locked pages over a given range. + * + * We will cache the entire ISM segment and save the pplist for the + * entire segment in the ppa field of the underlying ISM segment structure. + * Later, during a call to segspt_reclaim() we will use this ppa array + * to page_unlock() all of the pages and then we will free this ppa list. + */ +/*ARGSUSED*/ +static int +segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len, + struct page ***ppp, enum lock_type type, enum seg_rw rw) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct seg *sptseg = shmd->shm_sptseg; + struct spt_data *sptd = sptseg->s_data; + pgcnt_t np, page_index, npages; + caddr_t a, spt_base; + struct page **pplist, **pl, *pp; + struct anon_map *amp; + ulong_t anon_index; + int ret = ENOTSUP; + uint_t pl_built = 0; + struct anon *ap; + struct vnode *vp; + u_offset_t off; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * We want to lock/unlock the entire ISM segment. Therefore, + * we will be using the underlying sptseg and it's base address + * and length for the caching arguments. + */ + ASSERT(sptseg); + ASSERT(sptd); + + if (sptd->spt_flags & SHM_PAGEABLE) { + return (segspt_dismpagelock(seg, addr, len, ppp, type, rw)); + } + + page_index = seg_page(seg, addr); + npages = btopr(len); + + /* + * check if the request is larger than number of pages covered + * by amp + */ + if (page_index + npages > btopr(sptd->spt_amp->size)) { + *ppp = NULL; + return (ENOTSUP); + } + + if (type == L_PAGEUNLOCK) { + + ASSERT(sptd->spt_ppa != NULL); + + seg_pinactive(seg, seg->s_base, sptd->spt_amp->size, + sptd->spt_ppa, sptd->spt_prot, segspt_reclaim); + + /* + * If someone is blocked while unmapping, we purge + * segment page cache and thus reclaim pplist synchronously + * without waiting for seg_pasync_thread. This speeds up + * unmapping in cases where munmap(2) is called, while + * raw async i/o is still in progress or where a thread + * exits on data fault in a multithreaded application. + */ + if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { + segspt_purge(seg); + } + return (0); + } else if (type == L_PAGERECLAIM) { + ASSERT(sptd->spt_ppa != NULL); + + (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size, + sptd->spt_ppa, sptd->spt_prot); + return (0); + } + + /* + * First try to find pages in segment page cache, without + * holding the segment lock. + */ + pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, + sptd->spt_prot); + if (pplist != NULL) { + ASSERT(sptd->spt_ppa == pplist); + ASSERT(sptd->spt_ppa[page_index]); + /* + * Since we cache the entire ISM segment, we want to + * set ppp to point to the first slot that corresponds + * to the requested addr, i.e. page_index. + */ + *ppp = &(sptd->spt_ppa[page_index]); + return (0); + } + + /* The L_PAGELOCK case... */ + mutex_enter(&sptd->spt_lock); + + /* + * try to find pages in segment page cache + */ + pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, + sptd->spt_prot); + if (pplist != NULL) { + ASSERT(sptd->spt_ppa == pplist); + /* + * Since we cache the entire segment, we want to + * set ppp to point to the first slot that corresponds + * to the requested addr, i.e. page_index. + */ + mutex_exit(&sptd->spt_lock); + *ppp = &(sptd->spt_ppa[page_index]); + return (0); + } + + if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) == + SEGP_FAIL) { + mutex_exit(&sptd->spt_lock); + *ppp = NULL; + return (ENOTSUP); + } + + /* + * No need to worry about protections because ISM pages + * are always rw. + */ + pl = pplist = NULL; + + /* + * Do we need to build the ppa array? + */ + if (sptd->spt_ppa == NULL) { + ASSERT(sptd->spt_ppa == pplist); + + spt_base = sptseg->s_base; + pl_built = 1; + + /* + * availrmem is decremented once during anon_swap_adjust() + * and is incremented during the anon_unresv(), which is + * called from shm_rm_amp() when the segment is destroyed. + */ + amp = sptd->spt_amp; + ASSERT(amp != NULL); + + /* pcachecnt is protected by sptd->spt_lock */ + ASSERT(sptd->spt_pcachecnt == 0); + pplist = kmem_zalloc(sizeof (page_t *) + * btopr(sptd->spt_amp->size), KM_SLEEP); + pl = pplist; + + anon_index = seg_page(sptseg, spt_base); + + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + for (a = spt_base; a < (spt_base + sptd->spt_amp->size); + a += PAGESIZE, anon_index++, pplist++) { + ap = anon_get_ptr(amp->ahp, anon_index); + ASSERT(ap != NULL); + swap_xlate(ap, &vp, &off); + pp = page_lookup(vp, off, SE_SHARED); + ASSERT(pp != NULL); + *pplist = pp; + } + ANON_LOCK_EXIT(&->a_rwlock); + + if (a < (spt_base + sptd->spt_amp->size)) { + ret = ENOTSUP; + goto insert_fail; + } + sptd->spt_ppa = pl; + } else { + /* + * We already have a valid ppa[]. + */ + pl = sptd->spt_ppa; + } + + ASSERT(pl != NULL); + + ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size, + pl, sptd->spt_prot, SEGP_FORCE_WIRED, segspt_reclaim); + if (ret == SEGP_FAIL) { + /* + * seg_pinsert failed. We return + * ENOTSUP, so that the as_pagelock() code will + * then try the slower F_SOFTLOCK path. + */ + if (pl_built) { + /* + * No one else has referenced the ppa[]. + * We created it and we need to destroy it. + */ + sptd->spt_ppa = NULL; + } + ret = ENOTSUP; + goto insert_fail; + } + + /* + * In either case, we increment softlockcnt on the 'real' segment. + */ + sptd->spt_pcachecnt++; + atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1); + + /* + * We can now drop the sptd->spt_lock since the ppa[] + * exists and he have incremented pacachecnt. + */ + mutex_exit(&sptd->spt_lock); + + /* + * Since we cache the entire segment, we want to + * set ppp to point to the first slot that corresponds + * to the requested addr, i.e. page_index. + */ + *ppp = &(sptd->spt_ppa[page_index]); + return (ret); + +insert_fail: + /* + * We will only reach this code if we tried and failed. + * + * And we can drop the lock on the dummy seg, once we've failed + * to set up a new ppa[]. + */ + mutex_exit(&sptd->spt_lock); + + if (pl_built) { + /* + * We created pl and we need to destroy it. + */ + pplist = pl; + np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT); + while (np) { + page_unlock(*pplist); + np--; + pplist++; + } + kmem_free(pl, sizeof (page_t *) * + btopr(sptd->spt_amp->size)); + } + if (shmd->shm_softlockcnt <= 0) { + if (AS_ISUNMAPWAIT(seg->s_as)) { + mutex_enter(&seg->s_as->a_contents); + if (AS_ISUNMAPWAIT(seg->s_as)) { + AS_CLRUNMAPWAIT(seg->s_as); + cv_broadcast(&seg->s_as->a_cv); + } + mutex_exit(&seg->s_as->a_contents); + } + } + *ppp = NULL; + return (ret); +} + +/* + * purge any cached pages in the I/O page cache + */ +static void +segspt_purge(struct seg *seg) +{ + seg_ppurge(seg); +} + +static int +segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, + enum seg_rw rw) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct seg *sptseg; + struct spt_data *sptd; + pgcnt_t npages, i, free_availrmem = 0; + int done = 0; + +#ifdef lint + addr = addr; +#endif + sptseg = shmd->shm_sptseg; + sptd = sptseg->s_data; + npages = (len >> PAGESHIFT); + ASSERT(npages); + ASSERT(sptd->spt_pcachecnt != 0); + ASSERT(sptd->spt_ppa == pplist); + ASSERT(npages == btopr(sptd->spt_amp->size)); + + /* + * Acquire the lock on the dummy seg and destroy the + * ppa array IF this is the last pcachecnt. + */ + mutex_enter(&sptd->spt_lock); + if (--sptd->spt_pcachecnt == 0) { + for (i = 0; i < npages; i++) { + if (pplist[i] == NULL) { + continue; + } + if (rw == S_WRITE) { + hat_setrefmod(pplist[i]); + } else { + hat_setref(pplist[i]); + } + if ((sptd->spt_flags & SHM_PAGEABLE) && + (sptd->spt_ppa_lckcnt[i] == 0)) + free_availrmem++; + page_unlock(pplist[i]); + } + if (sptd->spt_flags & SHM_PAGEABLE) { + mutex_enter(&freemem_lock); + availrmem += free_availrmem; + mutex_exit(&freemem_lock); + } + /* + * Since we want to cach/uncache the entire ISM segment, + * we will track the pplist in a segspt specific field + * ppa, that is initialized at the time we add an entry to + * the cache. + */ + ASSERT(sptd->spt_pcachecnt == 0); + kmem_free(pplist, sizeof (page_t *) * npages); + sptd->spt_ppa = NULL; + sptd->spt_flags &= ~DISM_PPA_CHANGED; + done = 1; + } + mutex_exit(&sptd->spt_lock); + /* + * Now decrement softlockcnt. + */ + atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1); + + if (shmd->shm_softlockcnt <= 0) { + if (AS_ISUNMAPWAIT(seg->s_as)) { + mutex_enter(&seg->s_as->a_contents); + if (AS_ISUNMAPWAIT(seg->s_as)) { + AS_CLRUNMAPWAIT(seg->s_as); + cv_broadcast(&seg->s_as->a_cv); + } + mutex_exit(&seg->s_as->a_contents); + } + } + return (done); +} + +/* + * Do a F_SOFTUNLOCK call over the range requested. + * The range must have already been F_SOFTLOCK'ed. + * + * The calls to acquire and release the anon map lock mutex were + * removed in order to avoid a deadly embrace during a DR + * memory delete operation. (Eg. DR blocks while waiting for a + * exclusive lock on a page that is being used for kaio; the + * thread that will complete the kaio and call segspt_softunlock + * blocks on the anon map lock; another thread holding the anon + * map lock blocks on another page lock via the segspt_shmfault + * -> page_lookup -> page_lookup_create -> page_lock_es code flow.) + * + * The appropriateness of the removal is based upon the following: + * 1. If we are holding a segment's reader lock and the page is held + * shared, then the corresponding element in anonmap which points to + * anon struct cannot change and there is no need to acquire the + * anonymous map lock. + * 2. Threads in segspt_softunlock have a reader lock on the segment + * and already have the shared page lock, so we are guaranteed that + * the anon map slot cannot change and therefore can call anon_get_ptr() + * without grabbing the anonymous map lock. + * 3. Threads that softlock a shared page break copy-on-write, even if + * its a read. Thus cow faults can be ignored with respect to soft + * unlocking, since the breaking of cow means that the anon slot(s) will + * not be shared. + */ +static void +segspt_softunlock(struct seg *seg, caddr_t sptseg_addr, + size_t len, enum seg_rw rw) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct seg *sptseg; + struct spt_data *sptd; + page_t *pp; + caddr_t adr; + struct vnode *vp; + u_offset_t offset; + ulong_t anon_index; + struct anon_map *amp; /* XXX - for locknest */ + struct anon *ap = NULL; + pgcnt_t npages; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + sptseg = shmd->shm_sptseg; + sptd = sptseg->s_data; + + /* + * Some platforms assume that ISM mappings are HAT_LOAD_LOCK + * and therefore their pages are SE_SHARED locked + * for the entire life of the segment. + */ + if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) && + ((sptd->spt_flags & SHM_PAGEABLE) == 0)) { + goto softlock_decrement; + } + + /* + * Any thread is free to do a page_find and + * page_unlock() on the pages within this seg. + * + * We are already holding the as->a_lock on the user's + * real segment, but we need to hold the a_lock on the + * underlying dummy as. This is mostly to satisfy the + * underlying HAT layer. + */ + AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); + hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len); + AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); + + amp = sptd->spt_amp; + ASSERT(amp != NULL); + anon_index = seg_page(sptseg, sptseg_addr); + + for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) { + ap = anon_get_ptr(amp->ahp, anon_index++); + ASSERT(ap != NULL); + swap_xlate(ap, &vp, &offset); + + /* + * Use page_find() instead of page_lookup() to + * find the page since we know that it has a + * "shared" lock. + */ + pp = page_find(vp, offset); + ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1)); + if (pp == NULL) { + panic("segspt_softunlock: " + "addr %p, ap %p, vp %p, off %llx", + (void *)adr, (void *)ap, (void *)vp, offset); + /*NOTREACHED*/ + } + + if (rw == S_WRITE) { + hat_setrefmod(pp); + } else if (rw != S_OTHER) { + hat_setref(pp); + } + page_unlock(pp); + } + +softlock_decrement: + npages = btopr(len); + atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages); + if (shmd->shm_softlockcnt == 0) { + /* + * All SOFTLOCKS are gone. Wakeup any waiting + * unmappers so they can try again to unmap. + * Check for waiters first without the mutex + * held so we don't always grab the mutex on + * softunlocks. + */ + if (AS_ISUNMAPWAIT(seg->s_as)) { + mutex_enter(&seg->s_as->a_contents); + if (AS_ISUNMAPWAIT(seg->s_as)) { + AS_CLRUNMAPWAIT(seg->s_as); + cv_broadcast(&seg->s_as->a_cv); + } + mutex_exit(&seg->s_as->a_contents); + } + } +} + +int +segspt_shmattach(struct seg *seg, caddr_t *argsp) +{ + struct shm_data *shmd_arg = (struct shm_data *)argsp; + struct shm_data *shmd; + struct anon_map *shm_amp = shmd_arg->shm_amp; + struct spt_data *sptd; + int error = 0; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP); + if (shmd == NULL) + return (ENOMEM); + + shmd->shm_sptas = shmd_arg->shm_sptas; + shmd->shm_amp = shm_amp; + shmd->shm_sptseg = shmd_arg->shm_sptseg; + + (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0, + NULL, 0, seg->s_size); + + seg->s_data = (void *)shmd; + seg->s_ops = &segspt_shmops; + seg->s_szc = shmd->shm_sptseg->s_szc; + sptd = shmd->shm_sptseg->s_data; + + if (sptd->spt_flags & SHM_PAGEABLE) { + if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size), + KM_NOSLEEP)) == NULL) { + seg->s_data = (void *)NULL; + kmem_free(shmd, (sizeof (*shmd))); + return (ENOMEM); + } + shmd->shm_lckpgs = 0; + if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { + if ((error = hat_share(seg->s_as->a_hat, seg->s_base, + shmd_arg->shm_sptas->a_hat, SEGSPTADDR, + seg->s_size, seg->s_szc)) != 0) { + kmem_free(shmd->shm_vpage, + btopr(shm_amp->size)); + } + } + } else { + error = hat_share(seg->s_as->a_hat, seg->s_base, + shmd_arg->shm_sptas->a_hat, SEGSPTADDR, + seg->s_size, seg->s_szc); + } + if (error) { + seg->s_szc = 0; + seg->s_data = (void *)NULL; + kmem_free(shmd, (sizeof (*shmd))); + } else { + ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); + shm_amp->refcnt++; + ANON_LOCK_EXIT(&shm_amp->a_rwlock); + } + return (error); +} + +int +segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + int reclaim = 1; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); +retry: + if (shmd->shm_softlockcnt > 0) { + if (reclaim == 1) { + segspt_purge(seg); + reclaim = 0; + goto retry; + } + return (EAGAIN); + } + + if (ssize != seg->s_size) { +#ifdef DEBUG + cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n", + ssize, seg->s_size); +#endif + return (EINVAL); + } + + (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK, + NULL, 0); + hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc); + + seg_free(seg); + + return (0); +} + +void +segspt_shmfree(struct seg *seg) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct anon_map *shm_amp = shmd->shm_amp; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0, + MC_UNLOCK, NULL, 0); + + /* + * Need to increment refcnt when attaching + * and decrement when detaching because of dup(). + */ + ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); + shm_amp->refcnt--; + ANON_LOCK_EXIT(&shm_amp->a_rwlock); + + if (shmd->shm_vpage) { /* only for DISM */ + kmem_free(shmd->shm_vpage, btopr(shm_amp->size)); + shmd->shm_vpage = NULL; + } + kmem_free(shmd, sizeof (*shmd)); +} + +/*ARGSUSED*/ +int +segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * Shared page table is more than shared mapping. + * Individual process sharing page tables can't change prot + * because there is only one set of page tables. + * This will be allowed after private page table is + * supported. + */ +/* need to return correct status error? */ + return (0); +} + + +faultcode_t +segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr, + size_t len, enum fault_type type, enum seg_rw rw) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct seg *sptseg = shmd->shm_sptseg; + struct as *curspt = shmd->shm_sptas; + struct spt_data *sptd = sptseg->s_data; + pgcnt_t npages; + size_t share_sz, size; + caddr_t segspt_addr, shm_addr; + page_t **ppa; + int i; + ulong_t an_idx = 0; + int err = 0; + +#ifdef lint + hat = hat; +#endif + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * Because of the way spt is implemented + * the realsize of the segment does not have to be + * equal to the segment size itself. The segment size is + * often in multiples of a page size larger than PAGESIZE. + * The realsize is rounded up to the nearest PAGESIZE + * based on what the user requested. This is a bit of + * ungliness that is historical but not easily fixed + * without re-designing the higher levels of ISM. + */ + ASSERT(addr >= seg->s_base); + if (((addr + len) - seg->s_base) > sptd->spt_realsize) + return (FC_NOMAP); + /* + * For all of the following cases except F_PROT, we need to + * make any necessary adjustments to addr and len + * and get all of the necessary page_t's into an array called ppa[]. + * + * The code in shmat() forces base addr and len of ISM segment + * to be aligned to largest page size supported. Therefore, + * we are able to handle F_SOFTLOCK and F_INVAL calls in "large + * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK + * in large pagesize chunks, or else we will screw up the HAT + * layer by calling hat_memload_array() with differing page sizes + * over a given virtual range. + */ + share_sz = page_get_pagesize(sptseg->s_szc); + shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz); + size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_sz); + npages = btopr(size); + + /* + * Now we need to convert from addr in segshm to addr in segspt. + */ + an_idx = seg_page(seg, shm_addr); + segspt_addr = sptseg->s_base + ptob(an_idx); + + ASSERT((segspt_addr + ptob(npages)) <= + (sptseg->s_base + sptd->spt_realsize)); + ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size)); + + switch (type) { + + case F_SOFTLOCK: + + mutex_enter(&freemem_lock); + if (availrmem < tune.t_minarmem + npages) { + mutex_exit(&freemem_lock); + return (FC_MAKE_ERR(ENOMEM)); + } else { + availrmem -= npages; + } + mutex_exit(&freemem_lock); + atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); + /* + * Fall through to the F_INVAL case to load up the hat layer + * entries with the HAT_LOAD_LOCK flag. + */ + /* FALLTHRU */ + case F_INVAL: + + if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) + return (FC_NOMAP); + + ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP); + + err = spt_anon_getpages(sptseg, segspt_addr, size, ppa); + if (err != 0) { + if (type == F_SOFTLOCK) { + mutex_enter(&freemem_lock); + availrmem += npages; + mutex_exit(&freemem_lock); + atomic_add_long((ulong_t *)( + &(shmd->shm_softlockcnt)), -npages); + } + goto dism_err; + } + AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); + if (type == F_SOFTLOCK) { + + /* + * Load up the translation keeping it + * locked and don't unlock the page. + */ + hat_memload_array(sptseg->s_as->a_hat, segspt_addr, + size, ppa, sptd->spt_prot, + HAT_LOAD_LOCK | HAT_LOAD_SHARE); + } else { + if (hat == seg->s_as->a_hat) { + + /* + * Migrate pages marked for migration + */ + if (lgrp_optimizations()) + page_migrate(seg, shm_addr, ppa, + npages); + + /* CPU HAT */ + hat_memload_array(sptseg->s_as->a_hat, + segspt_addr, size, ppa, sptd->spt_prot, + HAT_LOAD_SHARE); + } else { + /* XHAT. Pass real address */ + hat_memload_array(hat, shm_addr, + size, ppa, sptd->spt_prot, HAT_LOAD_SHARE); + } + + /* + * And now drop the SE_SHARED lock(s). + */ + for (i = 0; i < npages; i++) + page_unlock(ppa[i]); + } + + if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { + if (hat_share(seg->s_as->a_hat, shm_addr, + curspt->a_hat, segspt_addr, ptob(npages), + seg->s_szc) != 0) { + panic("hat_share err in DISM fault"); + /* NOTREACHED */ + } + } + AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); +dism_err: + kmem_free(ppa, npages * sizeof (page_t *)); + return (err); + + case F_SOFTUNLOCK: + + mutex_enter(&freemem_lock); + availrmem += npages; + mutex_exit(&freemem_lock); + + /* + * This is a bit ugly, we pass in the real seg pointer, + * but the segspt_addr is the virtual address within the + * dummy seg. + */ + segspt_softunlock(seg, segspt_addr, size, rw); + return (0); + + case F_PROT: + + /* + * This takes care of the unusual case where a user + * allocates a stack in shared memory and a register + * window overflow is written to that stack page before + * it is otherwise modified. + * + * We can get away with this because ISM segments are + * always rw. Other than this unusual case, there + * should be no instances of protection violations. + */ + return (0); + + default: +#ifdef DEBUG + panic("segspt_dismfault default type?"); +#else + return (FC_NOMAP); +#endif + } +} + + +faultcode_t +segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr, + size_t len, enum fault_type type, enum seg_rw rw) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct seg *sptseg = shmd->shm_sptseg; + struct as *curspt = shmd->shm_sptas; + struct spt_data *sptd = sptseg->s_data; + pgcnt_t npages; + size_t share_size, size; + caddr_t sptseg_addr, shm_addr; + page_t *pp, **ppa; + int i; + u_offset_t offset; + ulong_t anon_index = 0; + struct vnode *vp; + struct anon_map *amp; /* XXX - for locknest */ + struct anon *ap = NULL; + anon_sync_obj_t cookie; + +#ifdef lint + hat = hat; +#endif + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + if (sptd->spt_flags & SHM_PAGEABLE) { + return (segspt_dismfault(hat, seg, addr, len, type, rw)); + } + + /* + * Because of the way spt is implemented + * the realsize of the segment does not have to be + * equal to the segment size itself. The segment size is + * often in multiples of a page size larger than PAGESIZE. + * The realsize is rounded up to the nearest PAGESIZE + * based on what the user requested. This is a bit of + * ungliness that is historical but not easily fixed + * without re-designing the higher levels of ISM. + */ + ASSERT(addr >= seg->s_base); + if (((addr + len) - seg->s_base) > sptd->spt_realsize) + return (FC_NOMAP); + /* + * For all of the following cases except F_PROT, we need to + * make any necessary adjustments to addr and len + * and get all of the necessary page_t's into an array called ppa[]. + * + * The code in shmat() forces base addr and len of ISM segment + * to be aligned to largest page size supported. Therefore, + * we are able to handle F_SOFTLOCK and F_INVAL calls in "large + * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK + * in large pagesize chunks, or else we will screw up the HAT + * layer by calling hat_memload_array() with differing page sizes + * over a given virtual range. + */ + share_size = page_get_pagesize(sptseg->s_szc); + shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size); + size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_size); + npages = btopr(size); + + /* + * Now we need to convert from addr in segshm to addr in segspt. + */ + anon_index = seg_page(seg, shm_addr); + sptseg_addr = sptseg->s_base + ptob(anon_index); + + /* + * And now we may have to adjust npages downward if we have + * exceeded the realsize of the segment or initial anon + * allocations. + */ + if ((sptseg_addr + ptob(npages)) > + (sptseg->s_base + sptd->spt_realsize)) + size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr; + + npages = btopr(size); + + ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size)); + ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0); + + switch (type) { + + case F_SOFTLOCK: + + /* + * availrmem is decremented once during anon_swap_adjust() + * and is incremented during the anon_unresv(), which is + * called from shm_rm_amp() when the segment is destroyed. + */ + atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); + /* + * Some platforms assume that ISM pages are SE_SHARED + * locked for the entire life of the segment. + */ + if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) + return (0); + /* + * Fall through to the F_INVAL case to load up the hat layer + * entries with the HAT_LOAD_LOCK flag. + */ + + /* FALLTHRU */ + case F_INVAL: + + if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) + return (FC_NOMAP); + + /* + * Some platforms that do NOT support DYNAMIC_ISM_UNMAP + * may still rely on this call to hat_share(). That + * would imply that those hat's can fault on a + * HAT_LOAD_LOCK translation, which would seem + * contradictory. + */ + if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { + if (hat_share(seg->s_as->a_hat, seg->s_base, + curspt->a_hat, sptseg->s_base, + sptseg->s_size, sptseg->s_szc) != 0) { + panic("hat_share error in ISM fault"); + /*NOTREACHED*/ + } + return (0); + } + ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP); + + /* + * I see no need to lock the real seg, + * here, because all of our work will be on the underlying + * dummy seg. + * + * sptseg_addr and npages now account for large pages. + */ + amp = sptd->spt_amp; + ASSERT(amp != NULL); + anon_index = seg_page(sptseg, sptseg_addr); + + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (i = 0; i < npages; i++) { + anon_array_enter(amp, anon_index, &cookie); + ap = anon_get_ptr(amp->ahp, anon_index++); + ASSERT(ap != NULL); + swap_xlate(ap, &vp, &offset); + anon_array_exit(&cookie); + pp = page_lookup(vp, offset, SE_SHARED); + ASSERT(pp != NULL); + ppa[i] = pp; + } + ANON_LOCK_EXIT(&->a_rwlock); + ASSERT(i == npages); + + /* + * We are already holding the as->a_lock on the user's + * real segment, but we need to hold the a_lock on the + * underlying dummy as. This is mostly to satisfy the + * underlying HAT layer. + */ + AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); + if (type == F_SOFTLOCK) { + /* + * Load up the translation keeping it + * locked and don't unlock the page. + */ + hat_memload_array(sptseg->s_as->a_hat, sptseg_addr, + ptob(npages), ppa, sptd->spt_prot, + HAT_LOAD_LOCK | HAT_LOAD_SHARE); + } else { + if (hat == seg->s_as->a_hat) { + + /* + * Migrate pages marked for migration. + */ + if (lgrp_optimizations()) + page_migrate(seg, shm_addr, ppa, + npages); + + /* CPU HAT */ + hat_memload_array(sptseg->s_as->a_hat, + sptseg_addr, ptob(npages), ppa, + sptd->spt_prot, HAT_LOAD_SHARE); + } else { + /* XHAT. Pass real address */ + hat_memload_array(hat, shm_addr, + ptob(npages), ppa, sptd->spt_prot, + HAT_LOAD_SHARE); + } + + /* + * And now drop the SE_SHARED lock(s). + */ + for (i = 0; i < npages; i++) + page_unlock(ppa[i]); + } + AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); + + kmem_free(ppa, sizeof (page_t *) * npages); + return (0); + case F_SOFTUNLOCK: + + /* + * This is a bit ugly, we pass in the real seg pointer, + * but the sptseg_addr is the virtual address within the + * dummy seg. + */ + segspt_softunlock(seg, sptseg_addr, ptob(npages), rw); + return (0); + + case F_PROT: + + /* + * This takes care of the unusual case where a user + * allocates a stack in shared memory and a register + * window overflow is written to that stack page before + * it is otherwise modified. + * + * We can get away with this because ISM segments are + * always rw. Other than this unusual case, there + * should be no instances of protection violations. + */ + return (0); + + default: +#ifdef DEBUG + cmn_err(CE_WARN, "segspt_shmfault default type?"); +#endif + return (FC_NOMAP); + } +} + +/*ARGSUSED*/ +static faultcode_t +segspt_shmfaulta(struct seg *seg, caddr_t addr) +{ + return (0); +} + +/*ARGSUSED*/ +static int +segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta) +{ + return (0); +} + +/*ARGSUSED*/ +static size_t +segspt_shmswapout(struct seg *seg) +{ + return (0); +} + +/* + * duplicate the shared page tables + */ +int +segspt_shmdup(struct seg *seg, struct seg *newseg) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct anon_map *amp = shmd->shm_amp; + struct shm_data *shmd_new; + struct seg *spt_seg = shmd->shm_sptseg; + struct spt_data *sptd = spt_seg->s_data; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP); + newseg->s_data = (void *)shmd_new; + shmd_new->shm_sptas = shmd->shm_sptas; + shmd_new->shm_amp = amp; + shmd_new->shm_sptseg = shmd->shm_sptseg; + newseg->s_ops = &segspt_shmops; + newseg->s_szc = seg->s_szc; + ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc); + + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + amp->refcnt++; + ANON_LOCK_EXIT(&->a_rwlock); + + if (sptd->spt_flags & SHM_PAGEABLE) { + shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP); + shmd_new->shm_lckpgs = 0; + } + return (hat_share(newseg->s_as->a_hat, newseg->s_base, + shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, seg->s_szc)); +} + +/*ARGSUSED*/ +int +segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * ISM segment is always rw. + */ + return (((sptd->spt_prot & prot) != prot) ? EACCES : 0); +} + +/* + * Return an array of locked large pages, for empty slots allocate + * private zero-filled anon pages. + */ +static int +spt_anon_getpages( + struct seg *sptseg, + caddr_t sptaddr, + size_t len, + page_t *ppa[]) +{ + struct spt_data *sptd = sptseg->s_data; + struct anon_map *amp = sptd->spt_amp; + enum seg_rw rw = sptd->spt_prot; + uint_t szc = sptseg->s_szc; + size_t pg_sz, share_sz = page_get_pagesize(szc); + pgcnt_t lp_npgs; + caddr_t lp_addr, e_sptaddr; + uint_t vpprot, ppa_szc = 0; + struct vpage *vpage = NULL; + ulong_t j, ppa_idx; + int err, ierr = 0; + pgcnt_t an_idx; + anon_sync_obj_t cookie; + + ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz)); + ASSERT(len != 0); + + pg_sz = share_sz; + lp_npgs = btop(pg_sz); + lp_addr = sptaddr; + e_sptaddr = sptaddr + len; + an_idx = seg_page(sptseg, sptaddr); + ppa_idx = 0; + + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + /*CONSTCOND*/ + while (1) { + for (; lp_addr < e_sptaddr; + an_idx += lp_npgs, lp_addr += pg_sz, + ppa_idx += lp_npgs) { + + anon_array_enter(amp, an_idx, &cookie); + ppa_szc = (uint_t)-1; + ierr = anon_map_getpages(amp, an_idx, szc, sptseg, + lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx], + &ppa_szc, vpage, rw, 0, segvn_anypgsz, kcred); + anon_array_exit(&cookie); + + if (ierr != 0) { + if (ierr > 0) { + err = FC_MAKE_ERR(ierr); + goto lpgs_err; + } + break; + } + } + if (lp_addr == e_sptaddr) { + break; + } + ASSERT(lp_addr < e_sptaddr); + + /* + * ierr == -1 means we failed to allocate a large page. + * so do a size down operation. + * + * ierr == -2 means some other process that privately shares + * pages with this process has allocated a larger page and we + * need to retry with larger pages. So do a size up + * operation. This relies on the fact that large pages are + * never partially shared i.e. if we share any constituent + * page of a large page with another process we must share the + * entire large page. Note this cannot happen for SOFTLOCK + * case, unless current address (lpaddr) is at the beginning + * of the next page size boundary because the other process + * couldn't have relocated locked pages. + */ + ASSERT(ierr == -1 || ierr == -2); + if (segvn_anypgsz) { + ASSERT(ierr == -2 || szc != 0); + ASSERT(ierr == -1 || szc < sptseg->s_szc); + szc = (ierr == -1) ? szc - 1 : szc + 1; + } else { + /* + * For faults and segvn_anypgsz == 0 + * we need to be careful not to loop forever + * if existing page is found with szc other + * than 0 or seg->s_szc. This could be due + * to page relocations on behalf of DR or + * more likely large page creation. For this + * case simply re-size to existing page's szc + * if returned by anon_map_getpages(). + */ + if (ppa_szc == (uint_t)-1) { + szc = (ierr == -1) ? 0 : sptseg->s_szc; + } else { + ASSERT(ppa_szc <= sptseg->s_szc); + ASSERT(ierr == -2 || ppa_szc < szc); + ASSERT(ierr == -1 || ppa_szc > szc); + szc = ppa_szc; + } + } + pg_sz = page_get_pagesize(szc); + lp_npgs = btop(pg_sz); + ASSERT(IS_P2ALIGNED(lp_addr, pg_sz)); + } + ANON_LOCK_EXIT(&->a_rwlock); + return (0); + +lpgs_err: + ANON_LOCK_EXIT(&->a_rwlock); + for (j = 0; j < ppa_idx; j++) + page_unlock(ppa[j]); + return (err); +} + +int +spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, + page_t **ppa, ulong_t *lockmap, size_t pos) +{ + struct shm_data *shmd = seg->s_data; + struct spt_data *sptd = shmd->shm_sptseg->s_data; + ulong_t i; + int kernel; + + for (i = 0; i < npages; anon_index++, pos++, i++) { + if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) { + if (sptd->spt_ppa_lckcnt[anon_index] < + (ushort_t)DISM_LOCK_MAX) { + if (++sptd->spt_ppa_lckcnt[anon_index] == + (ushort_t)DISM_LOCK_MAX) { + cmn_err(CE_WARN, + "DISM page lock limit " + "reached on DISM offset 0x%lx\n", + anon_index << PAGESHIFT); + } + kernel = (sptd->spt_ppa && + sptd->spt_ppa[anon_index]) ? 1 : 0; + if (!page_pp_lock(ppa[i], 0, kernel)) { + /* unlock rest of the pages */ + for (; i < npages; i++) + page_unlock(ppa[i]); + sptd->spt_ppa_lckcnt[anon_index]--; + return (EAGAIN); + } + shmd->shm_lckpgs++; + shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED; + if (lockmap != NULL) + BT_SET(lockmap, pos); + } + } + page_unlock(ppa[i]); + } + return (0); +} + +/*ARGSUSED*/ +static int +segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, + int attr, int op, ulong_t *lockmap, size_t pos) +{ + struct shm_data *shmd = seg->s_data; + struct seg *sptseg = shmd->shm_sptseg; + struct spt_data *sptd = sptseg->s_data; + pgcnt_t npages, a_npages; + page_t **ppa; + pgcnt_t an_idx, a_an_idx, ppa_idx; + caddr_t spt_addr, a_addr; /* spt and aligned address */ + size_t a_len; /* aligned len */ + size_t share_sz; + ulong_t i; + int sts = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { + return (0); + } + + addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + an_idx = seg_page(seg, addr); + npages = btopr(len); + + if (an_idx + npages > btopr(shmd->shm_amp->size)) { + return (ENOMEM); + } + + if (op == MC_LOCK) { + /* + * Need to align addr and size request if they are not + * aligned so we can always allocate large page(s) however + * we only lock what was requested in initial request. + */ + share_sz = page_get_pagesize(sptseg->s_szc); + a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz); + a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)), + share_sz); + a_npages = btop(a_len); + a_an_idx = seg_page(seg, a_addr); + spt_addr = sptseg->s_base + ptob(a_an_idx); + ppa_idx = an_idx - a_an_idx; + + if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages), + KM_NOSLEEP)) == NULL) { + return (ENOMEM); + } + + /* + * Don't cache any new pages for IO and + * flush any cached pages. + */ + mutex_enter(&sptd->spt_lock); + if (sptd->spt_ppa != NULL) + sptd->spt_flags |= DISM_PPA_CHANGED; + + sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa); + if (sts != 0) { + mutex_exit(&sptd->spt_lock); + kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); + return (sts); + } + + sts = spt_lockpages(seg, an_idx, npages, + &ppa[ppa_idx], lockmap, pos); + /* + * unlock remaining pages for requests which are not + * aligned or not in 4 M chunks + */ + for (i = 0; i < ppa_idx; i++) + page_unlock(ppa[i]); + for (i = ppa_idx + npages; i < a_npages; i++) + page_unlock(ppa[i]); + if (sptd->spt_ppa != NULL) + sptd->spt_flags |= DISM_PPA_CHANGED; + mutex_exit(&sptd->spt_lock); + + kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); + + } else if (op == MC_UNLOCK) { /* unlock */ + struct anon_map *amp; + struct anon *ap; + struct vnode *vp; + u_offset_t off; + struct page *pp; + int kernel; + anon_sync_obj_t cookie; + + amp = sptd->spt_amp; + mutex_enter(&sptd->spt_lock); + if (shmd->shm_lckpgs == 0) { + mutex_exit(&sptd->spt_lock); + return (0); + } + /* + * Don't cache new IO pages. + */ + if (sptd->spt_ppa != NULL) + sptd->spt_flags |= DISM_PPA_CHANGED; + + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (i = 0; i < npages; i++, an_idx++) { + if (shmd->shm_vpage[an_idx] & DISM_PG_LOCKED) { + anon_array_enter(amp, an_idx, &cookie); + ap = anon_get_ptr(amp->ahp, an_idx); + ASSERT(ap); + ASSERT(sptd->spt_ppa_lckcnt[an_idx] > 0); + + swap_xlate(ap, &vp, &off); + anon_array_exit(&cookie); + pp = page_lookup(vp, off, SE_SHARED); + ASSERT(pp); + /* + * the availrmem is decremented only for + * pages which are not in seg pcache, + * for pages in seg pcache availrmem was + * decremented in _dismpagelock() (if + * they were not locked here) + */ + kernel = (sptd->spt_ppa && + sptd->spt_ppa[an_idx]) ? 1 : 0; + page_pp_unlock(pp, 0, kernel); + page_unlock(pp); + shmd->shm_vpage[an_idx] &= ~DISM_PG_LOCKED; + sptd->spt_ppa_lckcnt[an_idx]--; + shmd->shm_lckpgs--; + } + } + ANON_LOCK_EXIT(&->a_rwlock); + if (sptd->spt_ppa != NULL) + sptd->spt_flags |= DISM_PPA_CHANGED; + mutex_exit(&sptd->spt_lock); + } + return (sts); +} + +/*ARGSUSED*/ +int +segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; + spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * ISM segment is always rw. + */ + while (--pgno >= 0) + *protv++ = sptd->spt_prot; + return (0); +} + +/*ARGSUSED*/ +u_offset_t +segspt_shmgetoffset(struct seg *seg, caddr_t addr) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* Offset does not matter in ISM memory */ + + return ((u_offset_t)0); +} + +/* ARGSUSED */ +int +segspt_shmgettype(struct seg *seg, caddr_t addr) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * The shared memory mapping is always MAP_SHARED, SWAP is only + * reserved for DISM + */ + return (MAP_SHARED | + ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE)); +} + +/*ARGSUSED*/ +int +segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + *vpp = sptd->spt_vp; + return (0); +} + +/*ARGSUSED*/ +static int +segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; + struct anon_map *amp; + pgcnt_t pg_idx; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + if (behav == MADV_FREE) { + if ((sptd->spt_flags & SHM_PAGEABLE) == 0) + return (0); + + amp = sptd->spt_amp; + pg_idx = seg_page(seg, addr); + + mutex_enter(&sptd->spt_lock); + if (sptd->spt_ppa != NULL) + sptd->spt_flags |= DISM_PPA_CHANGED; + mutex_exit(&sptd->spt_lock); + + /* + * Purge all DISM cached pages + */ + seg_ppurge_seg(segspt_reclaim); + + mutex_enter(&sptd->spt_lock); + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + anon_disclaim(amp, pg_idx, len, ANON_PGLOOKUP_BLK); + ANON_LOCK_EXIT(&->a_rwlock); + mutex_exit(&sptd->spt_lock); + } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP || + behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) { + int already_set; + ulong_t anon_index; + lgrp_mem_policy_t policy; + caddr_t shm_addr; + size_t share_size; + size_t size; + struct seg *sptseg = shmd->shm_sptseg; + caddr_t sptseg_addr; + + /* + * Align address and length to page size of underlying segment + */ + share_size = page_get_pagesize(shmd->shm_sptseg->s_szc); + shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size); + size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), + share_size); + + amp = shmd->shm_amp; + anon_index = seg_page(seg, shm_addr); + + /* + * And now we may have to adjust size downward if we have + * exceeded the realsize of the segment or initial anon + * allocations. + */ + sptseg_addr = sptseg->s_base + ptob(anon_index); + if ((sptseg_addr + size) > + (sptseg->s_base + sptd->spt_realsize)) + size = (sptseg->s_base + sptd->spt_realsize) - + sptseg_addr; + + /* + * Set memory allocation policy for this segment + */ + policy = lgrp_madv_to_policy(behav, len, MAP_SHARED); + already_set = lgrp_shm_policy_set(policy, amp, anon_index, + NULL, 0, len); + + /* + * If random memory allocation policy set already, + * don't bother reapplying it. + */ + if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy)) + return (0); + + /* + * Mark any existing pages in the given range for + * migration, flushing the I/O page cache, and using + * underlying segment to calculate anon index and get + * anonmap and vnode pointer from + */ + if (shmd->shm_softlockcnt > 0) + segspt_purge(seg); + + page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0); + } + + return (0); +} + +/*ARGSUSED*/ +void +segspt_shmdump(struct seg *seg) +{ + /* no-op for ISM segment */ +} + +/*ARGSUSED*/ +static faultcode_t +segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc) +{ + return (ENOTSUP); +} + +/* + * get a memory ID for an addr in a given segment + */ +static int +segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + struct shm_data *shmd = (struct shm_data *)seg->s_data; + struct anon *ap; + size_t anon_index; + struct anon_map *amp = shmd->shm_amp; + struct spt_data *sptd = shmd->shm_sptseg->s_data; + struct seg *sptseg = shmd->shm_sptseg; + anon_sync_obj_t cookie; + + anon_index = seg_page(seg, addr); + + if (addr > (seg->s_base + sptd->spt_realsize)) { + return (EFAULT); + } + + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + anon_array_enter(amp, anon_index, &cookie); + ap = anon_get_ptr(amp->ahp, anon_index); + if (ap == NULL) { + struct page *pp; + caddr_t spt_addr = sptseg->s_base + ptob(anon_index); + + pp = anon_zero(sptseg, spt_addr, &ap, kcred); + if (pp == NULL) { + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + return (ENOMEM); + } + (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); + page_unlock(pp); + } + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + memidp->val[0] = (uintptr_t)ap; + memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; + return (0); +} + +/* + * Get memory allocation policy info for specified address in given segment + */ +static lgrp_mem_policy_info_t * +segspt_shmgetpolicy(struct seg *seg, caddr_t addr) +{ + struct anon_map *amp; + ulong_t anon_index; + lgrp_mem_policy_info_t *policy_info; + struct shm_data *shm_data; + + ASSERT(seg != NULL); + + /* + * Get anon_map from segshm + * + * Assume that no lock needs to be held on anon_map, since + * it should be protected by its reference count which must be + * nonzero for an existing segment + * Need to grab readers lock on policy tree though + */ + shm_data = (struct shm_data *)seg->s_data; + if (shm_data == NULL) + return (NULL); + amp = shm_data->shm_amp; + ASSERT(amp->refcnt != 0); + + /* + * Get policy info + * + * Assume starting anon index of 0 + */ + anon_index = seg_page(seg, addr); + policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); + + return (policy_info); +} diff --git a/usr/src/uts/common/vm/seg_spt.h b/usr/src/uts/common/vm/seg_spt.h new file mode 100644 index 0000000000..fb97c77fcf --- /dev/null +++ b/usr/src/uts/common/vm/seg_spt.h @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VM_SEG_SPT_H +#define _VM_SEG_SPT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _ASM + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/lgrp.h> + +/* + * Passed data when creating spt segment. + */ +struct segspt_crargs { + struct seg *seg_spt; + struct anon_map *amp; + uint_t prot; + uint_t flags; + uint_t szc; +}; + +typedef struct spt_data { + struct vnode *spt_vp; + struct anon_map *spt_amp; + size_t spt_realsize; + struct page **spt_ppa; + ushort_t *spt_ppa_lckcnt; + uint_t spt_prot; + kmutex_t spt_lock; + size_t spt_pcachecnt; /* # of times in pcache */ + uint_t spt_flags; /* Dynamic ISM or regular ISM */ + /* + * Initial memory allocation policy + * used during pre-allocation done in shmat() + */ + lgrp_mem_policy_info_t spt_policy_info; +} spt_data_t; + +/* + * Private data for spt_shm segment. + */ +typedef struct shm_data { + struct as *shm_sptas; + struct anon_map *shm_amp; + size_t shm_softlockcnt; /* # outstanding lock operations */ + struct seg *shm_sptseg; /* pointer to spt segment */ + char *shm_vpage; /* indicating locked pages */ + spgcnt_t shm_lckpgs; /* # of locked pages per attached seg */ + /* + * Memory allocation policy after shmat() + */ + lgrp_mem_policy_info_t shm_policy_info; +} shm_data_t; + +#define DISM_PG_LOCKED 0x1 /* DISM page is locked */ +#define DISM_PPA_CHANGED 0x2 /* DISM new lock, need to rebuild ppa */ + +#define DISM_LOCK_MAX 0xfffe /* max number of locks per DISM page */ +#endif + +#ifdef _KERNEL + +#ifndef _ASM + +/* + * Functions used in shm.c to call ISM. + */ +int sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp, + uint_t prot, uint_t flags, uint_t szc); +void sptdestroy(struct as *, struct anon_map *); +int segspt_shmattach(struct seg *, caddr_t *); + +#define isspt(sp) ((sp)->shm_sptinfo ? (sp)->shm_sptinfo->sptas : NULL) +#define spt_locked(a) ((a) & SHM_SHARE_MMU) +#define spt_pageable(a) ((a) & SHM_PAGEABLE) +#define spt_invalid(a) (spt_locked((a)) && spt_pageable((a))) + +/* + * This can be applied to a segment with seg->s_ops == &segspt_shmops + * to determine the real size of the ISM segment. + */ +#define spt_realsize(seg) (((struct spt_data *)(((struct shm_data *)\ + ((seg)->s_data))->shm_sptseg->s_data))->spt_realsize) + +/* + * This can be applied to a segment with seg->s_ops == &segspt_ops + * to determine the flags of the {D}ISM segment. + */ +#define spt_flags(seg) (((struct spt_data *)((seg)->s_data))->spt_flags) + +/* + * For large page support + */ +extern int segvn_anypgsz; + +#endif + +/* + * In a 64-bit address space, we'll try to put ISM segments between + * PREDISM_BASE and PREDISM_BOUND. The HAT may use these constants to + * predict that a VA is contained by an ISM segment, which may optimize + * translation. The range must _only_ be treated as advisory; ISM segments + * may fall outside of the range, and non-ISM segments may be contained + * within the range. + * In order to avoid collision between ISM/DISM addresses with e.g. + * process heap addresses we will try to put ISM/DISM segments above + * PREDISM_1T_BASESHIFT (1T). + * The HAT is still expecting that any VA larger than PREDISM_BASESHIFT + * may belong to ISM/DISM (so on tlb miss it will probe first for 4M + * translation) + */ +#define PREDISM_BASESHIFT 33 +#define PREDISM_1T_BASESHIFT 40 +#define PREDISM_BASE ((uintptr_t)1 << PREDISM_BASESHIFT) +#define PREDISM_1T_BASE ((uintptr_t)1 << PREDISM_1T_BASESHIFT) +#define PREDISM_BOUND ((uintptr_t)1 << 63) + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_SPT_H */ diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c new file mode 100644 index 0000000000..86e57227f8 --- /dev/null +++ b/usr/src/uts/common/vm/seg_vn.c @@ -0,0 +1,7745 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - shared or copy-on-write from a vnode/anonymous memory. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/mman.h> +#include <sys/debug.h> +#include <sys/cred.h> +#include <sys/vmsystm.h> +#include <sys/tuneable.h> +#include <sys/bitmap.h> +#include <sys/swap.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/vtrace.h> +#include <sys/cmn_err.h> +#include <sys/vm.h> +#include <sys/dumphdr.h> +#include <sys/lgrp.h> + +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <vm/pvn.h> +#include <vm/anon.h> +#include <vm/page.h> +#include <vm/vpage.h> + +/* + * Private seg op routines. + */ +static int segvn_dup(struct seg *seg, struct seg *newseg); +static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); +static void segvn_free(struct seg *seg); +static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, + caddr_t addr, size_t len, enum fault_type type, + enum seg_rw rw); +static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); +static int segvn_setprot(struct seg *seg, caddr_t addr, + size_t len, uint_t prot); +static int segvn_checkprot(struct seg *seg, caddr_t addr, + size_t len, uint_t prot); +static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); +static size_t segvn_swapout(struct seg *seg); +static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, + int attr, uint_t flags); +static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, + char *vec); +static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, + int attr, int op, ulong_t *lockmap, size_t pos); +static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, + uint_t *protv); +static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); +static int segvn_gettype(struct seg *seg, caddr_t addr); +static int segvn_getvp(struct seg *seg, caddr_t addr, + struct vnode **vpp); +static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, + uint_t behav); +static void segvn_dump(struct seg *seg); +static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, + struct page ***ppp, enum lock_type type, enum seg_rw rw); +static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, + uint_t szc); +static int segvn_getmemid(struct seg *seg, caddr_t addr, + memid_t *memidp); +static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); + +struct seg_ops segvn_ops = { + segvn_dup, + segvn_unmap, + segvn_free, + segvn_fault, + segvn_faulta, + segvn_setprot, + segvn_checkprot, + segvn_kluster, + segvn_swapout, + segvn_sync, + segvn_incore, + segvn_lockop, + segvn_getprot, + segvn_getoffset, + segvn_gettype, + segvn_getvp, + segvn_advise, + segvn_dump, + segvn_pagelock, + segvn_setpagesize, + segvn_getmemid, + segvn_getpolicy, +}; + +/* + * Common zfod structures, provided as a shorthand for others to use. + */ +static segvn_crargs_t zfod_segvn_crargs = + SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); +static segvn_crargs_t kzfod_segvn_crargs = + SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, + PROT_ALL & ~PROT_USER); +static segvn_crargs_t stack_noexec_crargs = + SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); + +caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ +caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ +caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ +caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ + +#define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ + +size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ + +static int segvn_concat(struct seg *, struct seg *, int); +static int segvn_extend_prev(struct seg *, struct seg *, + struct segvn_crargs *, size_t); +static int segvn_extend_next(struct seg *, struct seg *, + struct segvn_crargs *, size_t); +static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); +static void segvn_pagelist_rele(page_t **); +static void segvn_setvnode_mpss(vnode_t *); +static void segvn_relocate_pages(page_t **, page_t *); +static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); +static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, + uint_t, page_t **, page_t **, uint_t *, int *); +static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, + caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); +static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, + caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); +static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, + u_offset_t, struct vpage *, page_t **, uint_t, + enum fault_type, enum seg_rw, int); +static void segvn_vpage(struct seg *); + +static void segvn_purge(struct seg *seg); +static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, + enum seg_rw); + +static int sameprot(struct seg *, caddr_t, size_t); + +static int segvn_demote_range(struct seg *, caddr_t, size_t, int); +static int segvn_clrszc(struct seg *); +static struct seg *segvn_split_seg(struct seg *, caddr_t); +static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, + ulong_t, uint_t); + +static struct kmem_cache *segvn_cache; + +#ifdef VM_STATS +static struct segvnvmstats_str { + ulong_t fill_vp_pages[31]; + ulong_t fltvnpages[49]; + ulong_t fullszcpages[10]; + ulong_t relocatepages[3]; + ulong_t fltanpages[17]; + ulong_t pagelock[3]; + ulong_t demoterange[3]; +} segvnvmstats; +#endif /* VM_STATS */ + +#define SDR_RANGE 1 /* demote entire range */ +#define SDR_END 2 /* demote non aligned ends only */ + +#define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ + if ((len) != 0) { \ + lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ + ASSERT(lpgaddr >= (seg)->s_base); \ + lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ + (len)), pgsz); \ + ASSERT(lpgeaddr > lpgaddr); \ + ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ + } else { \ + lpgeaddr = lpgaddr = (addr); \ + } \ + } + +/*ARGSUSED*/ +static int +segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + struct segvn_data *svd = buf; + + rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); + mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +/*ARGSUSED1*/ +static void +segvn_cache_destructor(void *buf, void *cdrarg) +{ + struct segvn_data *svd = buf; + + rw_destroy(&svd->lock); + mutex_destroy(&svd->segp_slock); +} + +/* + * Patching this variable to non-zero allows the system to run with + * stacks marked as "not executable". It's a bit of a kludge, but is + * provided as a tweakable for platforms that export those ABIs + * (e.g. sparc V8) that have executable stacks enabled by default. + * There are also some restrictions for platforms that don't actually + * implement 'noexec' protections. + * + * Once enabled, the system is (therefore) unable to provide a fully + * ABI-compliant execution environment, though practically speaking, + * most everything works. The exceptions are generally some interpreters + * and debuggers that create executable code on the stack and jump + * into it (without explicitly mprotecting the address range to include + * PROT_EXEC). + * + * One important class of applications that are disabled are those + * that have been transformed into malicious agents using one of the + * numerous "buffer overflow" attacks. See 4007890. + */ +int noexec_user_stack = 0; +int noexec_user_stack_log = 1; + +int segvn_lpg_disable = 0; +uint_t segvn_maxpgszc = 0; + +ulong_t segvn_fltvnpages_clrszc_err; +ulong_t segvn_setpgsz_align_err; +ulong_t segvn_setpgsz_getattr_err; +ulong_t segvn_setpgsz_eof_err; +ulong_t segvn_faultvnmpss_align_err1; +ulong_t segvn_faultvnmpss_align_err2; +ulong_t segvn_faultvnmpss_align_err3; +ulong_t segvn_faultvnmpss_align_err4; +ulong_t segvn_faultvnmpss_align_err5; +ulong_t segvn_vmpss_pageio_deadlk_err; + +/* + * Initialize segvn data structures + */ +void +segvn_init(void) +{ + uint_t maxszc; + uint_t szc; + size_t pgsz; + + segvn_cache = kmem_cache_create("segvn_cache", + sizeof (struct segvn_data), 0, + segvn_cache_constructor, segvn_cache_destructor, NULL, + NULL, NULL, 0); + + if (segvn_lpg_disable != 0) + return; + szc = maxszc = page_num_pagesizes() - 1; + if (szc == 0) { + segvn_lpg_disable = 1; + return; + } + if (page_get_pagesize(0) != PAGESIZE) { + panic("segvn_init: bad szc 0"); + /*NOTREACHED*/ + } + while (szc != 0) { + pgsz = page_get_pagesize(szc); + if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { + panic("segvn_init: bad szc %d", szc); + /*NOTREACHED*/ + } + szc--; + } + if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) + segvn_maxpgszc = maxszc; +} + +#define SEGVN_PAGEIO ((void *)0x1) +#define SEGVN_NOPAGEIO ((void *)0x2) + +static void +segvn_setvnode_mpss(vnode_t *vp) +{ + int err; + + ASSERT(vp->v_mpssdata == NULL || + vp->v_mpssdata == SEGVN_PAGEIO || + vp->v_mpssdata == SEGVN_NOPAGEIO); + + if (vp->v_mpssdata == NULL) { + if (vn_vmpss_usepageio(vp)) { + err = VOP_PAGEIO(vp, (page_t *)NULL, + (u_offset_t)0, 0, 0, CRED()); + } else { + err = ENOSYS; + } + /* + * set v_mpssdata just once per vnode life + * so that it never changes. + */ + mutex_enter(&vp->v_lock); + if (vp->v_mpssdata == NULL) { + if (err == EINVAL) { + vp->v_mpssdata = SEGVN_PAGEIO; + } else { + vp->v_mpssdata = SEGVN_NOPAGEIO; + } + } + mutex_exit(&vp->v_lock); + } +} + +int +segvn_create(struct seg *seg, void *argsp) +{ + struct segvn_crargs *a = (struct segvn_crargs *)argsp; + struct segvn_data *svd; + size_t swresv = 0; + struct cred *cred; + struct anon_map *amp; + int error = 0; + size_t pgsz; + lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; + + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { + panic("segvn_create type"); + /*NOTREACHED*/ + } + + /* + * Check arguments. If a shared anon structure is given then + * it is illegal to also specify a vp. + */ + if (a->amp != NULL && a->vp != NULL) { + panic("segvn_create anon_map"); + /*NOTREACHED*/ + } + + /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ + if (a->type == MAP_SHARED) + a->flags &= ~MAP_NORESERVE; + + if (a->szc != 0) { + if (segvn_lpg_disable != 0 || a->amp != NULL || + (a->type == MAP_SHARED && a->vp == NULL) || + (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { + a->szc = 0; + } else { + if (a->szc > segvn_maxpgszc) + a->szc = segvn_maxpgszc; + pgsz = page_get_pagesize(a->szc); + if (!IS_P2ALIGNED(seg->s_base, pgsz) || + !IS_P2ALIGNED(seg->s_size, pgsz)) { + a->szc = 0; + } else if (a->vp != NULL) { + extern struct vnode kvp; + if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) { + /* + * paranoid check. + * hat_page_demote() is not supported + * on swapfs pages. + */ + a->szc = 0; + } else if (map_addr_vacalign_check(seg->s_base, + a->offset & PAGEMASK)) { + a->szc = 0; + } + } + } + } + + /* + * If segment may need private pages, reserve them now. + */ + if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || + (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { + if (anon_resv(seg->s_size) == 0) + return (EAGAIN); + swresv = seg->s_size; + TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", + seg, swresv, 1); + } + + /* + * Reserve any mapping structures that may be required. + */ + hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); + + if (a->cred) { + cred = a->cred; + crhold(cred); + } else { + crhold(cred = CRED()); + } + + /* Inform the vnode of the new mapping */ + if (a->vp) { + error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, + seg->s_as, seg->s_base, seg->s_size, a->prot, + a->maxprot, a->type, cred); + if (error) { + if (swresv != 0) { + anon_unresv(swresv); + TRACE_3(TR_FAC_VM, TR_ANON_PROC, + "anon proc:%p %lu %u", + seg, swresv, 0); + } + crfree(cred); + hat_unload(seg->s_as->a_hat, seg->s_base, + seg->s_size, HAT_UNLOAD_UNMAP); + return (error); + } + } + + /* + * If more than one segment in the address space, and + * they're adjacent virtually, try to concatenate them. + * Don't concatenate if an explicit anon_map structure + * was supplied (e.g., SystemV shared memory). + */ + if (a->amp == NULL) { + struct seg *pseg, *nseg; + struct segvn_data *psvd, *nsvd; + lgrp_mem_policy_t ppolicy, npolicy; + uint_t lgrp_mem_policy_flags = 0; + extern lgrp_mem_policy_t lgrp_mem_default_policy; + + /* + * Memory policy flags (lgrp_mem_policy_flags) is valid when + * extending stack/heap segments. + */ + if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && + !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { + lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; + } else { + /* + * Get policy when not extending it from another segment + */ + mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); + } + + /* + * First, try to concatenate the previous and new segments + */ + pseg = AS_SEGPREV(seg->s_as, seg); + if (pseg != NULL && + pseg->s_base + pseg->s_size == seg->s_base && + pseg->s_ops == &segvn_ops) { + /* + * Get memory allocation policy from previous segment. + * When extension is specified (e.g. for heap) apply + * this policy to the new segment regardless of the + * outcome of segment concatenation. Extension occurs + * for non-default policy otherwise default policy is + * used and is based on extended segment size. + */ + psvd = (struct segvn_data *)pseg->s_data; + ppolicy = psvd->policy_info.mem_policy; + if (lgrp_mem_policy_flags == + LGRP_MP_FLAG_EXTEND_UP) { + if (ppolicy != lgrp_mem_default_policy) { + mpolicy = ppolicy; + } else { + mpolicy = lgrp_mem_policy_default( + pseg->s_size + seg->s_size, + a->type); + } + } + + if (mpolicy == ppolicy && + (pseg->s_size + seg->s_size <= + segvn_comb_thrshld || psvd->amp == NULL) && + segvn_extend_prev(pseg, seg, a, swresv) == 0) { + /* + * success! now try to concatenate + * with following seg + */ + crfree(cred); + nseg = AS_SEGNEXT(pseg->s_as, pseg); + if (nseg != NULL && + nseg != pseg && + nseg->s_ops == &segvn_ops && + pseg->s_base + pseg->s_size == + nseg->s_base) + (void) segvn_concat(pseg, nseg, 0); + ASSERT(pseg->s_szc == 0 || + (a->szc == pseg->s_szc && + IS_P2ALIGNED(pseg->s_base, pgsz) && + IS_P2ALIGNED(pseg->s_size, pgsz))); + return (0); + } + } + + /* + * Failed, so try to concatenate with following seg + */ + nseg = AS_SEGNEXT(seg->s_as, seg); + if (nseg != NULL && + seg->s_base + seg->s_size == nseg->s_base && + nseg->s_ops == &segvn_ops) { + /* + * Get memory allocation policy from next segment. + * When extension is specified (e.g. for stack) apply + * this policy to the new segment regardless of the + * outcome of segment concatenation. Extension occurs + * for non-default policy otherwise default policy is + * used and is based on extended segment size. + */ + nsvd = (struct segvn_data *)nseg->s_data; + npolicy = nsvd->policy_info.mem_policy; + if (lgrp_mem_policy_flags == + LGRP_MP_FLAG_EXTEND_DOWN) { + if (npolicy != lgrp_mem_default_policy) { + mpolicy = npolicy; + } else { + mpolicy = lgrp_mem_policy_default( + nseg->s_size + seg->s_size, + a->type); + } + } + + if (mpolicy == npolicy && + segvn_extend_next(seg, nseg, a, swresv) == 0) { + crfree(cred); + ASSERT(nseg->s_szc == 0 || + (a->szc == nseg->s_szc && + IS_P2ALIGNED(nseg->s_base, pgsz) && + IS_P2ALIGNED(nseg->s_size, pgsz))); + return (0); + } + } + } + + if (a->vp != NULL) { + VN_HOLD(a->vp); + if (a->type == MAP_SHARED) + lgrp_shm_policy_init(NULL, a->vp); + } + svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); + + seg->s_ops = &segvn_ops; + seg->s_data = (void *)svd; + seg->s_szc = a->szc; + + svd->vp = a->vp; + /* + * Anonymous mappings have no backing file so the offset is meaningless. + */ + svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; + svd->prot = a->prot; + svd->maxprot = a->maxprot; + svd->pageprot = 0; + svd->type = a->type; + svd->vpage = NULL; + svd->cred = cred; + svd->advice = MADV_NORMAL; + svd->pageadvice = 0; + svd->flags = (ushort_t)a->flags; + svd->softlockcnt = 0; + if (a->szc != 0 && a->vp != NULL) { + segvn_setvnode_mpss(a->vp); + } + + amp = a->amp; + if ((svd->amp = amp) == NULL) { + svd->anon_index = 0; + if (svd->type == MAP_SHARED) { + svd->swresv = 0; + /* + * Shared mappings to a vp need no other setup. + * If we have a shared mapping to an anon_map object + * which hasn't been allocated yet, allocate the + * struct now so that it will be properly shared + * by remembering the swap reservation there. + */ + if (a->vp == NULL) { + svd->amp = anonmap_alloc(seg->s_size, swresv); + svd->amp->a_szc = seg->s_szc; + } + } else { + /* + * Private mapping (with or without a vp). + * Allocate anon_map when needed. + */ + svd->swresv = swresv; + } + } else { + pgcnt_t anon_num; + + /* + * Mapping to an existing anon_map structure without a vp. + * For now we will insure that the segment size isn't larger + * than the size - offset gives us. Later on we may wish to + * have the anon array dynamically allocated itself so that + * we don't always have to allocate all the anon pointer slots. + * This of course involves adding extra code to check that we + * aren't trying to use an anon pointer slot beyond the end + * of the currently allocated anon array. + */ + if ((amp->size - a->offset) < seg->s_size) { + panic("segvn_create anon_map size"); + /*NOTREACHED*/ + } + + anon_num = btopr(a->offset); + + if (a->type == MAP_SHARED) { + /* + * SHARED mapping to a given anon_map. + */ + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + amp->refcnt++; + ANON_LOCK_EXIT(&->a_rwlock); + svd->anon_index = anon_num; + svd->swresv = 0; + } else { + /* + * PRIVATE mapping to a given anon_map. + * Make sure that all the needed anon + * structures are created (so that we will + * share the underlying pages if nothing + * is written by this mapping) and then + * duplicate the anon array as is done + * when a privately mapped segment is dup'ed. + */ + struct anon *ap; + caddr_t addr; + caddr_t eaddr; + ulong_t anon_idx; + int hat_flag = HAT_LOAD; + + if (svd->flags & MAP_TEXT) { + hat_flag |= HAT_LOAD_TEXT; + } + + svd->amp = anonmap_alloc(seg->s_size, 0); + svd->amp->a_szc = seg->s_szc; + svd->anon_index = 0; + svd->swresv = swresv; + + /* + * Prevent 2 threads from allocating anon + * slots simultaneously. + */ + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + eaddr = seg->s_base + seg->s_size; + + for (anon_idx = anon_num, addr = seg->s_base; + addr < eaddr; addr += PAGESIZE, anon_idx++) { + page_t *pp; + + if ((ap = anon_get_ptr(amp->ahp, + anon_idx)) != NULL) + continue; + + /* + * Allocate the anon struct now. + * Might as well load up translation + * to the page while we're at it... + */ + pp = anon_zero(seg, addr, &ap, cred); + if (ap == NULL || pp == NULL) { + panic("segvn_create anon_zero"); + /*NOTREACHED*/ + } + + /* + * Re-acquire the anon_map lock and + * initialize the anon array entry. + */ + ASSERT(anon_get_ptr(amp->ahp, + anon_idx) == NULL); + (void) anon_set_ptr(amp->ahp, anon_idx, ap, + ANON_SLEEP); + + ASSERT(seg->s_szc == 0); + ASSERT(!IS_VMODSORT(pp->p_vnode)); + + hat_memload(seg->s_as->a_hat, addr, pp, + svd->prot & ~PROT_WRITE, hat_flag); + + page_unlock(pp); + } + ASSERT(seg->s_szc == 0); + anon_dup(amp->ahp, anon_num, svd->amp->ahp, + 0, seg->s_size); + ANON_LOCK_EXIT(&->a_rwlock); + } + } + + /* + * Set default memory allocation policy for segment + * + * Always set policy for private memory at least for initialization + * even if this is a shared memory segment + */ + (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); + + if (svd->type == MAP_SHARED) + (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, + svd->vp, svd->offset, seg->s_size); + + return (0); +} + +/* + * Concatenate two existing segments, if possible. + * Return 0 on success, -1 if two segments are not compatible + * or -2 on memory allocation failure. + * If private == 1 then try and concat segments with private pages. + */ +static int +segvn_concat(struct seg *seg1, struct seg *seg2, int private) +{ + struct segvn_data *svd1 = seg1->s_data; + struct segvn_data *svd2 = seg2->s_data; + struct anon_map *amp1 = svd1->amp; + struct anon_map *amp2 = svd2->amp; + struct vpage *vpage1 = svd1->vpage; + struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; + size_t size, nvpsize; + pgcnt_t npages1, npages2; + + ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); + ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); + ASSERT(seg1->s_ops == seg2->s_ops); + + /* both segments exist, try to merge them */ +#define incompat(x) (svd1->x != svd2->x) + if (incompat(vp) || incompat(maxprot) || + (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || + (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || + incompat(type) || incompat(cred) || incompat(flags) || + seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || + (svd2->softlockcnt > 0)) + return (-1); +#undef incompat + + /* + * vp == NULL implies zfod, offset doesn't matter + */ + if (svd1->vp != NULL && + svd1->offset + seg1->s_size != svd2->offset) { + return (-1); + } + + /* + * Fail early if we're not supposed to concatenate + * private pages. + */ + if ((private == 0 || svd1->type != MAP_PRIVATE) && + (amp1 != NULL || amp2 != NULL)) { + return (-1); + } + + /* + * If either seg has vpages, create a new merged vpage array. + */ + if (vpage1 != NULL || vpage2 != NULL) { + struct vpage *vp; + + npages1 = seg_pages(seg1); + npages2 = seg_pages(seg2); + nvpsize = vpgtob(npages1 + npages2); + + if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { + return (-2); + } + if (vpage1 != NULL) { + bcopy(vpage1, nvpage, vpgtob(npages1)); + } + if (vpage2 != NULL) { + bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); + } + for (vp = nvpage; vp < nvpage + npages1; vp++) { + if (svd2->pageprot && !svd1->pageprot) { + VPP_SETPROT(vp, svd1->prot); + } + if (svd2->pageadvice && !svd1->pageadvice) { + VPP_SETADVICE(vp, svd1->advice); + } + } + for (vp = nvpage + npages1; + vp < nvpage + npages1 + npages2; vp++) { + if (svd1->pageprot && !svd2->pageprot) { + VPP_SETPROT(vp, svd2->prot); + } + if (svd1->pageadvice && !svd2->pageadvice) { + VPP_SETADVICE(vp, svd2->advice); + } + } + } + + /* + * If either segment has private pages, create a new merged anon + * array. + */ + if (amp1 != NULL || amp2 != NULL) { + struct anon_hdr *nahp; + struct anon_map *namp = NULL; + size_t asize = seg1->s_size + seg2->s_size; + + if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { + if (nvpage != NULL) { + kmem_free(nvpage, nvpsize); + } + return (-2); + } + if (amp1 != NULL) { + /* + * XXX anon rwlock is not really needed because + * this is a private segment and we are writers. + */ + ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); + ASSERT(amp1->refcnt == 1); + if (anon_copy_ptr(amp1->ahp, svd1->anon_index, + nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { + anon_release(nahp, btop(asize)); + ANON_LOCK_EXIT(&1->a_rwlock); + if (nvpage != NULL) { + kmem_free(nvpage, nvpsize); + } + return (-2); + } + } + if (amp2 != NULL) { + ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); + ASSERT(amp2->refcnt == 1); + if (anon_copy_ptr(amp2->ahp, svd2->anon_index, + nahp, btop(seg1->s_size), btop(seg2->s_size), + ANON_NOSLEEP)) { + anon_release(nahp, btop(asize)); + ANON_LOCK_EXIT(&2->a_rwlock); + if (amp1 != NULL) { + ANON_LOCK_EXIT(&1->a_rwlock); + } + if (nvpage != NULL) { + kmem_free(nvpage, nvpsize); + } + return (-2); + } + } + if (amp1 != NULL) { + namp = amp1; + anon_release(amp1->ahp, btop(amp1->size)); + } + if (amp2 != NULL) { + if (namp == NULL) { + ASSERT(amp1 == NULL); + namp = amp2; + anon_release(amp2->ahp, btop(amp2->size)); + } else { + amp2->refcnt--; + ANON_LOCK_EXIT(&2->a_rwlock); + anonmap_free(amp2); + } + svd2->amp = NULL; /* needed for seg_free */ + } + namp->ahp = nahp; + namp->size = asize; + svd1->amp = namp; + svd1->anon_index = 0; + ANON_LOCK_EXIT(&namp->a_rwlock); + } + /* + * Now free the old vpage structures. + */ + if (nvpage != NULL) { + if (vpage1 != NULL) { + kmem_free(vpage1, vpgtob(npages1)); + } + if (vpage2 != NULL) { + svd2->vpage = NULL; + kmem_free(vpage2, vpgtob(npages2)); + } + if (svd2->pageprot) { + svd1->pageprot = 1; + } + if (svd2->pageadvice) { + svd1->pageadvice = 1; + } + svd1->vpage = nvpage; + } + + /* all looks ok, merge segments */ + svd1->swresv += svd2->swresv; + svd2->swresv = 0; /* so seg_free doesn't release swap space */ + size = seg2->s_size; + seg_free(seg2); + seg1->s_size += size; + return (0); +} + +/* + * Extend the previous segment (seg1) to include the + * new segment (seg2 + a), if possible. + * Return 0 on success. + */ +static int +segvn_extend_prev(seg1, seg2, a, swresv) + struct seg *seg1, *seg2; + struct segvn_crargs *a; + size_t swresv; +{ + struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; + size_t size; + struct anon_map *amp1; + struct vpage *new_vpage; + + /* + * We don't need any segment level locks for "segvn" data + * since the address space is "write" locked. + */ + ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); + + /* second segment is new, try to extend first */ + /* XXX - should also check cred */ + if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || + (!svd1->pageprot && (svd1->prot != a->prot)) || + svd1->type != a->type || svd1->flags != a->flags || + seg1->s_szc != a->szc) + return (-1); + + /* vp == NULL implies zfod, offset doesn't matter */ + if (svd1->vp != NULL && + svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) + return (-1); + + amp1 = svd1->amp; + if (amp1) { + pgcnt_t newpgs; + + /* + * Segment has private pages, can data structures + * be expanded? + * + * Acquire the anon_map lock to prevent it from changing, + * if it is shared. This ensures that the anon_map + * will not change while a thread which has a read/write + * lock on an address space references it. + * XXX - Don't need the anon_map lock at all if "refcnt" + * is 1. + * + * Can't grow a MAP_SHARED segment with an anonmap because + * there may be existing anon slots where we want to extend + * the segment and we wouldn't know what to do with them + * (e.g., for tmpfs right thing is to just leave them there, + * for /dev/zero they should be cleared out). + */ + if (svd1->type == MAP_SHARED) + return (-1); + + ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); + if (amp1->refcnt > 1) { + ANON_LOCK_EXIT(&1->a_rwlock); + return (-1); + } + newpgs = anon_grow(amp1->ahp, &svd1->anon_index, + btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); + + if (newpgs == 0) { + ANON_LOCK_EXIT(&1->a_rwlock); + return (-1); + } + amp1->size = ptob(newpgs); + ANON_LOCK_EXIT(&1->a_rwlock); + } + if (svd1->vpage != NULL) { + new_vpage = + kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), + KM_NOSLEEP); + if (new_vpage == NULL) + return (-1); + bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); + kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); + svd1->vpage = new_vpage; + if (svd1->pageprot) { + struct vpage *vp, *evp; + + vp = new_vpage + seg_pages(seg1); + evp = vp + seg_pages(seg2); + for (; vp < evp; vp++) + VPP_SETPROT(vp, a->prot); + } + } + size = seg2->s_size; + seg_free(seg2); + seg1->s_size += size; + svd1->swresv += swresv; + return (0); +} + +/* + * Extend the next segment (seg2) to include the + * new segment (seg1 + a), if possible. + * Return 0 on success. + */ +static int +segvn_extend_next( + struct seg *seg1, + struct seg *seg2, + struct segvn_crargs *a, + size_t swresv) +{ + struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; + size_t size; + struct anon_map *amp2; + struct vpage *new_vpage; + + /* + * We don't need any segment level locks for "segvn" data + * since the address space is "write" locked. + */ + ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); + + /* first segment is new, try to extend second */ + /* XXX - should also check cred */ + if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || + (!svd2->pageprot && (svd2->prot != a->prot)) || + svd2->type != a->type || svd2->flags != a->flags || + seg2->s_szc != a->szc) + return (-1); + /* vp == NULL implies zfod, offset doesn't matter */ + if (svd2->vp != NULL && + (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) + return (-1); + + amp2 = svd2->amp; + if (amp2) { + pgcnt_t newpgs; + + /* + * Segment has private pages, can data structures + * be expanded? + * + * Acquire the anon_map lock to prevent it from changing, + * if it is shared. This ensures that the anon_map + * will not change while a thread which has a read/write + * lock on an address space references it. + * + * XXX - Don't need the anon_map lock at all if "refcnt" + * is 1. + */ + if (svd2->type == MAP_SHARED) + return (-1); + + ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); + if (amp2->refcnt > 1) { + ANON_LOCK_EXIT(&2->a_rwlock); + return (-1); + } + newpgs = anon_grow(amp2->ahp, &svd2->anon_index, + btop(seg2->s_size), btop(seg1->s_size), + ANON_NOSLEEP | ANON_GROWDOWN); + + if (newpgs == 0) { + ANON_LOCK_EXIT(&2->a_rwlock); + return (-1); + } + amp2->size = ptob(newpgs); + ANON_LOCK_EXIT(&2->a_rwlock); + } + if (svd2->vpage != NULL) { + new_vpage = + kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), + KM_NOSLEEP); + if (new_vpage == NULL) { + /* Not merging segments so adjust anon_index back */ + if (amp2) + svd2->anon_index += seg_pages(seg1); + return (-1); + } + bcopy(svd2->vpage, new_vpage + seg_pages(seg1), + vpgtob(seg_pages(seg2))); + kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); + svd2->vpage = new_vpage; + if (svd2->pageprot) { + struct vpage *vp, *evp; + + vp = new_vpage; + evp = vp + seg_pages(seg1); + for (; vp < evp; vp++) + VPP_SETPROT(vp, a->prot); + } + } + size = seg1->s_size; + seg_free(seg1); + seg2->s_size += size; + seg2->s_base -= size; + svd2->offset -= size; + svd2->swresv += swresv; + return (0); +} + +static int +segvn_dup(struct seg *seg, struct seg *newseg) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct segvn_data *newsvd; + pgcnt_t npages = seg_pages(seg); + int error = 0; + uint_t prot; + size_t len; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * If segment has anon reserved, reserve more for the new seg. + * For a MAP_NORESERVE segment swresv will be a count of all the + * allocated anon slots; thus we reserve for the child as many slots + * as the parent has allocated. This semantic prevents the child or + * parent from dieing during a copy-on-write fault caused by trying + * to write a shared pre-existing anon page. + */ + if ((len = svd->swresv) != 0) { + if (anon_resv(svd->swresv) == 0) + return (ENOMEM); + + TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", + seg, len, 0); + } + + newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); + + newseg->s_ops = &segvn_ops; + newseg->s_data = (void *)newsvd; + newseg->s_szc = seg->s_szc; + + if ((newsvd->vp = svd->vp) != NULL) { + VN_HOLD(svd->vp); + if (svd->type == MAP_SHARED) + lgrp_shm_policy_init(NULL, svd->vp); + } + newsvd->offset = svd->offset; + newsvd->prot = svd->prot; + newsvd->maxprot = svd->maxprot; + newsvd->pageprot = svd->pageprot; + newsvd->type = svd->type; + newsvd->cred = svd->cred; + crhold(newsvd->cred); + newsvd->advice = svd->advice; + newsvd->pageadvice = svd->pageadvice; + newsvd->swresv = svd->swresv; + newsvd->flags = svd->flags; + newsvd->softlockcnt = 0; + newsvd->policy_info = svd->policy_info; + if ((newsvd->amp = svd->amp) == NULL) { + /* + * Not attaching to a shared anon object. + */ + newsvd->anon_index = 0; + } else { + struct anon_map *amp; + + amp = svd->amp; + if (svd->type == MAP_SHARED) { + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + amp->refcnt++; + ANON_LOCK_EXIT(&->a_rwlock); + newsvd->anon_index = svd->anon_index; + } else { + int reclaim = 1; + + /* + * Allocate and initialize new anon_map structure. + */ + newsvd->amp = anonmap_alloc(newseg->s_size, 0); + newsvd->amp->a_szc = newseg->s_szc; + newsvd->anon_index = 0; + + /* + * We don't have to acquire the anon_map lock + * for the new segment (since it belongs to an + * address space that is still not associated + * with any process), or the segment in the old + * address space (since all threads in it + * are stopped while duplicating the address space). + */ + + /* + * The goal of the following code is to make sure that + * softlocked pages do not end up as copy on write + * pages. This would cause problems where one + * thread writes to a page that is COW and a different + * thread in the same process has softlocked it. The + * softlock lock would move away from this process + * because the write would cause this process to get + * a copy (without the softlock). + * + * The strategy here is to just break the + * sharing on pages that could possibly be + * softlocked. + */ +retry: + if (svd->softlockcnt) { + struct anon *ap, *newap; + size_t i; + uint_t vpprot; + page_t *anon_pl[1+1], *pp; + caddr_t addr; + ulong_t anon_idx = 0; + + /* + * The softlock count might be non zero + * because some pages are still stuck in the + * cache for lazy reclaim. Flush the cache + * now. This should drop the count to zero. + * [or there is really I/O going on to these + * pages]. Note, we have the writers lock so + * nothing gets inserted during the flush. + */ + if (reclaim == 1) { + segvn_purge(seg); + reclaim = 0; + goto retry; + } + i = btopr(seg->s_size); + addr = seg->s_base; + /* + * XXX break cow sharing using PAGESIZE + * pages. They will be relocated into larger + * pages at fault time. + */ + while (i-- > 0) { + if (ap = anon_get_ptr(amp->ahp, + anon_idx)) { + error = anon_getpage(&ap, + &vpprot, anon_pl, PAGESIZE, + seg, addr, S_READ, + svd->cred); + if (error) { + newsvd->vpage = NULL; + goto out; + } + /* + * prot need not be computed + * below 'cause anon_private is + * going to ignore it anyway + * as child doesn't inherit + * pagelock from parent. + */ + prot = svd->pageprot ? + VPP_PROT( + &svd->vpage[ + seg_page(seg, addr)]) + : svd->prot; + pp = anon_private(&newap, + newseg, addr, prot, + anon_pl[0], 0, + newsvd->cred); + if (pp == NULL) { + /* no mem abort */ + newsvd->vpage = NULL; + error = ENOMEM; + goto out; + } + (void) anon_set_ptr( + newsvd->amp->ahp, anon_idx, + newap, ANON_SLEEP); + page_unlock(pp); + } + addr += PAGESIZE; + anon_idx++; + } + } else { /* common case */ + if (seg->s_szc != 0) { + /* + * If at least one of anon slots of a + * large page exists then make sure + * all anon slots of a large page + * exist to avoid partial cow sharing + * of a large page in the future. + */ + anon_dup_fill_holes(amp->ahp, + svd->anon_index, newsvd->amp->ahp, + 0, seg->s_size, seg->s_szc, + svd->vp != NULL); + } else { + anon_dup(amp->ahp, svd->anon_index, + newsvd->amp->ahp, 0, seg->s_size); + } + + hat_clrattr(seg->s_as->a_hat, seg->s_base, + seg->s_size, PROT_WRITE); + } + } + } + /* + * If necessary, create a vpage structure for the new segment. + * Do not copy any page lock indications. + */ + if (svd->vpage != NULL) { + uint_t i; + struct vpage *ovp = svd->vpage; + struct vpage *nvp; + + nvp = newsvd->vpage = + kmem_alloc(vpgtob(npages), KM_SLEEP); + for (i = 0; i < npages; i++) { + *nvp = *ovp++; + VPP_CLRPPLOCK(nvp++); + } + } else + newsvd->vpage = NULL; + + /* Inform the vnode of the new mapping */ + if (newsvd->vp != NULL) { + error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, + newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, + newsvd->maxprot, newsvd->type, newsvd->cred); + } +out: + return (error); +} + + +/* + * callback function used by segvn_unmap to invoke free_vp_pages() for only + * those pages actually processed by the HAT + */ +extern int free_pages; + +static void +segvn_hat_unload_callback(hat_callback_t *cb) +{ + struct seg *seg = cb->hcb_data; + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + size_t len; + u_offset_t off; + + ASSERT(svd->vp != NULL); + ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); + ASSERT(cb->hcb_start_addr >= seg->s_base); + + len = cb->hcb_end_addr - cb->hcb_start_addr; + off = cb->hcb_start_addr - seg->s_base; + free_vp_pages(svd->vp, svd->offset + off, len); +} + + +static int +segvn_unmap(struct seg *seg, caddr_t addr, size_t len) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct segvn_data *nsvd; + struct seg *nseg; + struct anon_map *amp; + pgcnt_t opages; /* old segment size in pages */ + pgcnt_t npages; /* new segment size in pages */ + pgcnt_t dpages; /* pages being deleted (unmapped) */ + hat_callback_t callback; /* used for free_vp_pages() */ + hat_callback_t *cbp = NULL; + caddr_t nbase; + size_t nsize; + size_t oswresv; + int reclaim = 1; + + /* + * We don't need any segment level locks for "segvn" data + * since the address space is "write" locked. + */ + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * Fail the unmap if pages are SOFTLOCKed through this mapping. + * softlockcnt is protected from change by the as write lock. + */ +retry: + if (svd->softlockcnt > 0) { + /* + * since we do have the writers lock nobody can fill + * the cache during the purge. The flush either succeeds + * or we still have pending I/Os. + */ + if (reclaim == 1) { + segvn_purge(seg); + reclaim = 0; + goto retry; + } + return (EAGAIN); + } + + /* + * Check for bad sizes + */ + if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || + (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { + panic("segvn_unmap"); + /*NOTREACHED*/ + } + + if (seg->s_szc != 0) { + size_t pgsz = page_get_pagesize(seg->s_szc); + int err; + if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { + ASSERT(seg->s_base != addr || seg->s_size != len); + VM_STAT_ADD(segvnvmstats.demoterange[0]); + err = segvn_demote_range(seg, addr, len, SDR_END); + if (err == 0) { + return (IE_RETRY); + } + return (err); + } + } + + /* Inform the vnode of the unmapping. */ + if (svd->vp) { + int error; + + error = VOP_DELMAP(svd->vp, + (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), + seg->s_as, addr, len, svd->prot, svd->maxprot, + svd->type, svd->cred); + + if (error == EAGAIN) + return (error); + } + /* + * Remove any page locks set through this mapping. + */ + (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); + + /* + * Unload any hardware translations in the range to be taken out. + * Use a callback to invoke free_vp_pages() effectively. + */ + if (svd->vp != NULL && free_pages != 0) { + callback.hcb_data = seg; + callback.hcb_function = segvn_hat_unload_callback; + cbp = &callback; + } + hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp); + + /* + * Check for entire segment + */ + if (addr == seg->s_base && len == seg->s_size) { + seg_free(seg); + return (0); + } + + opages = seg_pages(seg); + dpages = btop(len); + npages = opages - dpages; + amp = svd->amp; + + /* + * Check for beginning of segment + */ + if (addr == seg->s_base) { + if (svd->vpage != NULL) { + size_t nbytes; + struct vpage *ovpage; + + ovpage = svd->vpage; /* keep pointer to vpage */ + + nbytes = vpgtob(npages); + svd->vpage = kmem_alloc(nbytes, KM_SLEEP); + bcopy(&ovpage[dpages], svd->vpage, nbytes); + + /* free up old vpage */ + kmem_free(ovpage, vpgtob(opages)); + } + if (amp != NULL) { + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { + /* + * Free up now unused parts of anon_map array. + */ + if (seg->s_szc != 0) { + anon_free_pages(amp->ahp, + svd->anon_index, len, seg->s_szc); + } else { + anon_free(amp->ahp, svd->anon_index, + len); + } + + /* + * Unreserve swap space for the unmapped chunk + * of this segment in case it's MAP_SHARED + */ + if (svd->type == MAP_SHARED) { + anon_unresv(len); + amp->swresv -= len; + } + } + ANON_LOCK_EXIT(&->a_rwlock); + svd->anon_index += dpages; + } + if (svd->vp != NULL) + svd->offset += len; + + if (svd->swresv) { + if (svd->flags & MAP_NORESERVE) { + ASSERT(amp); + oswresv = svd->swresv; + + svd->swresv = ptob(anon_pages(amp->ahp, + svd->anon_index, npages)); + anon_unresv(oswresv - svd->swresv); + } else { + anon_unresv(len); + svd->swresv -= len; + } + TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", + seg, len, 0); + } + + seg->s_base += len; + seg->s_size -= len; + return (0); + } + + /* + * Check for end of segment + */ + if (addr + len == seg->s_base + seg->s_size) { + if (svd->vpage != NULL) { + size_t nbytes; + struct vpage *ovpage; + + ovpage = svd->vpage; /* keep pointer to vpage */ + + nbytes = vpgtob(npages); + svd->vpage = kmem_alloc(nbytes, KM_SLEEP); + bcopy(ovpage, svd->vpage, nbytes); + + /* free up old vpage */ + kmem_free(ovpage, vpgtob(opages)); + + } + if (amp != NULL) { + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { + /* + * Free up now unused parts of anon_map array + */ + if (seg->s_szc != 0) { + ulong_t an_idx = svd->anon_index + + npages; + anon_free_pages(amp->ahp, an_idx, + len, seg->s_szc); + } else { + anon_free(amp->ahp, + svd->anon_index + npages, len); + } + /* + * Unreserve swap space for the unmapped chunk + * of this segment in case it's MAP_SHARED + */ + if (svd->type == MAP_SHARED) { + anon_unresv(len); + amp->swresv -= len; + } + } + ANON_LOCK_EXIT(&->a_rwlock); + } + + if (svd->swresv) { + if (svd->flags & MAP_NORESERVE) { + ASSERT(amp); + oswresv = svd->swresv; + svd->swresv = ptob(anon_pages(amp->ahp, + svd->anon_index, npages)); + anon_unresv(oswresv - svd->swresv); + } else { + anon_unresv(len); + svd->swresv -= len; + } + TRACE_3(TR_FAC_VM, TR_ANON_PROC, + "anon proc:%p %lu %u", seg, len, 0); + } + + seg->s_size -= len; + return (0); + } + + /* + * The section to go is in the middle of the segment, + * have to make it into two segments. nseg is made for + * the high end while seg is cut down at the low end. + */ + nbase = addr + len; /* new seg base */ + nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ + seg->s_size = addr - seg->s_base; /* shrink old seg */ + nseg = seg_alloc(seg->s_as, nbase, nsize); + if (nseg == NULL) { + panic("segvn_unmap seg_alloc"); + /*NOTREACHED*/ + } + nseg->s_ops = seg->s_ops; + nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); + nseg->s_data = (void *)nsvd; + nseg->s_szc = seg->s_szc; + *nsvd = *svd; + nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); + nsvd->swresv = 0; + nsvd->softlockcnt = 0; + + if (svd->vp != NULL) { + VN_HOLD(nsvd->vp); + if (nsvd->type == MAP_SHARED) + lgrp_shm_policy_init(NULL, nsvd->vp); + } + crhold(svd->cred); + + if (svd->vpage == NULL) { + nsvd->vpage = NULL; + } else { + /* need to split vpage into two arrays */ + size_t nbytes; + struct vpage *ovpage; + + ovpage = svd->vpage; /* keep pointer to vpage */ + + npages = seg_pages(seg); /* seg has shrunk */ + nbytes = vpgtob(npages); + svd->vpage = kmem_alloc(nbytes, KM_SLEEP); + + bcopy(ovpage, svd->vpage, nbytes); + + npages = seg_pages(nseg); + nbytes = vpgtob(npages); + nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); + + bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); + + /* free up old vpage */ + kmem_free(ovpage, vpgtob(opages)); + } + + if (amp == NULL) { + nsvd->amp = NULL; + nsvd->anon_index = 0; + } else { + /* + * Need to create a new anon map for the new segment. + * We'll also allocate a new smaller array for the old + * smaller segment to save space. + */ + opages = btop((uintptr_t)(addr - seg->s_base)); + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { + /* + * Free up now unused parts of anon_map array + */ + if (seg->s_szc != 0) { + ulong_t an_idx = svd->anon_index + opages; + anon_free_pages(amp->ahp, an_idx, len, + seg->s_szc); + } else { + anon_free(amp->ahp, svd->anon_index + opages, + len); + } + + /* + * Unreserve swap space for the unmapped chunk + * of this segment in case it's MAP_SHARED + */ + if (svd->type == MAP_SHARED) { + anon_unresv(len); + amp->swresv -= len; + } + } + + nsvd->anon_index = svd->anon_index + + btop((uintptr_t)(nseg->s_base - seg->s_base)); + if (svd->type == MAP_SHARED) { + ASSERT(seg->s_szc == 0); + amp->refcnt++; + nsvd->amp = amp; + } else { + struct anon_map *namp; + struct anon_hdr *nahp; + + ASSERT(svd->type == MAP_PRIVATE); + nahp = anon_create(btop(seg->s_size), ANON_SLEEP); + namp = anonmap_alloc(nseg->s_size, 0); + namp->a_szc = seg->s_szc; + (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, + 0, btop(seg->s_size), ANON_SLEEP); + (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, + namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); + anon_release(amp->ahp, btop(amp->size)); + svd->anon_index = 0; + nsvd->anon_index = 0; + amp->ahp = nahp; + amp->size = seg->s_size; + nsvd->amp = namp; + } + ANON_LOCK_EXIT(&->a_rwlock); + } + if (svd->swresv) { + if (svd->flags & MAP_NORESERVE) { + ASSERT(amp); + oswresv = svd->swresv; + svd->swresv = ptob(anon_pages(amp->ahp, + svd->anon_index, btop(seg->s_size))); + nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, + nsvd->anon_index, btop(nseg->s_size))); + ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); + anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); + } else { + if (seg->s_size + nseg->s_size + len != svd->swresv) { + panic("segvn_unmap: " + "cannot split swap reservation"); + /*NOTREACHED*/ + } + anon_unresv(len); + svd->swresv = seg->s_size; + nsvd->swresv = nseg->s_size; + } + TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", + seg, len, 0); + } + + return (0); /* I'm glad that's all over with! */ +} + +static void +segvn_free(struct seg *seg) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + pgcnt_t npages = seg_pages(seg); + struct anon_map *amp; + size_t len; + + /* + * We don't need any segment level locks for "segvn" data + * since the address space is "write" locked. + */ + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * Be sure to unlock pages. XXX Why do things get free'ed instead + * of unmapped? XXX + */ + (void) segvn_lockop(seg, seg->s_base, seg->s_size, + 0, MC_UNLOCK, NULL, 0); + + /* + * Deallocate the vpage and anon pointers if necessary and possible. + */ + if (svd->vpage != NULL) { + kmem_free(svd->vpage, vpgtob(npages)); + svd->vpage = NULL; + } + if ((amp = svd->amp) != NULL) { + /* + * If there are no more references to this anon_map + * structure, then deallocate the structure after freeing + * up all the anon slot pointers that we can. + */ + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + if (--amp->refcnt == 0) { + if (svd->type == MAP_PRIVATE) { + /* + * Private - we only need to anon_free + * the part that this segment refers to. + */ + if (seg->s_szc != 0) { + anon_free_pages(amp->ahp, + svd->anon_index, seg->s_size, + seg->s_szc); + } else { + anon_free(amp->ahp, svd->anon_index, + seg->s_size); + } + } else { + /* + * Shared - anon_free the entire + * anon_map's worth of stuff and + * release any swap reservation. + */ + ASSERT(seg->s_szc == 0); + anon_free(amp->ahp, 0, amp->size); + if ((len = amp->swresv) != 0) { + anon_unresv(len); + TRACE_3(TR_FAC_VM, TR_ANON_PROC, + "anon proc:%p %lu %u", + seg, len, 0); + } + } + svd->amp = NULL; + ANON_LOCK_EXIT(&->a_rwlock); + anonmap_free(amp); + } else if (svd->type == MAP_PRIVATE) { + /* + * We had a private mapping which still has + * a held anon_map so just free up all the + * anon slot pointers that we were using. + */ + if (seg->s_szc != 0) { + anon_free_pages(amp->ahp, svd->anon_index, + seg->s_size, seg->s_szc); + } else { + anon_free(amp->ahp, svd->anon_index, + seg->s_size); + } + ANON_LOCK_EXIT(&->a_rwlock); + } else { + ANON_LOCK_EXIT(&->a_rwlock); + } + } + + /* + * Release swap reservation. + */ + if ((len = svd->swresv) != 0) { + anon_unresv(svd->swresv); + TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", + seg, len, 0); + svd->swresv = 0; + } + /* + * Release claim on vnode, credentials, and finally free the + * private data. + */ + if (svd->vp != NULL) { + if (svd->type == MAP_SHARED) + lgrp_shm_policy_fini(NULL, svd->vp); + VN_RELE(svd->vp); + svd->vp = NULL; + } + crfree(svd->cred); + svd->cred = NULL; + + seg->s_data = NULL; + kmem_cache_free(segvn_cache, svd); +} + +/* + * Do a F_SOFTUNLOCK call over the range requested. The range must have + * already been F_SOFTLOCK'ed. + * Caller must always match addr and len of a softunlock with a previous + * softlock with exactly the same addr and len. + */ +static void +segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + page_t *pp; + caddr_t adr; + struct vnode *vp; + u_offset_t offset; + ulong_t anon_index; + struct anon_map *amp; + struct anon *ap = NULL; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); + + if ((amp = svd->amp) != NULL) + anon_index = svd->anon_index + seg_page(seg, addr); + + hat_unlock(seg->s_as->a_hat, addr, len); + for (adr = addr; adr < addr + len; adr += PAGESIZE) { + if (amp != NULL) { + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + if ((ap = anon_get_ptr(amp->ahp, anon_index++)) + != NULL) { + swap_xlate(ap, &vp, &offset); + } else { + vp = svd->vp; + offset = svd->offset + + (uintptr_t)(adr - seg->s_base); + } + ANON_LOCK_EXIT(&->a_rwlock); + } else { + vp = svd->vp; + offset = svd->offset + + (uintptr_t)(adr - seg->s_base); + } + + /* + * Use page_find() instead of page_lookup() to + * find the page since we know that it is locked. + */ + pp = page_find(vp, offset); + if (pp == NULL) { + panic( + "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", + (void *)adr, (void *)ap, (void *)vp, offset); + /*NOTREACHED*/ + } + + if (rw == S_WRITE) { + hat_setrefmod(pp); + if (seg->s_as->a_vbits) + hat_setstat(seg->s_as, adr, PAGESIZE, + P_REF | P_MOD); + } else if (rw != S_OTHER) { + hat_setref(pp); + if (seg->s_as->a_vbits) + hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); + } + TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, + "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); + page_unlock(pp); + } + mutex_enter(&freemem_lock); /* for availrmem */ + availrmem += btop(len); + segvn_pages_locked -= btop(len); + svd->softlockcnt -= btop(len); + mutex_exit(&freemem_lock); + if (svd->softlockcnt == 0) { + /* + * All SOFTLOCKS are gone. Wakeup any waiting + * unmappers so they can try again to unmap. + * Check for waiters first without the mutex + * held so we don't always grab the mutex on + * softunlocks. + */ + if (AS_ISUNMAPWAIT(seg->s_as)) { + mutex_enter(&seg->s_as->a_contents); + if (AS_ISUNMAPWAIT(seg->s_as)) { + AS_CLRUNMAPWAIT(seg->s_as); + cv_broadcast(&seg->s_as->a_cv); + } + mutex_exit(&seg->s_as->a_contents); + } + } +} + +#define PAGE_HANDLED ((page_t *)-1) + +/* + * Release all the pages in the NULL terminated ppp list + * which haven't already been converted to PAGE_HANDLED. + */ +static void +segvn_pagelist_rele(page_t **ppp) +{ + for (; *ppp != NULL; ppp++) { + if (*ppp != PAGE_HANDLED) + page_unlock(*ppp); + } +} + +static int stealcow = 1; + +/* + * Workaround for viking chip bug. See bug id 1220902. + * To fix this down in pagefault() would require importing so + * much as and segvn code as to be unmaintainable. + */ +int enable_mbit_wa = 0; + +/* + * Handles all the dirty work of getting the right + * anonymous pages and loading up the translations. + * This routine is called only from segvn_fault() + * when looping over the range of addresses requested. + * + * The basic algorithm here is: + * If this is an anon_zero case + * Call anon_zero to allocate page + * Load up translation + * Return + * endif + * If this is an anon page + * Use anon_getpage to get the page + * else + * Find page in pl[] list passed in + * endif + * If not a cow + * Load up the translation to the page + * return + * endif + * Call anon_private to handle cow + * Load up (writable) translation to new page + */ +static faultcode_t +segvn_faultpage( + struct hat *hat, /* the hat to use for mapping */ + struct seg *seg, /* seg_vn of interest */ + caddr_t addr, /* address in as */ + u_offset_t off, /* offset in vp */ + struct vpage *vpage, /* pointer to vpage for vp, off */ + page_t *pl[], /* object source page pointer */ + uint_t vpprot, /* access allowed to object pages */ + enum fault_type type, /* type of fault */ + enum seg_rw rw, /* type of access at fault */ + int brkcow) /* we may need to break cow */ +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + page_t *pp, **ppp; + uint_t pageflags = 0; + page_t *anon_pl[1 + 1]; + page_t *opp = NULL; /* original page */ + uint_t prot; + int err; + int cow; + int claim; + int steal = 0; + ulong_t anon_index; + struct anon *ap, *oldap; + struct anon_map *amp; + int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; + int anon_lock = 0; + anon_sync_obj_t cookie; + + if (svd->flags & MAP_TEXT) { + hat_flag |= HAT_LOAD_TEXT; + } + + ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); + ASSERT(seg->s_szc == 0); + + /* + * Initialize protection value for this page. + * If we have per page protection values check it now. + */ + if (svd->pageprot) { + uint_t protchk; + + switch (rw) { + case S_READ: + protchk = PROT_READ; + break; + case S_WRITE: + protchk = PROT_WRITE; + break; + case S_EXEC: + protchk = PROT_EXEC; + break; + case S_OTHER: + default: + protchk = PROT_READ | PROT_WRITE | PROT_EXEC; + break; + } + + prot = VPP_PROT(vpage); + if ((prot & protchk) == 0) + return (FC_PROT); /* illegal access type */ + } else { + prot = svd->prot; + } + + if (type == F_SOFTLOCK) { + mutex_enter(&freemem_lock); + if (availrmem <= tune.t_minarmem) { + mutex_exit(&freemem_lock); + return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ + } else { + svd->softlockcnt++; + availrmem--; + segvn_pages_locked++; + } + mutex_exit(&freemem_lock); + } + + /* + * Always acquire the anon array lock to prevent 2 threads from + * allocating separate anon slots for the same "addr". + */ + + if ((amp = svd->amp) != NULL) { + ASSERT(RW_READ_HELD(&->a_rwlock)); + anon_index = svd->anon_index + seg_page(seg, addr); + anon_array_enter(amp, anon_index, &cookie); + anon_lock = 1; + } + + if (svd->vp == NULL && amp != NULL) { + if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { + /* + * Allocate a (normally) writable anonymous page of + * zeroes. If no advance reservations, reserve now. + */ + if (svd->flags & MAP_NORESERVE) { + if (anon_resv(ptob(1))) { + svd->swresv += ptob(1); + } else { + err = ENOMEM; + goto out; + } + } + if ((pp = anon_zero(seg, addr, &ap, + svd->cred)) == NULL) { + err = ENOMEM; + goto out; /* out of swap space */ + } + /* + * Re-acquire the anon_map lock and + * initialize the anon array entry. + */ + (void) anon_set_ptr(amp->ahp, anon_index, ap, + ANON_SLEEP); + if (enable_mbit_wa) { + if (rw == S_WRITE) + hat_setmod(pp); + else if (!hat_ismod(pp)) + prot &= ~PROT_WRITE; + } + /* + * If AS_PAGLCK is set in a_flags (via memcntl(2) + * with MC_LOCKAS, MCL_FUTURE) and this is a + * MAP_NORESERVE segment, we may need to + * permanently lock the page as it is being faulted + * for the first time. The following text applies + * only to MAP_NORESERVE segments: + * + * As per memcntl(2), if this segment was created + * after MCL_FUTURE was applied (a "future" + * segment), its pages must be locked. If this + * segment existed at MCL_FUTURE application (a + * "past" segment), the interface is unclear. + * + * We decide to lock only if vpage is present: + * + * - "future" segments will have a vpage array (see + * as_map), and so will be locked as required + * + * - "past" segments may not have a vpage array, + * depending on whether events (such as + * mprotect) have occurred. Locking if vpage + * exists will preserve legacy behavior. Not + * locking if vpage is absent, will not break + * the interface or legacy behavior. Note that + * allocating vpage here if it's absent requires + * upgrading the segvn reader lock, the cost of + * which does not seem worthwhile. + */ + if (AS_ISPGLCK(seg->s_as) && vpage != NULL && + (svd->flags & MAP_NORESERVE)) { + claim = VPP_PROT(vpage) & PROT_WRITE; + ASSERT(svd->type == MAP_PRIVATE); + if (page_pp_lock(pp, claim, 0)) + VPP_SETPPLOCK(vpage); + } + + + /* + * Handle pages that have been marked for migration + */ + if (lgrp_optimizations()) + page_migrate(seg, addr, &pp, 1); + hat_memload(hat, addr, pp, prot, hat_flag); + + if (!(hat_flag & HAT_LOAD_LOCK)) + page_unlock(pp); + + anon_array_exit(&cookie); + return (0); + } + } + + /* + * Obtain the page structure via anon_getpage() if it is + * a private copy of an object (the result of a previous + * copy-on-write). + */ + if (amp != NULL) { + if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { + err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, + seg, addr, rw, svd->cred); + if (err) + goto out; + + if (svd->type == MAP_SHARED) { + /* + * If this is a shared mapping to an + * anon_map, then ignore the write + * permissions returned by anon_getpage(). + * They apply to the private mappings + * of this anon_map. + */ + vpprot |= PROT_WRITE; + } + opp = anon_pl[0]; + } + } + + /* + * Search the pl[] list passed in if it is from the + * original object (i.e., not a private copy). + */ + if (opp == NULL) { + /* + * Find original page. We must be bringing it in + * from the list in pl[]. + */ + for (ppp = pl; (opp = *ppp) != NULL; ppp++) { + if (opp == PAGE_HANDLED) + continue; + ASSERT(opp->p_vnode == svd->vp); /* XXX */ + if (opp->p_offset == off) + break; + } + if (opp == NULL) { + panic("segvn_faultpage not found"); + /*NOTREACHED*/ + } + *ppp = PAGE_HANDLED; + + } + + ASSERT(PAGE_LOCKED(opp)); + + TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, + "segvn_fault:pp %p vp %p offset %llx", + opp, NULL, 0); + + /* + * The fault is treated as a copy-on-write fault if a + * write occurs on a private segment and the object + * page (i.e., mapping) is write protected. We assume + * that fatal protection checks have already been made. + */ + + cow = brkcow && ((vpprot & PROT_WRITE) == 0); + + /* + * If not a copy-on-write case load the translation + * and return. + */ + if (cow == 0) { + if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { + if (rw == S_WRITE) + hat_setmod(opp); + else if (rw != S_OTHER && !hat_ismod(opp)) + prot &= ~PROT_WRITE; + } + + /* + * Handle pages that have been marked for migration + */ + if (lgrp_optimizations()) + page_migrate(seg, addr, &opp, 1); + + hat_memload(hat, addr, opp, prot & vpprot, hat_flag); + + if (!(hat_flag & HAT_LOAD_LOCK)) + page_unlock(opp); + + if (anon_lock) { + anon_array_exit(&cookie); + } + return (0); + } + + hat_setref(opp); + + ASSERT(amp != NULL && anon_lock); + + /* + * Steal the page only if it isn't a private page + * since stealing a private page is not worth the effort. + */ + if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) + steal = 1; + + /* + * Steal the original page if the following conditions are true: + * + * We are low on memory, the page is not private, page is not + * shared, not modified, not `locked' or if we have it `locked' + * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies + * that the page is not shared) and if it doesn't have any + * translations. page_struct_lock isn't needed to look at p_cowcnt + * and p_lckcnt because we first get exclusive lock on page. + */ + (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); + + if (stealcow && freemem < minfree && steal && + page_tryupgrade(opp) && !hat_ismod(opp) && + ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || + (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && + vpage != NULL && VPP_ISPPLOCK(vpage)))) { + /* + * Check if this page has other translations + * after unloading our translation. + */ + if (hat_page_is_mapped(opp)) { + hat_unload(seg->s_as->a_hat, addr, PAGESIZE, + HAT_UNLOAD); + } + + /* + * hat_unload() might sync back someone else's recent + * modification, so check again. + */ + if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) + pageflags |= STEAL_PAGE; + } + + /* + * If we have a vpage pointer, see if it indicates that we have + * ``locked'' the page we map -- if so, tell anon_private to + * transfer the locking resource to the new page. + * + * See Statement at the beginning of segvn_lockop regarding + * the way lockcnts/cowcnts are handled during COW. + * + */ + if (vpage != NULL && VPP_ISPPLOCK(vpage)) + pageflags |= LOCK_PAGE; + + /* + * Allocate a private page and perform the copy. + * For MAP_NORESERVE reserve swap space now, unless this + * is a cow fault on an existing anon page in which case + * MAP_NORESERVE will have made advance reservations. + */ + if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { + if (anon_resv(ptob(1))) { + svd->swresv += ptob(1); + } else { + page_unlock(opp); + err = ENOMEM; + goto out; + } + } + oldap = ap; + pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); + if (pp == NULL) { + err = ENOMEM; /* out of swap space */ + goto out; + } + + /* + * If we copied away from an anonymous page, then + * we are one step closer to freeing up an anon slot. + * + * NOTE: The original anon slot must be released while + * holding the "anon_map" lock. This is necessary to prevent + * other threads from obtaining a pointer to the anon slot + * which may be freed if its "refcnt" is 1. + */ + if (oldap != NULL) + anon_decref(oldap); + + (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); + + ASSERT(!IS_VMODSORT(pp->p_vnode)); + if (enable_mbit_wa) { + if (rw == S_WRITE) + hat_setmod(pp); + else if (!hat_ismod(pp)) + prot &= ~PROT_WRITE; + } + + + /* + * Handle pages that have been marked for migration + */ + if (lgrp_optimizations()) + page_migrate(seg, addr, &pp, 1); + hat_memload(hat, addr, pp, prot, hat_flag); + + if (!(hat_flag & HAT_LOAD_LOCK)) + page_unlock(pp); + + ASSERT(anon_lock); + anon_array_exit(&cookie); + return (0); +out: + if (anon_lock) + anon_array_exit(&cookie); + + if (type == F_SOFTLOCK) { + mutex_enter(&freemem_lock); + availrmem++; + segvn_pages_locked--; + svd->softlockcnt--; + mutex_exit(&freemem_lock); + } + return (FC_MAKE_ERR(err)); +} + +/* + * relocate a bunch of smaller targ pages into one large repl page. all targ + * pages must be complete pages smaller than replacement pages. + * it's assumed that no page's szc can change since they are all PAGESIZE or + * complete large pages locked SHARED. + */ +static void +segvn_relocate_pages(page_t **targ, page_t *replacement) +{ + page_t *pp; + pgcnt_t repl_npgs, curnpgs; + pgcnt_t i; + uint_t repl_szc = replacement->p_szc; + page_t *first_repl = replacement; + page_t *repl; + spgcnt_t npgs; + + VM_STAT_ADD(segvnvmstats.relocatepages[0]); + + ASSERT(repl_szc != 0); + npgs = repl_npgs = page_get_pagecnt(repl_szc); + + i = 0; + while (repl_npgs) { + spgcnt_t nreloc; + int err; + ASSERT(replacement != NULL); + pp = targ[i]; + ASSERT(pp->p_szc < repl_szc); + ASSERT(PAGE_EXCL(pp)); + ASSERT(!PP_ISFREE(pp)); + curnpgs = page_get_pagecnt(pp->p_szc); + if (curnpgs == 1) { + VM_STAT_ADD(segvnvmstats.relocatepages[1]); + repl = replacement; + page_sub(&replacement, repl); + ASSERT(PAGE_EXCL(repl)); + ASSERT(!PP_ISFREE(repl)); + ASSERT(repl->p_szc == repl_szc); + } else { + page_t *repl_savepp; + int j; + VM_STAT_ADD(segvnvmstats.relocatepages[2]); + repl_savepp = replacement; + for (j = 0; j < curnpgs; j++) { + repl = replacement; + page_sub(&replacement, repl); + ASSERT(PAGE_EXCL(repl)); + ASSERT(!PP_ISFREE(repl)); + ASSERT(repl->p_szc == repl_szc); + ASSERT(page_pptonum(targ[i + j]) == + page_pptonum(targ[i]) + j); + } + repl = repl_savepp; + ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); + } + err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); + if (err || nreloc != curnpgs) { + panic("segvn_relocate_pages: " + "page_relocate failed err=%d curnpgs=%ld " + "nreloc=%ld", err, curnpgs, nreloc); + } + ASSERT(curnpgs <= repl_npgs); + repl_npgs -= curnpgs; + i += curnpgs; + } + ASSERT(replacement == NULL); + + repl = first_repl; + repl_npgs = npgs; + for (i = 0; i < repl_npgs; i++) { + ASSERT(PAGE_EXCL(repl)); + ASSERT(!PP_ISFREE(repl)); + targ[i] = repl; + page_downgrade(targ[i]); + repl = page_next(repl); + } +} + +/* + * Check if all pages in ppa array are complete smaller than szc pages and + * their roots will still be aligned relative to their current size if the + * entire ppa array is relocated into one szc page. If these conditions are + * not met return 0. + * + * If all pages are properly aligned attempt to upgrade their locks + * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. + * upgrdfail was set to 0 by caller. + * + * Return 1 if all pages are aligned and locked exclusively. + * + * If all pages in ppa array happen to be physically contiguous to make one + * szc page and all exclusive locks are successfully obtained promote the page + * size to szc and set *pszc to szc. Return 1 with pages locked shared. + */ +static int +segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) +{ + page_t *pp; + pfn_t pfn; + pgcnt_t totnpgs = page_get_pagecnt(szc); + pfn_t first_pfn; + int contig = 1; + pgcnt_t i; + pgcnt_t j; + uint_t curszc; + pgcnt_t curnpgs; + int root = 0; + + ASSERT(szc > 0); + + VM_STAT_ADD(segvnvmstats.fullszcpages[0]); + + for (i = 0; i < totnpgs; i++) { + pp = ppa[i]; + ASSERT(PAGE_SHARED(pp)); + ASSERT(!PP_ISFREE(pp)); + pfn = page_pptonum(pp); + if (i == 0) { + if (!IS_P2ALIGNED(pfn, totnpgs)) { + contig = 0; + } else { + first_pfn = pfn; + } + } else if (contig && pfn != first_pfn + i) { + contig = 0; + } + if (pp->p_szc == 0) { + if (root) { + VM_STAT_ADD(segvnvmstats.fullszcpages[1]); + return (0); + } + } else if (!root) { + if ((curszc = pp->p_szc) >= szc) { + VM_STAT_ADD(segvnvmstats.fullszcpages[2]); + return (0); + } + if (curszc == 0) { + /* + * p_szc changed means we don't have all pages + * locked. return failure. + */ + VM_STAT_ADD(segvnvmstats.fullszcpages[3]); + return (0); + } + curnpgs = page_get_pagecnt(curszc); + if (!IS_P2ALIGNED(pfn, curnpgs) || + !IS_P2ALIGNED(i, curnpgs)) { + VM_STAT_ADD(segvnvmstats.fullszcpages[4]); + return (0); + } + root = 1; + } else { + ASSERT(i > 0); + VM_STAT_ADD(segvnvmstats.fullszcpages[5]); + if (pp->p_szc != curszc) { + VM_STAT_ADD(segvnvmstats.fullszcpages[6]); + return (0); + } + if (pfn - 1 != page_pptonum(ppa[i - 1])) { + panic("segvn_full_szcpages: " + "large page not physically contiguous"); + } + if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { + root = 0; + } + } + } + + for (i = 0; i < totnpgs; i++) { + ASSERT(ppa[i]->p_szc < szc); + if (!page_tryupgrade(ppa[i])) { + for (j = 0; j < i; j++) { + page_downgrade(ppa[j]); + } + *pszc = ppa[i]->p_szc; + *upgrdfail = 1; + VM_STAT_ADD(segvnvmstats.fullszcpages[7]); + return (0); + } + } + + /* + * When a page is put a free cachelist its szc is set to 0. if file + * system reclaimed pages from cachelist targ pages will be physically + * contiguous with 0 p_szc. in this case just upgrade szc of targ + * pages without any relocations. + * To avoid any hat issues with previous small mappings + * hat_pageunload() the target pages first. + */ + if (contig) { + VM_STAT_ADD(segvnvmstats.fullszcpages[8]); + for (i = 0; i < totnpgs; i++) { + (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); + } + for (i = 0; i < totnpgs; i++) { + ppa[i]->p_szc = szc; + } + for (i = 0; i < totnpgs; i++) { + ASSERT(PAGE_EXCL(ppa[i])); + page_downgrade(ppa[i]); + } + if (pszc != NULL) { + *pszc = szc; + } + } + VM_STAT_ADD(segvnvmstats.fullszcpages[9]); + return (1); +} + +/* + * Create physically contiguous pages for [vp, off] - [vp, off + + * page_size(szc)) range and for private segment return them in ppa array. + * Pages are created either via IO or relocations. + * + * Return 1 on sucess and 0 on failure. + * + * If physically contiguos pages already exist for this range return 1 without + * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa + * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). + */ + +static int +segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, + uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, + int *downsize) + +{ + page_t *pplist = *ppplist; + size_t pgsz = page_get_pagesize(szc); + pgcnt_t pages = btop(pgsz); + ulong_t start_off = off; + u_offset_t eoff = off + pgsz; + spgcnt_t nreloc; + u_offset_t io_off = off; + size_t io_len; + page_t *io_pplist = NULL; + page_t *done_pplist = NULL; + pgcnt_t pgidx = 0; + page_t *pp; + page_t *newpp; + page_t *targpp; + int io_err = 0; + int i; + pfn_t pfn; + ulong_t ppages; + page_t *targ_pplist = NULL; + page_t *repl_pplist = NULL; + page_t *tmp_pplist; + int nios = 0; + uint_t pszc; + struct vattr va; + + VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); + + ASSERT(szc != 0); + ASSERT(pplist->p_szc == szc); + + /* + * downsize will be set to 1 only if we fail to lock pages. this will + * allow subsequent faults to try to relocate the page again. If we + * fail due to misalignment don't downsize and let the caller map the + * whole region with small mappings to avoid more faults into the area + * where we can't get large pages anyway. + */ + *downsize = 0; + + while (off < eoff) { + newpp = pplist; + ASSERT(newpp != NULL); + ASSERT(PAGE_EXCL(newpp)); + ASSERT(!PP_ISFREE(newpp)); + /* + * we pass NULL for nrelocp to page_lookup_create() + * so that it doesn't relocate. We relocate here + * later only after we make sure we can lock all + * pages in the range we handle and they are all + * aligned. + */ + pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); + ASSERT(pp != NULL); + ASSERT(!PP_ISFREE(pp)); + ASSERT(pp->p_vnode == vp); + ASSERT(pp->p_offset == off); + if (pp == newpp) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); + page_sub(&pplist, pp); + ASSERT(PAGE_EXCL(pp)); + ASSERT(page_iolock_assert(pp)); + page_list_concat(&io_pplist, &pp); + off += PAGESIZE; + continue; + } + VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); + pfn = page_pptonum(pp); + pszc = pp->p_szc; + if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && + IS_P2ALIGNED(pfn, pages)) { + ASSERT(repl_pplist == NULL); + ASSERT(done_pplist == NULL); + ASSERT(pplist == *ppplist); + page_unlock(pp); + page_free_replacement_page(pplist); + page_create_putback(pages); + *ppplist = NULL; + VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); + return (1); + } + if (pszc >= szc) { + page_unlock(pp); + segvn_faultvnmpss_align_err1++; + goto out; + } + ppages = page_get_pagecnt(pszc); + if (!IS_P2ALIGNED(pfn, ppages)) { + ASSERT(pszc > 0); + /* + * sizing down to pszc won't help. + */ + page_unlock(pp); + segvn_faultvnmpss_align_err2++; + goto out; + } + pfn = page_pptonum(newpp); + if (!IS_P2ALIGNED(pfn, ppages)) { + ASSERT(pszc > 0); + /* + * sizing down to pszc won't help. + */ + page_unlock(pp); + segvn_faultvnmpss_align_err3++; + goto out; + } + if (!PAGE_EXCL(pp)) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); + page_unlock(pp); + *downsize = 1; + *ret_pszc = pp->p_szc; + goto out; + } + targpp = pp; + if (io_pplist != NULL) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); + io_len = off - io_off; + /* + * Some file systems like NFS don't check EOF + * conditions in VOP_PAGEIO(). Check it here + * now that pages are locked SE_EXCL. Any file + * truncation will wait until the pages are + * unlocked so no need to worry that file will + * be truncated after we check its size here. + * XXX fix NFS to remove this check. + */ + va.va_mask = AT_SIZE; + if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); + page_unlock(targpp); + goto out; + } + if (btopr(va.va_size) < btopr(io_off + io_len)) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); + *downsize = 1; + *ret_pszc = 0; + page_unlock(targpp); + goto out; + } + io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, + B_READ, svd->cred); + if (io_err) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); + page_unlock(targpp); + if (io_err == EDEADLK) { + segvn_vmpss_pageio_deadlk_err++; + } + goto out; + } + nios++; + VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); + while (io_pplist != NULL) { + pp = io_pplist; + page_sub(&io_pplist, pp); + ASSERT(page_iolock_assert(pp)); + page_io_unlock(pp); + pgidx = (pp->p_offset - start_off) >> + PAGESHIFT; + ASSERT(pgidx < pages); + ppa[pgidx] = pp; + page_list_concat(&done_pplist, &pp); + } + } + pp = targpp; + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_szc <= pszc); + if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); + page_unlock(pp); + *downsize = 1; + *ret_pszc = pp->p_szc; + goto out; + } + VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); + /* + * page szc chould have changed before the entire group was + * locked. reread page szc. + */ + pszc = pp->p_szc; + ppages = page_get_pagecnt(pszc); + + /* link just the roots */ + page_list_concat(&targ_pplist, &pp); + page_sub(&pplist, newpp); + page_list_concat(&repl_pplist, &newpp); + off += PAGESIZE; + while (--ppages != 0) { + newpp = pplist; + page_sub(&pplist, newpp); + off += PAGESIZE; + } + io_off = off; + } + if (io_pplist != NULL) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); + io_len = eoff - io_off; + va.va_mask = AT_SIZE; + if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); + goto out; + } + if (btopr(va.va_size) < btopr(io_off + io_len)) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); + *downsize = 1; + *ret_pszc = 0; + goto out; + } + io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, + B_READ, svd->cred); + if (io_err) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); + if (io_err == EDEADLK) { + segvn_vmpss_pageio_deadlk_err++; + } + goto out; + } + nios++; + while (io_pplist != NULL) { + pp = io_pplist; + page_sub(&io_pplist, pp); + ASSERT(page_iolock_assert(pp)); + page_io_unlock(pp); + pgidx = (pp->p_offset - start_off) >> PAGESHIFT; + ASSERT(pgidx < pages); + ppa[pgidx] = pp; + } + } + /* + * we're now bound to succeed or panic. + * remove pages from done_pplist. it's not needed anymore. + */ + while (done_pplist != NULL) { + pp = done_pplist; + page_sub(&done_pplist, pp); + } + VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); + ASSERT(pplist == NULL); + *ppplist = NULL; + while (targ_pplist != NULL) { + int ret; + VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); + ASSERT(repl_pplist); + pp = targ_pplist; + page_sub(&targ_pplist, pp); + pgidx = (pp->p_offset - start_off) >> PAGESHIFT; + newpp = repl_pplist; + page_sub(&repl_pplist, newpp); +#ifdef DEBUG + pfn = page_pptonum(pp); + pszc = pp->p_szc; + ppages = page_get_pagecnt(pszc); + ASSERT(IS_P2ALIGNED(pfn, ppages)); + pfn = page_pptonum(newpp); + ASSERT(IS_P2ALIGNED(pfn, ppages)); + ASSERT(P2PHASE(pfn, pages) == pgidx); +#endif + nreloc = 0; + ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); + if (ret != 0 || nreloc == 0) { + panic("segvn_fill_vp_pages: " + "page_relocate failed"); + } + pp = newpp; + while (nreloc-- != 0) { + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_vnode == vp); + ASSERT(pgidx == + ((pp->p_offset - start_off) >> PAGESHIFT)); + ppa[pgidx++] = pp; + pp = page_next(pp); + } + } + + if (svd->type == MAP_PRIVATE) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); + for (i = 0; i < pages; i++) { + ASSERT(ppa[i] != NULL); + ASSERT(PAGE_EXCL(ppa[i])); + ASSERT(ppa[i]->p_vnode == vp); + ASSERT(ppa[i]->p_offset == + start_off + (i << PAGESHIFT)); + page_downgrade(ppa[i]); + } + ppa[pages] = NULL; + } else { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); + /* + * the caller will still call VOP_GETPAGE() for shared segments + * to check FS write permissions. For private segments we map + * file read only anyway. so no VOP_GETPAGE is needed. + */ + for (i = 0; i < pages; i++) { + ASSERT(ppa[i] != NULL); + ASSERT(PAGE_EXCL(ppa[i])); + ASSERT(ppa[i]->p_vnode == vp); + ASSERT(ppa[i]->p_offset == + start_off + (i << PAGESHIFT)); + page_unlock(ppa[i]); + } + ppa[0] = NULL; + } + + return (1); +out: + /* + * Do the cleanup. Unlock target pages we didn't relocate. They are + * linked on targ_pplist by root pages. reassemble unused replacement + * and io pages back to pplist. + */ + if (io_pplist != NULL) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); + pp = io_pplist; + do { + ASSERT(pp->p_vnode == vp); + ASSERT(pp->p_offset == io_off); + ASSERT(page_iolock_assert(pp)); + page_io_unlock(pp); + page_hashout(pp, NULL); + io_off += PAGESIZE; + } while ((pp = pp->p_next) != io_pplist); + page_list_concat(&io_pplist, &pplist); + pplist = io_pplist; + } + tmp_pplist = NULL; + while (targ_pplist != NULL) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); + pp = targ_pplist; + ASSERT(PAGE_EXCL(pp)); + page_sub(&targ_pplist, pp); + + pszc = pp->p_szc; + ppages = page_get_pagecnt(pszc); + ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); + + if (pszc != 0) { + group_page_unlock(pp); + } + page_unlock(pp); + + pp = repl_pplist; + ASSERT(pp != NULL); + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_szc == szc); + page_sub(&repl_pplist, pp); + + ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); + + /* relink replacement page */ + page_list_concat(&tmp_pplist, &pp); + while (--ppages != 0) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); + pp = page_next(pp); + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_szc == szc); + page_list_concat(&tmp_pplist, &pp); + } + } + if (tmp_pplist != NULL) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); + page_list_concat(&tmp_pplist, &pplist); + pplist = tmp_pplist; + } + /* + * at this point all pages are either on done_pplist or + * pplist. They can't be all on done_pplist otherwise + * we'd've been done. + */ + ASSERT(pplist != NULL); + if (nios != 0) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); + pp = pplist; + do { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); + ASSERT(pp->p_szc == szc); + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_vnode != vp); + pp->p_szc = 0; + } while ((pp = pp->p_next) != pplist); + + pp = done_pplist; + do { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); + ASSERT(pp->p_szc == szc); + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_vnode == vp); + pp->p_szc = 0; + } while ((pp = pp->p_next) != done_pplist); + + while (pplist != NULL) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); + pp = pplist; + page_sub(&pplist, pp); + page_free(pp, 0); + } + + while (done_pplist != NULL) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); + pp = done_pplist; + page_sub(&done_pplist, pp); + page_unlock(pp); + } + *ppplist = NULL; + return (0); + } + ASSERT(pplist == *ppplist); + if (io_err) { + VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); + /* + * don't downsize on io error. + * see if vop_getpage succeeds. + * pplist may still be used in this case + * for relocations. + */ + return (0); + } + VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); + page_free_replacement_page(pplist); + page_create_putback(pages); + *ppplist = NULL; + return (0); +} + +int segvn_anypgsz = 0; + +#define SEGVN_RESTORE_SOFTLOCK(type, pages) \ + if ((type) == F_SOFTLOCK) { \ + mutex_enter(&freemem_lock); \ + availrmem += (pages); \ + segvn_pages_locked -= (pages); \ + svd->softlockcnt -= (pages); \ + mutex_exit(&freemem_lock); \ + } + +#define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ + if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ + if ((rw) == S_WRITE) { \ + for (i = 0; i < (pages); i++) { \ + ASSERT((ppa)[i]->p_vnode == \ + (ppa)[0]->p_vnode); \ + hat_setmod((ppa)[i]); \ + } \ + } else if ((rw) != S_OTHER && \ + ((prot) & (vpprot) & PROT_WRITE)) { \ + for (i = 0; i < (pages); i++) { \ + ASSERT((ppa)[i]->p_vnode == \ + (ppa)[0]->p_vnode); \ + if (!hat_ismod((ppa)[i])) { \ + prot &= ~PROT_WRITE; \ + break; \ + } \ + } \ + } \ + } + +#ifdef VM_STATS + +#define SEGVN_VMSTAT_FLTVNPAGES(idx) \ + VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); + +#else /* VM_STATS */ + +#define SEGVN_VMSTAT_FLTVNPAGES(idx) + +#endif + +static faultcode_t +segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, + caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, + caddr_t eaddr, int brkcow) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct anon_map *amp = svd->amp; + uchar_t segtype = svd->type; + uint_t szc = seg->s_szc; + size_t pgsz = page_get_pagesize(szc); + size_t maxpgsz = pgsz; + pgcnt_t pages = btop(pgsz); + pgcnt_t maxpages = pages; + size_t ppasize = (pages + 1) * sizeof (page_t *); + caddr_t a = lpgaddr; + caddr_t maxlpgeaddr = lpgeaddr; + u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); + ulong_t aindx = svd->anon_index + seg_page(seg, a); + struct vpage *vpage = (svd->vpage != NULL) ? + &svd->vpage[seg_page(seg, a)] : NULL; + vnode_t *vp = svd->vp; + page_t **ppa; + uint_t pszc; + size_t ppgsz; + pgcnt_t ppages; + faultcode_t err = 0; + int ierr; + int vop_size_err = 0; + uint_t protchk, prot, vpprot; + ulong_t i; + int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; + anon_sync_obj_t an_cookie; + enum seg_rw arw; + int alloc_failed = 0; + int adjszc_chk; + struct vattr va; + int xhat = 0; + page_t *pplist; + pfn_t pfn; + int physcontig; + int upgrdfail; + int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ + + ASSERT(szc != 0); + ASSERT(vp != NULL); + ASSERT(brkcow == 0 || amp != NULL); + ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ + ASSERT(!(svd->flags & MAP_NORESERVE)); + ASSERT(type != F_SOFTUNLOCK); + ASSERT(IS_P2ALIGNED(a, maxpgsz)); + ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); + ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); + ASSERT(seg->s_szc < NBBY * sizeof (int)); + + VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); + VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); + + if (svd->flags & MAP_TEXT) { + hat_flag |= HAT_LOAD_TEXT; + } + + if (svd->pageprot) { + switch (rw) { + case S_READ: + protchk = PROT_READ; + break; + case S_WRITE: + protchk = PROT_WRITE; + break; + case S_EXEC: + protchk = PROT_EXEC; + break; + case S_OTHER: + default: + protchk = PROT_READ | PROT_WRITE | PROT_EXEC; + break; + } + } else { + prot = svd->prot; + /* caller has already done segment level protection check. */ + } + + if (seg->s_as->a_hat != hat) { + xhat = 1; + } + + if (rw == S_WRITE && segtype == MAP_PRIVATE) { + SEGVN_VMSTAT_FLTVNPAGES(2); + arw = S_READ; + } else { + arw = rw; + } + + ppa = kmem_alloc(ppasize, KM_SLEEP); + + VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); + + for (;;) { + adjszc_chk = 0; + for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { + if (adjszc_chk) { + while (szc < seg->s_szc) { + uintptr_t e; + uint_t tszc; + tszc = segvn_anypgsz_vnode ? szc + 1 : + seg->s_szc; + ppgsz = page_get_pagesize(tszc); + if (!IS_P2ALIGNED(a, ppgsz) || + ((alloc_failed >> tszc) & + 0x1)) { + break; + } + SEGVN_VMSTAT_FLTVNPAGES(4); + szc = tszc; + pgsz = ppgsz; + pages = btop(pgsz); + e = P2ROUNDUP((uintptr_t)eaddr, pgsz); + lpgeaddr = (caddr_t)e; + } + } + + again: + if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { + ASSERT(IS_P2ALIGNED(aindx, maxpages)); + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + anon_array_enter(amp, aindx, &an_cookie); + if (anon_get_ptr(amp->ahp, aindx) != NULL) { + SEGVN_VMSTAT_FLTVNPAGES(5); + if (anon_pages(amp->ahp, aindx, + maxpages) != maxpages) { + panic("segvn_fault_vnodepages:" + " empty anon slots\n"); + } + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + err = segvn_fault_anonpages(hat, seg, + a, a + maxpgsz, type, rw, + MAX(a, addr), + MIN(a + maxpgsz, eaddr), brkcow); + if (err != 0) { + SEGVN_VMSTAT_FLTVNPAGES(6); + goto out; + } + if (szc < seg->s_szc) { + szc = seg->s_szc; + pgsz = maxpgsz; + pages = maxpages; + lpgeaddr = maxlpgeaddr; + } + goto next; + } else if (anon_pages(amp->ahp, aindx, + maxpages)) { + panic("segvn_fault_vnodepages:" + " non empty anon slots\n"); + } else { + SEGVN_VMSTAT_FLTVNPAGES(7); + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + } + ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); + + if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { + ASSERT(vpage != NULL); + prot = VPP_PROT(vpage); + ASSERT(sameprot(seg, a, maxpgsz)); + if ((prot & protchk) == 0) { + SEGVN_VMSTAT_FLTVNPAGES(8); + err = FC_PROT; + goto out; + } + } + if (type == F_SOFTLOCK) { + mutex_enter(&freemem_lock); + if (availrmem < tune.t_minarmem + pages) { + mutex_exit(&freemem_lock); + err = FC_MAKE_ERR(ENOMEM); + goto out; + } else { + availrmem -= pages; + segvn_pages_locked += pages; + svd->softlockcnt += pages; + } + mutex_exit(&freemem_lock); + } + + pplist = NULL; + physcontig = 0; + ppa[0] = NULL; + if (!brkcow && szc && + !page_exists_physcontig(vp, off, szc, + segtype == MAP_PRIVATE ? ppa : NULL)) { + SEGVN_VMSTAT_FLTVNPAGES(9); + if (page_alloc_pages(seg, a, &pplist, NULL, + szc, 0)) { + SEGVN_RESTORE_SOFTLOCK(type, pages); + SEGVN_VMSTAT_FLTVNPAGES(10); + pszc = 0; + ierr = -1; + alloc_failed |= (1 << szc); + break; + } + if (vp->v_mpssdata == SEGVN_PAGEIO) { + int downsize; + SEGVN_VMSTAT_FLTVNPAGES(11); + physcontig = segvn_fill_vp_pages(svd, + vp, off, szc, ppa, &pplist, + &pszc, &downsize); + ASSERT(!physcontig || pplist == NULL); + if (!physcontig && downsize) { + SEGVN_RESTORE_SOFTLOCK(type, + pages); + ASSERT(pplist == NULL); + SEGVN_VMSTAT_FLTVNPAGES(12); + ierr = -1; + break; + } + ASSERT(!physcontig || + segtype == MAP_PRIVATE || + ppa[0] == NULL); + if (physcontig && ppa[0] == NULL) { + physcontig = 0; + } + } + } else if (!brkcow && szc && ppa[0] != NULL) { + SEGVN_VMSTAT_FLTVNPAGES(13); + ASSERT(segtype == MAP_PRIVATE); + physcontig = 1; + } + + if (!physcontig) { + SEGVN_VMSTAT_FLTVNPAGES(14); + ppa[0] = NULL; + ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, + &vpprot, ppa, pgsz, seg, a, arw, + svd->cred); + if (segtype == MAP_PRIVATE) { + SEGVN_VMSTAT_FLTVNPAGES(15); + vpprot &= ~PROT_WRITE; + } + } else { + ASSERT(segtype == MAP_PRIVATE); + SEGVN_VMSTAT_FLTVNPAGES(16); + vpprot = PROT_ALL & ~PROT_WRITE; + ierr = 0; + } + + if (ierr != 0) { + SEGVN_VMSTAT_FLTVNPAGES(17); + if (pplist != NULL) { + SEGVN_VMSTAT_FLTVNPAGES(18); + page_free_replacement_page(pplist); + page_create_putback(pages); + } + SEGVN_RESTORE_SOFTLOCK(type, pages); + if (a + pgsz <= eaddr) { + SEGVN_VMSTAT_FLTVNPAGES(19); + err = FC_MAKE_ERR(ierr); + goto out; + } + va.va_mask = AT_SIZE; + if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { + SEGVN_VMSTAT_FLTVNPAGES(20); + err = FC_MAKE_ERR(EIO); + goto out; + } + if (btopr(va.va_size) >= btopr(off + pgsz)) { + SEGVN_VMSTAT_FLTVNPAGES(21); + err = FC_MAKE_ERR(EIO); + goto out; + } + if (btopr(va.va_size) < + btopr(off + (eaddr - a))) { + SEGVN_VMSTAT_FLTVNPAGES(22); + err = FC_MAKE_ERR(EIO); + goto out; + } + if (brkcow || type == F_SOFTLOCK) { + /* can't reduce map area */ + SEGVN_VMSTAT_FLTVNPAGES(23); + vop_size_err = 1; + goto out; + } + SEGVN_VMSTAT_FLTVNPAGES(24); + ASSERT(szc != 0); + pszc = 0; + ierr = -1; + break; + } + + if (amp != NULL) { + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + anon_array_enter(amp, aindx, &an_cookie); + } + if (amp != NULL && + anon_get_ptr(amp->ahp, aindx) != NULL) { + ulong_t taindx = P2ALIGN(aindx, maxpages); + + SEGVN_VMSTAT_FLTVNPAGES(25); + if (anon_pages(amp->ahp, taindx, maxpages) != + maxpages) { + panic("segvn_fault_vnodepages:" + " empty anon slots\n"); + } + for (i = 0; i < pages; i++) { + page_unlock(ppa[i]); + } + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + if (pplist != NULL) { + page_free_replacement_page(pplist); + page_create_putback(pages); + } + SEGVN_RESTORE_SOFTLOCK(type, pages); + if (szc < seg->s_szc) { + SEGVN_VMSTAT_FLTVNPAGES(26); + /* + * For private segments SOFTLOCK + * either always breaks cow (any rw + * type except S_READ_NOCOW) or + * address space is locked as writer + * (S_READ_NOCOW case) and anon slots + * can't show up on second check. + * Therefore if we are here for + * SOFTLOCK case it must be a cow + * break but cow break never reduces + * szc. Thus the assert below. + */ + ASSERT(!brkcow && type != F_SOFTLOCK); + pszc = seg->s_szc; + ierr = -2; + break; + } + ASSERT(IS_P2ALIGNED(a, maxpgsz)); + goto again; + } +#ifdef DEBUG + if (amp != NULL) { + ulong_t taindx = P2ALIGN(aindx, maxpages); + ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); + } +#endif /* DEBUG */ + + if (brkcow) { + ASSERT(amp != NULL); + ASSERT(pplist == NULL); + ASSERT(szc == seg->s_szc); + ASSERT(IS_P2ALIGNED(a, maxpgsz)); + ASSERT(IS_P2ALIGNED(aindx, maxpages)); + SEGVN_VMSTAT_FLTVNPAGES(27); + ierr = anon_map_privatepages(amp, aindx, szc, + seg, a, prot, ppa, vpage, segvn_anypgsz, + svd->cred); + if (ierr != 0) { + SEGVN_VMSTAT_FLTVNPAGES(28); + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + SEGVN_RESTORE_SOFTLOCK(type, pages); + err = FC_MAKE_ERR(ierr); + goto out; + } + + ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); + /* + * p_szc can't be changed for locked + * swapfs pages. + */ + hat_memload_array(hat, a, pgsz, ppa, prot, + hat_flag); + + if (!(hat_flag & HAT_LOAD_LOCK)) { + SEGVN_VMSTAT_FLTVNPAGES(29); + for (i = 0; i < pages; i++) { + page_unlock(ppa[i]); + } + } + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + goto next; + } + + pfn = page_pptonum(ppa[0]); + /* + * hat_page_demote() needs an EXCl lock on one of + * constituent page_t's and it decreases root's p_szc + * last. This means if root's p_szc is equal szc and + * all its constituent pages are locked + * hat_page_demote() that could have changed p_szc to + * szc is already done and no new have page_demote() + * can start for this large page. + */ + + /* + * we need to make sure same mapping size is used for + * the same address range if there's a possibility the + * adddress is already mapped because hat layer panics + * when translation is loaded for the range already + * mapped with a different page size. We achieve it + * by always using largest page size possible subject + * to the constraints of page size, segment page size + * and page alignment. Since mappings are invalidated + * when those constraints change and make it + * impossible to use previously used mapping size no + * mapping size conflicts should happen. + */ + + chkszc: + if ((pszc = ppa[0]->p_szc) == szc && + IS_P2ALIGNED(pfn, pages)) { + + SEGVN_VMSTAT_FLTVNPAGES(30); +#ifdef DEBUG + for (i = 0; i < pages; i++) { + ASSERT(PAGE_LOCKED(ppa[i])); + ASSERT(!PP_ISFREE(ppa[i])); + ASSERT(page_pptonum(ppa[i]) == + pfn + i); + ASSERT(ppa[i]->p_szc == szc); + ASSERT(ppa[i]->p_vnode == vp); + ASSERT(ppa[i]->p_offset == + off + (i << PAGESHIFT)); + } +#endif + /* + * All pages are of szc we need and they are + * all locked so they can't change szc. load + * translations. + * + * if page got promoted since last check + * we don't need pplist. + */ + if (pplist != NULL) { + page_free_replacement_page(pplist); + page_create_putback(pages); + } + if (PP_ISMIGRATE(ppa[0])) { + page_migrate(seg, a, ppa, pages); + } + SEGVN_UPDATE_MODBITS(ppa, pages, rw, + prot, vpprot); + if (!xhat) { + hat_memload_array(hat, a, pgsz, ppa, + prot & vpprot, hat_flag); + } else { + /* + * avoid large xhat mappings to FS + * pages so that hat_page_demote() + * doesn't need to check for xhat + * large mappings. + */ + for (i = 0; i < pages; i++) { + hat_memload(hat, + a + (i << PAGESHIFT), + ppa[i], prot & vpprot, + hat_flag); + } + } + + if (!(hat_flag & HAT_LOAD_LOCK)) { + for (i = 0; i < pages; i++) { + page_unlock(ppa[i]); + } + } + if (amp != NULL) { + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + goto next; + } + + /* + * See if upsize is possible. + */ + if (pszc > szc && szc < seg->s_szc && + (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { + pgcnt_t aphase; + uint_t pszc1 = MIN(pszc, seg->s_szc); + ppgsz = page_get_pagesize(pszc1); + ppages = btop(ppgsz); + aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); + + SEGVN_VMSTAT_FLTVNPAGES(31); + if (aphase != P2PHASE(pfn, ppages)) { + segvn_faultvnmpss_align_err4++; + } else if (type == F_SOFTLOCK && + a != lpgaddr && + !IS_P2ALIGNED(pfn, + page_get_pagecnt(ppa[0]->p_szc))) { + /* + * if we locked previous offsets for + * smaller szc page larger page can't + * be here since one needs excl locks + * to promote page size. + */ + panic("segvn_fault_vnodepages: " + "unexpected larger than szc page" + " found after SOFTLOCK"); + } else { + SEGVN_VMSTAT_FLTVNPAGES(32); + if (pplist != NULL) { + page_t *pl = pplist; + page_free_replacement_page(pl); + page_create_putback(pages); + } + for (i = 0; i < pages; i++) { + page_unlock(ppa[i]); + } + if (amp != NULL) { + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + SEGVN_RESTORE_SOFTLOCK(type, pages); + pszc = pszc1; + ierr = -2; + break; + } + } + + /* + * check if we should use smallest mapping size. + */ + upgrdfail = 0; + if (szc == 0 || xhat || + (pszc >= szc && + !IS_P2ALIGNED(pfn, pages)) || + (pszc < szc && + !segvn_full_szcpages(ppa, szc, &upgrdfail, + &pszc))) { + + if (upgrdfail) { + /* + * segvn_full_szcpages failed to lock + * all pages EXCL. Size down. + */ + ASSERT(pszc < szc); + + SEGVN_VMSTAT_FLTVNPAGES(33); + + if (pplist != NULL) { + page_t *pl = pplist; + page_free_replacement_page(pl); + page_create_putback(pages); + } + + for (i = 0; i < pages; i++) { + page_unlock(ppa[i]); + } + if (amp != NULL) { + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + SEGVN_RESTORE_SOFTLOCK(type, pages); + ierr = -1; + break; + } + if (szc != 0 && !xhat) { + segvn_faultvnmpss_align_err5++; + } + SEGVN_VMSTAT_FLTVNPAGES(34); + if (pplist != NULL) { + page_free_replacement_page(pplist); + page_create_putback(pages); + } + SEGVN_UPDATE_MODBITS(ppa, pages, rw, + prot, vpprot); + for (i = 0; i < pages; i++) { + hat_memload(hat, a + (i << PAGESHIFT), + ppa[i], prot & vpprot, hat_flag); + } + if (!(hat_flag & HAT_LOAD_LOCK)) { + for (i = 0; i < pages; i++) { + page_unlock(ppa[i]); + } + } + if (amp != NULL) { + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + goto next; + } + + if (pszc == szc) { + /* + * segvn_full_szcpages() upgraded pages szc. + */ + ASSERT(pszc == ppa[0]->p_szc); + ASSERT(IS_P2ALIGNED(pfn, pages)); + goto chkszc; + } + + if (pszc > szc) { + kmutex_t *szcmtx; + SEGVN_VMSTAT_FLTVNPAGES(35); + /* + * p_szc of ppa[0] can change since we haven't + * locked all constituent pages. Call + * page_lock_szc() to prevent szc changes. + * This should be a rare case that happens when + * multiple segments use a different page size + * to map the same file offsets. + */ + szcmtx = page_szc_lock(ppa[0]); + pszc = ppa[0]->p_szc; + ASSERT(szcmtx != NULL || pszc == 0); + ASSERT(ppa[0]->p_szc <= pszc); + if (pszc <= szc) { + SEGVN_VMSTAT_FLTVNPAGES(36); + if (szcmtx != NULL) { + mutex_exit(szcmtx); + } + goto chkszc; + } + if (pplist != NULL) { + /* + * page got promoted since last check. + * we don't need preaalocated large + * page. + */ + SEGVN_VMSTAT_FLTVNPAGES(37); + page_free_replacement_page(pplist); + page_create_putback(pages); + } + SEGVN_UPDATE_MODBITS(ppa, pages, rw, + prot, vpprot); + hat_memload_array(hat, a, pgsz, ppa, + prot & vpprot, hat_flag); + mutex_exit(szcmtx); + if (!(hat_flag & HAT_LOAD_LOCK)) { + for (i = 0; i < pages; i++) { + page_unlock(ppa[i]); + } + } + if (amp != NULL) { + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + goto next; + } + + /* + * if page got demoted since last check + * we could have not allocated larger page. + * allocate now. + */ + if (pplist == NULL && + page_alloc_pages(seg, a, &pplist, NULL, szc, 0)) { + SEGVN_VMSTAT_FLTVNPAGES(38); + for (i = 0; i < pages; i++) { + page_unlock(ppa[i]); + } + if (amp != NULL) { + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + SEGVN_RESTORE_SOFTLOCK(type, pages); + ierr = -1; + alloc_failed |= (1 << szc); + break; + } + + SEGVN_VMSTAT_FLTVNPAGES(39); + + segvn_relocate_pages(ppa, pplist); + + SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); + hat_memload_array(hat, a, pgsz, ppa, prot & vpprot, + hat_flag); + if (!(hat_flag & HAT_LOAD_LOCK)) { + for (i = 0; i < pages; i++) { + ASSERT(PAGE_SHARED(ppa[i])); + page_unlock(ppa[i]); + } + } + if (amp != NULL) { + anon_array_exit(&an_cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + + next: + if (vpage != NULL) { + vpage += pages; + } + adjszc_chk = 1; + } + if (a == lpgeaddr) + break; + ASSERT(a < lpgeaddr); + /* + * ierr == -1 means we failed to map with a large page. + * (either due to allocation/relocation failures or + * misalignment with other mappings to this file. + * + * ierr == -2 means some other thread allocated a large page + * after we gave up tp map with a large page. retry with + * larger mapping. + */ + ASSERT(ierr == -1 || ierr == -2); + ASSERT(ierr == -2 || szc != 0); + ASSERT(ierr == -1 || szc < seg->s_szc); + if (ierr == -2) { + SEGVN_VMSTAT_FLTVNPAGES(40); + ASSERT(pszc > szc && pszc <= seg->s_szc); + szc = pszc; + } else if (segvn_anypgsz_vnode) { + SEGVN_VMSTAT_FLTVNPAGES(41); + szc--; + } else { + SEGVN_VMSTAT_FLTVNPAGES(42); + ASSERT(pszc < szc); + /* + * other process created pszc large page. + * but we still have to drop to 0 szc. + */ + szc = 0; + } + + pgsz = page_get_pagesize(szc); + pages = btop(pgsz); + ASSERT(type != F_SOFTLOCK || ierr == -1 || + (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); + if (type == F_SOFTLOCK) { + /* + * For softlocks we cannot reduce the fault area + * (calculated based on the largest page size for this + * segment) for size down and a is already next + * page size aligned as assertted above for size + * ups. Therefore just continue in case of softlock. + */ + SEGVN_VMSTAT_FLTVNPAGES(43); + continue; /* keep lint happy */ + } else if (ierr == -2) { + + /* + * Size up case. Note lpgaddr may only be needed for + * softlock case so we don't adjust it here. + */ + a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); + ASSERT(a >= lpgaddr); + lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); + off = svd->offset + (uintptr_t)(a - seg->s_base); + aindx = svd->anon_index + seg_page(seg, a); + vpage = (svd->vpage != NULL) ? + &svd->vpage[seg_page(seg, a)] : NULL; + } else { + /* + * Size down case. Note lpgaddr may only be needed for + * softlock case so we don't adjust it here. + */ + ASSERT(IS_P2ALIGNED(a, pgsz)); + ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); + lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); + ASSERT(a < lpgeaddr); + if (a < addr) { + SEGVN_VMSTAT_FLTVNPAGES(44); + /* + * The beginning of the large page region can + * be pulled to the right to make a smaller + * region. We haven't yet faulted a single + * page. + */ + a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); + ASSERT(a >= lpgaddr); + off = svd->offset + + (uintptr_t)(a - seg->s_base); + aindx = svd->anon_index + seg_page(seg, a); + vpage = (svd->vpage != NULL) ? + &svd->vpage[seg_page(seg, a)] : NULL; + } + } + } +out: + kmem_free(ppa, ppasize); + if (!err && !vop_size_err) { + SEGVN_VMSTAT_FLTVNPAGES(45); + return (0); + } + if (type == F_SOFTLOCK && a > lpgaddr) { + SEGVN_VMSTAT_FLTVNPAGES(46); + segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); + } + if (!vop_size_err) { + SEGVN_VMSTAT_FLTVNPAGES(47); + return (err); + } + ASSERT(brkcow || type == F_SOFTLOCK); + /* + * Large page end is mapped beyond the end of file and it's a cow + * fault or softlock so we can't reduce the map area. For now just + * demote the segment. This should really only happen if the end of + * the file changed after the mapping was established since when large + * page segments are created we make sure they don't extend beyond the + * end of the file. + */ + SEGVN_VMSTAT_FLTVNPAGES(48); + + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); + err = 0; + if (seg->s_szc != 0) { + err = segvn_clrszc(seg); + if (err != 0) { + segvn_fltvnpages_clrszc_err++; + } + } + ASSERT(err || seg->s_szc == 0); + SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); + /* segvn_fault will do its job as if szc had been zero to begin with */ + return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); +} + +/* + * This routine will attempt to fault in one large page. + * it will use smaller pages if that fails. + * It should only be called for pure anonymous segments. + */ +static faultcode_t +segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, + caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, + caddr_t eaddr, int brkcow) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct anon_map *amp = svd->amp; + uchar_t segtype = svd->type; + uint_t szc = seg->s_szc; + size_t pgsz = page_get_pagesize(szc); + size_t maxpgsz = pgsz; + pgcnt_t pages = btop(pgsz); + size_t ppasize = pages * sizeof (page_t *); + caddr_t a = lpgaddr; + ulong_t aindx = svd->anon_index + seg_page(seg, a); + struct vpage *vpage = (svd->vpage != NULL) ? + &svd->vpage[seg_page(seg, a)] : NULL; + page_t **ppa; + uint_t ppa_szc; + faultcode_t err; + int ierr; + uint_t protchk, prot, vpprot; + int i; + int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; + anon_sync_obj_t cookie; + + ASSERT(szc != 0); + ASSERT(amp != NULL); + ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ + ASSERT(!(svd->flags & MAP_NORESERVE)); + ASSERT(type != F_SOFTUNLOCK); + ASSERT(segtype == MAP_PRIVATE); + ASSERT(IS_P2ALIGNED(a, maxpgsz)); + + ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); + + VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); + VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); + + if (svd->flags & MAP_TEXT) { + hat_flag |= HAT_LOAD_TEXT; + } + + if (svd->pageprot) { + switch (rw) { + case S_READ: + protchk = PROT_READ; + break; + case S_WRITE: + protchk = PROT_WRITE; + break; + case S_EXEC: + protchk = PROT_EXEC; + break; + case S_OTHER: + default: + protchk = PROT_READ | PROT_WRITE | PROT_EXEC; + break; + } + VM_STAT_ADD(segvnvmstats.fltanpages[2]); + } else { + prot = svd->prot; + /* caller has already done segment level protection check. */ + } + + ppa = kmem_alloc(ppasize, KM_SLEEP); + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (;;) { + for (; a < lpgeaddr; a += pgsz, aindx += pages) { + if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { + VM_STAT_ADD(segvnvmstats.fltanpages[3]); + ASSERT(vpage != NULL); + prot = VPP_PROT(vpage); + ASSERT(sameprot(seg, a, maxpgsz)); + if ((prot & protchk) == 0) { + err = FC_PROT; + goto error; + } + } + if (type == F_SOFTLOCK) { + mutex_enter(&freemem_lock); + if (availrmem < tune.t_minarmem + pages) { + mutex_exit(&freemem_lock); + err = FC_MAKE_ERR(ENOMEM); + goto error; + } else { + availrmem -= pages; + segvn_pages_locked += pages; + svd->softlockcnt += pages; + } + mutex_exit(&freemem_lock); + } + anon_array_enter(amp, aindx, &cookie); + ppa_szc = (uint_t)-1; + ierr = anon_map_getpages(amp, aindx, szc, seg, a, + prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, + segvn_anypgsz, svd->cred); + if (ierr != 0) { + anon_array_exit(&cookie); + VM_STAT_ADD(segvnvmstats.fltanpages[4]); + if (type == F_SOFTLOCK) { + VM_STAT_ADD(segvnvmstats.fltanpages[5]); + mutex_enter(&freemem_lock); + availrmem += pages; + segvn_pages_locked -= pages; + svd->softlockcnt -= pages; + mutex_exit(&freemem_lock); + } + if (ierr > 0) { + VM_STAT_ADD(segvnvmstats.fltanpages[6]); + err = FC_MAKE_ERR(ierr); + goto error; + } + break; + } + + ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); + + /* + * Handle pages that have been marked for migration + */ + if (lgrp_optimizations()) + page_migrate(seg, a, ppa, pages); + + hat_memload_array(hat, a, pgsz, ppa, + prot & vpprot, hat_flag); + + if (hat_flag & HAT_LOAD_LOCK) { + VM_STAT_ADD(segvnvmstats.fltanpages[7]); + } else { + VM_STAT_ADD(segvnvmstats.fltanpages[8]); + for (i = 0; i < pages; i++) + page_unlock(ppa[i]); + } + if (vpage != NULL) + vpage += pages; + + anon_array_exit(&cookie); + } + if (a == lpgeaddr) + break; + ASSERT(a < lpgeaddr); + /* + * ierr == -1 means we failed to allocate a large page. + * so do a size down operation. + * + * ierr == -2 means some other process that privately shares + * pages with this process has allocated a larger page and we + * need to retry with larger pages. So do a size up + * operation. This relies on the fact that large pages are + * never partially shared i.e. if we share any constituent + * page of a large page with another process we must share the + * entire large page. Note this cannot happen for SOFTLOCK + * case, unless current address (a) is at the beginning of the + * next page size boundary because the other process couldn't + * have relocated locked pages. + */ + ASSERT(ierr == -1 || ierr == -2); + if (segvn_anypgsz) { + ASSERT(ierr == -2 || szc != 0); + ASSERT(ierr == -1 || szc < seg->s_szc); + szc = (ierr == -1) ? szc - 1 : szc + 1; + } else { + /* + * For non COW faults and segvn_anypgsz == 0 + * we need to be careful not to loop forever + * if existing page is found with szc other + * than 0 or seg->s_szc. This could be due + * to page relocations on behalf of DR or + * more likely large page creation. For this + * case simply re-size to existing page's szc + * if returned by anon_map_getpages(). + */ + if (ppa_szc == (uint_t)-1) { + szc = (ierr == -1) ? 0 : seg->s_szc; + } else { + ASSERT(ppa_szc <= seg->s_szc); + ASSERT(ierr == -2 || ppa_szc < szc); + ASSERT(ierr == -1 || ppa_szc > szc); + szc = ppa_szc; + } + } + + pgsz = page_get_pagesize(szc); + pages = btop(pgsz); + ASSERT(type != F_SOFTLOCK || ierr == -1 || + (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); + if (type == F_SOFTLOCK) { + /* + * For softlocks we cannot reduce the fault area + * (calculated based on the largest page size for this + * segment) for size down and a is already next + * page size aligned as assertted above for size + * ups. Therefore just continue in case of softlock. + */ + VM_STAT_ADD(segvnvmstats.fltanpages[9]); + continue; /* keep lint happy */ + } else if (ierr == -2) { + + /* + * Size up case. Note lpgaddr may only be needed for + * softlock case so we don't adjust it here. + */ + VM_STAT_ADD(segvnvmstats.fltanpages[10]); + a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); + ASSERT(a >= lpgaddr); + lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); + aindx = svd->anon_index + seg_page(seg, a); + vpage = (svd->vpage != NULL) ? + &svd->vpage[seg_page(seg, a)] : NULL; + } else { + /* + * Size down case. Note lpgaddr may only be needed for + * softlock case so we don't adjust it here. + */ + VM_STAT_ADD(segvnvmstats.fltanpages[11]); + ASSERT(IS_P2ALIGNED(a, pgsz)); + ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); + lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); + ASSERT(a < lpgeaddr); + if (a < addr) { + /* + * The beginning of the large page region can + * be pulled to the right to make a smaller + * region. We haven't yet faulted a single + * page. + */ + VM_STAT_ADD(segvnvmstats.fltanpages[12]); + a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); + ASSERT(a >= lpgaddr); + aindx = svd->anon_index + seg_page(seg, a); + vpage = (svd->vpage != NULL) ? + &svd->vpage[seg_page(seg, a)] : NULL; + } + } + } + VM_STAT_ADD(segvnvmstats.fltanpages[13]); + ANON_LOCK_EXIT(&->a_rwlock); + kmem_free(ppa, ppasize); + return (0); +error: + VM_STAT_ADD(segvnvmstats.fltanpages[14]); + ANON_LOCK_EXIT(&->a_rwlock); + kmem_free(ppa, ppasize); + if (type == F_SOFTLOCK && a > lpgaddr) { + VM_STAT_ADD(segvnvmstats.fltanpages[15]); + segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); + } + return (err); +} + +int fltadvice = 1; /* set to free behind pages for sequential access */ + +/* + * This routine is called via a machine specific fault handling routine. + * It is also called by software routines wishing to lock or unlock + * a range of addresses. + * + * Here is the basic algorithm: + * If unlocking + * Call segvn_softunlock + * Return + * endif + * Checking and set up work + * If we will need some non-anonymous pages + * Call VOP_GETPAGE over the range of non-anonymous pages + * endif + * Loop over all addresses requested + * Call segvn_faultpage passing in page list + * to load up translations and handle anonymous pages + * endloop + * Load up translation to any additional pages in page list not + * already handled that fit into this segment + */ +static faultcode_t +segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, + enum fault_type type, enum seg_rw rw) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + page_t **plp, **ppp, *pp; + u_offset_t off; + caddr_t a; + struct vpage *vpage; + uint_t vpprot, prot; + int err; + page_t *pl[PVN_GETPAGE_NUM + 1]; + size_t plsz, pl_alloc_sz; + size_t page; + ulong_t anon_index; + struct anon_map *amp; + int dogetpage = 0; + caddr_t lpgaddr, lpgeaddr; + size_t pgsz; + anon_sync_obj_t cookie; + int brkcow = BREAK_COW_SHARE(rw, type, svd->type); + + /* + * S_READ_NOCOW is like read + * except caller advises no need + * to copy-on-write for softlock + * because it holds address space + * locked as writer and thus prevents + * any copy on writes of a softlocked + * page by another thread. + * S_READ_NOCOW vs S_READ distinction was + * only needed for BREAK_COW_SHARE(). After + * that we treat S_READ_NOW as just S_READ. + */ + if (rw == S_READ_NOCOW) { + rw = S_READ; + ASSERT(type == F_SOFTLOCK && + AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + } + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * First handle the easy stuff + */ + if (type == F_SOFTUNLOCK) { + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + pgsz = (seg->s_szc == 0) ? PAGESIZE : + page_get_pagesize(seg->s_szc); + VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); + CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); + segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + +top: + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + + /* + * If we have the same protections for the entire segment, + * insure that the access being attempted is legitimate. + */ + + if (svd->pageprot == 0) { + uint_t protchk; + + switch (rw) { + case S_READ: + protchk = PROT_READ; + break; + case S_WRITE: + protchk = PROT_WRITE; + break; + case S_EXEC: + protchk = PROT_EXEC; + break; + case S_OTHER: + default: + protchk = PROT_READ | PROT_WRITE | PROT_EXEC; + break; + } + + if ((svd->prot & protchk) == 0) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (FC_PROT); /* illegal access type */ + } + } + + /* + * Check to see if we need to allocate an anon_map structure. + */ + if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { + /* + * Drop the "read" lock on the segment and acquire + * the "write" version since we have to allocate the + * anon_map. + */ + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); + + if (svd->amp == NULL) { + svd->amp = anonmap_alloc(seg->s_size, 0); + svd->amp->a_szc = seg->s_szc; + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + + /* + * Start all over again since segment protections + * may have changed after we dropped the "read" lock. + */ + goto top; + } + + amp = svd->amp; + + /* + * MADV_SEQUENTIAL work is ignored for large page segments. + */ + if (seg->s_szc != 0) { + pgsz = page_get_pagesize(seg->s_szc); + ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); + /* + * We may need to do relocations so purge seg_pcache to allow + * pages to be locked exclusively. + */ + if (svd->softlockcnt != 0) + segvn_purge(seg); + CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); + if (svd->vp == NULL) { + ASSERT(svd->type == MAP_PRIVATE); + err = segvn_fault_anonpages(hat, seg, lpgaddr, + lpgeaddr, type, rw, addr, addr + len, brkcow); + } else { + err = segvn_fault_vnodepages(hat, seg, lpgaddr, + lpgeaddr, type, rw, addr, addr + len, brkcow); + if (err == IE_RETRY) { + ASSERT(seg->s_szc == 0); + ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); + goto cont; + } + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (err); + } + +cont: + page = seg_page(seg, addr); + if (amp != NULL) { + anon_index = svd->anon_index + page; + + if ((type == F_PROT) && (rw == S_READ) && + svd->type == MAP_PRIVATE && svd->pageprot == 0) { + size_t index = anon_index; + struct anon *ap; + + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + /* + * The fast path could apply to S_WRITE also, except + * that the protection fault could be caused by lazy + * tlb flush when ro->rw. In this case, the pte is + * RW already. But RO in the other cpu's tlb causes + * the fault. Since hat_chgprot won't do anything if + * pte doesn't change, we may end up faulting + * indefinitely until the RO tlb entry gets replaced. + */ + for (a = addr; a < addr + len; a += PAGESIZE, index++) { + anon_array_enter(amp, index, &cookie); + ap = anon_get_ptr(amp->ahp, index); + anon_array_exit(&cookie); + if ((ap == NULL) || (ap->an_refcnt != 1)) { + ANON_LOCK_EXIT(&->a_rwlock); + goto slow; + } + } + hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); + ANON_LOCK_EXIT(&->a_rwlock); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + } +slow: + + if (svd->vpage == NULL) + vpage = NULL; + else + vpage = &svd->vpage[page]; + + off = svd->offset + (uintptr_t)(addr - seg->s_base); + + /* + * If MADV_SEQUENTIAL has been set for the particular page we + * are faulting on, free behind all pages in the segment and put + * them on the free list. + */ + if ((page != 0) && fltadvice) { /* not if first page in segment */ + struct vpage *vpp; + ulong_t fanon_index; + size_t fpage; + u_offset_t pgoff, fpgoff; + struct vnode *fvp; + struct anon *fap = NULL; + + if (svd->advice == MADV_SEQUENTIAL || + (svd->pageadvice && + VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { + pgoff = off - PAGESIZE; + fpage = page - 1; + if (vpage != NULL) + vpp = &svd->vpage[fpage]; + if (amp != NULL) + fanon_index = svd->anon_index + fpage; + + while (pgoff > svd->offset) { + if (svd->advice != MADV_SEQUENTIAL && + (!svd->pageadvice || (vpage && + VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) + break; + + /* + * If this is an anon page, we must find the + * correct <vp, offset> for it + */ + fap = NULL; + if (amp != NULL) { + ANON_LOCK_ENTER(&->a_rwlock, + RW_READER); + anon_array_enter(amp, fanon_index, + &cookie); + fap = anon_get_ptr(amp->ahp, + fanon_index); + if (fap != NULL) { + swap_xlate(fap, &fvp, &fpgoff); + } else { + fpgoff = pgoff; + fvp = svd->vp; + } + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } else { + fpgoff = pgoff; + fvp = svd->vp; + } + if (fvp == NULL) + break; /* XXX */ + /* + * Skip pages that are free or have an + * "exclusive" lock. + */ + pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); + if (pp == NULL) + break; + /* + * We don't need the page_struct_lock to test + * as this is only advisory; even if we + * acquire it someone might race in and lock + * the page after we unlock and before the + * PUTPAGE, then VOP_PUTPAGE will do nothing. + */ + if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { + /* + * Hold the vnode before releasing + * the page lock to prevent it from + * being freed and re-used by some + * other thread. + */ + VN_HOLD(fvp); + page_unlock(pp); + /* + * We should build a page list + * to kluster putpages XXX + */ + (void) VOP_PUTPAGE(fvp, + (offset_t)fpgoff, PAGESIZE, + (B_DONTNEED|B_FREE|B_ASYNC), + svd->cred); + VN_RELE(fvp); + } else { + /* + * XXX - Should the loop terminate if + * the page is `locked'? + */ + page_unlock(pp); + } + --vpp; + --fanon_index; + pgoff -= PAGESIZE; + } + } + } + + plp = pl; + *plp = NULL; + pl_alloc_sz = 0; + + /* + * See if we need to call VOP_GETPAGE for + * *any* of the range being faulted on. + * We can skip all of this work if there + * was no original vnode. + */ + if (svd->vp != NULL) { + u_offset_t vp_off; + size_t vp_len; + struct anon *ap; + vnode_t *vp; + + vp_off = off; + vp_len = len; + + if (amp == NULL) + dogetpage = 1; + else { + /* + * Only acquire reader lock to prevent amp->ahp + * from being changed. It's ok to miss pages, + * hence we don't do anon_array_enter + */ + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + ap = anon_get_ptr(amp->ahp, anon_index); + + if (len <= PAGESIZE) + /* inline non_anon() */ + dogetpage = (ap == NULL); + else + dogetpage = non_anon(amp->ahp, anon_index, + &vp_off, &vp_len); + ANON_LOCK_EXIT(&->a_rwlock); + } + + if (dogetpage) { + enum seg_rw arw; + struct as *as = seg->s_as; + + if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { + /* + * Page list won't fit in local array, + * allocate one of the needed size. + */ + pl_alloc_sz = + (btop(len) + 1) * sizeof (page_t *); + plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); + plp[0] = NULL; + plsz = len; + } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || + rw == S_OTHER || + (((size_t)(addr + PAGESIZE) < + (size_t)(seg->s_base + seg->s_size)) && + hat_probe(as->a_hat, addr + PAGESIZE))) { + /* + * Ask VOP_GETPAGE to return the exact number + * of pages if + * (a) this is a COW fault, or + * (b) this is a software fault, or + * (c) next page is already mapped. + */ + plsz = len; + } else { + /* + * Ask VOP_GETPAGE to return adjacent pages + * within the segment. + */ + plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) + ((seg->s_base + seg->s_size) - addr)); + ASSERT((addr + plsz) <= + (seg->s_base + seg->s_size)); + } + + /* + * Need to get some non-anonymous pages. + * We need to make only one call to GETPAGE to do + * this to prevent certain deadlocking conditions + * when we are doing locking. In this case + * non_anon() should have picked up the smallest + * range which includes all the non-anonymous + * pages in the requested range. We have to + * be careful regarding which rw flag to pass in + * because on a private mapping, the underlying + * object is never allowed to be written. + */ + if (rw == S_WRITE && svd->type == MAP_PRIVATE) { + arw = S_READ; + } else { + arw = rw; + } + vp = svd->vp; + TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, + "segvn_getpage:seg %p addr %p vp %p", + seg, addr, vp); + err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, + &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, + svd->cred); + if (err) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + segvn_pagelist_rele(plp); + if (pl_alloc_sz) + kmem_free(plp, pl_alloc_sz); + return (FC_MAKE_ERR(err)); + } + if (svd->type == MAP_PRIVATE) + vpprot &= ~PROT_WRITE; + } + } + + /* + * N.B. at this time the plp array has all the needed non-anon + * pages in addition to (possibly) having some adjacent pages. + */ + + /* + * Always acquire the anon_array_lock to prevent + * 2 threads from allocating separate anon slots for + * the same "addr". + * + * If this is a copy-on-write fault and we don't already + * have the anon_array_lock, acquire it to prevent the + * fault routine from handling multiple copy-on-write faults + * on the same "addr" in the same address space. + * + * Only one thread should deal with the fault since after + * it is handled, the other threads can acquire a translation + * to the newly created private page. This prevents two or + * more threads from creating different private pages for the + * same fault. + * + * We grab "serialization" lock here if this is a MAP_PRIVATE segment + * to prevent deadlock between this thread and another thread + * which has soft-locked this page and wants to acquire serial_lock. + * ( bug 4026339 ) + * + * The fix for bug 4026339 becomes unnecessary when using the + * locking scheme with per amp rwlock and a global set of hash + * lock, anon_array_lock. If we steal a vnode page when low + * on memory and upgrad the page lock through page_rename, + * then the page is PAGE_HANDLED, nothing needs to be done + * for this page after returning from segvn_faultpage. + * + * But really, the page lock should be downgraded after + * the stolen page is page_rename'd. + */ + + if (amp != NULL) + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + + /* + * Ok, now loop over the address range and handle faults + */ + for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { + err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, + type, rw, brkcow); + if (err) { + if (amp != NULL) + ANON_LOCK_EXIT(&->a_rwlock); + if (type == F_SOFTLOCK && a > addr) + segvn_softunlock(seg, addr, (a - addr), + S_OTHER); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + segvn_pagelist_rele(plp); + if (pl_alloc_sz) + kmem_free(plp, pl_alloc_sz); + return (err); + } + if (vpage) { + vpage++; + } else if (svd->vpage) { + page = seg_page(seg, addr); + vpage = &svd->vpage[++page]; + } + } + + /* Didn't get pages from the underlying fs so we're done */ + if (!dogetpage) + goto done; + + /* + * Now handle any other pages in the list returned. + * If the page can be used, load up the translations now. + * Note that the for loop will only be entered if "plp" + * is pointing to a non-NULL page pointer which means that + * VOP_GETPAGE() was called and vpprot has been initialized. + */ + if (svd->pageprot == 0) + prot = svd->prot & vpprot; + + + /* + * Large Files: diff should be unsigned value because we started + * supporting > 2GB segment sizes from 2.5.1 and when a + * large file of size > 2GB gets mapped to address space + * the diff value can be > 2GB. + */ + + for (ppp = plp; (pp = *ppp) != NULL; ppp++) { + size_t diff; + struct anon *ap; + int anon_index; + anon_sync_obj_t cookie; + int hat_flag = HAT_LOAD_ADV; + + if (svd->flags & MAP_TEXT) { + hat_flag |= HAT_LOAD_TEXT; + } + + if (pp == PAGE_HANDLED) + continue; + + if (pp->p_offset >= svd->offset && + (pp->p_offset < svd->offset + seg->s_size)) { + + diff = pp->p_offset - svd->offset; + + /* + * Large Files: Following is the assertion + * validating the above cast. + */ + ASSERT(svd->vp == pp->p_vnode); + + page = btop(diff); + if (svd->pageprot) + prot = VPP_PROT(&svd->vpage[page]) & vpprot; + + /* + * Prevent other threads in the address space from + * creating private pages (i.e., allocating anon slots) + * while we are in the process of loading translations + * to additional pages returned by the underlying + * object. + */ + if (amp != NULL) { + anon_index = svd->anon_index + page; + anon_array_enter(amp, anon_index, &cookie); + ap = anon_get_ptr(amp->ahp, anon_index); + } + if ((amp == NULL) || (ap == NULL)) { + if (IS_VMODSORT(pp->p_vnode) || + enable_mbit_wa) { + if (rw == S_WRITE) + hat_setmod(pp); + else if (rw != S_OTHER && + !hat_ismod(pp)) + prot &= ~PROT_WRITE; + } + /* + * Skip mapping read ahead pages marked + * for migration, so they will get migrated + * properly on fault + */ + if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { + hat_memload(hat, seg->s_base + diff, + pp, prot, hat_flag); + } + } + if (amp != NULL) + anon_array_exit(&cookie); + } + page_unlock(pp); + } +done: + if (amp != NULL) + ANON_LOCK_EXIT(&->a_rwlock); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + if (pl_alloc_sz) + kmem_free(plp, pl_alloc_sz); + return (0); +} + +/* + * This routine is used to start I/O on pages asynchronously. XXX it will + * only create PAGESIZE pages. At fault time they will be relocated into + * larger pages. + */ +static faultcode_t +segvn_faulta(struct seg *seg, caddr_t addr) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + int err; + struct anon_map *amp; + vnode_t *vp; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + if ((amp = svd->amp) != NULL) { + struct anon *ap; + + /* + * Reader lock to prevent amp->ahp from being changed. + * This is advisory, it's ok to miss a page, so + * we don't do anon_array_enter lock. + */ + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + if ((ap = anon_get_ptr(amp->ahp, + svd->anon_index + seg_page(seg, addr))) != NULL) { + + err = anon_getpage(&ap, NULL, NULL, + 0, seg, addr, S_READ, svd->cred); + + ANON_LOCK_EXIT(&->a_rwlock); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + if (err) + return (FC_MAKE_ERR(err)); + return (0); + } + ANON_LOCK_EXIT(&->a_rwlock); + } + + if (svd->vp == NULL) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); /* zfod page - do nothing now */ + } + + vp = svd->vp; + TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, + "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); + err = VOP_GETPAGE(vp, + (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), + PAGESIZE, NULL, NULL, 0, seg, addr, + S_OTHER, svd->cred); + + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + if (err) + return (FC_MAKE_ERR(err)); + return (0); +} + +static int +segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct vpage *svp, *evp; + struct vnode *vp; + size_t pgsz; + pgcnt_t pgcnt; + anon_sync_obj_t cookie; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + if ((svd->maxprot & prot) != prot) + return (EACCES); /* violated maxprot */ + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); + + /* return if prot is the same */ + if (!svd->pageprot && svd->prot == prot) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + + /* + * Since we change protections we first have to flush the cache. + * This makes sure all the pagelock calls have to recheck + * protections. + */ + if (svd->softlockcnt > 0) { + /* + * Since we do have the segvn writers lock nobody can fill + * the cache with entries belonging to this seg during + * the purge. The flush either succeeds or we still have + * pending I/Os. + */ + segvn_purge(seg); + if (svd->softlockcnt > 0) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EAGAIN); + } + } + + if (seg->s_szc != 0) { + int err; + pgsz = page_get_pagesize(seg->s_szc); + pgcnt = pgsz >> PAGESHIFT; + ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); + if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + ASSERT(seg->s_base != addr || seg->s_size != len); + /* + * If we are holding the as lock as a reader then + * we need to return IE_RETRY and let the as + * layer drop and re-aquire the lock as a writer. + */ + if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) + return (IE_RETRY); + VM_STAT_ADD(segvnvmstats.demoterange[1]); + err = segvn_demote_range(seg, addr, len, SDR_END); + if (err == 0) + return (IE_RETRY); + if (err == ENOMEM) + return (IE_NOMEM); + return (err); + } + } + + + /* + * If it's a private mapping and we're making it writable + * and no swap space has been reserved, have to reserve + * it all now. If it's a private mapping to a file (i.e., vp != NULL) + * and we're removing write permission on the entire segment and + * we haven't modified any pages, we can release the swap space. + */ + if (svd->type == MAP_PRIVATE) { + if (prot & PROT_WRITE) { + size_t sz; + if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { + if (anon_resv(seg->s_size) == 0) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (IE_NOMEM); + } + sz = svd->swresv = seg->s_size; + TRACE_3(TR_FAC_VM, TR_ANON_PROC, + "anon proc:%p %lu %u", + seg, sz, 1); + } + } else { + /* + * Swap space is released only if this segment + * does not map anonymous memory, since read faults + * on such segments still need an anon slot to read + * in the data. + */ + if (svd->swresv != 0 && svd->vp != NULL && + svd->amp == NULL && addr == seg->s_base && + len == seg->s_size && svd->pageprot == 0) { + anon_unresv(svd->swresv); + svd->swresv = 0; + TRACE_3(TR_FAC_VM, TR_ANON_PROC, + "anon proc:%p %lu %u", + seg, 0, 0); + } + } + } + + if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { + if (svd->prot == prot) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); /* all done */ + } + svd->prot = (uchar_t)prot; + } else { + struct anon *ap = NULL; + page_t *pp; + u_offset_t offset, off; + struct anon_map *amp; + ulong_t anon_idx = 0; + + /* + * A vpage structure exists or else the change does not + * involve the entire segment. Establish a vpage structure + * if none is there. Then, for each page in the range, + * adjust its individual permissions. Note that write- + * enabling a MAP_PRIVATE page can affect the claims for + * locked down memory. Overcommitting memory terminates + * the operation. + */ + segvn_vpage(seg); + if ((amp = svd->amp) != NULL) { + anon_idx = svd->anon_index + seg_page(seg, addr); + ASSERT(seg->s_szc == 0 || + IS_P2ALIGNED(anon_idx, pgcnt)); + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + } + + offset = svd->offset + (uintptr_t)(addr - seg->s_base); + evp = &svd->vpage[seg_page(seg, addr + len)]; + + /* + * See Statement at the beginning of segvn_lockop regarding + * the way cowcnts and lckcnts are handled. + */ + for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { + + ASSERT(seg->s_szc == 0 || + (svd->vp != NULL || svd->type == MAP_PRIVATE)); + + if (seg->s_szc != 0 && svd->type == MAP_PRIVATE) { + if (amp != NULL) { + anon_array_enter(amp, anon_idx, + &cookie); + } + if (IS_P2ALIGNED(anon_idx, pgcnt) && + !segvn_claim_pages(seg, svp, offset, + anon_idx, prot)) { + if (amp != NULL) { + anon_array_exit(&cookie); + } + break; + } + if (amp != NULL) { + anon_array_exit(&cookie); + } + anon_idx++; + } else { + if (amp != NULL) { + anon_array_enter(amp, anon_idx, + &cookie); + ap = anon_get_ptr(amp->ahp, anon_idx++); + } + + if (VPP_ISPPLOCK(svp) && + (VPP_PROT(svp) != prot) && + (svd->type == MAP_PRIVATE)) { + + if (amp == NULL || ap == NULL) { + vp = svd->vp; + off = offset; + } else + swap_xlate(ap, &vp, &off); + if (amp != NULL) + anon_array_exit(&cookie); + + if ((pp = page_lookup(vp, off, + SE_SHARED)) == NULL) { + panic("segvn_setprot: no page"); + /*NOTREACHED*/ + } + ASSERT(seg->s_szc == 0); + if ((VPP_PROT(svp) ^ prot) & + PROT_WRITE) { + if (prot & PROT_WRITE) { + if (!page_addclaim(pp)) { + page_unlock(pp); + break; + } + } else { + if (!page_subclaim(pp)) { + page_unlock(pp); + break; + } + } + } + page_unlock(pp); + } else if (amp != NULL) + anon_array_exit(&cookie); + } + VPP_SETPROT(svp, prot); + offset += PAGESIZE; + } + if (amp != NULL) + ANON_LOCK_EXIT(&->a_rwlock); + + /* + * Did we terminate prematurely? If so, simply unload + * the translations to the things we've updated so far. + */ + if (svp != evp) { + len = (svp - &svd->vpage[seg_page(seg, addr)]) * + PAGESIZE; + ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); + if (len != 0) + hat_unload(seg->s_as->a_hat, addr, + len, HAT_UNLOAD); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (IE_NOMEM); + } + } + + if ((prot & PROT_WRITE) != 0 || (prot & ~PROT_USER) == PROT_NONE) { + /* + * Either private or shared data with write access (in + * which case we need to throw out all former translations + * so that we get the right translations set up on fault + * and we don't allow write access to any copy-on-write pages + * that might be around or to prevent write access to pages + * representing holes in a file), or we don't have permission + * to access the memory at all (in which case we have to + * unload any current translations that might exist). + */ + hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); + } else { + /* + * A shared mapping or a private mapping in which write + * protection is going to be denied - just change all the + * protections over the range of addresses in question. + * segvn does not support any other attributes other + * than prot so we can use hat_chgattr. + */ + hat_chgattr(seg->s_as->a_hat, addr, len, prot); + } + + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + + return (0); +} + +/* + * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, + * to determine if the seg is capable of mapping the requested szc. + */ +static int +segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct segvn_data *nsvd; + struct anon_map *amp = svd->amp; + struct seg *nseg; + caddr_t eaddr = addr + len, a; + size_t pgsz = page_get_pagesize(szc); + int err; + u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); + extern struct vnode kvp; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); + + if (seg->s_szc == szc || segvn_lpg_disable != 0) { + return (0); + } + + /* + * addr should always be pgsz aligned but eaddr may be misaligned if + * it's at the end of the segment. + * + * XXX we should assert this condition since as_setpagesize() logic + * guarantees it. + */ + if (!IS_P2ALIGNED(addr, pgsz) || + (!IS_P2ALIGNED(eaddr, pgsz) && + eaddr != seg->s_base + seg->s_size)) { + + segvn_setpgsz_align_err++; + return (EINVAL); + } + + if ((svd->vp == NULL && svd->type == MAP_SHARED) || + (svd->flags & MAP_NORESERVE) || seg->s_as == &kas || + szc > segvn_maxpgszc) { + return (EINVAL); + } + + /* paranoid check */ + if (svd->vp != NULL && + (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) { + return (EINVAL); + } + + if (seg->s_szc == 0 && svd->vp != NULL && + map_addr_vacalign_check(addr, off)) { + return (EINVAL); + } + + /* + * Check that protections are the same within new page + * size boundaries. + */ + if (svd->pageprot) { + for (a = addr; a < eaddr; a += pgsz) { + if ((a + pgsz) > eaddr) { + if (!sameprot(seg, a, eaddr - a)) { + return (EINVAL); + } + } else { + if (!sameprot(seg, a, pgsz)) { + return (EINVAL); + } + } + } + } + + /* + * Since we are changing page size we first have to flush + * the cache. This makes sure all the pagelock calls have + * to recheck protections. + */ + if (svd->softlockcnt > 0) { + /* + * Since we do have the segvn writers lock nobody can fill + * the cache with entries belonging to this seg during + * the purge. The flush either succeeds or we still have + * pending I/Os. + */ + segvn_purge(seg); + if (svd->softlockcnt > 0) { + return (EAGAIN); + } + } + + /* + * Operation for sub range of existing segment. + */ + if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { + if (szc < seg->s_szc) { + VM_STAT_ADD(segvnvmstats.demoterange[2]); + err = segvn_demote_range(seg, addr, len, SDR_RANGE); + if (err == 0) { + return (IE_RETRY); + } + if (err == ENOMEM) { + return (IE_NOMEM); + } + return (err); + } + if (addr != seg->s_base) { + nseg = segvn_split_seg(seg, addr); + if (eaddr != (nseg->s_base + nseg->s_size)) { + /* eaddr is szc aligned */ + (void) segvn_split_seg(nseg, eaddr); + } + return (IE_RETRY); + } + if (eaddr != (seg->s_base + seg->s_size)) { + /* eaddr is szc aligned */ + (void) segvn_split_seg(seg, eaddr); + } + return (IE_RETRY); + } + + /* + * Break any low level sharing and reset seg->s_szc to 0. + */ + if ((err = segvn_clrszc(seg)) != 0) { + if (err == ENOMEM) { + err = IE_NOMEM; + } + return (err); + } + ASSERT(seg->s_szc == 0); + + /* + * If the end of the current segment is not pgsz aligned + * then attempt to concatenate with the next segment. + */ + if (!IS_P2ALIGNED(eaddr, pgsz)) { + nseg = AS_SEGNEXT(seg->s_as, seg); + if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { + return (ENOMEM); + } + if (nseg->s_ops != &segvn_ops) { + return (EINVAL); + } + nsvd = (struct segvn_data *)nseg->s_data; + if (nsvd->softlockcnt > 0) { + segvn_purge(nseg); + if (nsvd->softlockcnt > 0) { + return (EAGAIN); + } + } + err = segvn_clrszc(nseg); + if (err == ENOMEM) { + err = IE_NOMEM; + } + if (err != 0) { + return (err); + } + err = segvn_concat(seg, nseg, 1); + if (err == -1) { + return (EINVAL); + } + if (err == -2) { + return (IE_NOMEM); + } + return (IE_RETRY); + } + + /* + * May need to re-align anon array to + * new szc. + */ + if (amp != NULL) { + pgcnt_t pgcnt = pgsz >> PAGESHIFT; + if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { + struct anon_hdr *nahp; + + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + ASSERT(amp->refcnt == 1); + nahp = anon_create(btop(amp->size), ANON_NOSLEEP); + if (nahp == NULL) { + ANON_LOCK_EXIT(&->a_rwlock); + return (IE_NOMEM); + } + if (anon_copy_ptr(amp->ahp, svd->anon_index, + nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { + anon_release(nahp, btop(amp->size)); + ANON_LOCK_EXIT(&->a_rwlock); + return (IE_NOMEM); + } + anon_release(amp->ahp, btop(amp->size)); + amp->ahp = nahp; + svd->anon_index = 0; + ANON_LOCK_EXIT(&->a_rwlock); + } + } + if (svd->vp != NULL && szc != 0) { + struct vattr va; + u_offset_t eoffpage = svd->offset; + va.va_mask = AT_SIZE; + eoffpage += seg->s_size; + eoffpage = btopr(eoffpage); + if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { + segvn_setpgsz_getattr_err++; + return (EINVAL); + } + if (btopr(va.va_size) < eoffpage) { + segvn_setpgsz_eof_err++; + return (EINVAL); + } + if (amp != NULL) { + /* + * anon_fill_cow_holes() may call VOP_GETPAGE(). + * don't take anon map lock here to avoid holding it + * across VOP_GETPAGE() calls that may call back into + * segvn for klsutering checks. We don't really need + * anon map lock here since it's a private segment and + * we hold as level lock as writers. + */ + if ((err = anon_fill_cow_holes(seg, seg->s_base, + amp->ahp, svd->anon_index, svd->vp, svd->offset, + seg->s_size, szc, svd->prot, svd->vpage, + svd->cred)) != 0) { + return (EINVAL); + } + } + segvn_setvnode_mpss(svd->vp); + } + + if (amp != NULL) { + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + amp->a_szc = szc; + ANON_LOCK_EXIT(&->a_rwlock); + } + + seg->s_szc = szc; + + return (0); +} + +static int +segvn_clrszc(struct seg *seg) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct anon_map *amp = svd->amp; + size_t pgsz; + pgcnt_t pages; + int err = 0; + caddr_t a = seg->s_base; + caddr_t ea = a + seg->s_size; + ulong_t an_idx = svd->anon_index; + vnode_t *vp = svd->vp; + struct vpage *vpage = svd->vpage; + page_t *anon_pl[1 + 1], *pp; + struct anon *ap, *oldap; + uint_t prot = svd->prot, vpprot; + + ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || + SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); + ASSERT(svd->type == MAP_PRIVATE || + (vp != NULL && svd->amp == NULL)); + + if (vp == NULL && amp == NULL) { + seg->s_szc = 0; + return (0); + } + + /* + * do HAT_UNLOAD_UNMAP since we are changing the pagesize. + * unload argument is 0 when we are freeing the segment + * and unload was already done. + */ + hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, + HAT_UNLOAD_UNMAP); + + if (amp == NULL) { + seg->s_szc = 0; + return (0); + } + + pgsz = page_get_pagesize(seg->s_szc); + pages = btop(pgsz); + + /* + * XXX anon rwlock is not really needed because this is a + * private segment and we are writers. + */ + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + + for (; a < ea; a += pgsz, an_idx += pages) { + if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { + if (svd->pageprot != 0) { + ASSERT(vpage != NULL); + prot = VPP_PROT(vpage); + ASSERT(sameprot(seg, a, pgsz)); + } + if (seg->s_szc != 0) { + ASSERT(vp == NULL || anon_pages(amp->ahp, + an_idx, pages) == pages); + if ((err = anon_map_demotepages(amp, an_idx, + seg, a, prot, vpage, svd->cred)) != 0) { + goto out; + } + } else { + if (oldap->an_refcnt == 1) { + continue; + } + if ((err = anon_getpage(&oldap, &vpprot, + anon_pl, PAGESIZE, seg, a, S_READ, + svd->cred))) { + goto out; + } + if ((pp = anon_private(&ap, seg, a, prot, + anon_pl[0], 0, svd->cred)) == NULL) { + err = ENOMEM; + goto out; + } + anon_decref(oldap); + (void) anon_set_ptr(amp->ahp, an_idx, ap, + ANON_SLEEP); + page_unlock(pp); + } + } + vpage = (vpage == NULL) ? NULL : vpage + pages; + } + + amp->a_szc = 0; + seg->s_szc = 0; +out: + ANON_LOCK_EXIT(&->a_rwlock); + return (err); +} + +static int +segvn_claim_pages( + struct seg *seg, + struct vpage *svp, + u_offset_t off, + ulong_t anon_idx, + uint_t prot) +{ + pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); + size_t ppasize = (pgcnt + 1) * sizeof (page_t *); + page_t **ppa; + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct anon_map *amp = svd->amp; + struct vpage *evp = svp + pgcnt; + caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) + + seg->s_base; + struct anon *ap; + struct vnode *vp = svd->vp; + page_t *pp; + pgcnt_t pg_idx, i; + int err = 0; + anoff_t aoff; + int anon = (amp != NULL) ? 1 : 0; + + ASSERT(svd->type == MAP_PRIVATE); + ASSERT(svd->vpage != NULL); + ASSERT(seg->s_szc != 0); + ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); + ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); + ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); + + if (VPP_PROT(svp) == prot) + return (1); + if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) + return (1); + + ppa = kmem_alloc(ppasize, KM_SLEEP); + if (anon && vp != NULL) { + if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { + anon = 0; + ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); + } + ASSERT(!anon || + anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); + } + + for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { + if (!VPP_ISPPLOCK(svp)) + continue; + if (anon) { + ap = anon_get_ptr(amp->ahp, anon_idx); + if (ap == NULL) { + panic("segvn_claim_pages: no anon slot"); + } + swap_xlate(ap, &vp, &aoff); + off = (u_offset_t)aoff; + } + ASSERT(vp != NULL); + if ((pp = page_lookup(vp, + (u_offset_t)off, SE_SHARED)) == NULL) { + panic("segvn_claim_pages: no page"); + } + ppa[pg_idx++] = pp; + off += PAGESIZE; + } + + if (ppa[0] == NULL) { + kmem_free(ppa, ppasize); + return (1); + } + + ASSERT(pg_idx <= pgcnt); + ppa[pg_idx] = NULL; + + if (prot & PROT_WRITE) + err = page_addclaim_pages(ppa); + else + err = page_subclaim_pages(ppa); + + for (i = 0; i < pg_idx; i++) { + ASSERT(ppa[i] != NULL); + page_unlock(ppa[i]); + } + + kmem_free(ppa, ppasize); + return (err); +} + +/* + * Returns right (upper address) segment if split occured. + * If the address is equal to the beginning or end of its segment it returns + * the current segment. + */ +static struct seg * +segvn_split_seg(struct seg *seg, caddr_t addr) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct seg *nseg; + size_t nsize; + struct segvn_data *nsvd; + + ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + ASSERT(svd->type == MAP_PRIVATE || svd->amp == NULL); + ASSERT(addr >= seg->s_base); + ASSERT(addr <= seg->s_base + seg->s_size); + + if (addr == seg->s_base || addr == seg->s_base + seg->s_size) + return (seg); + + nsize = seg->s_base + seg->s_size - addr; + seg->s_size = addr - seg->s_base; + nseg = seg_alloc(seg->s_as, addr, nsize); + ASSERT(nseg != NULL); + nseg->s_ops = seg->s_ops; + nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); + nseg->s_data = (void *)nsvd; + nseg->s_szc = seg->s_szc; + *nsvd = *svd; + rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); + + if (nsvd->vp != NULL) { + VN_HOLD(nsvd->vp); + nsvd->offset = svd->offset + + (uintptr_t)(nseg->s_base - seg->s_base); + if (nsvd->type == MAP_SHARED) + lgrp_shm_policy_init(NULL, nsvd->vp); + } else { + /* + * The offset for an anonymous segment has no signifigance in + * terms of an offset into a file. If we were to use the above + * calculation instead, the structures read out of + * /proc/<pid>/xmap would be more difficult to decipher since + * it would be unclear whether two seemingly contiguous + * prxmap_t structures represented different segments or a + * single segment that had been split up into multiple prxmap_t + * structures (e.g. if some part of the segment had not yet + * been faulted in). + */ + nsvd->offset = 0; + } + + ASSERT(svd->softlockcnt == 0); + crhold(svd->cred); + + if (svd->vpage != NULL) { + size_t bytes = vpgtob(seg_pages(seg)); + size_t nbytes = vpgtob(seg_pages(nseg)); + struct vpage *ovpage = svd->vpage; + + svd->vpage = kmem_alloc(bytes, KM_SLEEP); + bcopy(ovpage, svd->vpage, bytes); + nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); + bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); + kmem_free(ovpage, bytes + nbytes); + } + if (svd->amp != NULL) { + struct anon_map *oamp = svd->amp, *namp; + struct anon_hdr *nahp; + + ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); + ASSERT(oamp->refcnt == 1); + nahp = anon_create(btop(seg->s_size), ANON_SLEEP); + (void) anon_copy_ptr(oamp->ahp, svd->anon_index, + nahp, 0, btop(seg->s_size), ANON_SLEEP); + + namp = anonmap_alloc(nseg->s_size, 0); + namp->a_szc = nseg->s_szc; + (void) anon_copy_ptr(oamp->ahp, + svd->anon_index + btop(seg->s_size), + namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); + anon_release(oamp->ahp, btop(oamp->size)); + oamp->ahp = nahp; + oamp->size = seg->s_size; + svd->anon_index = 0; + nsvd->amp = namp; + nsvd->anon_index = 0; + ANON_LOCK_EXIT(&oamp->a_rwlock); + } + + /* + * Split amount of swap reserve + */ + if (svd->swresv) { + /* + * For MAP_NORESERVE, only allocate swap reserve for pages + * being used. Other segments get enough to cover whole + * segment. + */ + if (svd->flags & MAP_NORESERVE) { + size_t oswresv; + + ASSERT(svd->amp); + oswresv = svd->swresv; + svd->swresv = ptob(anon_pages(svd->amp->ahp, + svd->anon_index, btop(seg->s_size))); + nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, + nsvd->anon_index, btop(nseg->s_size))); + ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); + } else { + ASSERT(svd->swresv == seg->s_size + nseg->s_size); + svd->swresv = seg->s_size; + nsvd->swresv = nseg->s_size; + } + } + + return (nseg); +} + + +/* + * called on memory operations (unmap, setprot, setpagesize) for a subset + * of a large page segment to either demote the memory range (SDR_RANGE) + * or the ends (SDR_END) by addr/len. + * + * returns 0 on success. returns errno, including ENOMEM, on failure. + */ +static int +segvn_demote_range(struct seg *seg, caddr_t addr, size_t len, int flag) +{ + caddr_t eaddr = addr + len; + caddr_t lpgaddr, lpgeaddr; + struct seg *nseg; + struct seg *badseg1 = NULL; + struct seg *badseg2 = NULL; + size_t pgsz; + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + int err; + + ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + ASSERT(seg->s_szc != 0); + pgsz = page_get_pagesize(seg->s_szc); + ASSERT(seg->s_base != addr || seg->s_size != len); + ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); + ASSERT(svd->softlockcnt == 0); + ASSERT(svd->type == MAP_PRIVATE || + (svd->vp != NULL && svd->amp == NULL)); + + CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); + ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); + if (flag == SDR_RANGE) { + /* demote entire range */ + badseg1 = nseg = segvn_split_seg(seg, lpgaddr); + (void) segvn_split_seg(nseg, lpgeaddr); + ASSERT(badseg1->s_base == lpgaddr); + ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); + } else if (addr != lpgaddr) { + ASSERT(flag == SDR_END); + badseg1 = nseg = segvn_split_seg(seg, lpgaddr); + if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && + eaddr < lpgaddr + 2 * pgsz) { + (void) segvn_split_seg(nseg, lpgeaddr); + ASSERT(badseg1->s_base == lpgaddr); + ASSERT(badseg1->s_size == 2 * pgsz); + } else { + nseg = segvn_split_seg(nseg, lpgaddr + pgsz); + ASSERT(badseg1->s_base == lpgaddr); + ASSERT(badseg1->s_size == pgsz); + if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { + ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); + nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); + badseg2 = nseg; + (void) segvn_split_seg(nseg, lpgeaddr); + ASSERT(badseg2->s_base == lpgeaddr - pgsz); + ASSERT(badseg2->s_size == pgsz); + } + } + } else { + ASSERT(flag == SDR_END); + ASSERT(eaddr < lpgeaddr); + badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); + (void) segvn_split_seg(nseg, lpgeaddr); + ASSERT(badseg1->s_base == lpgeaddr - pgsz); + ASSERT(badseg1->s_size == pgsz); + } + + ASSERT(badseg1 != NULL); + ASSERT(badseg1->s_szc != 0); + ASSERT(page_get_pagesize(badseg1->s_szc) == pgsz); + ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || + badseg1->s_size == 2 * pgsz); + if (err = segvn_clrszc(badseg1)) { + return (err); + } + ASSERT(badseg1->s_szc == 0); + + if (badseg2 == NULL) + return (0); + ASSERT(badseg2->s_szc != 0); + ASSERT(page_get_pagesize(badseg2->s_szc) == pgsz); + ASSERT(badseg2->s_size == pgsz); + ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); + if (err = segvn_clrszc(badseg2)) { + return (err); + } + ASSERT(badseg2->s_szc == 0); + return (0); +} + +static int +segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct vpage *vp, *evp; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + /* + * If segment protection can be used, simply check against them. + */ + if (svd->pageprot == 0) { + int err; + + err = ((svd->prot & prot) != prot) ? EACCES : 0; + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (err); + } + + /* + * Have to check down to the vpage level. + */ + evp = &svd->vpage[seg_page(seg, addr + len)]; + for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { + if ((VPP_PROT(vp) & prot) != prot) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EACCES); + } + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); +} + +static int +segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + if (pgno != 0) { + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + if (svd->pageprot == 0) { + do + protv[--pgno] = svd->prot; + while (pgno != 0); + } else { + size_t pgoff = seg_page(seg, addr); + + do { + pgno--; + protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); + } while (pgno != 0); + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + } + return (0); +} + +static u_offset_t +segvn_getoffset(struct seg *seg, caddr_t addr) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + return (svd->offset + (uintptr_t)(addr - seg->s_base)); +} + +/*ARGSUSED*/ +static int +segvn_gettype(struct seg *seg, caddr_t addr) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + return (svd->type | (svd->flags & MAP_NORESERVE)); +} + +/*ARGSUSED*/ +static int +segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + *vpp = svd->vp; + return (0); +} + +/* + * Check to see if it makes sense to do kluster/read ahead to + * addr + delta relative to the mapping at addr. We assume here + * that delta is a signed PAGESIZE'd multiple (which can be negative). + * + * For segvn, we currently "approve" of the action if we are + * still in the segment and it maps from the same vp/off, + * or if the advice stored in segvn_data or vpages allows it. + * Currently, klustering is not allowed only if MADV_RANDOM is set. + */ +static int +segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct anon *oap, *ap; + ssize_t pd; + size_t page; + struct vnode *vp1, *vp2; + u_offset_t off1, off2; + struct anon_map *amp; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || + SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); + + if (addr + delta < seg->s_base || + addr + delta >= (seg->s_base + seg->s_size)) + return (-1); /* exceeded segment bounds */ + + pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ + page = seg_page(seg, addr); + + /* + * Check to see if either of the pages addr or addr + delta + * have advice set that prevents klustering (if MADV_RANDOM advice + * is set for entire segment, or MADV_SEQUENTIAL is set and delta + * is negative). + */ + if (svd->advice == MADV_RANDOM || + svd->advice == MADV_SEQUENTIAL && delta < 0) + return (-1); + else if (svd->pageadvice && svd->vpage) { + struct vpage *bvpp, *evpp; + + bvpp = &svd->vpage[page]; + evpp = &svd->vpage[page + pd]; + if (VPP_ADVICE(bvpp) == MADV_RANDOM || + VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) + return (-1); + if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && + VPP_ADVICE(evpp) == MADV_RANDOM) + return (-1); + } + + if (svd->type == MAP_SHARED) + return (0); /* shared mapping - all ok */ + + if ((amp = svd->amp) == NULL) + return (0); /* off original vnode */ + + page += svd->anon_index; + + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + + oap = anon_get_ptr(amp->ahp, page); + ap = anon_get_ptr(amp->ahp, page + pd); + + ANON_LOCK_EXIT(&->a_rwlock); + + if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { + return (-1); /* one with and one without an anon */ + } + + if (oap == NULL) { /* implies that ap == NULL */ + return (0); /* off original vnode */ + } + + /* + * Now we know we have two anon pointers - check to + * see if they happen to be properly allocated. + */ + + /* + * XXX We cheat here and don't lock the anon slots. We can't because + * we may have been called from the anon layer which might already + * have locked them. We are holding a refcnt on the slots so they + * can't disappear. The worst that will happen is we'll get the wrong + * names (vp, off) for the slots and make a poor klustering decision. + */ + swap_xlate(ap, &vp1, &off1); + swap_xlate(oap, &vp2, &off2); + + + if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) + return (-1); + return (0); +} + +/* + * Swap the pages of seg out to secondary storage, returning the + * number of bytes of storage freed. + * + * The basic idea is first to unload all translations and then to call + * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the + * swap device. Pages to which other segments have mappings will remain + * mapped and won't be swapped. Our caller (as_swapout) has already + * performed the unloading step. + * + * The value returned is intended to correlate well with the process's + * memory requirements. However, there are some caveats: + * 1) When given a shared segment as argument, this routine will + * only succeed in swapping out pages for the last sharer of the + * segment. (Previous callers will only have decremented mapping + * reference counts.) + * 2) We assume that the hat layer maintains a large enough translation + * cache to capture process reference patterns. + */ +static size_t +segvn_swapout(struct seg *seg) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct anon_map *amp; + pgcnt_t pgcnt = 0; + pgcnt_t npages; + pgcnt_t page; + ulong_t anon_index; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + /* + * Find pages unmapped by our caller and force them + * out to the virtual swap device. + */ + if ((amp = svd->amp) != NULL) + anon_index = svd->anon_index; + npages = seg->s_size >> PAGESHIFT; + for (page = 0; page < npages; page++) { + page_t *pp; + struct anon *ap; + struct vnode *vp; + u_offset_t off; + anon_sync_obj_t cookie; + + /* + * Obtain <vp, off> pair for the page, then look it up. + * + * Note that this code is willing to consider regular + * pages as well as anon pages. Is this appropriate here? + */ + ap = NULL; + if (amp != NULL) { + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + anon_array_enter(amp, anon_index + page, &cookie); + ap = anon_get_ptr(amp->ahp, anon_index + page); + if (ap != NULL) { + swap_xlate(ap, &vp, &off); + } else { + vp = svd->vp; + off = svd->offset + ptob(page); + } + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } else { + vp = svd->vp; + off = svd->offset + ptob(page); + } + if (vp == NULL) { /* untouched zfod page */ + ASSERT(ap == NULL); + continue; + } + + pp = page_lookup_nowait(vp, off, SE_SHARED); + if (pp == NULL) + continue; + + + /* + * Examine the page to see whether it can be tossed out, + * keeping track of how many we've found. + */ + if (!page_tryupgrade(pp)) { + /* + * If the page has an i/o lock and no mappings, + * it's very likely that the page is being + * written out as a result of klustering. + * Assume this is so and take credit for it here. + */ + if (!page_io_trylock(pp)) { + if (!hat_page_is_mapped(pp)) + pgcnt++; + } else { + page_io_unlock(pp); + } + page_unlock(pp); + continue; + } + ASSERT(!page_iolock_assert(pp)); + + + /* + * Skip if page is locked or has mappings. + * We don't need the page_struct_lock to look at lckcnt + * and cowcnt because the page is exclusive locked. + */ + if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || + hat_page_is_mapped(pp)) { + page_unlock(pp); + continue; + } + + /* + * dispose skips large pages so try to demote first. + */ + if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { + page_unlock(pp); + /* + * XXX should skip the remaining page_t's of this + * large page. + */ + continue; + } + + ASSERT(pp->p_szc == 0); + + /* + * No longer mapped -- we can toss it out. How + * we do so depends on whether or not it's dirty. + */ + if (hat_ismod(pp) && pp->p_vnode) { + /* + * We must clean the page before it can be + * freed. Setting B_FREE will cause pvn_done + * to free the page when the i/o completes. + * XXX: This also causes it to be accounted + * as a pageout instead of a swap: need + * B_SWAPOUT bit to use instead of B_FREE. + * + * Hold the vnode before releasing the page lock + * to prevent it from being freed and re-used by + * some other thread. + */ + VN_HOLD(vp); + page_unlock(pp); + + /* + * Queue all i/o requests for the pageout thread + * to avoid saturating the pageout devices. + */ + if (!queue_io_request(vp, off)) + VN_RELE(vp); + } else { + /* + * The page was clean, free it. + * + * XXX: Can we ever encounter modified pages + * with no associated vnode here? + */ + ASSERT(pp->p_vnode != NULL); + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, B_FREE, 0, kcred); + } + + /* + * Credit now even if i/o is in progress. + */ + pgcnt++; + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + + /* + * Wakeup pageout to initiate i/o on all queued requests. + */ + cv_signal_pageout(); + return (ptob(pgcnt)); +} + +/* + * Synchronize primary storage cache with real object in virtual memory. + * + * XXX - Anonymous pages should not be sync'ed out at all. + */ +static int +segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct vpage *vpp; + page_t *pp; + u_offset_t offset; + struct vnode *vp; + u_offset_t off; + caddr_t eaddr; + int bflags; + int err = 0; + int segtype; + int pageprot; + int prot; + ulong_t anon_index; + struct anon_map *amp; + struct anon *ap; + anon_sync_obj_t cookie; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + + if (svd->softlockcnt > 0) { + /* + * flush all pages from seg cache + * otherwise we may deadlock in swap_putpage + * for B_INVAL page (4175402). + * + * Even if we grab segvn WRITER's lock or segp_slock + * here, there might be another thread which could've + * successfully performed lookup/insert just before + * we acquired the lock here. So, grabbing either + * lock here is of not much use. Until we devise + * a strategy at upper layers to solve the + * synchronization issues completely, we expect + * applications to handle this appropriately. + */ + segvn_purge(seg); + if (svd->softlockcnt > 0) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EAGAIN); + } + } + + vpp = svd->vpage; + offset = svd->offset + (uintptr_t)(addr - seg->s_base); + bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | + ((flags & MS_INVALIDATE) ? B_INVAL : 0); + + if (attr) { + pageprot = attr & ~(SHARED|PRIVATE); + segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; + + /* + * We are done if the segment types don't match + * or if we have segment level protections and + * they don't match. + */ + if (svd->type != segtype) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + if (vpp == NULL) { + if (svd->prot != pageprot) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + prot = svd->prot; + } else + vpp = &svd->vpage[seg_page(seg, addr)]; + + } else if (svd->vp && svd->amp == NULL && + (flags & MS_INVALIDATE) == 0) { + + /* + * No attributes, no anonymous pages and MS_INVALIDATE flag + * is not on, just use one big request. + */ + err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, + bflags, svd->cred); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (err); + } + + if ((amp = svd->amp) != NULL) + anon_index = svd->anon_index + seg_page(seg, addr); + + for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { + ap = NULL; + if (amp != NULL) { + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + anon_array_enter(amp, anon_index, &cookie); + ap = anon_get_ptr(amp->ahp, anon_index++); + if (ap != NULL) { + swap_xlate(ap, &vp, &off); + } else { + vp = svd->vp; + off = offset; + } + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } else { + vp = svd->vp; + off = offset; + } + offset += PAGESIZE; + + if (vp == NULL) /* untouched zfod page */ + continue; + + if (attr) { + if (vpp) { + prot = VPP_PROT(vpp); + vpp++; + } + if (prot != pageprot) { + continue; + } + } + + /* + * See if any of these pages are locked -- if so, then we + * will have to truncate an invalidate request at the first + * locked one. We don't need the page_struct_lock to test + * as this is only advisory; even if we acquire it someone + * might race in and lock the page after we unlock and before + * we do the PUTPAGE, then PUTPAGE simply does nothing. + */ + if (flags & MS_INVALIDATE) { + if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { + if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { + page_unlock(pp); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EBUSY); + } + if (ap != NULL && pp->p_szc != 0 && + page_tryupgrade(pp)) { + if (pp->p_lckcnt == 0 && + pp->p_cowcnt == 0) { + /* + * swapfs VN_DISPOSE() won't + * invalidate large pages. + * Attempt to demote. + * XXX can't help it if it + * fails. But for swapfs + * pages it is no big deal. + */ + (void) page_try_demote_pages( + pp); + } + } + page_unlock(pp); + } + } else if (svd->type == MAP_SHARED && amp != NULL) { + /* + * Avoid writting out to disk ISM's large pages + * because segspt_free_pages() relies on NULL an_pvp + * of anon slots of such pages. + */ + + ASSERT(svd->vp == NULL); + /* + * swapfs uses page_lookup_nowait if not freeing or + * invalidating and skips a page if + * page_lookup_nowait returns NULL. + */ + pp = page_lookup_nowait(vp, off, SE_SHARED); + if (pp == NULL) { + continue; + } + if (pp->p_szc != 0) { + page_unlock(pp); + continue; + } + + /* + * Note ISM pages are created large so (vp, off)'s + * page cannot suddenly become large after we unlock + * pp. + */ + page_unlock(pp); + } + /* + * XXX - Should ultimately try to kluster + * calls to VOP_PUTPAGE() for performance. + */ + VN_HOLD(vp); + err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, + bflags, svd->cred); + VN_RELE(vp); + if (err) + break; + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (err); +} + +/* + * Determine if we have data corresponding to pages in the + * primary storage virtual memory cache (i.e., "in core"). + */ +static size_t +segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct vnode *vp, *avp; + u_offset_t offset, aoffset; + size_t p, ep; + int ret; + struct vpage *vpp; + page_t *pp; + uint_t start; + struct anon_map *amp; /* XXX - for locknest */ + struct anon *ap; + uint_t attr; + anon_sync_obj_t cookie; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + if (svd->amp == NULL && svd->vp == NULL) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + bzero(vec, btopr(len)); + return (len); /* no anonymous pages created yet */ + } + + p = seg_page(seg, addr); + ep = seg_page(seg, addr + len); + start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; + + amp = svd->amp; + for (; p < ep; p++, addr += PAGESIZE) { + vpp = (svd->vpage) ? &svd->vpage[p]: NULL; + ret = start; + ap = NULL; + avp = NULL; + /* Grab the vnode/offset for the anon slot */ + if (amp != NULL) { + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + anon_array_enter(amp, svd->anon_index + p, &cookie); + ap = anon_get_ptr(amp->ahp, svd->anon_index + p); + if (ap != NULL) { + swap_xlate(ap, &avp, &aoffset); + } + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + if ((avp != NULL) && page_exists(avp, aoffset)) { + /* A page exists for the anon slot */ + ret |= SEG_PAGE_INCORE; + + /* + * If page is mapped and writable + */ + attr = (uint_t)0; + if ((hat_getattr(seg->s_as->a_hat, addr, + &attr) != -1) && (attr & PROT_WRITE)) { + ret |= SEG_PAGE_ANON; + } + /* + * Don't get page_struct lock for lckcnt and cowcnt, + * since this is purely advisory. + */ + if ((pp = page_lookup_nowait(avp, aoffset, + SE_SHARED)) != NULL) { + if (pp->p_lckcnt) + ret |= SEG_PAGE_SOFTLOCK; + if (pp->p_cowcnt) + ret |= SEG_PAGE_HASCOW; + page_unlock(pp); + } + } + + /* Gather vnode statistics */ + vp = svd->vp; + offset = svd->offset + (uintptr_t)(addr - seg->s_base); + + if (vp != NULL) { + /* + * Try to obtain a "shared" lock on the page + * without blocking. If this fails, determine + * if the page is in memory. + */ + pp = page_lookup_nowait(vp, offset, SE_SHARED); + if ((pp == NULL) && (page_exists(vp, offset))) { + /* Page is incore, and is named */ + ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); + } + /* + * Don't get page_struct lock for lckcnt and cowcnt, + * since this is purely advisory. + */ + if (pp != NULL) { + ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); + if (pp->p_lckcnt) + ret |= SEG_PAGE_SOFTLOCK; + if (pp->p_cowcnt) + ret |= SEG_PAGE_HASCOW; + page_unlock(pp); + } + } + + /* Gather virtual page information */ + if (vpp) { + if (VPP_ISPPLOCK(vpp)) + ret |= SEG_PAGE_LOCKED; + vpp++; + } + + *vec++ = (char)ret; + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (len); +} + +/* + * Statement for p_cowcnts/p_lckcnts. + * + * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region + * irrespective of the following factors or anything else: + * + * (1) anon slots are populated or not + * (2) cow is broken or not + * (3) refcnt on ap is 1 or greater than 1 + * + * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock + * and munlock. + * + * + * Handling p_cowcnts/p_lckcnts during copy-on-write fault: + * + * if vpage has PROT_WRITE + * transfer cowcnt on the oldpage -> cowcnt on the newpage + * else + * transfer lckcnt on the oldpage -> lckcnt on the newpage + * + * During copy-on-write, decrement p_cowcnt on the oldpage and increment + * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. + * + * We may also break COW if softlocking on read access in the physio case. + * In this case, vpage may not have PROT_WRITE. So, we need to decrement + * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the + * vpage doesn't have PROT_WRITE. + * + * + * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: + * + * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and + * increment p_lckcnt by calling page_subclaim() which takes care of + * availrmem accounting and p_lckcnt overflow. + * + * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and + * increment p_cowcnt by calling page_addclaim() which takes care of + * availrmem availability and p_cowcnt overflow. + */ + +/* + * Lock down (or unlock) pages mapped by this segment. + * + * XXX only creates PAGESIZE pages if anon slots are not initialized. + * At fault time they will be relocated into larger pages. + */ +static int +segvn_lockop(struct seg *seg, caddr_t addr, size_t len, + int attr, int op, ulong_t *lockmap, size_t pos) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct vpage *vpp; + struct vpage *evp; + page_t *pp; + u_offset_t offset; + u_offset_t off; + int segtype; + int pageprot; + int claim; + struct vnode *vp; + ulong_t anon_index; + struct anon_map *amp; + struct anon *ap; + struct vattr va; + anon_sync_obj_t cookie; + + /* + * Hold write lock on address space because may split or concatenate + * segments + */ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); + if (attr) { + pageprot = attr & ~(SHARED|PRIVATE); + segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; + + /* + * We are done if the segment types don't match + * or if we have segment level protections and + * they don't match. + */ + if (svd->type != segtype) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + if (svd->pageprot == 0 && svd->prot != pageprot) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + } + + /* + * If we're locking, then we must create a vpage structure if + * none exists. If we're unlocking, then check to see if there + * is a vpage -- if not, then we could not have locked anything. + */ + + if ((vpp = svd->vpage) == NULL) { + if (op == MC_LOCK) + segvn_vpage(seg); + else { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + } + + /* + * The anonymous data vector (i.e., previously + * unreferenced mapping to swap space) can be allocated + * by lazily testing for its existence. + */ + if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { + svd->amp = anonmap_alloc(seg->s_size, 0); + svd->amp->a_szc = seg->s_szc; + } + + if ((amp = svd->amp) != NULL) { + anon_index = svd->anon_index + seg_page(seg, addr); + } + + offset = svd->offset + (uintptr_t)(addr - seg->s_base); + evp = &svd->vpage[seg_page(seg, addr + len)]; + + /* + * Loop over all pages in the range. Process if we're locking and + * page has not already been locked in this mapping; or if we're + * unlocking and the page has been locked. + */ + for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; + vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { + if ((attr == 0 || VPP_PROT(vpp) == pageprot) && + ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || + (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { + + if (amp != NULL) + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + /* + * If this isn't a MAP_NORESERVE segment and + * we're locking, allocate anon slots if they + * don't exist. The page is brought in later on. + */ + if (op == MC_LOCK && svd->vp == NULL && + ((svd->flags & MAP_NORESERVE) == 0) && + amp != NULL && + ((ap = anon_get_ptr(amp->ahp, anon_index)) + == NULL)) { + anon_array_enter(amp, anon_index, &cookie); + + if ((ap = anon_get_ptr(amp->ahp, + anon_index)) == NULL) { + pp = anon_zero(seg, addr, &ap, + svd->cred); + if (pp == NULL) { + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + SEGVN_LOCK_EXIT(seg->s_as, + &svd->lock); + return (ENOMEM); + } + ASSERT(anon_get_ptr(amp->ahp, + anon_index) == NULL); + (void) anon_set_ptr(amp->ahp, + anon_index, ap, ANON_SLEEP); + page_unlock(pp); + } + anon_array_exit(&cookie); + } + + /* + * Get name for page, accounting for + * existence of private copy. + */ + ap = NULL; + if (amp != NULL) { + anon_array_enter(amp, anon_index, &cookie); + ap = anon_get_ptr(amp->ahp, anon_index); + if (ap != NULL) { + swap_xlate(ap, &vp, &off); + } else { + if (svd->vp == NULL && + (svd->flags & MAP_NORESERVE)) { + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + continue; + } + vp = svd->vp; + off = offset; + } + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } else { + vp = svd->vp; + off = offset; + } + + /* + * Get page frame. It's ok if the page is + * not available when we're unlocking, as this + * may simply mean that a page we locked got + * truncated out of existence after we locked it. + * + * Invoke VOP_GETPAGE() to obtain the page struct + * since we may need to read it from disk if its + * been paged out. + */ + if (op != MC_LOCK) + pp = page_lookup(vp, off, SE_SHARED); + else { + page_t *pl[1 + 1]; + int error; + + ASSERT(vp != NULL); + + error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, + (uint_t *)NULL, pl, PAGESIZE, seg, addr, + S_OTHER, svd->cred); + + /* + * If the error is EDEADLK then we must bounce + * up and drop all vm subsystem locks and then + * retry the operation later + * This behavior is a temporary measure because + * ufs/sds logging is badly designed and will + * deadlock if we don't allow this bounce to + * happen. The real solution is to re-design + * the logging code to work properly. See bug + * 4125102 for details of the problem. + */ + if (error == EDEADLK) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (error); + } + /* + * Quit if we fail to fault in the page. Treat + * the failure as an error, unless the addr + * is mapped beyond the end of a file. + */ + if (error && svd->vp) { + va.va_mask = AT_SIZE; + if (VOP_GETATTR(svd->vp, &va, 0, + svd->cred) != 0) { + SEGVN_LOCK_EXIT(seg->s_as, + &svd->lock); + return (EIO); + } + if (btopr(va.va_size) >= + btopr(off + 1)) { + SEGVN_LOCK_EXIT(seg->s_as, + &svd->lock); + return (EIO); + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } else if (error) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EIO); + } + pp = pl[0]; + ASSERT(pp != NULL); + } + + /* + * See Statement at the beginning of this routine. + * + * claim is always set if MAP_PRIVATE and PROT_WRITE + * irrespective of following factors: + * + * (1) anon slots are populated or not + * (2) cow is broken or not + * (3) refcnt on ap is 1 or greater than 1 + * + * See 4140683 for details + */ + claim = ((VPP_PROT(vpp) & PROT_WRITE) && + (svd->type == MAP_PRIVATE)); + + /* + * Perform page-level operation appropriate to + * operation. If locking, undo the SOFTLOCK + * performed to bring the page into memory + * after setting the lock. If unlocking, + * and no page was found, account for the claim + * separately. + */ + if (op == MC_LOCK) { + int ret = 1; /* Assume success */ + + /* + * Make sure another thread didn't lock + * the page after we released the segment + * lock. + */ + if ((attr == 0 || VPP_PROT(vpp) == pageprot) && + !VPP_ISPPLOCK(vpp)) { + ret = page_pp_lock(pp, claim, 0); + if (ret != 0) { + VPP_SETPPLOCK(vpp); + if (lockmap != (ulong_t *)NULL) + BT_SET(lockmap, pos); + } + } + page_unlock(pp); + if (ret == 0) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EAGAIN); + } + } else { + if (pp != NULL) { + if ((attr == 0 || + VPP_PROT(vpp) == pageprot) && + VPP_ISPPLOCK(vpp)) + page_pp_unlock(pp, claim, 0); + page_unlock(pp); + } + VPP_CLRPPLOCK(vpp); + } + } + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); +} + +/* + * Set advice from user for specified pages + * There are 5 types of advice: + * MADV_NORMAL - Normal (default) behavior (whatever that is) + * MADV_RANDOM - Random page references + * do not allow readahead or 'klustering' + * MADV_SEQUENTIAL - Sequential page references + * Pages previous to the one currently being + * accessed (determined by fault) are 'not needed' + * and are freed immediately + * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) + * MADV_DONTNEED - Pages are not needed (synced out in mctl) + * MADV_FREE - Contents can be discarded + * MADV_ACCESS_DEFAULT- Default access + * MADV_ACCESS_LWP - Next LWP will access heavily + * MADV_ACCESS_MANY- Many LWPs or processes will access heavily + */ +static int +segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + size_t page; + int err = 0; + int already_set; + struct anon_map *amp; + ulong_t anon_index; + struct seg *next; + lgrp_mem_policy_t policy; + struct seg *prev; + struct vnode *vp; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * In case of MADV_FREE, we won't be modifying any segment private + * data structures; so, we only need to grab READER's lock + */ + if (behav != MADV_FREE) + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); + else + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + + /* + * Large pages are assumed to be only turned on when accesses to the + * segment's address range have spatial and temporal locality. That + * justifies ignoring MADV_SEQUENTIAL for large page segments. + * Also, ignore advice affecting lgroup memory allocation + * if don't need to do lgroup optimizations on this system + */ + + if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || + (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || + behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + + if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || + behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { + /* + * Since we are going to unload hat mappings + * we first have to flush the cache. Otherwise + * this might lead to system panic if another + * thread is doing physio on the range whose + * mappings are unloaded by madvise(3C). + */ + if (svd->softlockcnt > 0) { + /* + * Since we do have the segvn writers lock + * nobody can fill the cache with entries + * belonging to this seg during the purge. + * The flush either succeeds or we still + * have pending I/Os. In the later case, + * madvise(3C) fails. + */ + segvn_purge(seg); + if (svd->softlockcnt > 0) { + /* + * Since madvise(3C) is advisory and + * it's not part of UNIX98, madvise(3C) + * failure here doesn't cause any hardship. + * Note that we don't block in "as" layer. + */ + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EAGAIN); + } + } + } + + amp = svd->amp; + vp = svd->vp; + if (behav == MADV_FREE) { + /* + * MADV_FREE is not supported for segments with + * underlying object; if anonmap is NULL, anon slots + * are not yet populated and there is nothing for + * us to do. As MADV_FREE is advisory, we don't + * return error in either case. + */ + if (vp || amp == NULL) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + + page = seg_page(seg, addr); + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + anon_disclaim(amp, svd->anon_index + page, len, 0); + ANON_LOCK_EXIT(&->a_rwlock); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (0); + } + + /* + * If advice is to be applied to entire segment, + * use advice field in seg_data structure + * otherwise use appropriate vpage entry. + */ + if ((addr == seg->s_base) && (len == seg->s_size)) { + switch (behav) { + case MADV_ACCESS_LWP: + case MADV_ACCESS_MANY: + case MADV_ACCESS_DEFAULT: + /* + * Set memory allocation policy for this segment + */ + policy = lgrp_madv_to_policy(behav, len, svd->type); + if (svd->type == MAP_SHARED) + already_set = lgrp_shm_policy_set(policy, amp, + svd->anon_index, vp, svd->offset, len); + else { + /* + * For private memory, need writers lock on + * address space because the segment may be + * split or concatenated when changing policy + */ + if (AS_READ_HELD(seg->s_as, + &seg->s_as->a_lock)) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (IE_RETRY); + } + + already_set = lgrp_privm_policy_set(policy, + &svd->policy_info, len); + } + + /* + * If policy set already and it shouldn't be reapplied, + * don't do anything. + */ + if (already_set && + !LGRP_MEM_POLICY_REAPPLICABLE(policy)) + break; + + /* + * Mark any existing pages in given range for + * migration + */ + page_mark_migrate(seg, addr, len, amp, svd->anon_index, + vp, svd->offset, 1); + + /* + * If same policy set already or this is a shared + * memory segment, don't need to try to concatenate + * segment with adjacent ones. + */ + if (already_set || svd->type == MAP_SHARED) + break; + + /* + * Try to concatenate this segment with previous + * one and next one, since we changed policy for + * this one and it may be compatible with adjacent + * ones now. + */ + prev = AS_SEGPREV(seg->s_as, seg); + next = AS_SEGNEXT(seg->s_as, seg); + + if (next && next->s_ops == &segvn_ops && + addr + len == next->s_base) + (void) segvn_concat(seg, next, 1); + + if (prev && prev->s_ops == &segvn_ops && + addr == prev->s_base + prev->s_size) { + /* + * Drop lock for private data of current + * segment before concatenating (deleting) it + * and return IE_REATTACH to tell as_ctl() that + * current segment has changed + */ + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + if (!segvn_concat(prev, seg, 1)) + err = IE_REATTACH; + + return (err); + } + break; + + case MADV_SEQUENTIAL: + /* + * unloading mapping guarantees + * detection in segvn_fault + */ + ASSERT(seg->s_szc == 0); + hat_unload(seg->s_as->a_hat, addr, len, + HAT_UNLOAD); + /* FALLTHROUGH */ + case MADV_NORMAL: + case MADV_RANDOM: + svd->advice = (uchar_t)behav; + svd->pageadvice = 0; + break; + case MADV_WILLNEED: /* handled in memcntl */ + case MADV_DONTNEED: /* handled in memcntl */ + case MADV_FREE: /* handled above */ + break; + default: + err = EINVAL; + } + } else { + caddr_t eaddr; + struct seg *new_seg; + struct segvn_data *new_svd; + u_offset_t off; + caddr_t oldeaddr; + + page = seg_page(seg, addr); + + segvn_vpage(seg); + + switch (behav) { + struct vpage *bvpp, *evpp; + + case MADV_ACCESS_LWP: + case MADV_ACCESS_MANY: + case MADV_ACCESS_DEFAULT: + /* + * Set memory allocation policy for portion of this + * segment + */ + + /* + * Align address and length of advice to page + * boundaries for large pages + */ + if (seg->s_szc != 0) { + size_t pgsz; + + pgsz = page_get_pagesize(seg->s_szc); + addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); + len = P2ROUNDUP(len, pgsz); + } + + /* + * Check to see whether policy is set already + */ + policy = lgrp_madv_to_policy(behav, len, svd->type); + + anon_index = svd->anon_index + page; + off = svd->offset + (uintptr_t)(addr - seg->s_base); + + if (svd->type == MAP_SHARED) + already_set = lgrp_shm_policy_set(policy, amp, + anon_index, vp, off, len); + else + already_set = + (policy == svd->policy_info.mem_policy); + + /* + * If policy set already and it shouldn't be reapplied, + * don't do anything. + */ + if (already_set && + !LGRP_MEM_POLICY_REAPPLICABLE(policy)) + break; + + /* + * For private memory, need writers lock on + * address space because the segment may be + * split or concatenated when changing policy + */ + if (svd->type == MAP_PRIVATE && + AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (IE_RETRY); + } + + /* + * Mark any existing pages in given range for + * migration + */ + page_mark_migrate(seg, addr, len, amp, svd->anon_index, + vp, svd->offset, 1); + + /* + * Don't need to try to split or concatenate + * segments, since policy is same or this is a shared + * memory segment + */ + if (already_set || svd->type == MAP_SHARED) + break; + + /* + * Split off new segment if advice only applies to a + * portion of existing segment starting in middle + */ + new_seg = NULL; + eaddr = addr + len; + oldeaddr = seg->s_base + seg->s_size; + if (addr > seg->s_base) { + /* + * Must flush I/O page cache + * before splitting segment + */ + if (svd->softlockcnt > 0) + segvn_purge(seg); + + /* + * Split segment and return IE_REATTACH to tell + * as_ctl() that current segment changed + */ + new_seg = segvn_split_seg(seg, addr); + new_svd = (struct segvn_data *)new_seg->s_data; + err = IE_REATTACH; + + /* + * If new segment ends where old one + * did, try to concatenate the new + * segment with next one. + */ + if (eaddr == oldeaddr) { + /* + * Set policy for new segment + */ + (void) lgrp_privm_policy_set(policy, + &new_svd->policy_info, + new_seg->s_size); + + next = AS_SEGNEXT(new_seg->s_as, + new_seg); + + if (next && + next->s_ops == &segvn_ops && + eaddr == next->s_base) + (void) segvn_concat(new_seg, + next, 1); + } + } + + /* + * Split off end of existing segment if advice only + * applies to a portion of segment ending before + * end of the existing segment + */ + if (eaddr < oldeaddr) { + /* + * Must flush I/O page cache + * before splitting segment + */ + if (svd->softlockcnt > 0) + segvn_purge(seg); + + /* + * If beginning of old segment was already + * split off, use new segment to split end off + * from. + */ + if (new_seg != NULL && new_seg != seg) { + /* + * Split segment + */ + (void) segvn_split_seg(new_seg, eaddr); + + /* + * Set policy for new segment + */ + (void) lgrp_privm_policy_set(policy, + &new_svd->policy_info, + new_seg->s_size); + } else { + /* + * Split segment and return IE_REATTACH + * to tell as_ctl() that current + * segment changed + */ + (void) segvn_split_seg(seg, eaddr); + err = IE_REATTACH; + + (void) lgrp_privm_policy_set(policy, + &svd->policy_info, seg->s_size); + + /* + * If new segment starts where old one + * did, try to concatenate it with + * previous segment. + */ + if (addr == seg->s_base) { + prev = AS_SEGPREV(seg->s_as, + seg); + + /* + * Drop lock for private data + * of current segment before + * concatenating (deleting) it + */ + if (prev && + prev->s_ops == + &segvn_ops && + addr == prev->s_base + + prev->s_size) { + SEGVN_LOCK_EXIT( + seg->s_as, + &svd->lock); + (void) segvn_concat( + prev, seg, 1); + return (err); + } + } + } + } + break; + case MADV_SEQUENTIAL: + ASSERT(seg->s_szc == 0); + hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); + /* FALLTHROUGH */ + case MADV_NORMAL: + case MADV_RANDOM: + bvpp = &svd->vpage[page]; + evpp = &svd->vpage[page + (len >> PAGESHIFT)]; + for (; bvpp < evpp; bvpp++) + VPP_SETADVICE(bvpp, behav); + svd->advice = MADV_NORMAL; + break; + case MADV_WILLNEED: /* handled in memcntl */ + case MADV_DONTNEED: /* handled in memcntl */ + case MADV_FREE: /* handled above */ + break; + default: + err = EINVAL; + } + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (err); +} + +/* + * Create a vpage structure for this seg. + */ +static void +segvn_vpage(struct seg *seg) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct vpage *vp, *evp; + + ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); + + /* + * If no vpage structure exists, allocate one. Copy the protections + * and the advice from the segment itself to the individual pages. + */ + if (svd->vpage == NULL) { + svd->pageprot = 1; + svd->pageadvice = 1; + svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), + KM_SLEEP); + evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; + for (vp = svd->vpage; vp < evp; vp++) { + VPP_SETPROT(vp, svd->prot); + VPP_SETADVICE(vp, svd->advice); + } + } +} + +/* + * Dump the pages belonging to this segvn segment. + */ +static void +segvn_dump(struct seg *seg) +{ + struct segvn_data *svd; + page_t *pp; + struct anon_map *amp; + ulong_t anon_index; + struct vnode *vp; + u_offset_t off, offset; + pfn_t pfn; + pgcnt_t page, npages; + caddr_t addr; + + npages = seg_pages(seg); + svd = (struct segvn_data *)seg->s_data; + vp = svd->vp; + off = offset = svd->offset; + addr = seg->s_base; + + if ((amp = svd->amp) != NULL) { + anon_index = svd->anon_index; + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + } + + for (page = 0; page < npages; page++, offset += PAGESIZE) { + struct anon *ap; + int we_own_it = 0; + + if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { + swap_xlate_nopanic(ap, &vp, &off); + } else { + vp = svd->vp; + off = offset; + } + + /* + * If pp == NULL, the page either does not exist + * or is exclusively locked. So determine if it + * exists before searching for it. + */ + + if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) + we_own_it = 1; + else + pp = page_exists(vp, off); + + if (pp) { + pfn = page_pptonum(pp); + dump_addpage(seg->s_as, addr, pfn); + if (we_own_it) + page_unlock(pp); + } + addr += PAGESIZE; + dump_timeleft = dump_timeout; + } + + if (amp != NULL) + ANON_LOCK_EXIT(&->a_rwlock); +} + +/* + * lock/unlock anon pages over a given range. Return shadow list + */ +static int +segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, + enum lock_type type, enum seg_rw rw) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); + ulong_t anon_index; + uint_t protchk; + uint_t error; + struct anon_map *amp; + struct page **pplist, **pl, *pp; + caddr_t a; + size_t page; + caddr_t lpgaddr, lpgeaddr; + + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, + "segvn_pagelock: start seg %p addr %p", seg, addr); + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { + /* + * We are adjusting the pagelock region to the large page size + * boundary because the unlocked part of a large page cannot + * be freed anyway unless all constituent pages of a large + * page are locked. Therefore this adjustment allows us to + * decrement availrmem by the right value (note we don't want + * to just decrement availrem by the large page size without + * adjusting addr and len because then we may end up + * decrementing availrmem by large page size for every + * constituent page locked by a new as_pagelock call). + * as_pageunlock caller must always match as_pagelock call's + * addr and len. + * + * Note segment's page size cannot change while we are holding + * as lock. And then it cannot change while softlockcnt is + * not 0. This will allow us to correctly recalculate large + * page size region for the matching pageunlock/reclaim call. + * + * for pageunlock *ppp points to the pointer of page_t that + * corresponds to the real unadjusted start address. Similar + * for pagelock *ppp must point to the pointer of page_t that + * corresponds to the real unadjusted start address. + */ + size_t pgsz = page_get_pagesize(seg->s_szc); + CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); + adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; + } + + if (type == L_PAGEUNLOCK) { + + /* + * update hat ref bits for /proc. We need to make sure + * that threads tracing the ref and mod bits of the + * address space get the right data. + * Note: page ref and mod bits are updated at reclaim time + */ + if (seg->s_as->a_vbits) { + for (a = addr; a < addr + len; a += PAGESIZE) { + if (rw == S_WRITE) { + hat_setstat(seg->s_as, a, + PAGESIZE, P_REF | P_MOD); + } else { + hat_setstat(seg->s_as, a, + PAGESIZE, P_REF); + } + } + } + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + if (seg->s_szc != 0) { + VM_STAT_ADD(segvnvmstats.pagelock[0]); + seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, + *ppp - adjustpages, rw, segvn_reclaim); + } else { + seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); + } + + /* + * If someone is blocked while unmapping, we purge + * segment page cache and thus reclaim pplist synchronously + * without waiting for seg_pasync_thread. This speeds up + * unmapping in cases where munmap(2) is called, while + * raw async i/o is still in progress or where a thread + * exits on data fault in a multithreaded application. + */ + if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { + /* + * Even if we grab segvn WRITER's lock or segp_slock + * here, there might be another thread which could've + * successfully performed lookup/insert just before + * we acquired the lock here. So, grabbing either + * lock here is of not much use. Until we devise + * a strategy at upper layers to solve the + * synchronization issues completely, we expect + * applications to handle this appropriately. + */ + segvn_purge(seg); + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, + "segvn_pagelock: unlock seg %p addr %p", seg, addr); + return (0); + } else if (type == L_PAGERECLAIM) { + VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + (void) segvn_reclaim(seg, addr, len, *ppp, rw); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, + "segvn_pagelock: reclaim seg %p addr %p", seg, addr); + return (0); + } + + if (seg->s_szc != 0) { + VM_STAT_ADD(segvnvmstats.pagelock[2]); + addr = lpgaddr; + len = lpgeaddr - lpgaddr; + npages = (len >> PAGESHIFT); + } + + /* + * for now we only support pagelock to anon memory. We've to check + * protections for vnode objects and call into the vnode driver. + * That's too much for a fast path. Let the fault entry point handle it. + */ + if (svd->vp != NULL) { + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, + "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); + *ppp = NULL; + return (ENOTSUP); + } + + /* + * if anonmap is not yet created, let the fault entry point populate it + * with anon ptrs. + */ + if ((amp = svd->amp) == NULL) { + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, + "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); + *ppp = NULL; + return (EFAULT); + } + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + + /* + * we acquire segp_slock to prevent duplicate entries + * in seg_pcache + */ + mutex_enter(&svd->segp_slock); + + /* + * try to find pages in segment page cache + */ + pplist = seg_plookup(seg, addr, len, rw); + if (pplist != NULL) { + mutex_exit(&svd->segp_slock); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + *ppp = pplist + adjustpages; + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, + "segvn_pagelock: cache hit seg %p addr %p", seg, addr); + return (0); + } + + if (rw == S_READ) { + protchk = PROT_READ; + } else { + protchk = PROT_WRITE; + } + + if (svd->pageprot == 0) { + if ((svd->prot & protchk) == 0) { + mutex_exit(&svd->segp_slock); + error = EFAULT; + goto out; + } + } else { + /* + * check page protections + */ + for (a = addr; a < addr + len; a += PAGESIZE) { + struct vpage *vp; + + vp = &svd->vpage[seg_page(seg, a)]; + if ((VPP_PROT(vp) & protchk) == 0) { + mutex_exit(&svd->segp_slock); + error = EFAULT; + goto out; + } + } + } + + mutex_enter(&freemem_lock); + if (availrmem < tune.t_minarmem + npages) { + mutex_exit(&freemem_lock); + mutex_exit(&svd->segp_slock); + error = ENOMEM; + goto out; + } else { + svd->softlockcnt += npages; + availrmem -= npages; + segvn_pages_locked += npages; + } + mutex_exit(&freemem_lock); + + pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); + pl = pplist; + *ppp = pplist + adjustpages; + + page = seg_page(seg, addr); + anon_index = svd->anon_index + page; + + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { + struct anon *ap; + struct vnode *vp; + u_offset_t off; + anon_sync_obj_t cookie; + + anon_array_enter(amp, anon_index, &cookie); + ap = anon_get_ptr(amp->ahp, anon_index); + if (ap == NULL) { + anon_array_exit(&cookie); + break; + } else { + /* + * We must never use seg_pcache for COW pages + * because we might end up with original page still + * lying in seg_pcache even after private page is + * created. This leads to data corruption as + * aio_write refers to the page still in cache + * while all other accesses refer to the private + * page. + */ + if (ap->an_refcnt != 1) { + anon_array_exit(&cookie); + break; + } + } + swap_xlate(ap, &vp, &off); + anon_array_exit(&cookie); + + pp = page_lookup_nowait(vp, off, SE_SHARED); + if (pp == NULL) { + break; + } + *pplist++ = pp; + } + ANON_LOCK_EXIT(&->a_rwlock); + + if (a >= addr + len) { + (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, + segvn_reclaim); + mutex_exit(&svd->segp_slock); + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, + "segvn_pagelock: cache fill seg %p addr %p", seg, addr); + return (0); + } + + mutex_exit(&svd->segp_slock); + error = EFAULT; + pplist = pl; + np = ((uintptr_t)(a - addr)) >> PAGESHIFT; + while (np > (uint_t)0) { + page_unlock(*pplist); + np--; + pplist++; + } + kmem_free(pl, sizeof (page_t *) * npages); + mutex_enter(&freemem_lock); + svd->softlockcnt -= npages; + availrmem += npages; + segvn_pages_locked -= npages; + mutex_exit(&freemem_lock); + if (svd->softlockcnt <= 0) { + if (AS_ISUNMAPWAIT(seg->s_as)) { + mutex_enter(&seg->s_as->a_contents); + if (AS_ISUNMAPWAIT(seg->s_as)) { + AS_CLRUNMAPWAIT(seg->s_as); + cv_broadcast(&seg->s_as->a_cv); + } + mutex_exit(&seg->s_as->a_contents); + } + } + +out: + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + *ppp = NULL; + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, + "segvn_pagelock: cache miss seg %p addr %p", seg, addr); + return (error); +} + +/* + * purge any cached pages in the I/O page cache + */ +static void +segvn_purge(struct seg *seg) +{ + seg_ppurge(seg); +} + +static int +segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, + enum seg_rw rw) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + pgcnt_t np, npages; + struct page **pl; + +#ifdef lint + addr = addr; +#endif + + npages = np = (len >> PAGESHIFT); + ASSERT(npages); + pl = pplist; + if (seg->s_szc != 0) { + size_t pgsz = page_get_pagesize(seg->s_szc); + if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { + panic("segvn_reclaim: unaligned addr or len"); + /*NOTREACHED*/ + } + } + + while (np > (uint_t)0) { + if (rw == S_WRITE) { + hat_setrefmod(*pplist); + } else { + hat_setref(*pplist); + } + page_unlock(*pplist); + np--; + pplist++; + } + kmem_free(pl, sizeof (page_t *) * npages); + + mutex_enter(&freemem_lock); + availrmem += npages; + segvn_pages_locked -= npages; + svd->softlockcnt -= npages; + mutex_exit(&freemem_lock); + if (svd->softlockcnt <= 0) { + if (AS_ISUNMAPWAIT(seg->s_as)) { + mutex_enter(&seg->s_as->a_contents); + if (AS_ISUNMAPWAIT(seg->s_as)) { + AS_CLRUNMAPWAIT(seg->s_as); + cv_broadcast(&seg->s_as->a_cv); + } + mutex_exit(&seg->s_as->a_contents); + } + } + return (0); +} +/* + * get a memory ID for an addr in a given segment + * + * XXX only creates PAGESIZE pages if anon slots are not initialized. + * At fault time they will be relocated into larger pages. + */ +static int +segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct anon *ap = NULL; + ulong_t anon_index; + struct anon_map *amp; + anon_sync_obj_t cookie; + + if (svd->type == MAP_PRIVATE) { + memidp->val[0] = (uintptr_t)seg->s_as; + memidp->val[1] = (uintptr_t)addr; + return (0); + } + + if (svd->type == MAP_SHARED) { + if (svd->vp) { + memidp->val[0] = (uintptr_t)svd->vp; + memidp->val[1] = (u_longlong_t)svd->offset + + (uintptr_t)(addr - seg->s_base); + return (0); + } else { + + SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); + if ((amp = svd->amp) != NULL) { + anon_index = svd->anon_index + + seg_page(seg, addr); + } + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + + ASSERT(amp != NULL); + + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + anon_array_enter(amp, anon_index, &cookie); + ap = anon_get_ptr(amp->ahp, anon_index); + if (ap == NULL) { + page_t *pp; + + pp = anon_zero(seg, addr, &ap, svd->cred); + if (pp == NULL) { + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + return (ENOMEM); + } + ASSERT(anon_get_ptr(amp->ahp, anon_index) + == NULL); + (void) anon_set_ptr(amp->ahp, anon_index, + ap, ANON_SLEEP); + page_unlock(pp); + } + + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + + memidp->val[0] = (uintptr_t)ap; + memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; + return (0); + } + } + return (EINVAL); +} + +static int +sameprot(struct seg *seg, caddr_t a, size_t len) +{ + struct segvn_data *svd = (struct segvn_data *)seg->s_data; + struct vpage *vpage; + spgcnt_t pages = btop(len); + uint_t prot; + + if (svd->pageprot == 0) + return (1); + + ASSERT(svd->vpage != NULL); + + vpage = &svd->vpage[seg_page(seg, a)]; + prot = VPP_PROT(vpage); + vpage++; + pages--; + while (pages-- > 0) { + if (prot != VPP_PROT(vpage)) + return (0); + vpage++; + } + return (1); +} + +/* + * Get memory allocation policy info for specified address in given segment + */ +static lgrp_mem_policy_info_t * +segvn_getpolicy(struct seg *seg, caddr_t addr) +{ + struct anon_map *amp; + ulong_t anon_index; + lgrp_mem_policy_info_t *policy_info; + struct segvn_data *svn_data; + u_offset_t vn_off; + vnode_t *vp; + + ASSERT(seg != NULL); + + svn_data = (struct segvn_data *)seg->s_data; + if (svn_data == NULL) + return (NULL); + + /* + * Get policy info for private or shared memory + */ + if (svn_data->type != MAP_SHARED) + policy_info = &svn_data->policy_info; + else { + amp = svn_data->amp; + anon_index = svn_data->anon_index + seg_page(seg, addr); + vp = svn_data->vp; + vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); + policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); + } + + return (policy_info); +} diff --git a/usr/src/uts/common/vm/seg_vn.h b/usr/src/uts/common/vm/seg_vn.h new file mode 100644 index 0000000000..4f66d495dd --- /dev/null +++ b/usr/src/uts/common/vm/seg_vn.h @@ -0,0 +1,168 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_SEG_VN_H +#define _VM_SEG_VN_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/lgrp.h> +#include <vm/anon.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * A pointer to this structure is passed to segvn_create(). + */ +typedef struct segvn_crargs { + struct vnode *vp; /* vnode mapped from */ + struct cred *cred; /* credentials */ + u_offset_t offset; /* starting offset of vnode for mapping */ + uchar_t type; /* type of sharing done */ + uchar_t prot; /* protections */ + uchar_t maxprot; /* maximum protections */ + uint_t flags; /* flags */ + struct anon_map *amp; /* anon mapping to map to */ + uint_t szc; /* max preferred page size code */ + uint_t lgrp_mem_policy_flags; +} segvn_crargs_t; + +/* + * (Semi) private data maintained by the seg_vn driver per segment mapping. + * + * The read/write segment lock protects all of segvn_data including the + * vpage array. All fields in segvn_data are treated as read-only when + * the "read" version of the address space and the segment locks are held. + * The "write" version of the segment lock, however, is required in order to + * update the following fields: + * + * pageprot + * prot + * amp + * vpage + * + * softlockcnt + * is written by acquiring either the readers lock on the segment and + * freemem lock, or any lock combination which guarantees exclusive use + * of this segment (e.g., adress space writers lock, + * address space readers lock + segment writers lock). + */ +typedef struct segvn_data { + krwlock_t lock; /* protect segvn_data and vpage array */ + kmutex_t segp_slock; /* serialize insertions into seg_pcache */ + uchar_t pageprot; /* true if per page protections present */ + uchar_t prot; /* current segment prot if pageprot == 0 */ + uchar_t maxprot; /* maximum segment protections */ + uchar_t type; /* type of sharing done */ + u_offset_t offset; /* starting offset of vnode for mapping */ + struct vnode *vp; /* vnode that segment mapping is to */ + ulong_t anon_index; /* starting index into anon_map anon array */ + struct anon_map *amp; /* pointer to anon share structure, if needed */ + struct vpage *vpage; /* per-page information, if needed */ + struct cred *cred; /* mapping credentials */ + size_t swresv; /* swap space reserved for this segment */ + uchar_t advice; /* madvise flags for segment */ + uchar_t pageadvice; /* true if per page advice set */ + ushort_t flags; /* flags - from sys/mman.h */ + ssize_t softlockcnt; /* # of pages SOFTLOCKED in seg */ + lgrp_mem_policy_info_t policy_info; /* memory allocation policy */ +} segvn_data_t; + +#ifdef _KERNEL + +/* + * Macros for segvn segment driver locking. + */ +#define SEGVN_LOCK_ENTER(as, lock, type) rw_enter((lock), (type)) +#define SEGVN_LOCK_EXIT(as, lock) rw_exit((lock)) +#define SEGVN_LOCK_DOWNGRADE(as, lock) rw_downgrade((lock)) + +/* + * Macros to test lock states. + */ +#define SEGVN_LOCK_HELD(as, lock) RW_LOCK_HELD((lock)) +#define SEGVN_READ_HELD(as, lock) RW_READ_HELD((lock)) +#define SEGVN_WRITE_HELD(as, lock) RW_WRITE_HELD((lock)) + +/* + * Macro used to detect the need to Break the sharing of COW pages + * + * The rw == S_WRITE is for the COW case + * rw == S_READ and type == SOFTLOCK is for the physio case + * We don't want to share a softlocked page because it can cause problems + * with multithreaded apps but if rw == S_READ_NOCOW it's ok to not break + * sharing of COW pages even in SOFTLOCK case. + */ +#define BREAK_COW_SHARE(rw, type, seg_type) ((rw == S_WRITE || \ + (type == F_SOFTLOCK && rw != S_READ_NOCOW)) && \ + seg_type == MAP_PRIVATE) + +#define SEGVN_ZFOD_ARGS(prot, max) \ + { NULL, NULL, 0, MAP_PRIVATE, prot, max, 0, NULL, 0, 0 } + +#define AS_MAP_VNSEGS_USELPGS(crfp, argsp) \ + ((crfp) == (int (*)())segvn_create && \ + (((struct segvn_crargs *)(argsp))->flags & \ + (MAP_TEXT | MAP_INITDATA)) && \ + ((struct segvn_crargs *)(argsp))->vp != NULL && \ + ((struct segvn_crargs *)(argsp))->amp == NULL) + + +extern void segvn_init(void); +extern int segvn_create(struct seg *, void *); + +extern struct seg_ops segvn_ops; + +/* + * Provided as shorthand for creating user zfod segments. + */ +extern caddr_t zfod_argsp; +extern caddr_t kzfod_argsp; +extern caddr_t stack_exec_argsp; +extern caddr_t stack_noexec_argsp; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_VN_H */ diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c new file mode 100644 index 0000000000..b8da5c97c2 --- /dev/null +++ b/usr/src/uts/common/vm/vm_anon.c @@ -0,0 +1,3197 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - anonymous pages. + * + * This layer sits immediately above the vm_swap layer. It manages + * physical pages that have no permanent identity in the file system + * name space, using the services of the vm_swap layer to allocate + * backing storage for these pages. Since these pages have no external + * identity, they are discarded when the last reference is removed. + * + * An important function of this layer is to manage low-level sharing + * of pages that are logically distinct but that happen to be + * physically identical (e.g., the corresponding pages of the processes + * resulting from a fork before one process or the other changes their + * contents). This pseudo-sharing is present only as an optimization + * and is not to be confused with true sharing in which multiple + * address spaces deliberately contain references to the same object; + * such sharing is managed at a higher level. + * + * The key data structure here is the anon struct, which contains a + * reference count for its associated physical page and a hint about + * the identity of that page. Anon structs typically live in arrays, + * with an instance's position in its array determining where the + * corresponding backing storage is allocated; however, the swap_xlate() + * routine abstracts away this representation information so that the + * rest of the anon layer need not know it. (See the swap layer for + * more details on anon struct layout.) + * + * In the future versions of the system, the association between an + * anon struct and its position on backing store will change so that + * we don't require backing store all anonymous pages in the system. + * This is important for consideration for large memory systems. + * We can also use this technique to delay binding physical locations + * to anonymous pages until pageout/swapout time where we can make + * smarter allocation decisions to improve anonymous klustering. + * + * Many of the routines defined here take a (struct anon **) argument, + * which allows the code at this level to manage anon pages directly, + * so that callers can regard anon structs as opaque objects and not be + * concerned with assigning or inspecting their contents. + * + * Clients of this layer refer to anon pages indirectly. That is, they + * maintain arrays of pointers to anon structs rather than maintaining + * anon structs themselves. The (struct anon **) arguments mentioned + * above are pointers to entries in these arrays. It is these arrays + * that capture the mapping between offsets within a given segment and + * the corresponding anonymous backing storage address. + */ + +#ifdef DEBUG +#define ANON_DEBUG +#endif + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mman.h> +#include <sys/cred.h> +#include <sys/thread.h> +#include <sys/vnode.h> +#include <sys/cpuvar.h> +#include <sys/swap.h> +#include <sys/cmn_err.h> +#include <sys/vtrace.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/vmsystm.h> +#include <sys/debug.h> +#include <sys/tnf_probe.h> +#include <sys/lgrp.h> +#include <sys/policy.h> +#include <sys/condvar_impl.h> +#include <sys/mutex_impl.h> + +#include <vm/as.h> +#include <vm/hat.h> +#include <vm/anon.h> +#include <vm/page.h> +#include <vm/vpage.h> +#include <vm/seg.h> +#include <vm/rm.h> + +#include <fs/fs_subr.h> + +int anon_debug; + +kmutex_t anoninfo_lock; +struct k_anoninfo k_anoninfo; +ani_free_t ani_free_pool[ANI_MAX_POOL]; +pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; +kcondvar_t anon_array_cv[ANON_LOCKSIZE]; + +/* + * Global hash table for (vp, off) -> anon slot + */ +extern int swap_maxcontig; +size_t anon_hash_size; +struct anon **anon_hash; + +static struct kmem_cache *anon_cache; +static struct kmem_cache *anonmap_cache; + +#ifdef VM_STATS +static struct anonvmstats_str { + ulong_t getpages[30]; + ulong_t privatepages[10]; + ulong_t demotepages[9]; + ulong_t decrefpages[9]; + ulong_t dupfillholes[4]; + ulong_t freepages[1]; +} anonvmstats; +#endif /* VM_STATS */ + + +/*ARGSUSED*/ +static int +anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + struct anon_map *amp = buf; + + rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); + return (0); +} + +/*ARGSUSED1*/ +static void +anonmap_cache_destructor(void *buf, void *cdrarg) +{ + struct anon_map *amp = buf; + + rw_destroy(&->a_rwlock); +} + +kmutex_t anonhash_lock[AH_LOCK_SIZE]; +kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; + +void +anon_init(void) +{ + int i; + + anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); + + for (i = 0; i < AH_LOCK_SIZE; i++) { + mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); + mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); + } + + for (i = 0; i < ANON_LOCKSIZE; i++) { + mutex_init(&anon_array_lock[i].pad_mutex, NULL, + MUTEX_DEFAULT, NULL); + cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); + } + + anon_hash = (struct anon **) + kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); + anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), + AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); + anonmap_cache = kmem_cache_create("anonmap_cache", + sizeof (struct anon_map), 0, + anonmap_cache_constructor, anonmap_cache_destructor, NULL, + NULL, NULL, 0); + swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ +} + +/* + * Global anon slot hash table manipulation. + */ + +static void +anon_addhash(struct anon *ap) +{ + int index; + + ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); + index = ANON_HASH(ap->an_vp, ap->an_off); + ap->an_hash = anon_hash[index]; + anon_hash[index] = ap; +} + +static void +anon_rmhash(struct anon *ap) +{ + struct anon **app; + + ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); + + for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; + *app; app = &((*app)->an_hash)) { + if (*app == ap) { + *app = ap->an_hash; + break; + } + } +} + +/* + * The anon array interfaces. Functions allocating, + * freeing array of pointers, and returning/setting + * entries in the array of pointers for a given offset. + * + * Create the list of pointers + */ +struct anon_hdr * +anon_create(pgcnt_t npages, int flags) +{ + struct anon_hdr *ahp; + ulong_t nchunks; + int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + + if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { + return (NULL); + } + + mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); + /* + * Single level case. + */ + ahp->size = npages; + if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { + + if (flags & ANON_ALLOC_FORCE) + ahp->flags |= ANON_ALLOC_FORCE; + + ahp->array_chunk = kmem_zalloc( + ahp->size * sizeof (struct anon *), kmemflags); + + if (ahp->array_chunk == NULL) { + kmem_free(ahp, sizeof (struct anon_hdr)); + return (NULL); + } + } else { + /* + * 2 Level case. + */ + nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; + + ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), + kmemflags); + + if (ahp->array_chunk == NULL) { + kmem_free(ahp, sizeof (struct anon_hdr)); + return (NULL); + } + } + return (ahp); +} + +/* + * Free the array of pointers + */ +void +anon_release(struct anon_hdr *ahp, pgcnt_t npages) +{ + ulong_t i; + void **ppp; + ulong_t nchunks; + + ASSERT(npages == ahp->size); + + /* + * Single level case. + */ + if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { + kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); + } else { + /* + * 2 level case. + */ + nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; + for (i = 0; i < nchunks; i++) { + ppp = &ahp->array_chunk[i]; + if (*ppp != NULL) + kmem_free(*ppp, PAGESIZE); + } + kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); + } + mutex_destroy(&ahp->serial_lock); + kmem_free(ahp, sizeof (struct anon_hdr)); +} + +/* + * Return the pointer from the list for a + * specified anon index. + */ +struct anon * +anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) +{ + struct anon **app; + + ASSERT(an_idx < ahp->size); + + /* + * Single level case. + */ + if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { + return ((struct anon *) + ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); + } else { + + /* + * 2 level case. + */ + app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; + if (app) { + return ((struct anon *) + ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & + ANON_PTRMASK)); + } else { + return (NULL); + } + } +} + +/* + * Return the anon pointer for the first valid entry in the anon list, + * starting from the given index. + */ +struct anon * +anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) +{ + struct anon *ap; + struct anon **app; + ulong_t chunkoff; + ulong_t i; + ulong_t j; + pgcnt_t size; + + i = *index; + size = ahp->size; + + ASSERT(i < size); + + if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { + /* + * 1 level case + */ + while (i < size) { + ap = (struct anon *) + ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); + if (ap) { + *index = i; + return (ap); + } + i++; + } + } else { + /* + * 2 level case + */ + chunkoff = i & ANON_CHUNK_OFF; + while (i < size) { + app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; + if (app) + for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { + ap = (struct anon *) + ((uintptr_t)app[j] & + ANON_PTRMASK); + if (ap) { + *index = i + (j - chunkoff); + return (ap); + } + } + chunkoff = 0; + i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; + } + } + *index = size; + return (NULL); +} + +/* + * Set list entry with a given pointer for a specified offset + */ +int +anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) +{ + void **ppp; + struct anon **app; + int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + uintptr_t *ap_addr; + + ASSERT(an_idx < ahp->size); + + /* + * Single level case. + */ + if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { + ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; + } else { + + /* + * 2 level case. + */ + ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; + + ASSERT(ppp != NULL); + if (*ppp == NULL) { + mutex_enter(&ahp->serial_lock); + ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; + if (*ppp == NULL) { + *ppp = kmem_zalloc(PAGESIZE, kmemflags); + if (*ppp == NULL) { + mutex_exit(&ahp->serial_lock); + return (ENOMEM); + } + } + mutex_exit(&ahp->serial_lock); + } + app = *ppp; + ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; + } + *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; + return (0); +} + +/* + * Copy anon array into a given new anon array + */ +int +anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, + struct anon_hdr *dahp, ulong_t d_idx, + pgcnt_t npages, int flags) +{ + void **sapp, **dapp; + void *ap; + int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + + ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); + ASSERT((npages <= sahp->size) && (npages <= dahp->size)); + + /* + * Both arrays are 1 level. + */ + if (((sahp->size <= ANON_CHUNK_SIZE) && + (dahp->size <= ANON_CHUNK_SIZE)) || + ((sahp->flags & ANON_ALLOC_FORCE) && + (dahp->flags & ANON_ALLOC_FORCE))) { + + bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], + npages * sizeof (struct anon *)); + return (0); + } + + /* + * Both arrays are 2 levels. + */ + if (sahp->size > ANON_CHUNK_SIZE && + dahp->size > ANON_CHUNK_SIZE && + ((sahp->flags & ANON_ALLOC_FORCE) == 0) && + ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { + + ulong_t sapidx, dapidx; + ulong_t *sap, *dap; + ulong_t chknp; + + while (npages != 0) { + + sapidx = s_idx & ANON_CHUNK_OFF; + dapidx = d_idx & ANON_CHUNK_OFF; + chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); + if (chknp > npages) + chknp = npages; + + sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; + if ((sap = *sapp) != NULL) { + dapp = &dahp->array_chunk[d_idx + >> ANON_CHUNK_SHIFT]; + if ((dap = *dapp) == NULL) { + *dapp = kmem_zalloc(PAGESIZE, + kmemflags); + if ((dap = *dapp) == NULL) + return (ENOMEM); + } + bcopy((sap + sapidx), (dap + dapidx), + chknp << ANON_PTRSHIFT); + } + s_idx += chknp; + d_idx += chknp; + npages -= chknp; + } + return (0); + } + + /* + * At least one of the arrays is 2 level. + */ + while (npages--) { + if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { + ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); + if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) + return (ENOMEM); + } + s_idx++; + d_idx++; + } + return (0); +} + + +/* + * ANON_INITBUF is a convenience macro for anon_grow() below. It + * takes a buffer dst, which is at least as large as buffer src. It + * does a bcopy from src into dst, and then bzeros the extra bytes + * of dst. If tail is set, the data in src is tail aligned within + * dst instead of head aligned. + */ + +#define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ + if (tail) { \ + bzero((dst), (dstsize) - (srclen)); \ + bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ + } else { \ + bcopy((src), (dst), (srclen)); \ + bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ + } + +#define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) +#define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) + +/* + * anon_grow() is used to efficiently extend an existing anon array. + * startidx_p points to the index into the anon array of the first page + * that is in use. curpages is the number of pages in use, starting at + * *startidx_p. newpages is the number of additional pages desired. + * + * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. + * + * The growth is done by creating a new top level of the anon array, + * and (if the array is 2-level) reusing the existing second level arrays. + * + * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. + * + * Returns the new number of pages in the anon array. + */ + +pgcnt_t +anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t curpages, + pgcnt_t newpages, int flags) +{ + ulong_t startidx = startidx_p ? *startidx_p : 0; + pgcnt_t osz = ahp->size, nsz; + pgcnt_t oelems, nelems, totpages; + void **level1; + int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + int growdown = (flags & ANON_GROWDOWN); + size_t newarrsz, oldarrsz; + void *level2; + + ASSERT(!(startidx_p == NULL && growdown)); + ASSERT(startidx + curpages <= ahp->size); + + /* + * Determine the total number of pages needed in the new + * anon array. If growing down, totpages is all pages from + * startidx through the end of the array, plus <newpages> + * pages. If growing up, keep all pages from page 0 through + * the last page currently in use, plus <newpages> pages. + */ + + if (growdown) + totpages = osz - startidx + newpages; + else + totpages = startidx + curpages + newpages; + + /* If the array is already large enough, just return. */ + + if (osz >= totpages) { + nsz = osz; + goto out; + } + + /* + * osz/nsz are the total numbers of pages represented by the array. + * oelems/nelems are the number of pointers in the top level array. + * + * Will the new anon array be one level or two levels? + */ + + if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { + nsz = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); + oelems = osz; + nelems = nsz; + } else { + nsz = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); + oelems = (osz + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; + nelems = nsz >> ANON_CHUNK_SHIFT; + } + + newarrsz = nelems * sizeof (void *); + level1 = kmem_alloc(newarrsz, kmemflags); + if (level1 == NULL) + return (0); + + /* Are we converting from a one level to a two level anon array? */ + + if (nsz > ANON_CHUNK_SIZE && osz <= ANON_CHUNK_SIZE && + !(ahp->flags & ANON_ALLOC_FORCE)) { + /* + * Yes, we're converting to a two level. Reuse old level 1 + * as new level 2 if it is exactly PAGESIZE. Otherwise + * alloc a new level 2 and copy the old level 1 data into it. + */ + + if (osz == ANON_CHUNK_SIZE) { + level2 = (void *)ahp->array_chunk; + } else { + level2 = kmem_alloc(PAGESIZE, kmemflags); + if (level2 == NULL) { + kmem_free(level1, newarrsz); + return (0); + } + oldarrsz = osz * sizeof (void *); + + ANON_INITBUF(ahp->array_chunk, oldarrsz, + level2, PAGESIZE, growdown); + kmem_free(ahp->array_chunk, oldarrsz); + } + bzero(level1, newarrsz); + if (growdown) + level1[nelems - 1] = level2; + else + level1[0] = level2; + } else { + oldarrsz = oelems * sizeof (void *); + + ANON_INITBUF(ahp->array_chunk, oldarrsz, + level1, newarrsz, growdown); + kmem_free(ahp->array_chunk, oldarrsz); + } + + ahp->array_chunk = level1; + ahp->size = nsz; +out: + if (growdown) + *startidx_p = nsz - totpages; + return (nsz); +} + +/* + * Called from clock handler to sync ani_free value. + */ + +void +set_anoninfo(void) +{ + int ix; + pgcnt_t total = 0; + + for (ix = 0; ix < ANI_MAX_POOL; ix++) { + total += ani_free_pool[ix].ani_count; + } + k_anoninfo.ani_free = total; +} + +/* + * Reserve anon space. + * + * It's no longer simply a matter of incrementing ani_resv to + * reserve swap space, we need to check memory-based as well + * as disk-backed (physical) swap. The following algorithm + * is used: + * Check the space on physical swap + * i.e. amount needed < ani_max - ani_phys_resv + * If we are swapping on swapfs check + * amount needed < (availrmem - swapfs_minfree) + * Since the algorithm to check for the quantity of swap space is + * almost the same as that for reserving it, we'll just use anon_resvmem + * with a flag to decrement availrmem. + * + * Return non-zero on success. + */ +int +anon_resvmem(size_t size, uint_t takemem) +{ + pgcnt_t npages = btopr(size); + pgcnt_t mswap_pages = 0; + pgcnt_t pswap_pages = 0; + + mutex_enter(&anoninfo_lock); + + /* + * pswap_pages is the number of pages we can take from + * physical (i.e. disk-backed) swap. + */ + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; + + ANON_PRINT(A_RESV, + ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", + npages, takemem, pswap_pages, (void *)caller())); + + if (npages <= pswap_pages) { + /* + * we have enough space on a physical swap + */ + if (takemem) + k_anoninfo.ani_phys_resv += npages; + mutex_exit(&anoninfo_lock); + return (1); + } else if (pswap_pages != 0) { + /* + * we have some space on a physical swap + */ + if (takemem) { + /* + * use up remainder of phys swap + */ + k_anoninfo.ani_phys_resv += pswap_pages; + ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); + } + } + /* + * since (npages > pswap_pages) we need mem swap + * mswap_pages is the number of pages needed from availrmem + */ + ASSERT(npages > pswap_pages); + mswap_pages = npages - pswap_pages; + + ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", + mswap_pages)); + + /* + * priv processes can reserve memory as swap as long as availrmem + * remains greater than swapfs_minfree; in the case of non-priv + * processes, memory can be reserved as swap only if availrmem + * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, + * swapfs_reserve amount of memswap is not available to non-priv + * processes. This protects daemons such as automounter dying + * as a result of application processes eating away almost entire + * membased swap. This safeguard becomes useless if apps are run + * with root access. + * + * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. + * + */ + mutex_enter(&freemem_lock); + if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || + (availrmem > (swapfs_minfree + mswap_pages) && + secpolicy_resource(CRED()) == 0)) { + + if (takemem) { + /* + * Take the memory from the rest of the system. + */ + availrmem -= mswap_pages; + mutex_exit(&freemem_lock); + k_anoninfo.ani_mem_resv += mswap_pages; + ANI_ADD(mswap_pages); + ANON_PRINT((A_RESV | A_MRESV), + ("anon_resvmem: took %ld pages of availrmem\n", + mswap_pages)); + } else { + mutex_exit(&freemem_lock); + } + + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + mutex_exit(&anoninfo_lock); + return (1); + + } else { + /* + * Fail if not enough memory + */ + + if (takemem) { + k_anoninfo.ani_phys_resv -= pswap_pages; + } + + mutex_exit(&freemem_lock); + mutex_exit(&anoninfo_lock); + ANON_PRINT(A_RESV, + ("anon_resvmem: not enough space from swapfs\n")); + return (0); + } +} + + +/* + * Give back an anon reservation. + */ +void +anon_unresv(size_t size) +{ + pgcnt_t npages = btopr(size); + spgcnt_t mem_free_pages = 0; + pgcnt_t phys_free_slots; +#ifdef ANON_DEBUG + pgcnt_t mem_resv; +#endif + + mutex_enter(&anoninfo_lock); + + ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); + /* + * If some of this reservation belonged to swapfs + * give it back to availrmem. + * ani_mem_resv is the amount of availrmem swapfs has reserved. + * but some of that memory could be locked by segspt so we can only + * return non locked ani_mem_resv back to availrmem + */ + if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { + ANON_PRINT((A_RESV | A_MRESV), + ("anon_unresv: growing availrmem by %ld pages\n", + MIN(k_anoninfo.ani_mem_resv, npages))); + + mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - + k_anoninfo.ani_locked_swap), npages); + mutex_enter(&freemem_lock); + availrmem += mem_free_pages; + mutex_exit(&freemem_lock); + k_anoninfo.ani_mem_resv -= mem_free_pages; + + ANI_ADD(-mem_free_pages); + } + /* + * The remainder of the pages is returned to phys swap + */ + ASSERT(npages >= mem_free_pages); + phys_free_slots = npages - mem_free_pages; + + if (phys_free_slots) { + k_anoninfo.ani_phys_resv -= phys_free_slots; + } + +#ifdef ANON_DEBUG + mem_resv = k_anoninfo.ani_mem_resv; +#endif + + ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + + mutex_exit(&anoninfo_lock); + + ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", + npages, mem_resv, (void *)caller())); +} + +/* + * Allocate an anon slot and return it with the lock held. + */ +struct anon * +anon_alloc(struct vnode *vp, anoff_t off) +{ + struct anon *ap; + kmutex_t *ahm; + + ap = kmem_cache_alloc(anon_cache, KM_SLEEP); + if (vp == NULL) { + swap_alloc(ap); + } else { + ap->an_vp = vp; + ap->an_off = off; + } + ap->an_refcnt = 1; + ap->an_pvp = NULL; + ap->an_poff = 0; + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + mutex_enter(ahm); + anon_addhash(ap); + mutex_exit(ahm); + ANI_ADD(-1); + ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", + (void *)ap, (ap ? (void *)ap->an_vp : NULL))); + return (ap); +} + +/* + * Decrement the reference count of an anon page. + * If reference count goes to zero, free it and + * its associated page (if any). + */ +void +anon_decref(struct anon *ap) +{ + page_t *pp; + struct vnode *vp; + anoff_t off; + kmutex_t *ahm; + + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + mutex_enter(ahm); + ASSERT(ap->an_refcnt != 0); + if (ap->an_refcnt == 0) + panic("anon_decref: slot count 0"); + if (--ap->an_refcnt == 0) { + swap_xlate(ap, &vp, &off); + mutex_exit(ahm); + + /* + * If there is a page for this anon slot we will need to + * call VN_DISPOSE to get rid of the vp association and + * put the page back on the free list as really free. + * Acquire the "exclusive" lock to ensure that any + * pending i/o always completes before the swap slot + * is freed. + */ + pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); + + /* + * If there was a page, we've synchronized on it (getting + * the exclusive lock is as good as gettting the iolock) + * so now we can free the physical backing store. Also, this + * is where we would free the name of the anonymous page + * (swap_free(ap)), a no-op in the current implementation. + */ + mutex_enter(ahm); + ASSERT(ap->an_refcnt == 0); + anon_rmhash(ap); + if (ap->an_pvp) + swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); + mutex_exit(ahm); + + if (pp != NULL) { + /*LINTED: constant in conditional context */ + VN_DISPOSE(pp, B_INVAL, 0, kcred); + } + ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", + (void *)ap, (void *)ap->an_vp)); + kmem_cache_free(anon_cache, ap); + + ANI_ADD(1); + } else { + mutex_exit(ahm); + } +} + +static int +anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) +{ + struct anon *ap; + + while (nslots-- > 0) { + if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && + ap->an_refcnt > 1) + return (1); + anon_index++; + } + + return (0); +} + +static void +anon_decref_pages( + struct anon_hdr *ahp, + ulong_t an_idx, + uint_t szc) +{ + struct anon *ap = anon_get_ptr(ahp, an_idx); + kmutex_t *ahmpages = NULL; + page_t *pp; + pgcnt_t pgcnt = page_get_pagecnt(szc); + pgcnt_t i; + struct vnode *vp; + anoff_t off; + kmutex_t *ahm; +#ifdef DEBUG + int refcnt = 1; +#endif + + ASSERT(szc != 0); + ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); + ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); + + VM_STAT_ADD(anonvmstats.decrefpages[0]); + + if (ap != NULL) { + ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + mutex_enter(ahmpages); + ASSERT((refcnt = ap->an_refcnt) != 0); + VM_STAT_ADD(anonvmstats.decrefpages[1]); + if (ap->an_refcnt == 1) { + VM_STAT_ADD(anonvmstats.decrefpages[2]); + ASSERT(!anon_share(ahp, an_idx, pgcnt)); + mutex_exit(ahmpages); + ahmpages = NULL; + } + } + + i = 0; + while (i < pgcnt) { + if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { + ASSERT(refcnt == 1 && ahmpages == NULL); + i++; + continue; + } + ASSERT(ap->an_refcnt == refcnt); + ASSERT(ahmpages != NULL || ap->an_refcnt == 1); + ASSERT(ahmpages == NULL || ap->an_refcnt > 1); + + if (ahmpages == NULL) { + swap_xlate(ap, &vp, &off); + pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); + if (pp == NULL || pp->p_szc == 0) { + VM_STAT_ADD(anonvmstats.decrefpages[3]); + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, + ap->an_off)]; + (void) anon_set_ptr(ahp, an_idx + i, NULL, + ANON_SLEEP); + mutex_enter(ahm); + ap->an_refcnt--; + ASSERT(ap->an_refcnt == 0); + anon_rmhash(ap); + if (ap->an_pvp) + swap_phys_free(ap->an_pvp, ap->an_poff, + PAGESIZE); + mutex_exit(ahm); + if (pp != NULL) { + VM_STAT_ADD(anonvmstats.decrefpages[4]); + /*LINTED*/ + VN_DISPOSE(pp, B_INVAL, 0, kcred); + } + kmem_cache_free(anon_cache, ap); + ANI_ADD(1); + i++; + } else { + pgcnt_t j; + pgcnt_t curpgcnt = + page_get_pagecnt(pp->p_szc); + size_t ppasize = curpgcnt * sizeof (page_t *); + page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); + int dispose = 0; + + VM_STAT_ADD(anonvmstats.decrefpages[5]); + + ASSERT(pp->p_szc <= szc); + ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); + ASSERT(IS_P2ALIGNED(i, curpgcnt)); + ASSERT(i + curpgcnt <= pgcnt); + ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); + ppa[0] = pp; + for (j = i + 1; j < i + curpgcnt; j++) { + ap = anon_get_ptr(ahp, an_idx + j); + ASSERT(ap != NULL && + ap->an_refcnt == 1); + swap_xlate(ap, &vp, &off); + pp = page_lookup(vp, (u_offset_t)off, + SE_EXCL); + if (pp == NULL) + panic("anon_decref_pages: " + "no page"); + + (void) hat_pageunload(pp, + HAT_FORCE_PGUNLOAD); + ASSERT(pp->p_szc == ppa[0]->p_szc); + ASSERT(page_pptonum(pp) - 1 == + page_pptonum(ppa[j - i - 1])); + ppa[j - i] = pp; + if (ap->an_pvp != NULL && + !vn_matchopval(ap->an_pvp, + VOPNAME_DISPOSE, + (fs_generic_func_p)fs_dispose)) + dispose = 1; + } + if (!dispose) { + VM_STAT_ADD(anonvmstats.decrefpages[6]); + page_destroy_pages(ppa[0]); + } else { + VM_STAT_ADD(anonvmstats.decrefpages[7]); + for (j = 0; j < curpgcnt; j++) { + ASSERT(PAGE_EXCL(ppa[j])); + ppa[j]->p_szc = 0; + } + for (j = 0; j < curpgcnt; j++) { + ASSERT(!hat_page_is_mapped( + ppa[j])); + /*LINTED*/ + VN_DISPOSE(ppa[j], B_INVAL, 0, + kcred); + } + } + kmem_free(ppa, ppasize); + for (j = i; j < i + curpgcnt; j++) { + ap = anon_get_ptr(ahp, an_idx + j); + ASSERT(ap != NULL && + ap->an_refcnt == 1); + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, + ap->an_off)]; + (void) anon_set_ptr(ahp, an_idx + j, + NULL, ANON_SLEEP); + mutex_enter(ahm); + ap->an_refcnt--; + ASSERT(ap->an_refcnt == 0); + anon_rmhash(ap); + if (ap->an_pvp) + swap_phys_free(ap->an_pvp, + ap->an_poff, PAGESIZE); + mutex_exit(ahm); + kmem_cache_free(anon_cache, ap); + ANI_ADD(1); + } + i += curpgcnt; + } + } else { + VM_STAT_ADD(anonvmstats.decrefpages[8]); + (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + mutex_enter(ahm); + ap->an_refcnt--; + mutex_exit(ahm); + i++; + } + } + + if (ahmpages != NULL) { + mutex_exit(ahmpages); + } +} + +/* + * Duplicate references to size bytes worth of anon pages. + * Used when duplicating a segment that contains private anon pages. + * This code assumes that procedure calling this one has already used + * hat_chgprot() to disable write access to the range of addresses that + * that *old actually refers to. + */ +void +anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, + ulong_t new_idx, size_t size) +{ + spgcnt_t npages; + kmutex_t *ahm; + struct anon *ap; + ulong_t off; + ulong_t index; + + npages = btopr(size); + while (npages > 0) { + index = old_idx; + if ((ap = anon_get_next_ptr(old, &index)) == NULL) + break; + + ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); + off = index - old_idx; + npages -= off; + if (npages <= 0) + break; + + (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + + mutex_enter(ahm); + ap->an_refcnt++; + mutex_exit(ahm); + + off++; + new_idx += off; + old_idx += off; + npages--; + } +} + +/* + * Just like anon_dup but also guarantees there are no holes (unallocated anon + * slots) within any large page region. That means if a large page region is + * empty in the old array it will skip it. If there are 1 or more valid slots + * in the large page region of the old array it will make sure to fill in any + * unallocated ones and also copy them to the new array. If noalloc is 1 large + * page region should either have no valid anon slots or all slots should be + * valid. + */ +void +anon_dup_fill_holes( + struct anon_hdr *old, + ulong_t old_idx, + struct anon_hdr *new, + ulong_t new_idx, + size_t size, + uint_t szc, + int noalloc) +{ + struct anon *ap; + spgcnt_t npages; + kmutex_t *ahm, *ahmpages = NULL; + pgcnt_t pgcnt, i; + ulong_t index, off; +#ifdef DEBUG + int refcnt; +#endif + + ASSERT(szc != 0); + pgcnt = page_get_pagecnt(szc); + ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); + npages = btopr(size); + ASSERT(IS_P2ALIGNED(npages, pgcnt)); + ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); + + VM_STAT_ADD(anonvmstats.dupfillholes[0]); + + while (npages > 0) { + index = old_idx; + + /* + * Find the next valid slot. + */ + if (anon_get_next_ptr(old, &index) == NULL) + break; + + ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); + /* + * Now backup index to the beginning of the + * current large page region of the old array. + */ + index = P2ALIGN(index, pgcnt); + off = index - old_idx; + ASSERT(IS_P2ALIGNED(off, pgcnt)); + npages -= off; + if (npages <= 0) + break; + + /* + * Fill and copy a large page regions worth + * of anon slots. + */ + for (i = 0; i < pgcnt; i++) { + if ((ap = anon_get_ptr(old, index + i)) == NULL) { + if (noalloc) { + panic("anon_dup_fill_holes: " + "empty anon slot\n"); + } + VM_STAT_ADD(anonvmstats.dupfillholes[1]); + ap = anon_alloc(NULL, 0); + (void) anon_set_ptr(old, index + i, ap, + ANON_SLEEP); + } else if (i == 0) { + /* + * make the increment of all refcnts of all + * anon slots of a large page appear atomic by + * getting an anonpages_hash_lock for the + * first anon slot of a large page. + */ + int hash = AH_LOCK(ap->an_vp, ap->an_off); + + VM_STAT_ADD(anonvmstats.dupfillholes[2]); + + ahmpages = &anonpages_hash_lock[hash]; + mutex_enter(ahmpages); + /*LINTED*/ + ASSERT(refcnt = ap->an_refcnt); + + VM_STAT_COND_ADD(ap->an_refcnt > 1, + anonvmstats.dupfillholes[3]); + } + (void) anon_set_ptr(new, new_idx + off + i, ap, + ANON_SLEEP); + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + mutex_enter(ahm); + ASSERT(ahmpages != NULL || ap->an_refcnt == 1); + ASSERT(i == 0 || ahmpages == NULL || + refcnt == ap->an_refcnt); + ap->an_refcnt++; + mutex_exit(ahm); + } + if (ahmpages != NULL) { + mutex_exit(ahmpages); + ahmpages = NULL; + } + off += pgcnt; + new_idx += off; + old_idx += off; + npages -= pgcnt; + } +} + +/* + * Used when a segment with a vnode changes szc. similarly to + * anon_dup_fill_holes() makes sure each large page region either has no anon + * slots or all of them. but new slots are created by COWing the file + * pages. on entrance no anon slots should be shared. + */ +int +anon_fill_cow_holes( + struct seg *seg, + caddr_t addr, + struct anon_hdr *ahp, + ulong_t an_idx, + struct vnode *vp, + u_offset_t vp_off, + size_t size, + uint_t szc, + uint_t prot, + struct vpage vpage[], + struct cred *cred) +{ + struct anon *ap; + spgcnt_t npages; + pgcnt_t pgcnt, i; + ulong_t index, off; + int err = 0; + int pageflags = 0; + + ASSERT(szc != 0); + pgcnt = page_get_pagecnt(szc); + ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); + npages = btopr(size); + ASSERT(IS_P2ALIGNED(npages, pgcnt)); + ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); + + while (npages > 0) { + index = an_idx; + + /* + * Find the next valid slot. + */ + if (anon_get_next_ptr(ahp, &index) == NULL) { + break; + } + + ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); + /* + * Now backup index to the beginning of the + * current large page region of the anon array. + */ + index = P2ALIGN(index, pgcnt); + off = index - an_idx; + ASSERT(IS_P2ALIGNED(off, pgcnt)); + npages -= off; + if (npages <= 0) + break; + an_idx += off; + vp_off += ptob(off); + addr += ptob(off); + if (vpage != NULL) { + vpage += off; + } + + for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { + if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { + page_t *pl[1 + 1]; + page_t *pp; + + err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, + pl, PAGESIZE, seg, addr, S_READ, cred); + if (err) { + break; + } + if (vpage != NULL) { + prot = VPP_PROT(vpage); + pageflags = VPP_ISPPLOCK(vpage) ? + LOCK_PAGE : 0; + } + pp = anon_private(&ap, seg, addr, prot, pl[0], + pageflags, cred); + if (pp == NULL) { + err = ENOMEM; + break; + } + (void) anon_set_ptr(ahp, an_idx, ap, + ANON_SLEEP); + page_unlock(pp); + } + ASSERT(ap->an_refcnt == 1); + addr += PAGESIZE; + if (vpage != NULL) { + vpage++; + } + } + npages -= pgcnt; + } + + return (err); +} + +/* + * Free a group of "size" anon pages, size in bytes, + * and clear out the pointers to the anon entries. + */ +void +anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) +{ + spgcnt_t npages; + struct anon *ap; + ulong_t old; + + npages = btopr(size); + + while (npages > 0) { + old = index; + if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) + break; + + ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); + npages -= index - old; + if (npages <= 0) + break; + + (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); + anon_decref(ap); + /* + * Bump index and decrement page count + */ + index++; + npages--; + } +} + +void +anon_free_pages( + struct anon_hdr *ahp, + ulong_t an_idx, + size_t size, + uint_t szc) +{ + spgcnt_t npages; + pgcnt_t pgcnt; + ulong_t index, off; + + ASSERT(szc != 0); + pgcnt = page_get_pagecnt(szc); + ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); + npages = btopr(size); + ASSERT(IS_P2ALIGNED(npages, pgcnt)); + ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); + + VM_STAT_ADD(anonvmstats.freepages[0]); + + while (npages > 0) { + index = an_idx; + + /* + * Find the next valid slot. + */ + if (anon_get_next_ptr(ahp, &index) == NULL) + break; + + ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); + /* + * Now backup index to the beginning of the + * current large page region of the old array. + */ + index = P2ALIGN(index, pgcnt); + off = index - an_idx; + ASSERT(IS_P2ALIGNED(off, pgcnt)); + npages -= off; + if (npages <= 0) + break; + + anon_decref_pages(ahp, index, szc); + + off += pgcnt; + an_idx += off; + npages -= pgcnt; + } +} + +/* + * Make anonymous pages discardable + */ +void +anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, int flags) +{ + spgcnt_t npages = btopr(size); + struct anon *ap; + struct vnode *vp; + anoff_t off; + page_t *pp, *root_pp; + kmutex_t *ahm; + pgcnt_t pgcnt; + ulong_t old_idx, idx, i; + struct anon_hdr *ahp = amp->ahp; + anon_sync_obj_t cookie; + + ASSERT(RW_READ_HELD(&->a_rwlock)); + pgcnt = 1; + for (; npages > 0; index = (pgcnt == 1) ? index + 1: + P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { + + /* + * get anon pointer and index for the first valid entry + * in the anon list, starting from "index" + */ + old_idx = index; + if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) + break; + + /* + * decrement npages by number of NULL anon slots we skipped + */ + npages -= index - old_idx; + if (npages <= 0) + break; + + anon_array_enter(amp, index, &cookie); + ap = anon_get_ptr(ahp, index); + ASSERT(ap != NULL); + + /* + * Get anonymous page and try to lock it SE_EXCL; + * For non blocking case if we couldn't grab the lock + * we skip to next page. + * For blocking case (ANON_PGLOOKUP_BLK) block + * until we grab SE_EXCL lock. + */ + swap_xlate(ap, &vp, &off); + if (flags & ANON_PGLOOKUP_BLK) + pp = page_lookup_create(vp, (u_offset_t)off, + SE_EXCL, NULL, NULL, SE_EXCL_WANTED); + else + pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); + if (pp == NULL) { + segadvstat.MADV_FREE_miss.value.ul++; + pgcnt = 1; + anon_array_exit(&cookie); + continue; + } + pgcnt = page_get_pagecnt(pp->p_szc); + + /* + * we cannot free a page which is permanently locked. + * The page_struct_lock need not be acquired to examine + * these fields since the page has an "exclusive" lock. + */ + if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { + page_unlock(pp); + segadvstat.MADV_FREE_miss.value.ul++; + anon_array_exit(&cookie); + continue; + } + + ahm = &anonhash_lock[AH_LOCK(vp, off)]; + mutex_enter(ahm); + ASSERT(ap->an_refcnt != 0); + /* + * skip this one if copy-on-write is not yet broken. + */ + if (ap->an_refcnt > 1) { + mutex_exit(ahm); + page_unlock(pp); + segadvstat.MADV_FREE_miss.value.ul++; + anon_array_exit(&cookie); + continue; + } + + if (pp->p_szc == 0) { + pgcnt = 1; + + /* + * free swap slot; + */ + if (ap->an_pvp) { + swap_phys_free(ap->an_pvp, ap->an_poff, + PAGESIZE); + ap->an_pvp = NULL; + ap->an_poff = 0; + } + mutex_exit(ahm); + segadvstat.MADV_FREE_hit.value.ul++; + + /* + * while we are at it, unload all the translations + * and attempt to free the page. + */ + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + /*LINTED: constant in conditional context */ + VN_DISPOSE(pp, B_FREE, 0, kcred); + anon_array_exit(&cookie); + continue; + } + + pgcnt = page_get_pagecnt(pp->p_szc); + if (!IS_P2ALIGNED(index, pgcnt)) { + if (!page_try_demote_pages(pp)) { + mutex_exit(ahm); + page_unlock(pp); + segadvstat.MADV_FREE_miss.value.ul++; + anon_array_exit(&cookie); + continue; + } else { + pgcnt = 1; + if (ap->an_pvp) { + swap_phys_free(ap->an_pvp, + ap->an_poff, PAGESIZE); + ap->an_pvp = NULL; + ap->an_poff = 0; + } + mutex_exit(ahm); + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + /*LINTED*/ + VN_DISPOSE(pp, B_FREE, 0, kcred); + segadvstat.MADV_FREE_hit.value.ul++; + anon_array_exit(&cookie); + continue; + } + } + mutex_exit(ahm); + root_pp = pp; + + /* + * try to lock remaining pages + */ + for (idx = 1; idx < pgcnt; idx++) { + pp = page_next(pp); + if (!page_trylock(pp, SE_EXCL)) + break; + if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { + page_unlock(pp); + break; + } + } + + if (idx == pgcnt) { + for (i = 0; i < pgcnt; i++) { + ap = anon_get_ptr(ahp, index + i); + if (ap == NULL) + break; + swap_xlate(ap, &vp, &off); + ahm = &anonhash_lock[AH_LOCK(vp, off)]; + mutex_enter(ahm); + ASSERT(ap->an_refcnt != 0); + + /* + * skip this one if copy-on-write + * is not yet broken. + */ + if (ap->an_refcnt > 1) { + mutex_exit(ahm); + goto skiplp; + } + if (ap->an_pvp) { + swap_phys_free(ap->an_pvp, + ap->an_poff, PAGESIZE); + ap->an_pvp = NULL; + ap->an_poff = 0; + } + mutex_exit(ahm); + } + page_destroy_pages(root_pp); + segadvstat.MADV_FREE_hit.value.ul += pgcnt; + anon_array_exit(&cookie); + continue; + } +skiplp: + segadvstat.MADV_FREE_miss.value.ul += pgcnt; + for (i = 0, pp = root_pp; i < idx; pp = page_next(pp), i++) + page_unlock(pp); + anon_array_exit(&cookie); + } +} + +/* + * Return the kept page(s) and protections back to the segment driver. + */ +int +anon_getpage( + struct anon **app, + uint_t *protp, + page_t *pl[], + size_t plsz, + struct seg *seg, + caddr_t addr, + enum seg_rw rw, + struct cred *cred) +{ + page_t *pp; + struct anon *ap = *app; + struct vnode *vp; + anoff_t off; + int err; + kmutex_t *ahm; + + swap_xlate(ap, &vp, &off); + + /* + * Lookup the page. If page is being paged in, + * wait for it to finish as we must return a list of + * pages since this routine acts like the VOP_GETPAGE + * routine does. + */ + if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + mutex_enter(ahm); + if (ap->an_refcnt == 1) + *protp = PROT_ALL; + else + *protp = PROT_ALL & ~PROT_WRITE; + mutex_exit(ahm); + pl[0] = pp; + pl[1] = NULL; + return (0); + } + + /* + * Simply treat it as a vnode fault on the anon vp. + */ + + TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, + "anon_getpage:seg %x addr %x vp %x", + seg, addr, vp); + + err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, + seg, addr, rw, cred); + + if (err == 0 && pl != NULL) { + ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + mutex_enter(ahm); + if (ap->an_refcnt != 1) + *protp &= ~PROT_WRITE; /* make read-only */ + mutex_exit(ahm); + } + return (err); +} + +/* + * Creates or returns kept pages to the segment driver. returns -1 if a large + * page cannot be allocated. returns -2 if some other process has allocated a + * larger page. + * + * For cowfault it will alocate any size pages to fill the requested area to + * avoid partially overwritting anon slots (i.e. sharing only some of the anon + * slots within a large page with other processes). This policy greatly + * simplifies large page freeing (which is only freed when all anon slot + * refcnts are 0). + */ +int +anon_map_getpages( + struct anon_map *amp, + ulong_t start_idx, + uint_t szc, + struct seg *seg, + caddr_t addr, + uint_t prot, + uint_t *protp, + page_t *ppa[], + uint_t *ppa_szc, + struct vpage vpage[], + enum seg_rw rw, + int brkcow, + int anypgsz, + struct cred *cred) +{ + pgcnt_t pgcnt; + struct anon *ap; + struct vnode *vp; + anoff_t off; + page_t *pp, *pl[2], *conpp = NULL; + caddr_t vaddr; + ulong_t pg_idx, an_idx, i; + spgcnt_t nreloc = 0; + int prealloc = 1; + int err, slotcreate; + uint_t vpprot; + +#if !defined(__i386) && !defined(__amd64) + ASSERT(seg->s_szc != 0); +#endif + ASSERT(szc <= seg->s_szc); + ASSERT(ppa_szc != NULL); + ASSERT(rw != S_CREATE); + + *protp = PROT_ALL; + + VM_STAT_ADD(anonvmstats.getpages[0]); + + if (szc == 0) { + VM_STAT_ADD(anonvmstats.getpages[1]); + if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { + err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, + addr, rw, cred); + if (err) + return (err); + ppa[0] = pl[0]; + if (brkcow == 0 || (*protp & PROT_WRITE)) { + VM_STAT_ADD(anonvmstats.getpages[2]); + if (ppa[0]->p_szc != 0) { + VM_STAT_ADD(anonvmstats.getpages[3]); + *ppa_szc = ppa[0]->p_szc; + page_unlock(ppa[0]); + return (-2); + } + return (0); + } + panic("anon_map_getpages: cowfault for szc 0"); + } else { + VM_STAT_ADD(anonvmstats.getpages[4]); + ppa[0] = anon_zero(seg, addr, &ap, cred); + if (ppa[0] == NULL) + return (ENOMEM); + (void) anon_set_ptr(amp->ahp, start_idx, ap, + ANON_SLEEP); + return (0); + } + } + + pgcnt = page_get_pagecnt(szc); + ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); + ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); + + /* + * First we check for the case that the requtested large + * page or larger page already exists in the system. + * Actually we only check if the first constituent page + * exists and only preallocate if it's not found. + */ + ap = anon_get_ptr(amp->ahp, start_idx); + if (ap) { + uint_t pszc; + swap_xlate(ap, &vp, &off); + if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { + if (pszc > szc) { + *ppa_szc = pszc; + return (-2); + } + if (pszc == szc) { + prealloc = 0; + } + } + } + + VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); + VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); + +top: + /* + * If a smaller page or no page at all was found, + * grab a large page off the freelist. + */ + if (prealloc) { + ASSERT(conpp == NULL); + if (page_alloc_pages(seg, addr, NULL, ppa, szc, 0) != 0) { + VM_STAT_ADD(anonvmstats.getpages[7]); + if (brkcow == 0 || + !anon_share(amp->ahp, start_idx, pgcnt)) { + /* + * If the refcnt's of all anon slots are <= 1 + * they can't increase since we are holding + * the address space's lock. So segvn can + * safely decrease szc without risking to + * generate a cow fault for the region smaller + * than the segment's largest page size. + */ + VM_STAT_ADD(anonvmstats.getpages[8]); + return (-1); + } + docow: + /* + * This is a cow fault. Copy away the entire 1 large + * page region of this segment. + */ + if (szc != seg->s_szc) + panic("anon_map_getpages: cowfault for szc %d", + szc); + vaddr = addr; + for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; + pg_idx++, an_idx++, vaddr += PAGESIZE) { + if ((ap = anon_get_ptr(amp->ahp, an_idx)) != + NULL) { + err = anon_getpage(&ap, &vpprot, pl, + PAGESIZE, seg, vaddr, rw, cred); + if (err) { + for (i = 0; i < pg_idx; i++) { + if ((pp = ppa[i]) != + NULL) + page_unlock(pp); + } + return (err); + } + ppa[pg_idx] = pl[0]; + } else { + /* + * Since this is a cowfault we know + * that this address space has a + * parent or children which means + * anon_dup_fill_holes() has initialized + * all anon slots within a large page + * region that had at least one anon + * slot at the time of fork(). + */ + panic("anon_map_getpages: " + "cowfault but anon slot is empty"); + } + } + VM_STAT_ADD(anonvmstats.getpages[9]); + *protp = PROT_ALL; + return (anon_map_privatepages(amp, start_idx, szc, seg, + addr, prot, ppa, vpage, anypgsz, cred)); + } + } + + VM_STAT_ADD(anonvmstats.getpages[10]); + + an_idx = start_idx; + pg_idx = 0; + vaddr = addr; + while (pg_idx < pgcnt) { + slotcreate = 0; + if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { + VM_STAT_ADD(anonvmstats.getpages[11]); + /* + * For us to have decided not to preallocate + * would have meant that a large page + * was found. Which also means that all of the + * anon slots for that page would have been + * already created for us. + */ + if (prealloc == 0) + panic("anon_map_getpages: prealloc = 0"); + + slotcreate = 1; + ap = anon_alloc(NULL, 0); + } + swap_xlate(ap, &vp, &off); + + /* + * Now setup our preallocated page to pass down + * to swap_getpage(). + */ + if (prealloc) { + ASSERT(ppa[pg_idx]->p_szc == szc); + conpp = ppa[pg_idx]; + } + ASSERT(prealloc || conpp == NULL); + + /* + * If we just created this anon slot then call + * with S_CREATE to prevent doing IO on the page. + * Similar to the anon_zero case. + */ + err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, + NULL, pl, PAGESIZE, conpp, &nreloc, seg, vaddr, + slotcreate == 1 ? S_CREATE : rw, cred); + + if (err) { + VM_STAT_ADD(anonvmstats.getpages[12]); + ASSERT(slotcreate == 0); + goto io_err; + } + + pp = pl[0]; + + if (pp->p_szc != szc) { + VM_STAT_ADD(anonvmstats.getpages[13]); + ASSERT(slotcreate == 0); + ASSERT(prealloc == 0); + ASSERT(pg_idx == 0); + if (pp->p_szc > szc) { + page_unlock(pp); + VM_STAT_ADD(anonvmstats.getpages[14]); + return (-2); + } + page_unlock(pp); + prealloc = 1; + goto top; + } + + /* + * If we decided to preallocate but VOP_GETPAGE + * found a page in the system that satisfies our + * request then free up our preallocated large page + * and continue looping accross the existing large + * page via VOP_GETPAGE. + */ + if (prealloc && pp != ppa[pg_idx]) { + VM_STAT_ADD(anonvmstats.getpages[15]); + ASSERT(slotcreate == 0); + ASSERT(pg_idx == 0); + conpp = NULL; + prealloc = 0; + page_free_pages(ppa[0]); + } + + if (prealloc && nreloc > 1) { + /* + * we have relocated out of a smaller large page. + * skip npgs - 1 iterations and continue which will + * increment by one the loop indices. + */ + spgcnt_t npgs = nreloc; + + VM_STAT_ADD(anonvmstats.getpages[16]); + + ASSERT(pp == ppa[pg_idx]); + ASSERT(slotcreate == 0); + ASSERT(pg_idx + npgs <= pgcnt); + if ((*protp & PROT_WRITE) && + anon_share(amp->ahp, an_idx, npgs)) { + *protp &= ~PROT_WRITE; + } + pg_idx += npgs; + an_idx += npgs; + vaddr += PAGESIZE * npgs; + continue; + } + + VM_STAT_ADD(anonvmstats.getpages[17]); + + /* + * Anon_zero case. + */ + if (slotcreate) { + ASSERT(prealloc); + pagezero(pp, 0, PAGESIZE); + CPU_STATS_ADD_K(vm, zfod, 1); + hat_setrefmod(pp); + } + + ASSERT(prealloc == 0 || ppa[pg_idx] == pp); + ASSERT(prealloc != 0 || PAGE_SHARED(pp)); + ASSERT(prealloc == 0 || PAGE_EXCL(pp)); + + if (pg_idx > 0 && + ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || + (pp->p_szc != ppa[pg_idx - 1]->p_szc))) + panic("anon_map_getpages: unexpected page"); + + if (prealloc == 0) { + ppa[pg_idx] = pp; + } + + if (ap->an_refcnt > 1) { + VM_STAT_ADD(anonvmstats.getpages[18]); + *protp &= ~PROT_WRITE; + } + + /* + * If this is a new anon slot then initialize + * the anon array entry. + */ + if (slotcreate) { + (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); + } + pg_idx++; + an_idx++; + vaddr += PAGESIZE; + } + + /* + * Since preallocated pages come off the freelist + * they are locked SE_EXCL. Simply downgrade and return. + */ + if (prealloc) { + VM_STAT_ADD(anonvmstats.getpages[19]); + conpp = NULL; + for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { + page_downgrade(ppa[pg_idx]); + } + } + ASSERT(conpp == NULL); + + if (brkcow == 0 || (*protp & PROT_WRITE)) { + VM_STAT_ADD(anonvmstats.getpages[20]); + return (0); + } + + if (szc < seg->s_szc) + panic("anon_map_getpages: cowfault for szc %d", szc); + + VM_STAT_ADD(anonvmstats.getpages[21]); + + *protp = PROT_ALL; + return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, + ppa, vpage, anypgsz, cred)); +io_err: + /* + * We got an IO error somewhere in our large page. + * If we were using a preallocated page then just demote + * all the constituent pages that we've succeeded with sofar + * to PAGESIZE pages and leave them in the system + * unlocked. + */ + + ASSERT(err != -2 || pg_idx == 0); + + VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); + VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); + VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); + + if (prealloc) { + conpp = NULL; + if (pg_idx > 0) { + VM_STAT_ADD(anonvmstats.getpages[25]); + for (i = 0; i < pgcnt; i++) { + pp = ppa[i]; + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_szc == szc); + pp->p_szc = 0; + } + for (i = 0; i < pg_idx; i++) { + ASSERT(!hat_page_is_mapped(ppa[i])); + page_unlock(ppa[i]); + } + /* + * Now free up the remaining unused constituent + * pages. + */ + while (pg_idx < pgcnt) { + ASSERT(!hat_page_is_mapped(ppa[pg_idx])); + page_free(ppa[pg_idx], 0); + pg_idx++; + } + } else { + VM_STAT_ADD(anonvmstats.getpages[26]); + page_free_pages(ppa[0]); + } + } else { + VM_STAT_ADD(anonvmstats.getpages[27]); + ASSERT(err > 0); + for (i = 0; i < pg_idx; i++) + page_unlock(ppa[i]); + } + ASSERT(conpp == NULL); + if (err != -1) + return (err); + /* + * we are here because we failed to relocate. + */ + ASSERT(prealloc); + if (brkcow == 0 || !anon_share(amp->ahp, start_idx, pgcnt)) { + VM_STAT_ADD(anonvmstats.getpages[28]); + return (-1); + } + VM_STAT_ADD(anonvmstats.getpages[29]); + goto docow; +} + + +/* + * Turn a reference to an object or shared anon page + * into a private page with a copy of the data from the + * original page which is always locked by the caller. + * This routine unloads the translation and unlocks the + * original page, if it isn't being stolen, before returning + * to the caller. + * + * NOTE: The original anon slot is not freed by this routine + * It must be freed by the caller while holding the + * "anon_map" lock to prevent races which can occur if + * a process has multiple lwps in its address space. + */ +page_t * +anon_private( + struct anon **app, + struct seg *seg, + caddr_t addr, + uint_t prot, + page_t *opp, + int oppflags, + struct cred *cred) +{ + struct anon *old = *app; + struct anon *new; + page_t *pp = NULL; + struct vnode *vp; + anoff_t off; + page_t *anon_pl[1 + 1]; + int err; + + if (oppflags & STEAL_PAGE) + ASSERT(PAGE_EXCL(opp)); + else + ASSERT(PAGE_LOCKED(opp)); + + CPU_STATS_ADD_K(vm, cow_fault, 1); + + /* Kernel probe */ + TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, + tnf_opaque, address, addr); + + *app = new = anon_alloc(NULL, 0); + swap_xlate(new, &vp, &off); + + if (oppflags & STEAL_PAGE) { + page_rename(opp, vp, (u_offset_t)off); + pp = opp; + TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, + "anon_private:seg %p addr %x pp %p vp %p off %lx", + seg, addr, pp, vp, off); + hat_setmod(pp); + + /* bug 4026339 */ + page_downgrade(pp); + return (pp); + } + + /* + * Call the VOP_GETPAGE routine to create the page, thereby + * enabling the vnode driver to allocate any filesystem + * space (e.g., disk block allocation for UFS). This also + * prevents more than one page from being added to the + * vnode at the same time. + */ + err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, + anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); + if (err) + goto out; + + pp = anon_pl[0]; + + /* + * If the original page was locked, we need to move the lock + * to the new page by transfering 'cowcnt/lckcnt' of the original + * page to 'cowcnt/lckcnt' of the new page. + * + * See Statement at the beginning of segvn_lockop() and + * comments in page_pp_useclaim() regarding the way + * cowcnts/lckcnts are handled. + * + * Also availrmem must be decremented up front for read only mapping + * before calling page_pp_useclaim. page_pp_useclaim will bump it back + * if availrmem did not need to be decremented after all. + */ + if (oppflags & LOCK_PAGE) { + if ((prot & PROT_WRITE) == 0) { + mutex_enter(&freemem_lock); + if (availrmem > pages_pp_maximum) { + availrmem--; + pages_useclaim++; + } else { + mutex_exit(&freemem_lock); + goto out; + } + mutex_exit(&freemem_lock); + } + page_pp_useclaim(opp, pp, prot & PROT_WRITE); + } + + /* + * Now copy the contents from the original page, + * which is locked and loaded in the MMU by + * the caller to prevent yet another page fault. + */ + ppcopy(opp, pp); /* XXX - should set mod bit in here */ + + hat_setrefmod(pp); /* mark as modified */ + + /* + * Unload the old translation. + */ + hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); + + /* + * Free unmapped, unmodified original page. + * or release the lock on the original page, + * otherwise the process will sleep forever in + * anon_decref() waiting for the "exclusive" lock + * on the page. + */ + (void) page_release(opp, 1); + + /* + * we are done with page creation so downgrade the new + * page's selock to shared, this helps when multiple + * as_fault(...SOFTLOCK...) are done to the same + * page(aio) + */ + page_downgrade(pp); + + /* + * NOTE: The original anon slot must be freed by the + * caller while holding the "anon_map" lock, if we + * copied away from an anonymous page. + */ + return (pp); + +out: + *app = old; + if (pp) + page_unlock(pp); + anon_decref(new); + page_unlock(opp); + return ((page_t *)NULL); +} + +int +anon_map_privatepages( + struct anon_map *amp, + ulong_t start_idx, + uint_t szc, + struct seg *seg, + caddr_t addr, + uint_t prot, + page_t *ppa[], + struct vpage vpage[], + int anypgsz, + struct cred *cred) +{ + pgcnt_t pgcnt; + struct vnode *vp; + anoff_t off; + page_t *pl[2], *conpp = NULL; + int err; + int prealloc = 1; + struct anon *ap, *oldap; + caddr_t vaddr; + page_t *pplist, *pp; + ulong_t pg_idx, an_idx; + spgcnt_t nreloc = 0; + int pagelock = 0; + kmutex_t *ahmpages = NULL; +#ifdef DEBUG + int refcnt; +#endif + + ASSERT(szc != 0); + ASSERT(szc == seg->s_szc); + + VM_STAT_ADD(anonvmstats.privatepages[0]); + + pgcnt = page_get_pagecnt(szc); + ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); + ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); + + ASSERT(amp != NULL); + ap = anon_get_ptr(amp->ahp, start_idx); + ASSERT(ap == NULL || ap->an_refcnt >= 1); + + VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); + + /* + * Now try and allocate the large page. If we fail then just + * let VOP_GETPAGE give us PAGESIZE pages. Normally we let + * the caller make this decision but to avoid added complexity + * it's simplier to handle that case here. + */ + if (anypgsz == -1) { + VM_STAT_ADD(anonvmstats.privatepages[2]); + prealloc = 0; + } else if (page_alloc_pages(seg, addr, &pplist, NULL, szc, + anypgsz) != 0) { + VM_STAT_ADD(anonvmstats.privatepages[3]); + prealloc = 0; + } + + /* + * make the decrement of all refcnts of all + * anon slots of a large page appear atomic by + * getting an anonpages_hash_lock for the + * first anon slot of a large page. + */ + if (ap != NULL) { + ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, + ap->an_off)]; + mutex_enter(ahmpages); + if (ap->an_refcnt == 1) { + VM_STAT_ADD(anonvmstats.privatepages[4]); + ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); + mutex_exit(ahmpages); + + if (prealloc) { + page_free_replacement_page(pplist); + page_create_putback(pgcnt); + } + ASSERT(ppa[0]->p_szc <= szc); + if (ppa[0]->p_szc == szc) { + VM_STAT_ADD(anonvmstats.privatepages[5]); + return (0); + } + for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { + ASSERT(ppa[pg_idx] != NULL); + page_unlock(ppa[pg_idx]); + } + return (-1); + } + } + + /* + * If we are passed in the vpage array and this is + * not PROT_WRITE then we need to decrement availrmem + * up front before we try anything. If we need to and + * can't decrement availrmem then its better to fail now + * than in the middle of processing the new large page. + * page_pp_usclaim() on behalf of each constituent page + * below will adjust availrmem back for the cases not needed. + */ + if (vpage != NULL && (prot & PROT_WRITE) == 0) { + for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { + if (VPP_ISPPLOCK(&vpage[pg_idx])) { + pagelock = 1; + break; + } + } + if (pagelock) { + VM_STAT_ADD(anonvmstats.privatepages[6]); + mutex_enter(&freemem_lock); + if (availrmem >= pages_pp_maximum + pgcnt) { + availrmem -= pgcnt; + pages_useclaim += pgcnt; + } else { + VM_STAT_ADD(anonvmstats.privatepages[7]); + mutex_exit(&freemem_lock); + if (ahmpages != NULL) { + mutex_exit(ahmpages); + } + if (prealloc) { + page_free_replacement_page(pplist); + page_create_putback(pgcnt); + } + for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) + if (ppa[pg_idx] != NULL) + page_unlock(ppa[pg_idx]); + return (ENOMEM); + } + mutex_exit(&freemem_lock); + } + } + + CPU_STATS_ADD_K(vm, cow_fault, pgcnt); + + VM_STAT_ADD(anonvmstats.privatepages[8]); + + an_idx = start_idx; + pg_idx = 0; + vaddr = addr; + for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { + ASSERT(ppa[pg_idx] != NULL); + oldap = anon_get_ptr(amp->ahp, an_idx); + ASSERT(ahmpages != NULL || oldap == NULL); + ASSERT(ahmpages == NULL || oldap != NULL); + ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); + ASSERT(ahmpages == NULL || pg_idx != 0 || + (refcnt = oldap->an_refcnt)); + ASSERT(ahmpages == NULL || pg_idx == 0 || + refcnt == oldap->an_refcnt); + + ap = anon_alloc(NULL, 0); + + swap_xlate(ap, &vp, &off); + + /* + * Now setup our preallocated page to pass down to + * swap_getpage(). + */ + if (prealloc) { + pp = pplist; + page_sub(&pplist, pp); + conpp = pp; + } + + err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, + PAGESIZE, conpp, &nreloc, seg, vaddr, S_CREATE, cred); + + /* + * Impossible to fail this is S_CREATE. + */ + if (err) + panic("anon_map_privatepages: VOP_GETPAGE failed"); + + ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); + ASSERT(prealloc == 0 || nreloc == 1); + + pp = pl[0]; + + /* + * If the original page was locked, we need to move + * the lock to the new page by transfering + * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' + * of the new page. pg_idx can be used to index + * into the vpage array since the caller will guarentee + * that vpage struct passed in corresponds to addr + * and forward. + */ + if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { + page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); + } else if (pagelock) { + mutex_enter(&freemem_lock); + availrmem++; + pages_useclaim--; + mutex_exit(&freemem_lock); + } + + /* + * Now copy the contents from the original page. + */ + ppcopy(ppa[pg_idx], pp); + + hat_setrefmod(pp); /* mark as modified */ + + /* + * Release the lock on the original page, + * derement the old slot, and down grade the lock + * on the new copy. + */ + page_unlock(ppa[pg_idx]); + + if (!prealloc) + page_downgrade(pp); + + ppa[pg_idx] = pp; + + /* + * Now reflect the copy in the new anon array. + */ + ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); + if (oldap != NULL) + anon_decref(oldap); + (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); + } + if (ahmpages != NULL) { + mutex_exit(ahmpages); + } + ASSERT(prealloc == 0 || pplist == NULL); + if (prealloc) { + VM_STAT_ADD(anonvmstats.privatepages[9]); + for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { + page_downgrade(ppa[pg_idx]); + } + } + + /* + * Unload the old large page translation. + */ + hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); + return (0); +} + +/* + * Allocate a private zero-filled anon page. + */ +page_t * +anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) +{ + struct anon *ap; + page_t *pp; + struct vnode *vp; + anoff_t off; + page_t *anon_pl[1 + 1]; + int err; + + /* Kernel probe */ + TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, + tnf_opaque, address, addr); + + *app = ap = anon_alloc(NULL, 0); + swap_xlate(ap, &vp, &off); + + /* + * Call the VOP_GETPAGE routine to create the page, thereby + * enabling the vnode driver to allocate any filesystem + * dependent structures (e.g., disk block allocation for UFS). + * This also prevents more than on page from being added to + * the vnode at the same time since it is locked. + */ + err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, + anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); + if (err) { + *app = NULL; + anon_decref(ap); + return (NULL); + } + pp = anon_pl[0]; + + pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ + page_downgrade(pp); + CPU_STATS_ADD_K(vm, zfod, 1); + hat_setrefmod(pp); /* mark as modified so pageout writes back */ + return (pp); +} + + +/* + * Allocate array of private zero-filled anon pages for empty slots + * and kept pages for non empty slots within given range. + * + * NOTE: This rontine will try and use large pages + * if available and supported by underlying platform. + */ +int +anon_map_createpages( + struct anon_map *amp, + ulong_t start_index, + size_t len, + page_t *ppa[], + struct seg *seg, + caddr_t addr, + enum seg_rw rw, + struct cred *cred) +{ + + struct anon *ap; + struct vnode *ap_vp; + page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; + int err = 0; + ulong_t p_index, index; + pgcnt_t npgs, pg_cnt; + spgcnt_t nreloc = 0; + uint_t l_szc, szc, prot; + anoff_t ap_off; + size_t pgsz; + lgrp_t *lgrp; + + /* + * XXX For now only handle S_CREATE. + */ + ASSERT(rw == S_CREATE); + + index = start_index; + p_index = 0; + npgs = btopr(len); + + /* + * If this platform supports multiple page sizes + * then try and allocate directly from the free + * list for pages larger than PAGESIZE. + * + * NOTE:When we have page_create_ru we can stop + * directly allocating from the freelist. + */ + l_szc = seg->s_szc; + ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); + while (npgs) { + + /* + * if anon slot already exists + * (means page has been created) + * so 1) look up the page + * 2) if the page is still in memory, get it. + * 3) if not, create a page and + * page in from physical swap device. + * These are done in anon_getpage(). + */ + ap = anon_get_ptr(amp->ahp, index); + if (ap) { + err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, + seg, addr, S_READ, cred); + if (err) { + ANON_LOCK_EXIT(&->a_rwlock); + panic("anon_map_createpages: anon_getpage"); + } + pp = anon_pl[0]; + ppa[p_index++] = pp; + + addr += PAGESIZE; + index++; + npgs--; + continue; + } + /* + * Now try and allocate the largest page possible + * for the current address and range. + * Keep dropping down in page size until: + * + * 1) Properly aligned + * 2) Does not overlap existing anon pages + * 3) Fits in remaining range. + * 4) able to allocate one. + * + * NOTE: XXX When page_create_ru is completed this code + * will change. + */ + szc = l_szc; + pplist = NULL; + pg_cnt = 0; + while (szc) { + pgsz = page_get_pagesize(szc); + pg_cnt = pgsz >> PAGESHIFT; + if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && + anon_pages(amp->ahp, index, pg_cnt) == 0) { + /* + * XXX + * Since we are faking page_create() + * we also need to do the freemem and + * pcf accounting. + */ + (void) page_create_wait(pg_cnt, PG_WAIT); + + /* + * Get lgroup to allocate next page of shared + * memory from and use it to specify where to + * allocate the physical memory + */ + lgrp = lgrp_mem_choose(seg, addr, pgsz); + + pplist = page_get_freelist( + (struct vnode *)NULL, (u_offset_t)0, seg, + addr, pgsz, 0, lgrp); + + if (pplist == NULL) { + page_create_putback(pg_cnt); + } + + /* + * If a request for a page of size + * larger than PAGESIZE failed + * then don't try that size anymore. + */ + if (pplist == NULL) { + l_szc = szc - 1; + } else { + break; + } + } + szc--; + } + + /* + * If just using PAGESIZE pages then don't + * directly allocate from the free list. + */ + if (pplist == NULL) { + ASSERT(szc == 0); + pp = anon_zero(seg, addr, &ap, cred); + if (pp == NULL) { + ANON_LOCK_EXIT(&->a_rwlock); + panic("anon_map_createpages: anon_zero"); + } + ppa[p_index++] = pp; + + ASSERT(anon_get_ptr(amp->ahp, index) == NULL); + (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); + + addr += PAGESIZE; + index++; + npgs--; + continue; + } + + /* + * pplist is a list of pg_cnt PAGESIZE pages. + * These pages are locked SE_EXCL since they + * came directly off the free list. + */ + ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); + ASSERT(IS_P2ALIGNED(index, pg_cnt)); + ASSERT(conpp == NULL); + while (pg_cnt--) { + + ap = anon_alloc(NULL, 0); + swap_xlate(ap, &ap_vp, &ap_off); + + ASSERT(pplist != NULL); + pp = pplist; + page_sub(&pplist, pp); + PP_CLRFREE(pp); + PP_CLRAGED(pp); + conpp = pp; + + err = swap_getconpage(ap_vp, ap_off, PAGESIZE, + (uint_t *)NULL, anon_pl, PAGESIZE, conpp, &nreloc, + seg, addr, S_CREATE, cred); + + if (err) { + ANON_LOCK_EXIT(&->a_rwlock); + panic("anon_map_createpages: S_CREATE"); + } + + ASSERT(anon_pl[0] == pp); + ASSERT(nreloc == 1); + pagezero(pp, 0, PAGESIZE); + CPU_STATS_ADD_K(vm, zfod, 1); + hat_setrefmod(pp); + + ASSERT(anon_get_ptr(amp->ahp, index) == NULL); + (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); + + ppa[p_index++] = pp; + + addr += PAGESIZE; + index++; + npgs--; + } + conpp = NULL; + pg_cnt = pgsz >> PAGESHIFT; + p_index = p_index - pg_cnt; + while (pg_cnt--) { + page_downgrade(ppa[p_index++]); + } + } + ANON_LOCK_EXIT(&->a_rwlock); + return (0); +} + +int +anon_map_demotepages( + struct anon_map *amp, + ulong_t start_idx, + struct seg *seg, + caddr_t addr, + uint_t prot, + struct vpage vpage[], + struct cred *cred) +{ + struct anon *ap; + uint_t szc = seg->s_szc; + pgcnt_t pgcnt = page_get_pagecnt(szc); + size_t ppasize = pgcnt * sizeof (page_t *); + page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); + page_t *pp; + page_t *pl[2]; + pgcnt_t i, pg_idx; + ulong_t an_idx; + caddr_t vaddr; + kmutex_t *ahmpages = NULL; + int err; + int retry = 0; + uint_t vpprot; + + ASSERT(RW_WRITE_HELD(&->a_rwlock)); + ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); + ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); + ASSERT(ppa != NULL); + + VM_STAT_ADD(anonvmstats.demotepages[0]); + + ap = anon_get_ptr(amp->ahp, start_idx); + if (ap != NULL) { + VM_STAT_ADD(anonvmstats.demotepages[1]); + ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; + mutex_enter(ahmpages); + } +top: + if (ap == NULL || ap->an_refcnt <= 1) { + int root = 0; + pgcnt_t npgs, curnpgs = 0; + + VM_STAT_ADD(anonvmstats.demotepages[2]); + + ASSERT(retry == 0 || ap != NULL); + + if (ahmpages != NULL) + mutex_exit(ahmpages); + an_idx = start_idx; + for (i = 0; i < pgcnt; i++, an_idx++) { + ap = anon_get_ptr(amp->ahp, an_idx); + if (ap != NULL) { + ASSERT(ap->an_refcnt == 1); + pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, + SE_EXCL); + if (pp != NULL) { + (void) hat_pageunload(pp, + HAT_FORCE_PGUNLOAD); + } + } else { + ppa[i] = NULL; + } + } + for (i = 0; i < pgcnt; i++) { + if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { + ASSERT(pp->p_szc <= szc); + if (!root) { + VM_STAT_ADD(anonvmstats.demotepages[3]); + if (curnpgs != 0) + panic("anon_map_demotepages: " + "bad large page"); + + root = 1; + curnpgs = npgs = + page_get_pagecnt(pp->p_szc); + + ASSERT(npgs <= pgcnt); + ASSERT(IS_P2ALIGNED(npgs, npgs)); + ASSERT(!(page_pptonum(pp) & + (npgs - 1))); + } else { + ASSERT(i > 0); + ASSERT(page_pptonum(pp) - 1 == + page_pptonum(ppa[i - 1])); + if ((page_pptonum(pp) & (npgs - 1)) == + npgs - 1) + root = 0; + } + ASSERT(PAGE_EXCL(pp)); + pp->p_szc = 0; + curnpgs--; + } + } + if (root != 0 || curnpgs != 0) + panic("anon_map_demotepages: bad large page"); + + for (i = 0; i < pgcnt; i++) { + if ((pp = ppa[i]) != NULL) { + ASSERT(!hat_page_is_mapped(pp)); + ASSERT(pp->p_szc == 0); + page_unlock(pp); + } + } + kmem_free(ppa, ppasize); + return (0); + } + ASSERT(ahmpages != NULL); + mutex_exit(ahmpages); + ahmpages = NULL; + + VM_STAT_ADD(anonvmstats.demotepages[4]); + + ASSERT(retry == 0); /* we can be here only once */ + + vaddr = addr; + for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; + pg_idx++, an_idx++, vaddr += PAGESIZE) { + ap = anon_get_ptr(amp->ahp, an_idx); + if (ap == NULL) + panic("anon_map_demotepages: no anon slot"); + err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, + S_READ, cred); + if (err) { + for (i = 0; i < pg_idx; i++) { + if ((pp = ppa[i]) != NULL) + page_unlock(pp); + } + kmem_free(ppa, ppasize); + return (err); + } + ppa[pg_idx] = pl[0]; + } + + err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, + vpage, -1, cred); + if (err > 0) { + VM_STAT_ADD(anonvmstats.demotepages[5]); + kmem_free(ppa, ppasize); + return (err); + } + ASSERT(err == 0 || err == -1); + if (err == -1) { + VM_STAT_ADD(anonvmstats.demotepages[6]); + retry = 1; + goto top; + } + for (i = 0; i < pgcnt; i++) { + ASSERT(ppa[i] != NULL); + if (ppa[i]->p_szc != 0) + retry = 1; + page_unlock(ppa[i]); + } + if (retry) { + VM_STAT_ADD(anonvmstats.demotepages[7]); + goto top; + } + + VM_STAT_ADD(anonvmstats.demotepages[8]); + + kmem_free(ppa, ppasize); + + return (0); +} + +/* + * Allocate and initialize an anon_map structure for seg + * associating the given swap reservation with the new anon_map. + */ +struct anon_map * +anonmap_alloc(size_t size, size_t swresv) +{ + struct anon_map *amp; + + amp = kmem_cache_alloc(anonmap_cache, KM_SLEEP); + + amp->refcnt = 1; + amp->size = size; + + amp->ahp = anon_create(btopr(size), ANON_SLEEP); + amp->swresv = swresv; + amp->locality = 0; + amp->a_szc = 0; + return (amp); +} + +void +anonmap_free(struct anon_map *amp) +{ + ASSERT(amp->ahp); + ASSERT(amp->refcnt == 0); + + lgrp_shm_policy_fini(amp, NULL); + anon_release(amp->ahp, btopr(amp->size)); + kmem_cache_free(anonmap_cache, amp); +} + +/* + * Returns true if the app array has some empty slots. + * The offp and lenp paramters are in/out paramters. On entry + * these values represent the starting offset and length of the + * mapping. When true is returned, these values may be modified + * to be the largest range which includes empty slots. + */ +int +non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, + size_t *lenp) +{ + ulong_t i, el; + ssize_t low, high; + struct anon *ap; + + low = -1; + for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { + ap = anon_get_ptr(ahp, anon_idx); + if (ap == NULL) { + if (low == -1) + low = i; + high = i; + } + } + if (low != -1) { + /* + * Found at least one non-anon page. + * Set up the off and len return values. + */ + if (low != 0) + *offp += low; + *lenp = high - low + PAGESIZE; + return (1); + } + return (0); +} + +/* + * Return a count of the number of existing anon pages in the anon array + * app in the range (off, off+len). The array and slots must be guaranteed + * stable by the caller. + */ +pgcnt_t +anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) +{ + pgcnt_t cnt = 0; + + while (nslots-- > 0) { + if ((anon_get_ptr(ahp, anon_index)) != NULL) + cnt++; + anon_index++; + } + return (cnt); +} + +/* + * Move reserved phys swap into memory swap (unreserve phys swap + * and reserve mem swap by the same amount). + * Used by segspt when it needs to lock resrved swap npages in memory + */ +int +anon_swap_adjust(pgcnt_t npages) +{ + pgcnt_t unlocked_mem_swap; + + mutex_enter(&anoninfo_lock); + + ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + + unlocked_mem_swap = k_anoninfo.ani_mem_resv + - k_anoninfo.ani_locked_swap; + if (npages > unlocked_mem_swap) { + spgcnt_t adjusted_swap = npages - unlocked_mem_swap; + + /* + * if there is not enough unlocked mem swap we take missing + * amount from phys swap and give it to mem swap + */ + mutex_enter(&freemem_lock); + if (availrmem < adjusted_swap + segspt_minfree) { + mutex_exit(&freemem_lock); + mutex_exit(&anoninfo_lock); + return (ENOMEM); + } + availrmem -= adjusted_swap; + mutex_exit(&freemem_lock); + + k_anoninfo.ani_mem_resv += adjusted_swap; + ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); + k_anoninfo.ani_phys_resv -= adjusted_swap; + + ANI_ADD(adjusted_swap); + } + k_anoninfo.ani_locked_swap += npages; + + ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + + mutex_exit(&anoninfo_lock); + + return (0); +} + +/* + * 'unlocked' reserved mem swap so when it is unreserved it + * can be moved back phys (disk) swap + */ +void +anon_swap_restore(pgcnt_t npages) +{ + mutex_enter(&anoninfo_lock); + + ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); + + ASSERT(k_anoninfo.ani_locked_swap >= npages); + k_anoninfo.ani_locked_swap -= npages; + + ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); + + mutex_exit(&anoninfo_lock); +} + +/* + * Return the pointer from the list for a + * specified anon index. + */ +ulong_t * +anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) +{ + struct anon **app; + void **ppp; + + ASSERT(an_idx < ahp->size); + + /* + * Single level case. + */ + if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { + return ((ulong_t *)&ahp->array_chunk[an_idx]); + } else { + + /* + * 2 level case. + */ + ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; + if (*ppp == NULL) { + mutex_enter(&ahp->serial_lock); + ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; + if (*ppp == NULL) + *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); + mutex_exit(&ahp->serial_lock); + } + app = *ppp; + return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); + } +} + +void +anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) +{ + ulong_t *ap_slot; + kmutex_t *mtx; + kcondvar_t *cv; + int hash; + + /* + * Use szc to determine anon slot(s) to appear atomic. + * If szc = 0, then lock the anon slot and mark it busy. + * If szc > 0, then lock the range of slots by getting the + * anon_array_lock for the first anon slot, and mark only the + * first anon slot busy to represent whole range being busy. + */ + + ASSERT(RW_READ_HELD(&->a_rwlock)); + an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); + hash = ANON_ARRAY_HASH(amp, an_idx); + sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; + sobj->sync_cv = cv = &anon_array_cv[hash]; + mutex_enter(mtx); + ap_slot = anon_get_slot(amp->ahp, an_idx); + while (ANON_ISBUSY(ap_slot)) + cv_wait(cv, mtx); + ANON_SETBUSY(ap_slot); + sobj->sync_data = ap_slot; + mutex_exit(mtx); +} + +void +anon_array_exit(anon_sync_obj_t *sobj) +{ + mutex_enter(sobj->sync_mutex); + ASSERT(ANON_ISBUSY(sobj->sync_data)); + ANON_CLRBUSY(sobj->sync_data); + if (CV_HAS_WAITERS(sobj->sync_cv)) + cv_broadcast(sobj->sync_cv); + mutex_exit(sobj->sync_mutex); +} diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c new file mode 100644 index 0000000000..f54ae54359 --- /dev/null +++ b/usr/src/uts/common/vm/vm_as.c @@ -0,0 +1,2898 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - address spaces. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/mman.h> +#include <sys/sysmacros.h> +#include <sys/cpuvar.h> +#include <sys/sysinfo.h> +#include <sys/kmem.h> +#include <sys/vnode.h> +#include <sys/vmsystm.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/tnf_probe.h> +#include <sys/vtrace.h> + +#include <vm/hat.h> +#include <vm/xhat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <vm/seg_dev.h> +#include <vm/seg_kmem.h> +#include <vm/seg_map.h> +#include <vm/seg_spt.h> +#include <vm/page.h> + +clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ + +static struct kmem_cache *as_cache; + +static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); +static void as_clearwatchprot(struct as *, caddr_t, size_t); + + +/* + * Verifying the segment lists is very time-consuming; it may not be + * desirable always to define VERIFY_SEGLIST when DEBUG is set. + */ +#ifdef DEBUG +#define VERIFY_SEGLIST +int do_as_verify = 0; +#endif + +/* + * Allocate a new callback data structure entry and fill in the events of + * interest, the address range of interest, and the callback argument. + * Link the entry on the as->a_callbacks list. A callback entry for the + * entire address space may be specified with vaddr = 0 and size = -1. + * + * CALLERS RESPONSIBILITY: If not calling from within the process context for + * the specified as, the caller must guarantee persistence of the specified as + * for the duration of this function (eg. pages being locked within the as + * will guarantee persistence). + */ +int +as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, + caddr_t vaddr, size_t size, int sleepflag) +{ + struct as_callback *current_head, *cb; + caddr_t saddr; + size_t rsize; + + /* callback function and an event are mandatory */ + if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) + return (EINVAL); + + /* Adding a callback after as_free has been called is not allowed */ + if (as == &kas) + return (ENOMEM); + + /* + * vaddr = 0 and size = -1 is used to indicate that the callback range + * is the entire address space so no rounding is done in that case. + */ + if (size != -1) { + saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); + rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)saddr; + /* check for wraparound */ + if (saddr + rsize < saddr) + return (ENOMEM); + } else { + if (vaddr != 0) + return (EINVAL); + saddr = vaddr; + rsize = size; + } + + /* Allocate and initialize a callback entry */ + cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); + if (cb == NULL) + return (EAGAIN); + + cb->ascb_func = cb_func; + cb->ascb_arg = arg; + cb->ascb_events = events; + cb->ascb_saddr = saddr; + cb->ascb_len = rsize; + + /* Add the entry to the list */ + mutex_enter(&as->a_contents); + current_head = as->a_callbacks; + as->a_callbacks = cb; + cb->ascb_next = current_head; + + /* + * The call to this function may lose in a race with + * a pertinent event - eg. a thread does long term memory locking + * but before the callback is added another thread executes as_unmap. + * A broadcast here resolves that. + */ + if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { + AS_CLRUNMAPWAIT(as); + cv_broadcast(&as->a_cv); + } + + mutex_exit(&as->a_contents); + return (0); +} + +/* + * Search the callback list for an entry which pertains to arg. + * + * This is called from within the client upon completion of the callback. + * RETURN VALUES: + * AS_CALLBACK_DELETED (callback entry found and deleted) + * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) + * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this + * entry will be made in as_do_callbacks) + * + * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED + * set, it indicates that as_do_callbacks is processing this entry. The + * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made + * to unblock as_do_callbacks, in case it is blocked. + * + * CALLERS RESPONSIBILITY: If not calling from within the process context for + * the specified as, the caller must guarantee persistence of the specified as + * for the duration of this function (eg. pages being locked within the as + * will guarantee persistence). + */ +uint_t +as_delete_callback(struct as *as, void *arg) +{ + struct as_callback **prevcb = &as->a_callbacks; + struct as_callback *cb; + uint_t rc = AS_CALLBACK_NOTFOUND; + + mutex_enter(&as->a_contents); + for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { + if (cb->ascb_arg != arg) + continue; + + /* + * If the events indicate AS_CALLBACK_CALLED, just clear + * AS_ALL_EVENT in the events field and wakeup the thread + * that may be waiting in as_do_callbacks. as_do_callbacks + * will take care of removing this entry from the list. In + * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise + * (AS_CALLBACK_CALLED not set), just remove it from the + * list, return the memory and return AS_CALLBACK_DELETED. + */ + if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { + /* leave AS_CALLBACK_CALLED */ + cb->ascb_events &= ~AS_ALL_EVENT; + rc = AS_CALLBACK_DELETE_DEFERRED; + cv_broadcast(&as->a_cv); + } else { + *prevcb = cb->ascb_next; + kmem_free(cb, sizeof (struct as_callback)); + rc = AS_CALLBACK_DELETED; + } + break; + } + mutex_exit(&as->a_contents); + return (rc); +} + +/* + * Searches the as callback list for a matching entry. + * Returns a pointer to the first matching callback, or NULL if + * nothing is found. + * This function never sleeps so it is ok to call it with more + * locks held but the (required) a_contents mutex. + * + * See also comment on as_do_callbacks below. + */ +static struct as_callback * +as_find_callback(struct as *as, uint_t events, caddr_t event_addr, + size_t event_len) +{ + struct as_callback *cb; + + ASSERT(MUTEX_HELD(&as->a_contents)); + for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { + /* + * If the callback has not already been called, then + * check if events or address range pertains. An event_len + * of zero means do an unconditional callback. + */ + if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || + ((event_len != 0) && (((cb->ascb_events & events) == 0) || + (event_addr + event_len < cb->ascb_saddr) || + (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { + continue; + } + break; + } + return (cb); +} + +/* + * Executes a given callback and removes it from the callback list for + * this address space. + * This function may sleep so the caller must drop all locks except + * a_contents before calling this func. + * + * See also comments on as_do_callbacks below. + */ +static void +as_execute_callback(struct as *as, struct as_callback *cb, + uint_t events) +{ + struct as_callback **prevcb; + void *cb_arg; + + ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); + cb->ascb_events |= AS_CALLBACK_CALLED; + mutex_exit(&as->a_contents); + (*cb->ascb_func)(as, cb->ascb_arg, events); + mutex_enter(&as->a_contents); + /* + * the callback function is required to delete the callback + * when the callback function determines it is OK for + * this thread to continue. as_delete_callback will clear + * the AS_ALL_EVENT in the events field when it is deleted. + * If the callback function called as_delete_callback, + * events will already be cleared and there will be no blocking. + */ + while ((cb->ascb_events & events) != 0) { + cv_wait(&as->a_cv, &as->a_contents); + } + /* + * This entry needs to be taken off the list. Normally, the + * callback func itself does that, but unfortunately the list + * may have changed while the callback was running because the + * a_contents mutex was dropped and someone else other than the + * callback func itself could have called as_delete_callback, + * so we have to search to find this entry again. The entry + * must have AS_CALLBACK_CALLED, and have the same 'arg'. + */ + cb_arg = cb->ascb_arg; + prevcb = &as->a_callbacks; + for (cb = as->a_callbacks; cb != NULL; + prevcb = &cb->ascb_next, cb = *prevcb) { + if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || + (cb_arg != cb->ascb_arg)) { + continue; + } + *prevcb = cb->ascb_next; + kmem_free(cb, sizeof (struct as_callback)); + break; + } +} + +/* + * Check the callback list for a matching event and intersection of + * address range. If there is a match invoke the callback. Skip an entry if: + * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) + * - not event of interest + * - not address range of interest + * + * An event_len of zero indicates a request for an unconditional callback + * (regardless of event), only the AS_CALLBACK_CALLED is checked. The + * a_contents lock must be dropped before a callback, so only one callback + * can be done before returning. Return -1 (true) if a callback was + * executed and removed from the list, else return 0 (false). + * + * The logically separate parts, i.e. finding a matching callback and + * executing a given callback have been separated into two functions + * so that they can be called with different sets of locks held beyond + * the always-required a_contents. as_find_callback does not sleep so + * it is ok to call it if more locks than a_contents (i.e. the a_lock + * rwlock) are held. as_execute_callback on the other hand may sleep + * so all locks beyond a_contents must be dropped by the caller if one + * does not want to end comatose. + */ +static int +as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, + size_t event_len) +{ + struct as_callback *cb; + + if ((cb = as_find_callback(as, events, event_addr, event_len))) { + as_execute_callback(as, cb, events); + return (-1); + } + return (0); +} + +/* + * Search for the segment containing addr. If a segment containing addr + * exists, that segment is returned. If no such segment exists, and + * the list spans addresses greater than addr, then the first segment + * whose base is greater than addr is returned; otherwise, NULL is + * returned unless tail is true, in which case the last element of the + * list is returned. + * + * a_seglast is used to cache the last found segment for repeated + * searches to the same addr (which happens frequently). + */ +struct seg * +as_findseg(struct as *as, caddr_t addr, int tail) +{ + struct seg *seg = as->a_seglast; + avl_index_t where; + + ASSERT(AS_LOCK_HELD(as, &as->a_lock)); + + if (seg != NULL && + seg->s_base <= addr && + addr < seg->s_base + seg->s_size) + return (seg); + + seg = avl_find(&as->a_segtree, &addr, &where); + if (seg != NULL) + return (as->a_seglast = seg); + + seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); + if (seg == NULL && tail) + seg = avl_last(&as->a_segtree); + return (as->a_seglast = seg); +} + +#ifdef VERIFY_SEGLIST +/* + * verify that the linked list is coherent + */ +static void +as_verify(struct as *as) +{ + struct seg *seg, *seglast, *p, *n; + uint_t nsegs = 0; + + if (do_as_verify == 0) + return; + + seglast = as->a_seglast; + + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + ASSERT(seg->s_as == as); + p = AS_SEGPREV(as, seg); + n = AS_SEGNEXT(as, seg); + ASSERT(p == NULL || p->s_as == as); + ASSERT(p == NULL || p->s_base < seg->s_base); + ASSERT(n == NULL || n->s_base > seg->s_base); + ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); + if (seg == seglast) + seglast = NULL; + nsegs++; + } + ASSERT(seglast == NULL); + ASSERT(avl_numnodes(&as->a_segtree) == nsegs); +} +#endif /* VERIFY_SEGLIST */ + +/* + * Add a new segment to the address space. The avl_find() + * may be expensive so we attempt to use last segment accessed + * in as_gap() as an insertion point. + */ +int +as_addseg(struct as *as, struct seg *newseg) +{ + struct seg *seg; + caddr_t addr; + caddr_t eaddr; + avl_index_t where; + + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + + as->a_updatedir = 1; /* inform /proc */ + gethrestime(&as->a_updatetime); + + if (as->a_lastgaphl != NULL) { + struct seg *hseg = NULL; + struct seg *lseg = NULL; + + if (as->a_lastgaphl->s_base > newseg->s_base) { + hseg = as->a_lastgaphl; + lseg = AVL_PREV(&as->a_segtree, hseg); + } else { + lseg = as->a_lastgaphl; + hseg = AVL_NEXT(&as->a_segtree, lseg); + } + + if (hseg && lseg && lseg->s_base < newseg->s_base && + hseg->s_base > newseg->s_base) { + avl_insert_here(&as->a_segtree, newseg, lseg, + AVL_AFTER); + as->a_lastgaphl = NULL; + as->a_seglast = newseg; + return (0); + } + as->a_lastgaphl = NULL; + } + + addr = newseg->s_base; + eaddr = addr + newseg->s_size; +again: + + seg = avl_find(&as->a_segtree, &addr, &where); + + if (seg == NULL) + seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); + + if (seg == NULL) + seg = avl_last(&as->a_segtree); + + if (seg != NULL) { + caddr_t base = seg->s_base; + + /* + * If top of seg is below the requested address, then + * the insertion point is at the end of the linked list, + * and seg points to the tail of the list. Otherwise, + * the insertion point is immediately before seg. + */ + if (base + seg->s_size > addr) { + if (addr >= base || eaddr > base) { +#ifdef __sparc + extern struct seg_ops segnf_ops; + + /* + * no-fault segs must disappear if overlaid. + * XXX need new segment type so + * we don't have to check s_ops + */ + if (seg->s_ops == &segnf_ops) { + seg_unmap(seg); + goto again; + } +#endif + return (-1); /* overlapping segment */ + } + } + } + as->a_seglast = newseg; + avl_insert(&as->a_segtree, newseg, where); + +#ifdef VERIFY_SEGLIST + as_verify(as); +#endif + return (0); +} + +struct seg * +as_removeseg(struct as *as, struct seg *seg) +{ + avl_tree_t *t; + + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + + as->a_updatedir = 1; /* inform /proc */ + gethrestime(&as->a_updatetime); + + if (seg == NULL) + return (NULL); + + t = &as->a_segtree; + if (as->a_seglast == seg) + as->a_seglast = NULL; + as->a_lastgaphl = NULL; + + /* + * if this segment is at an address higher than + * a_lastgap, set a_lastgap to the next segment (NULL if last segment) + */ + if (as->a_lastgap && + (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) + as->a_lastgap = AVL_NEXT(t, seg); + + /* + * remove the segment from the seg tree + */ + avl_remove(t, seg); + +#ifdef VERIFY_SEGLIST + as_verify(as); +#endif + return (seg); +} + +/* + * Find a segment containing addr. + */ +struct seg * +as_segat(struct as *as, caddr_t addr) +{ + struct seg *seg = as->a_seglast; + + ASSERT(AS_LOCK_HELD(as, &as->a_lock)); + + if (seg != NULL && seg->s_base <= addr && + addr < seg->s_base + seg->s_size) + return (seg); + + seg = avl_find(&as->a_segtree, &addr, NULL); + return (seg); +} + +/* + * Serialize all searches for holes in an address space to + * prevent two or more threads from allocating the same virtual + * address range. The address space must not be "read/write" + * locked by the caller since we may block. + */ +void +as_rangelock(struct as *as) +{ + mutex_enter(&as->a_contents); + while (AS_ISCLAIMGAP(as)) + cv_wait(&as->a_cv, &as->a_contents); + AS_SETCLAIMGAP(as); + mutex_exit(&as->a_contents); +} + +/* + * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. + */ +void +as_rangeunlock(struct as *as) +{ + mutex_enter(&as->a_contents); + AS_CLRCLAIMGAP(as); + cv_signal(&as->a_cv); + mutex_exit(&as->a_contents); +} + +/* + * compar segments (or just an address) by segment address range + */ +static int +as_segcompar(const void *x, const void *y) +{ + struct seg *a = (struct seg *)x; + struct seg *b = (struct seg *)y; + + if (a->s_base < b->s_base) + return (-1); + if (a->s_base >= b->s_base + b->s_size) + return (1); + return (0); +} + + +void +as_avlinit(struct as *as) +{ + avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), + offsetof(struct seg, s_tree)); + avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), + offsetof(struct watched_page, wp_link)); +} + +/*ARGSUSED*/ +static int +as_constructor(void *buf, void *cdrarg, int kmflags) +{ + struct as *as = buf; + + mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); + cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); + rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); + as_avlinit(as); + return (0); +} + +/*ARGSUSED1*/ +static void +as_destructor(void *buf, void *cdrarg) +{ + struct as *as = buf; + + avl_destroy(&as->a_segtree); + mutex_destroy(&as->a_contents); + cv_destroy(&as->a_cv); + rw_destroy(&as->a_lock); +} + +void +as_init(void) +{ + as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, + as_constructor, as_destructor, NULL, NULL, NULL, 0); +} + +/* + * Allocate and initialize an address space data structure. + * We call hat_alloc to allow any machine dependent + * information in the hat structure to be initialized. + */ +struct as * +as_alloc(void) +{ + struct as *as; + + as = kmem_cache_alloc(as_cache, KM_SLEEP); + + as->a_flags = 0; + as->a_vbits = 0; + as->a_hrm = NULL; + as->a_seglast = NULL; + as->a_size = 0; + as->a_updatedir = 0; + gethrestime(&as->a_updatetime); + as->a_objectdir = NULL; + as->a_sizedir = 0; + as->a_userlimit = (caddr_t)USERLIMIT; + as->a_lastgap = NULL; + as->a_lastgaphl = NULL; + as->a_callbacks = NULL; + + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + as->a_hat = hat_alloc(as); /* create hat for default system mmu */ + AS_LOCK_EXIT(as, &as->a_lock); + + as->a_xhat = NULL; + + return (as); +} + +/* + * Free an address space data structure. + * Need to free the hat first and then + * all the segments on this as and finally + * the space for the as struct itself. + */ +void +as_free(struct as *as) +{ + struct hat *hat = as->a_hat; + struct seg *seg, *next; + int called = 0; + +top: + /* + * Invoke ALL callbacks. as_do_callbacks will do one callback + * per call, and not return (-1) until the callback has completed. + * When as_do_callbacks returns zero, all callbacks have completed. + */ + mutex_enter(&as->a_contents); + while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)); + + /* This will prevent new XHATs from attaching to as */ + if (!called) + AS_SETBUSY(as); + mutex_exit(&as->a_contents); + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + + if (!called) { + called = 1; + hat_free_start(hat); + if (as->a_xhat != NULL) + xhat_free_start_all(as); + } + for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { + int err; + + next = AS_SEGNEXT(as, seg); + err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); + if (err == EAGAIN) { + mutex_enter(&as->a_contents); + if (as->a_callbacks) { + AS_LOCK_EXIT(as, &as->a_lock); + } else { + /* + * Memory is currently locked. Wait for a + * cv_signal that it has been unlocked, then + * try the operation again. + */ + if (AS_ISUNMAPWAIT(as) == 0) + cv_broadcast(&as->a_cv); + AS_SETUNMAPWAIT(as); + AS_LOCK_EXIT(as, &as->a_lock); + while (AS_ISUNMAPWAIT(as)) + cv_wait(&as->a_cv, &as->a_contents); + } + mutex_exit(&as->a_contents); + goto top; + } else { + /* + * We do not expect any other error return at this + * time. This is similar to an ASSERT in seg_unmap() + */ + ASSERT(err == 0); + } + } + hat_free_end(hat); + if (as->a_xhat != NULL) + xhat_free_end_all(as); + AS_LOCK_EXIT(as, &as->a_lock); + + /* /proc stuff */ + ASSERT(avl_numnodes(&as->a_wpage) == 0); + if (as->a_objectdir) { + kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); + as->a_objectdir = NULL; + as->a_sizedir = 0; + } + + /* + * Free the struct as back to kmem. Assert it has no segments. + */ + ASSERT(avl_numnodes(&as->a_segtree) == 0); + kmem_cache_free(as_cache, as); +} + +int +as_dup(struct as *as, struct as **outas) +{ + struct as *newas; + struct seg *seg, *newseg; + int error; + + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + as_clearwatch(as); + newas = as_alloc(); + newas->a_userlimit = as->a_userlimit; + AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER); + + /* This will prevent new XHATs from attaching */ + mutex_enter(&as->a_contents); + AS_SETBUSY(as); + mutex_exit(&as->a_contents); + mutex_enter(&newas->a_contents); + AS_SETBUSY(newas); + mutex_exit(&newas->a_contents); + + + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + + if (seg->s_flags & S_PURGE) + continue; + + newseg = seg_alloc(newas, seg->s_base, seg->s_size); + if (newseg == NULL) { + AS_LOCK_EXIT(newas, &newas->a_lock); + as_setwatch(as); + mutex_enter(&as->a_contents); + AS_CLRBUSY(as); + mutex_exit(&as->a_contents); + AS_LOCK_EXIT(as, &as->a_lock); + as_free(newas); + return (-1); + } + if ((error = SEGOP_DUP(seg, newseg)) != 0) { + /* + * We call seg_free() on the new seg + * because the segment is not set up + * completely; i.e. it has no ops. + */ + as_setwatch(as); + mutex_enter(&as->a_contents); + AS_CLRBUSY(as); + mutex_exit(&as->a_contents); + AS_LOCK_EXIT(as, &as->a_lock); + seg_free(newseg); + AS_LOCK_EXIT(newas, &newas->a_lock); + as_free(newas); + return (error); + } + newas->a_size += seg->s_size; + } + + error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); + if (as->a_xhat != NULL) + error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL); + + mutex_enter(&newas->a_contents); + AS_CLRBUSY(newas); + mutex_exit(&newas->a_contents); + AS_LOCK_EXIT(newas, &newas->a_lock); + + as_setwatch(as); + mutex_enter(&as->a_contents); + AS_CLRBUSY(as); + mutex_exit(&as->a_contents); + AS_LOCK_EXIT(as, &as->a_lock); + if (error != 0) { + as_free(newas); + return (error); + } + *outas = newas; + return (0); +} + +/* + * Handle a ``fault'' at addr for size bytes. + */ +faultcode_t +as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, + enum fault_type type, enum seg_rw rw) +{ + struct seg *seg; + caddr_t raddr; /* rounded down addr */ + size_t rsize; /* rounded up size */ + size_t ssize; + faultcode_t res = 0; + caddr_t addrsav; + struct seg *segsav; + int as_lock_held; + klwp_t *lwp = ttolwp(curthread); + int is_xhat = 0; + int holding_wpage = 0; + extern struct seg_ops segdev_ops; + + + + if (as->a_hat != hat) { + /* This must be an XHAT then */ + is_xhat = 1; + + if ((type != F_INVAL) || (as == &kas)) + return (FC_NOSUPPORT); + } + +retry: + if (!is_xhat) { + /* + * Indicate that the lwp is not to be stopped while waiting + * for a pagefault. This is to avoid deadlock while debugging + * a process via /proc over NFS (in particular). + */ + if (lwp != NULL) + lwp->lwp_nostop++; + + /* + * same length must be used when we softlock and softunlock. + * We don't support softunlocking lengths less than + * the original length when there is largepage support. + * See seg_dev.c for more comments. + */ + switch (type) { + + case F_SOFTLOCK: + CPU_STATS_ADD_K(vm, softlock, 1); + break; + + case F_SOFTUNLOCK: + break; + + case F_PROT: + CPU_STATS_ADD_K(vm, prot_fault, 1); + break; + + case F_INVAL: + CPU_STATS_ENTER_K(); + CPU_STATS_ADDQ(CPU, vm, as_fault, 1); + if (as == &kas) + CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); + CPU_STATS_EXIT_K(); + break; + } + } + + /* Kernel probe */ + TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, + tnf_opaque, address, addr, + tnf_fault_type, fault_type, type, + tnf_seg_access, access, rw); + + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; + + /* + * XXX -- Don't grab the as lock for segkmap. We should grab it for + * correctness, but then we could be stuck holding this lock for + * a LONG time if the fault needs to be resolved on a slow + * filesystem, and then no-one will be able to exec new commands, + * as exec'ing requires the write lock on the as. + */ + if (as == &kas && segkmap && segkmap->s_base <= raddr && + raddr + size < segkmap->s_base + segkmap->s_size) { + /* + * if (as==&kas), this can't be XHAT: we've already returned + * FC_NOSUPPORT. + */ + seg = segkmap; + as_lock_held = 0; + } else { + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + if (is_xhat && avl_numnodes(&as->a_wpage) != 0) { + /* + * Grab and hold the writers' lock on the as + * if the fault is to a watched page. + * This will keep CPUs from "peeking" at the + * address range while we're temporarily boosting + * the permissions for the XHAT device to + * resolve the fault in the segment layer. + * + * We could check whether faulted address + * is within a watched page and only then grab + * the writer lock, but this is simpler. + */ + AS_LOCK_EXIT(as, &as->a_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + } + + seg = as_segat(as, raddr); + if (seg == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + if ((lwp != NULL) && (!is_xhat)) + lwp->lwp_nostop--; + return (FC_NOMAP); + } + + as_lock_held = 1; + } + + addrsav = raddr; + segsav = seg; + + for (; rsize != 0; rsize -= ssize, raddr += ssize) { + if (raddr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + if (seg == NULL || raddr != seg->s_base) { + res = FC_NOMAP; + break; + } + } + if (raddr + rsize > seg->s_base + seg->s_size) + ssize = seg->s_base + seg->s_size - raddr; + else + ssize = rsize; + + if (!is_xhat || (seg->s_ops != &segdev_ops)) { + + if (is_xhat && avl_numnodes(&as->a_wpage) != 0 && + pr_is_watchpage_as(raddr, rw, as)) { + /* + * Handle watch pages. If we're faulting on a + * watched page from an X-hat, we have to + * restore the original permissions while we + * handle the fault. + */ + as_clearwatch(as); + holding_wpage = 1; + } + + res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); + + /* Restore watchpoints */ + if (holding_wpage) { + as_setwatch(as); + holding_wpage = 0; + } + + if (res != 0) + break; + } else { + /* XHAT does not support seg_dev */ + res = FC_NOSUPPORT; + break; + } + } + + /* + * If we were SOFTLOCKing and encountered a failure, + * we must SOFTUNLOCK the range we already did. (Maybe we + * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing + * right here...) + */ + if (res != 0 && type == F_SOFTLOCK) { + for (seg = segsav; addrsav < raddr; addrsav += ssize) { + if (addrsav >= seg->s_base + seg->s_size) + seg = AS_SEGNEXT(as, seg); + ASSERT(seg != NULL); + /* + * Now call the fault routine again to perform the + * unlock using S_OTHER instead of the rw variable + * since we never got a chance to touch the pages. + */ + if (raddr > seg->s_base + seg->s_size) + ssize = seg->s_base + seg->s_size - addrsav; + else + ssize = raddr - addrsav; + (void) SEGOP_FAULT(hat, seg, addrsav, ssize, + F_SOFTUNLOCK, S_OTHER); + } + } + if (as_lock_held) + AS_LOCK_EXIT(as, &as->a_lock); + if ((lwp != NULL) && (!is_xhat)) + lwp->lwp_nostop--; + /* + * If the lower levels returned EDEADLK for a fault, + * It means that we should retry the fault. Let's wait + * a bit also to let the deadlock causing condition clear. + * This is part of a gross hack to work around a design flaw + * in the ufs/sds logging code and should go away when the + * logging code is re-designed to fix the problem. See bug + * 4125102 for details of the problem. + */ + if (FC_ERRNO(res) == EDEADLK) { + delay(deadlk_wait); + res = 0; + goto retry; + } + return (res); +} + + + +/* + * Asynchronous ``fault'' at addr for size bytes. + */ +faultcode_t +as_faulta(struct as *as, caddr_t addr, size_t size) +{ + struct seg *seg; + caddr_t raddr; /* rounded down addr */ + size_t rsize; /* rounded up size */ + faultcode_t res = 0; + klwp_t *lwp = ttolwp(curthread); + +retry: + /* + * Indicate that the lwp is not to be stopped while waiting + * for a pagefault. This is to avoid deadlock while debugging + * a process via /proc over NFS (in particular). + */ + if (lwp != NULL) + lwp->lwp_nostop++; + + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + seg = as_segat(as, raddr); + if (seg == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + if (lwp != NULL) + lwp->lwp_nostop--; + return (FC_NOMAP); + } + + for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { + if (raddr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + if (seg == NULL || raddr != seg->s_base) { + res = FC_NOMAP; + break; + } + } + res = SEGOP_FAULTA(seg, raddr); + if (res != 0) + break; + } + AS_LOCK_EXIT(as, &as->a_lock); + if (lwp != NULL) + lwp->lwp_nostop--; + /* + * If the lower levels returned EDEADLK for a fault, + * It means that we should retry the fault. Let's wait + * a bit also to let the deadlock causing condition clear. + * This is part of a gross hack to work around a design flaw + * in the ufs/sds logging code and should go away when the + * logging code is re-designed to fix the problem. See bug + * 4125102 for details of the problem. + */ + if (FC_ERRNO(res) == EDEADLK) { + delay(deadlk_wait); + res = 0; + goto retry; + } + return (res); +} + +/* + * Set the virtual mapping for the interval from [addr : addr + size) + * in address space `as' to have the specified protection. + * It is ok for the range to cross over several segments, + * as long as they are contiguous. + */ +int +as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) +{ + struct seg *seg; + struct as_callback *cb; + size_t ssize; + caddr_t raddr; /* rounded down addr */ + size_t rsize; /* rounded up size */ + int error = 0, writer = 0; + caddr_t saveraddr; + size_t saversize; + +setprot_top: + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; + + if (raddr + rsize < raddr) /* check for wraparound */ + return (ENOMEM); + + saveraddr = raddr; + saversize = rsize; + + /* + * Normally we only lock the as as a reader. But + * if due to setprot the segment driver needs to split + * a segment it will return IE_RETRY. Therefore we re-aquire + * the as lock as a writer so the segment driver can change + * the seg list. Also the segment driver will return IE_RETRY + * after it has changed the segment list so we therefore keep + * locking as a writer. Since these opeartions should be rare + * want to only lock as a writer when necessary. + */ + if (writer || avl_numnodes(&as->a_wpage) != 0) { + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + } else { + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + } + + as_clearwatchprot(as, raddr, rsize); + seg = as_segat(as, raddr); + if (seg == NULL) { + as_setwatch(as); + AS_LOCK_EXIT(as, &as->a_lock); + return (ENOMEM); + } + + for (; rsize != 0; rsize -= ssize, raddr += ssize) { + if (raddr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + if (seg == NULL || raddr != seg->s_base) { + error = ENOMEM; + break; + } + } + if ((raddr + rsize) > (seg->s_base + seg->s_size)) + ssize = seg->s_base + seg->s_size - raddr; + else + ssize = rsize; + error = SEGOP_SETPROT(seg, raddr, ssize, prot); + + if (error == IE_NOMEM) { + error = EAGAIN; + break; + } + + if (error == IE_RETRY) { + AS_LOCK_EXIT(as, &as->a_lock); + writer = 1; + goto setprot_top; + } + + if (error == EAGAIN) { + /* + * Make sure we have a_lock as writer. + */ + if (writer == 0) { + AS_LOCK_EXIT(as, &as->a_lock); + writer = 1; + goto setprot_top; + } + + /* + * Memory is currently locked. It must be unlocked + * before this operation can succeed through a retry. + * The possible reasons for locked memory and + * corresponding strategies for unlocking are: + * (1) Normal I/O + * wait for a signal that the I/O operation + * has completed and the memory is unlocked. + * (2) Asynchronous I/O + * The aio subsystem does not unlock pages when + * the I/O is completed. Those pages are unlocked + * when the application calls aiowait/aioerror. + * So, to prevent blocking forever, cv_broadcast() + * is done to wake up aio_cleanup_thread. + * Subsequently, segvn_reclaim will be called, and + * that will do AS_CLRUNMAPWAIT() and wake us up. + * (3) Long term page locking: + * Drivers intending to have pages locked for a + * period considerably longer than for normal I/O + * (essentially forever) may have registered for a + * callback so they may unlock these pages on + * request. This is needed to allow this operation + * to succeed. Each entry on the callback list is + * examined. If the event or address range pertains + * the callback is invoked (unless it already is in + * progress). The a_contents lock must be dropped + * before the callback, so only one callback can + * be done at a time. Go to the top and do more + * until zero is returned. If zero is returned, + * either there were no callbacks for this event + * or they were already in progress. + */ + mutex_enter(&as->a_contents); + if (as->a_callbacks && + (cb = as_find_callback(as, AS_SETPROT_EVENT, + seg->s_base, seg->s_size))) { + AS_LOCK_EXIT(as, &as->a_lock); + as_execute_callback(as, cb, AS_SETPROT_EVENT); + } else { + if (AS_ISUNMAPWAIT(as) == 0) + cv_broadcast(&as->a_cv); + AS_SETUNMAPWAIT(as); + AS_LOCK_EXIT(as, &as->a_lock); + while (AS_ISUNMAPWAIT(as)) + cv_wait(&as->a_cv, &as->a_contents); + } + mutex_exit(&as->a_contents); + goto setprot_top; + } else if (error != 0) + break; + } + if (error != 0) { + as_setwatch(as); + } else { + as_setwatchprot(as, saveraddr, saversize, prot); + } + AS_LOCK_EXIT(as, &as->a_lock); + return (error); +} + +/* + * Check to make sure that the interval [addr, addr + size) + * in address space `as' has at least the specified protection. + * It is ok for the range to cross over several segments, as long + * as they are contiguous. + */ +int +as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) +{ + struct seg *seg; + size_t ssize; + caddr_t raddr; /* rounded down addr */ + size_t rsize; /* rounded up size */ + int error = 0; + + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; + + if (raddr + rsize < raddr) /* check for wraparound */ + return (ENOMEM); + + /* + * This is ugly as sin... + * Normally, we only acquire the address space readers lock. + * However, if the address space has watchpoints present, + * we must acquire the writer lock on the address space for + * the benefit of as_clearwatchprot() and as_setwatchprot(). + */ + if (avl_numnodes(&as->a_wpage) != 0) + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + else + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + as_clearwatchprot(as, raddr, rsize); + seg = as_segat(as, raddr); + if (seg == NULL) { + as_setwatch(as); + AS_LOCK_EXIT(as, &as->a_lock); + return (ENOMEM); + } + + for (; rsize != 0; rsize -= ssize, raddr += ssize) { + if (raddr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + if (seg == NULL || raddr != seg->s_base) { + error = ENOMEM; + break; + } + } + if ((raddr + rsize) > (seg->s_base + seg->s_size)) + ssize = seg->s_base + seg->s_size - raddr; + else + ssize = rsize; + + error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); + if (error != 0) + break; + } + as_setwatch(as); + AS_LOCK_EXIT(as, &as->a_lock); + return (error); +} + +int +as_unmap(struct as *as, caddr_t addr, size_t size) +{ + struct seg *seg, *seg_next; + struct as_callback *cb; + caddr_t raddr, eaddr; + size_t ssize; + int err; + +top: + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & + (uintptr_t)PAGEMASK); + + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + + as->a_updatedir = 1; /* inform /proc */ + gethrestime(&as->a_updatetime); + + /* + * Use as_findseg to find the first segment in the range, then + * step through the segments in order, following s_next. + */ + as_clearwatchprot(as, raddr, eaddr - raddr); + + for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { + if (eaddr <= seg->s_base) + break; /* eaddr was in a gap; all done */ + + /* this is implied by the test above */ + ASSERT(raddr < eaddr); + + if (raddr < seg->s_base) + raddr = seg->s_base; /* raddr was in a gap */ + + if (eaddr > (seg->s_base + seg->s_size)) + ssize = seg->s_base + seg->s_size - raddr; + else + ssize = eaddr - raddr; + + /* + * Save next segment pointer since seg can be + * destroyed during the segment unmap operation. + */ + seg_next = AS_SEGNEXT(as, seg); + + err = SEGOP_UNMAP(seg, raddr, ssize); + if (err == EAGAIN) { + /* + * Memory is currently locked. It must be unlocked + * before this operation can succeed through a retry. + * The possible reasons for locked memory and + * corresponding strategies for unlocking are: + * (1) Normal I/O + * wait for a signal that the I/O operation + * has completed and the memory is unlocked. + * (2) Asynchronous I/O + * The aio subsystem does not unlock pages when + * the I/O is completed. Those pages are unlocked + * when the application calls aiowait/aioerror. + * So, to prevent blocking forever, cv_broadcast() + * is done to wake up aio_cleanup_thread. + * Subsequently, segvn_reclaim will be called, and + * that will do AS_CLRUNMAPWAIT() and wake us up. + * (3) Long term page locking: + * Drivers intending to have pages locked for a + * period considerably longer than for normal I/O + * (essentially forever) may have registered for a + * callback so they may unlock these pages on + * request. This is needed to allow this operation + * to succeed. Each entry on the callback list is + * examined. If the event or address range pertains + * the callback is invoked (unless it already is in + * progress). The a_contents lock must be dropped + * before the callback, so only one callback can + * be done at a time. Go to the top and do more + * until zero is returned. If zero is returned, + * either there were no callbacks for this event + * or they were already in progress. + */ + as_setwatch(as); + mutex_enter(&as->a_contents); + if (as->a_callbacks && + (cb = as_find_callback(as, AS_UNMAP_EVENT, + seg->s_base, seg->s_size))) { + AS_LOCK_EXIT(as, &as->a_lock); + as_execute_callback(as, cb, AS_UNMAP_EVENT); + } else { + if (AS_ISUNMAPWAIT(as) == 0) + cv_broadcast(&as->a_cv); + AS_SETUNMAPWAIT(as); + AS_LOCK_EXIT(as, &as->a_lock); + while (AS_ISUNMAPWAIT(as)) + cv_wait(&as->a_cv, &as->a_contents); + } + mutex_exit(&as->a_contents); + goto top; + } else if (err == IE_RETRY) { + as_setwatch(as); + AS_LOCK_EXIT(as, &as->a_lock); + goto top; + } else if (err) { + as_setwatch(as); + AS_LOCK_EXIT(as, &as->a_lock); + return (-1); + } + + as->a_size -= ssize; + raddr += ssize; + } + AS_LOCK_EXIT(as, &as->a_lock); + return (0); +} + +static int +as_map_vnsegs(struct as *as, caddr_t addr, size_t size, + int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) +{ + int text = vn_a->flags & MAP_TEXT; + uint_t szcvec = map_execseg_pgszcvec(text, addr, size); + uint_t szc; + uint_t nszc; + int error; + caddr_t a; + caddr_t eaddr; + size_t segsize; + struct seg *seg; + uint_t save_szcvec; + size_t pgsz; + struct vattr va; + u_offset_t eoff; + size_t save_size = 0; + + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); + ASSERT(IS_P2ALIGNED(size, PAGESIZE)); + ASSERT(vn_a->vp != NULL); + ASSERT(vn_a->amp == NULL); + +again: + if (szcvec <= 1) { + seg = seg_alloc(as, addr, size); + if (seg == NULL) { + return (ENOMEM); + } + vn_a->szc = 0; + error = (*crfp)(seg, vn_a); + if (error != 0) { + seg_free(seg); + } + return (error); + } + + va.va_mask = AT_SIZE; + if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred) != 0) { + szcvec = 0; + goto again; + } + eoff = vn_a->offset & PAGEMASK; + if (eoff >= va.va_size) { + szcvec = 0; + goto again; + } + eoff += size; + if (btopr(va.va_size) < btopr(eoff)) { + save_size = size; + size = va.va_size - (vn_a->offset & PAGEMASK); + size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); + szcvec = map_execseg_pgszcvec(text, addr, size); + if (szcvec <= 1) { + size = save_size; + goto again; + } + } + + eaddr = addr + size; + save_szcvec = szcvec; + szcvec >>= 1; + szc = 0; + nszc = 0; + while (szcvec) { + if ((szcvec & 0x1) == 0) { + nszc++; + szcvec >>= 1; + continue; + } + nszc++; + pgsz = page_get_pagesize(nszc); + a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); + if (a != addr) { + ASSERT(a < eaddr); + segsize = a - addr; + seg = seg_alloc(as, addr, segsize); + if (seg == NULL) { + return (ENOMEM); + } + vn_a->szc = szc; + error = (*crfp)(seg, vn_a); + if (error != 0) { + seg_free(seg); + return (error); + } + *segcreated = 1; + vn_a->offset += segsize; + addr = a; + } + szc = nszc; + szcvec >>= 1; + } + + ASSERT(addr < eaddr); + szcvec = save_szcvec | 1; /* add 8K pages */ + while (szcvec) { + a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); + ASSERT(a >= addr); + if (a != addr) { + segsize = a - addr; + seg = seg_alloc(as, addr, segsize); + if (seg == NULL) { + return (ENOMEM); + } + vn_a->szc = szc; + error = (*crfp)(seg, vn_a); + if (error != 0) { + seg_free(seg); + return (error); + } + *segcreated = 1; + vn_a->offset += segsize; + addr = a; + } + szcvec &= ~(1 << szc); + if (szcvec) { + szc = highbit(szcvec) - 1; + pgsz = page_get_pagesize(szc); + } + } + ASSERT(addr == eaddr); + + if (save_size) { + size = save_size - size; + goto again; + } + + return (0); +} + +int +as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp) +{ + struct seg *seg = NULL; + caddr_t raddr; /* rounded down addr */ + size_t rsize; /* rounded up size */ + int error; + struct proc *p = curproc; + + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; + + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + + /* + * check for wrap around + */ + if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { + AS_LOCK_EXIT(as, &as->a_lock); + return (ENOMEM); + } + + as->a_updatedir = 1; /* inform /proc */ + gethrestime(&as->a_updatetime); + + if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) { + AS_LOCK_EXIT(as, &as->a_lock); + + (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p, + RCA_UNSAFE_ALL); + + return (ENOMEM); + } + + if (AS_MAP_VNSEGS_USELPGS(crfp, argsp)) { + int unmap = 0; + error = as_map_vnsegs(as, raddr, rsize, crfp, + (struct segvn_crargs *)argsp, &unmap); + if (error != 0) { + AS_LOCK_EXIT(as, &as->a_lock); + if (unmap) { + (void) as_unmap(as, addr, size); + } + return (error); + } + } else { + seg = seg_alloc(as, addr, size); + if (seg == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + return (ENOMEM); + } + + error = (*crfp)(seg, argsp); + if (error != 0) { + seg_free(seg); + AS_LOCK_EXIT(as, &as->a_lock); + return (error); + } + } + + /* + * Add size now so as_unmap will work if as_ctl fails. + */ + as->a_size += rsize; + + as_setwatch(as); + + /* + * If the address space is locked, + * establish memory locks for the new segment. + */ + mutex_enter(&as->a_contents); + if (AS_ISPGLCK(as)) { + mutex_exit(&as->a_contents); + AS_LOCK_EXIT(as, &as->a_lock); + error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); + if (error != 0) + (void) as_unmap(as, addr, size); + } else { + mutex_exit(&as->a_contents); + AS_LOCK_EXIT(as, &as->a_lock); + } + return (error); +} + + +/* + * Delete all segments in the address space marked with S_PURGE. + * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). + * These segments are deleted as a first step before calls to as_gap(), so + * that they don't affect mmap() or shmat(). + */ +void +as_purge(struct as *as) +{ + struct seg *seg; + struct seg *next_seg; + + /* + * the setting of NEEDSPURGE is protect by as_rangelock(), so + * no need to grab a_contents mutex for this check + */ + if ((as->a_flags & AS_NEEDSPURGE) == 0) + return; + + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + next_seg = NULL; + seg = AS_SEGFIRST(as); + while (seg != NULL) { + next_seg = AS_SEGNEXT(as, seg); + if (seg->s_flags & S_PURGE) + SEGOP_UNMAP(seg, seg->s_base, seg->s_size); + seg = next_seg; + } + AS_LOCK_EXIT(as, &as->a_lock); + + mutex_enter(&as->a_contents); + as->a_flags &= ~AS_NEEDSPURGE; + mutex_exit(&as->a_contents); +} + +/* + * Find a hole of at least size minlen within [base, base + len). + * + * If flags specifies AH_HI, the hole will have the highest possible address + * in the range. We use the as->a_lastgap field to figure out where to + * start looking for a gap. + * + * Otherwise, the gap will have the lowest possible address. + * + * If flags specifies AH_CONTAIN, the hole will contain the address addr. + * + * If an adequate hole is found, base and len are set to reflect the part of + * the hole that is within range, and 0 is returned, otherwise, + * -1 is returned. + * + * NOTE: This routine is not correct when base+len overflows caddr_t. + */ +int +as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, + caddr_t addr) +{ + caddr_t lobound = *basep; + caddr_t hibound = lobound + *lenp; + struct seg *lseg, *hseg; + caddr_t lo, hi; + int forward; + caddr_t save_base; + size_t save_len; + + save_base = *basep; + save_len = *lenp; + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + if (AS_SEGFIRST(as) == NULL) { + if (valid_va_range(basep, lenp, minlen, flags & AH_DIR)) { + AS_LOCK_EXIT(as, &as->a_lock); + return (0); + } else { + AS_LOCK_EXIT(as, &as->a_lock); + *basep = save_base; + *lenp = save_len; + return (-1); + } + } + + /* + * Set up to iterate over all the inter-segment holes in the given + * direction. lseg is NULL for the lowest-addressed hole and hseg is + * NULL for the highest-addressed hole. If moving backwards, we reset + * sseg to denote the highest-addressed segment. + */ + forward = (flags & AH_DIR) == AH_LO; + if (forward) { + hseg = as_findseg(as, lobound, 1); + lseg = AS_SEGPREV(as, hseg); + } else { + + /* + * If allocating at least as much as the last allocation, + * use a_lastgap's base as a better estimate of hibound. + */ + if (as->a_lastgap && + minlen >= as->a_lastgap->s_size && + hibound >= as->a_lastgap->s_base) + hibound = as->a_lastgap->s_base; + + hseg = as_findseg(as, hibound, 1); + if (hseg->s_base + hseg->s_size < hibound) { + lseg = hseg; + hseg = NULL; + } else { + lseg = AS_SEGPREV(as, hseg); + } + } + + for (;;) { + /* + * Set lo and hi to the hole's boundaries. (We should really + * use MAXADDR in place of hibound in the expression below, + * but can't express it easily; using hibound in its place is + * harmless.) + */ + lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; + hi = (hseg == NULL) ? hibound : hseg->s_base; + /* + * If the iteration has moved past the interval from lobound + * to hibound it's pointless to continue. + */ + if ((forward && lo > hibound) || (!forward && hi < lobound)) + break; + else if (lo > hibound || hi < lobound) + goto cont; + /* + * Candidate hole lies at least partially within the allowable + * range. Restrict it to fall completely within that range, + * i.e., to [max(lo, lobound), min(hi, hibound)]. + */ + if (lo < lobound) + lo = lobound; + if (hi > hibound) + hi = hibound; + /* + * Verify that the candidate hole is big enough and meets + * hardware constraints. + */ + *basep = lo; + *lenp = hi - lo; + if (valid_va_range(basep, lenp, minlen, + forward ? AH_LO : AH_HI) && + ((flags & AH_CONTAIN) == 0 || + (*basep <= addr && *basep + *lenp > addr))) { + if (!forward) + as->a_lastgap = hseg; + if (hseg != NULL) + as->a_lastgaphl = hseg; + else + as->a_lastgaphl = lseg; + AS_LOCK_EXIT(as, &as->a_lock); + return (0); + } + cont: + /* + * Move to the next hole. + */ + if (forward) { + lseg = hseg; + if (lseg == NULL) + break; + hseg = AS_SEGNEXT(as, hseg); + } else { + hseg = lseg; + if (hseg == NULL) + break; + lseg = AS_SEGPREV(as, lseg); + } + } + *basep = save_base; + *lenp = save_len; + AS_LOCK_EXIT(as, &as->a_lock); + return (-1); +} + +/* + * Return the next range within [base, base + len) that is backed + * with "real memory". Skip holes and non-seg_vn segments. + * We're lazy and only return one segment at a time. + */ +int +as_memory(struct as *as, caddr_t *basep, size_t *lenp) +{ + extern struct seg_ops segspt_shmops; /* needs a header file */ + struct seg *seg; + caddr_t addr, eaddr; + caddr_t segend; + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + + addr = *basep; + eaddr = addr + *lenp; + + seg = as_findseg(as, addr, 0); + if (seg != NULL) + addr = MAX(seg->s_base, addr); + + for (;;) { + if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { + AS_LOCK_EXIT(as, &as->a_lock); + return (EINVAL); + } + + if (seg->s_ops == &segvn_ops) { + segend = seg->s_base + seg->s_size; + break; + } + + /* + * We do ISM by looking into the private data + * to determine the real size of the segment. + */ + if (seg->s_ops == &segspt_shmops) { + segend = seg->s_base + spt_realsize(seg); + if (addr < segend) + break; + } + + seg = AS_SEGNEXT(as, seg); + + if (seg != NULL) + addr = seg->s_base; + } + + *basep = addr; + + if (segend > eaddr) + *lenp = eaddr - addr; + else + *lenp = segend - addr; + + AS_LOCK_EXIT(as, &as->a_lock); + return (0); +} + +/* + * Swap the pages associated with the address space as out to + * secondary storage, returning the number of bytes actually + * swapped. + * + * The value returned is intended to correlate well with the process's + * memory requirements. Its usefulness for this purpose depends on + * how well the segment-level routines do at returning accurate + * information. + */ +size_t +as_swapout(struct as *as) +{ + struct seg *seg; + size_t swpcnt = 0; + + /* + * Kernel-only processes have given up their address + * spaces. Of course, we shouldn't be attempting to + * swap out such processes in the first place... + */ + if (as == NULL) + return (0); + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + + /* Prevent XHATs from attaching */ + mutex_enter(&as->a_contents); + AS_SETBUSY(as); + mutex_exit(&as->a_contents); + + + /* + * Free all mapping resources associated with the address + * space. The segment-level swapout routines capitalize + * on this unmapping by scavanging pages that have become + * unmapped here. + */ + hat_swapout(as->a_hat); + if (as->a_xhat != NULL) + xhat_swapout_all(as); + + mutex_enter(&as->a_contents); + AS_CLRBUSY(as); + mutex_exit(&as->a_contents); + + /* + * Call the swapout routines of all segments in the address + * space to do the actual work, accumulating the amount of + * space reclaimed. + */ + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + struct seg_ops *ov = seg->s_ops; + + /* + * We have to check to see if the seg has + * an ops vector because the seg may have + * been in the middle of being set up when + * the process was picked for swapout. + */ + if ((ov != NULL) && (ov->swapout != NULL)) + swpcnt += SEGOP_SWAPOUT(seg); + } + AS_LOCK_EXIT(as, &as->a_lock); + return (swpcnt); +} + +/* + * Determine whether data from the mappings in interval [addr, addr + size) + * are in the primary memory (core) cache. + */ +int +as_incore(struct as *as, caddr_t addr, + size_t size, char *vec, size_t *sizep) +{ + struct seg *seg; + size_t ssize; + caddr_t raddr; /* rounded down addr */ + size_t rsize; /* rounded up size */ + size_t isize; /* iteration size */ + int error = 0; /* result, assume success */ + + *sizep = 0; + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; + + if (raddr + rsize < raddr) /* check for wraparound */ + return (ENOMEM); + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + seg = as_segat(as, raddr); + if (seg == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + return (-1); + } + + for (; rsize != 0; rsize -= ssize, raddr += ssize) { + if (raddr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + if (seg == NULL || raddr != seg->s_base) { + error = -1; + break; + } + } + if ((raddr + rsize) > (seg->s_base + seg->s_size)) + ssize = seg->s_base + seg->s_size - raddr; + else + ssize = rsize; + *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); + if (isize != ssize) { + error = -1; + break; + } + vec += btopr(ssize); + } + AS_LOCK_EXIT(as, &as->a_lock); + return (error); +} + +static void +as_segunlock(struct seg *seg, caddr_t addr, int attr, + ulong_t *bitmap, size_t position, size_t npages) +{ + caddr_t range_start; + size_t pos1 = position; + size_t pos2; + size_t size; + size_t end_pos = npages + position; + + while (bt_range(bitmap, &pos1, &pos2, end_pos)) { + size = ptob((pos2 - pos1)); + range_start = (caddr_t)((uintptr_t)addr + + ptob(pos1 - position)); + + (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, + (ulong_t *)NULL, (size_t)NULL); + pos1 = pos2; + } +} + +static void +as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, + caddr_t raddr, size_t rsize) +{ + struct seg *seg = as_segat(as, raddr); + size_t ssize; + + while (rsize != 0) { + if (raddr >= seg->s_base + seg->s_size) + seg = AS_SEGNEXT(as, seg); + + if ((raddr + rsize) > (seg->s_base + seg->s_size)) + ssize = seg->s_base + seg->s_size - raddr; + else + ssize = rsize; + + as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); + + rsize -= ssize; + raddr += ssize; + } +} + +/* + * Cache control operations over the interval [addr, addr + size) in + * address space "as". + */ +/*ARGSUSED*/ +int +as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, + uintptr_t arg, ulong_t *lock_map, size_t pos) +{ + struct seg *seg; /* working segment */ + caddr_t raddr; /* rounded down addr */ + caddr_t initraddr; /* saved initial rounded down addr */ + size_t rsize; /* rounded up size */ + size_t initrsize; /* saved initial rounded up size */ + size_t ssize; /* size of seg */ + int error = 0; /* result */ + size_t mlock_size; /* size of bitmap */ + ulong_t *mlock_map; /* pointer to bitmap used */ + /* to represent the locked */ + /* pages. */ +retry: + if (error == IE_RETRY) + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + else + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + + /* + * If these are address space lock/unlock operations, loop over + * all segments in the address space, as appropriate. + */ + if (func == MC_LOCKAS) { + size_t npages, idx; + size_t rlen = 0; /* rounded as length */ + + idx = pos; + + if (arg & MCL_FUTURE) { + mutex_enter(&as->a_contents); + AS_SETPGLCK(as); + mutex_exit(&as->a_contents); + } + if ((arg & MCL_CURRENT) == 0) { + AS_LOCK_EXIT(as, &as->a_lock); + return (0); + } + + seg = AS_SEGFIRST(as); + if (seg == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + return (0); + } + + do { + raddr = (caddr_t)((uintptr_t)seg->s_base & + (uintptr_t)PAGEMASK); + rlen += (((uintptr_t)(seg->s_base + seg->s_size) + + PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; + } while ((seg = AS_SEGNEXT(as, seg)) != NULL); + + mlock_size = BT_BITOUL(btopr(rlen)); + if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * + sizeof (ulong_t), KM_NOSLEEP)) == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + return (EAGAIN); + } + + for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { + error = SEGOP_LOCKOP(seg, seg->s_base, + seg->s_size, attr, MC_LOCK, mlock_map, pos); + if (error != 0) + break; + pos += seg_pages(seg); + } + + if (error) { + for (seg = AS_SEGFIRST(as); seg != NULL; + seg = AS_SEGNEXT(as, seg)) { + + raddr = (caddr_t)((uintptr_t)seg->s_base & + (uintptr_t)PAGEMASK); + npages = seg_pages(seg); + as_segunlock(seg, raddr, attr, mlock_map, + idx, npages); + idx += npages; + } + } + + kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); + AS_LOCK_EXIT(as, &as->a_lock); + goto lockerr; + } else if (func == MC_UNLOCKAS) { + mutex_enter(&as->a_contents); + AS_CLRPGLCK(as); + mutex_exit(&as->a_contents); + + for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { + error = SEGOP_LOCKOP(seg, seg->s_base, + seg->s_size, attr, MC_UNLOCK, NULL, 0); + if (error != 0) + break; + } + + AS_LOCK_EXIT(as, &as->a_lock); + goto lockerr; + } + + /* + * Normalize addresses and sizes. + */ + initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; + + if (raddr + rsize < raddr) { /* check for wraparound */ + AS_LOCK_EXIT(as, &as->a_lock); + return (ENOMEM); + } + + /* + * Get initial segment. + */ + if ((seg = as_segat(as, raddr)) == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + return (ENOMEM); + } + + if (func == MC_LOCK) { + mlock_size = BT_BITOUL(btopr(rsize)); + if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * + sizeof (ulong_t), KM_NOSLEEP)) == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + return (EAGAIN); + } + } + + /* + * Loop over all segments. If a hole in the address range is + * discovered, then fail. For each segment, perform the appropriate + * control operation. + */ + while (rsize != 0) { + + /* + * Make sure there's no hole, calculate the portion + * of the next segment to be operated over. + */ + if (raddr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + if (seg == NULL || raddr != seg->s_base) { + if (func == MC_LOCK) { + as_unlockerr(as, attr, mlock_map, + initraddr, initrsize - rsize); + kmem_free(mlock_map, + mlock_size * sizeof (ulong_t)); + } + AS_LOCK_EXIT(as, &as->a_lock); + return (ENOMEM); + } + } + if ((raddr + rsize) > (seg->s_base + seg->s_size)) + ssize = seg->s_base + seg->s_size - raddr; + else + ssize = rsize; + + /* + * Dispatch on specific function. + */ + switch (func) { + + /* + * Synchronize cached data from mappings with backing + * objects. + */ + case MC_SYNC: + if (error = SEGOP_SYNC(seg, raddr, ssize, + attr, (uint_t)arg)) { + AS_LOCK_EXIT(as, &as->a_lock); + return (error); + } + break; + + /* + * Lock pages in memory. + */ + case MC_LOCK: + if (error = SEGOP_LOCKOP(seg, raddr, ssize, + attr, func, mlock_map, pos)) { + as_unlockerr(as, attr, mlock_map, initraddr, + initrsize - rsize + ssize); + kmem_free(mlock_map, mlock_size * + sizeof (ulong_t)); + AS_LOCK_EXIT(as, &as->a_lock); + goto lockerr; + } + break; + + /* + * Unlock mapped pages. + */ + case MC_UNLOCK: + (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, + (ulong_t *)NULL, (size_t)NULL); + break; + + /* + * Store VM advise for mapped pages in segment layer. + */ + case MC_ADVISE: + error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); + + /* + * Check for regular errors and special retry error + */ + if (error) { + if (error == IE_RETRY) { + /* + * Need to acquire writers lock, so + * have to drop readers lock and start + * all over again + */ + AS_LOCK_EXIT(as, &as->a_lock); + goto retry; + } else if (error == IE_REATTACH) { + /* + * Find segment for current address + * because current segment just got + * split or concatenated + */ + seg = as_segat(as, raddr); + if (seg == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + return (ENOMEM); + } + } else { + /* + * Regular error + */ + AS_LOCK_EXIT(as, &as->a_lock); + return (error); + } + } + break; + + /* + * Can't happen. + */ + default: + panic("as_ctl: bad operation %d", func); + /*NOTREACHED*/ + } + + rsize -= ssize; + raddr += ssize; + } + + if (func == MC_LOCK) + kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); + AS_LOCK_EXIT(as, &as->a_lock); + return (0); +lockerr: + + /* + * If the lower levels returned EDEADLK for a segment lockop, + * it means that we should retry the operation. Let's wait + * a bit also to let the deadlock causing condition clear. + * This is part of a gross hack to work around a design flaw + * in the ufs/sds logging code and should go away when the + * logging code is re-designed to fix the problem. See bug + * 4125102 for details of the problem. + */ + if (error == EDEADLK) { + delay(deadlk_wait); + error = 0; + goto retry; + } + return (error); +} + +/* + * Special code for exec to move the stack segment from its interim + * place in the old address to the right place in the new address space. + */ +/*ARGSUSED*/ +int +as_exec(struct as *oas, caddr_t ostka, size_t stksz, + struct as *nas, caddr_t nstka, uint_t hatflag) +{ + struct seg *stkseg; + + AS_LOCK_ENTER(oas, &oas->a_lock, RW_WRITER); + stkseg = as_segat(oas, ostka); + stkseg = as_removeseg(oas, stkseg); + ASSERT(stkseg != NULL); + ASSERT(stkseg->s_base == ostka && stkseg->s_size == stksz); + stkseg->s_as = nas; + stkseg->s_base = nstka; + + /* + * It's ok to lock the address space we are about to exec to. + */ + AS_LOCK_ENTER(nas, &nas->a_lock, RW_WRITER); + ASSERT(avl_numnodes(&nas->a_wpage) == 0); + nas->a_size += stkseg->s_size; + oas->a_size -= stkseg->s_size; + (void) as_addseg(nas, stkseg); + AS_LOCK_EXIT(nas, &nas->a_lock); + AS_LOCK_EXIT(oas, &oas->a_lock); + return (0); +} + +static int +f_decode(faultcode_t fault_err) +{ + int error = 0; + + switch (FC_CODE(fault_err)) { + case FC_OBJERR: + error = FC_ERRNO(fault_err); + break; + case FC_PROT: + error = EACCES; + break; + default: + error = EFAULT; + break; + } + return (error); +} + +/* + * lock pages in a given address space. Return shadow list. If + * the list is NULL, the MMU mapping is also locked. + */ +int +as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, + size_t size, enum seg_rw rw) +{ + size_t rsize; + caddr_t base; + caddr_t raddr; + faultcode_t fault_err; + struct seg *seg; + int res; + int prefaulted = 0; + + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, + "as_pagelock_start: addr %p size %ld", addr, size); + + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; +top: + /* + * if the request crosses two segments let + * as_fault handle it. + */ + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + seg = as_findseg(as, addr, 0); + if ((seg == NULL) || ((base = seg->s_base) > addr) || + (addr + size) > base + seg->s_size) { + AS_LOCK_EXIT(as, &as->a_lock); + goto slow; + } + + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, + "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); + + /* + * try to lock pages and pass back shadow list + */ + res = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); + + TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); + AS_LOCK_EXIT(as, &as->a_lock); + if (res == 0) { + return (0); + } else if (res == ENOTSUP || prefaulted) { + /* + * (1) segment driver doesn't support PAGELOCK fastpath, or + * (2) we've already tried fast path unsuccessfully after + * faulting in the addr range below; system might be + * thrashing or there may not be enough availrmem. + */ + goto slow; + } + + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_FAULT_START, + "as_fault_start: addr %p size %ld", addr, size); + + /* + * we might get here because of some COW fault or non + * existing page. Let as_fault deal with it. Just load + * the page, don't lock the MMU mapping. + */ + fault_err = as_fault(as->a_hat, as, addr, size, F_INVAL, rw); + if (fault_err != 0) { + return (f_decode(fault_err)); + } + + prefaulted = 1; + + /* + * try fast path again; since we've dropped a_lock, + * we need to try the dance from the start to see if + * the addr range is still valid. + */ + goto top; +slow: + /* + * load the page and lock the MMU mapping. + */ + fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); + if (fault_err != 0) { + return (f_decode(fault_err)); + } + *ppp = NULL; + + TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); + return (0); +} + +/* + * unlock pages in a given address range + */ +void +as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, + enum seg_rw rw) +{ + struct seg *seg; + size_t rsize; + caddr_t raddr; + + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, + "as_pageunlock_start: addr %p size %ld", addr, size); + + /* + * if the shadow list is NULL, as_pagelock was + * falling back to as_fault + */ + if (pp == NULL) { + (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); + return; + } + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + seg = as_findseg(as, addr, 0); + ASSERT(seg); + TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, + "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); + SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); + AS_LOCK_EXIT(as, &as->a_lock); + TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); +} + +/* + * reclaim cached pages in a given address range + */ +void +as_pagereclaim(struct as *as, struct page **pp, caddr_t addr, + size_t size, enum seg_rw rw) +{ + struct seg *seg; + size_t rsize; + caddr_t raddr; + + ASSERT(AS_READ_HELD(as, &as->a_lock)); + ASSERT(pp != NULL); + + raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - + (size_t)raddr; + seg = as_findseg(as, addr, 0); + ASSERT(seg); + SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGERECLAIM, rw); +} + +#define MAXPAGEFLIP 4 +#define MAXPAGEFLIPSIZ MAXPAGEFLIP*PAGESIZE + +int +as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, + boolean_t wait) +{ + struct seg *seg; + size_t ssize; + caddr_t raddr; /* rounded down addr */ + size_t rsize; /* rounded up size */ + int error = 0; + size_t pgsz = page_get_pagesize(szc); + +setpgsz_top: + if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { + return (EINVAL); + } + + raddr = addr; + rsize = size; + + if (raddr + rsize < raddr) /* check for wraparound */ + return (ENOMEM); + + AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); + as_clearwatchprot(as, raddr, rsize); + seg = as_segat(as, raddr); + if (seg == NULL) { + as_setwatch(as); + AS_LOCK_EXIT(as, &as->a_lock); + return (ENOMEM); + } + + for (; rsize != 0; rsize -= ssize, raddr += ssize) { + if (raddr >= seg->s_base + seg->s_size) { + seg = AS_SEGNEXT(as, seg); + if (seg == NULL || raddr != seg->s_base) { + error = ENOMEM; + break; + } + } + if ((raddr + rsize) > (seg->s_base + seg->s_size)) { + ssize = seg->s_base + seg->s_size - raddr; + } else { + ssize = rsize; + } + + error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); + + if (error == IE_NOMEM) { + error = EAGAIN; + break; + } + + if (error == IE_RETRY) { + AS_LOCK_EXIT(as, &as->a_lock); + goto setpgsz_top; + } + + if (error == ENOTSUP) { + error = EINVAL; + break; + } + + if (wait && (error == EAGAIN)) { + /* + * Memory is currently locked. It must be unlocked + * before this operation can succeed through a retry. + * The possible reasons for locked memory and + * corresponding strategies for unlocking are: + * (1) Normal I/O + * wait for a signal that the I/O operation + * has completed and the memory is unlocked. + * (2) Asynchronous I/O + * The aio subsystem does not unlock pages when + * the I/O is completed. Those pages are unlocked + * when the application calls aiowait/aioerror. + * So, to prevent blocking forever, cv_broadcast() + * is done to wake up aio_cleanup_thread. + * Subsequently, segvn_reclaim will be called, and + * that will do AS_CLRUNMAPWAIT() and wake us up. + * (3) Long term page locking: + * This is not relevant for as_setpagesize() + * because we cannot change the page size for + * driver memory. The attempt to do so will + * fail with a different error than EAGAIN so + * there's no need to trigger as callbacks like + * as_unmap, as_setprot or as_free would do. + */ + mutex_enter(&as->a_contents); + if (AS_ISUNMAPWAIT(as) == 0) { + cv_broadcast(&as->a_cv); + } + AS_SETUNMAPWAIT(as); + AS_LOCK_EXIT(as, &as->a_lock); + while (AS_ISUNMAPWAIT(as)) { + cv_wait(&as->a_cv, &as->a_contents); + } + mutex_exit(&as->a_contents); + goto setpgsz_top; + } else if (error != 0) { + break; + } + } + as_setwatch(as); + AS_LOCK_EXIT(as, &as->a_lock); + return (error); +} + +/* + * Setup all of the uninitialized watched pages that we can. + */ +void +as_setwatch(struct as *as) +{ + struct watched_page *pwp; + struct seg *seg; + caddr_t vaddr; + uint_t prot; + int err, retrycnt; + + if (avl_numnodes(&as->a_wpage) == 0) + return; + + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + + for (pwp = avl_first(&as->a_wpage); pwp != NULL; + pwp = AVL_NEXT(&as->a_wpage, pwp)) { + retrycnt = 0; + retry: + vaddr = pwp->wp_vaddr; + if (pwp->wp_oprot != 0 || /* already set up */ + (seg = as_segat(as, vaddr)) == NULL || + SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) + continue; + + pwp->wp_oprot = prot; + if (pwp->wp_read) + prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); + if (pwp->wp_write) + prot &= ~PROT_WRITE; + if (pwp->wp_exec) + prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); + if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { + err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); + if (err == IE_RETRY) { + pwp->wp_oprot = 0; + ASSERT(retrycnt == 0); + retrycnt++; + goto retry; + } + } + pwp->wp_prot = prot; + } +} + +/* + * Clear all of the watched pages in the address space. + */ +void +as_clearwatch(struct as *as) +{ + struct watched_page *pwp; + struct seg *seg; + caddr_t vaddr; + uint_t prot; + int err, retrycnt; + + if (avl_numnodes(&as->a_wpage) == 0) + return; + + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + + for (pwp = avl_first(&as->a_wpage); pwp != NULL; + pwp = AVL_NEXT(&as->a_wpage, pwp)) { + retrycnt = 0; + retry: + vaddr = pwp->wp_vaddr; + if (pwp->wp_oprot == 0 || /* not set up */ + (seg = as_segat(as, vaddr)) == NULL) + continue; + + if ((prot = pwp->wp_oprot) != pwp->wp_prot) { + err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); + if (err == IE_RETRY) { + ASSERT(retrycnt == 0); + retrycnt++; + goto retry; + } + } + pwp->wp_oprot = 0; + pwp->wp_prot = 0; + } +} + +/* + * Force a new setup for all the watched pages in the range. + */ +static void +as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) +{ + struct watched_page *pwp; + struct watched_page tpw; + caddr_t eaddr = addr + size; + caddr_t vaddr; + struct seg *seg; + int err, retrycnt; + uint_t wprot; + avl_index_t where; + + if (avl_numnodes(&as->a_wpage) == 0) + return; + + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + + tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) + pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); + + while (pwp != NULL && pwp->wp_vaddr < eaddr) { + retrycnt = 0; + vaddr = pwp->wp_vaddr; + + wprot = prot; + if (pwp->wp_read) + wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); + if (pwp->wp_write) + wprot &= ~PROT_WRITE; + if (pwp->wp_exec) + wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); + if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { + retry: + seg = as_segat(as, vaddr); + if (seg == NULL) { + panic("as_setwatchprot: no seg"); + /*NOTREACHED*/ + } + err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); + if (err == IE_RETRY) { + ASSERT(retrycnt == 0); + retrycnt++; + goto retry; + } + } + pwp->wp_oprot = prot; + pwp->wp_prot = wprot; + + pwp = AVL_NEXT(&as->a_wpage, pwp); + } +} + +/* + * Clear all of the watched pages in the range. + */ +static void +as_clearwatchprot(struct as *as, caddr_t addr, size_t size) +{ + caddr_t eaddr = addr + size; + struct watched_page *pwp; + struct watched_page tpw; + uint_t prot; + struct seg *seg; + int err, retrycnt; + avl_index_t where; + + if (avl_numnodes(&as->a_wpage) == 0) + return; + + tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) + pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); + + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + + while (pwp != NULL && pwp->wp_vaddr < eaddr) { + ASSERT(addr >= pwp->wp_vaddr); + + if ((prot = pwp->wp_oprot) != 0) { + retrycnt = 0; + + if (prot != pwp->wp_prot) { + retry: + seg = as_segat(as, pwp->wp_vaddr); + if (seg == NULL) + continue; + err = SEGOP_SETPROT(seg, pwp->wp_vaddr, + PAGESIZE, prot); + if (err == IE_RETRY) { + ASSERT(retrycnt == 0); + retrycnt++; + goto retry; + + } + } + pwp->wp_oprot = 0; + pwp->wp_prot = 0; + } + + pwp = AVL_NEXT(&as->a_wpage, pwp); + } +} + +void +as_signal_proc(struct as *as, k_siginfo_t *siginfo) +{ + struct proc *p; + + mutex_enter(&pidlock); + for (p = practive; p; p = p->p_next) { + if (p->p_as == as) { + mutex_enter(&p->p_lock); + if (p->p_as == as) + sigaddq(p, NULL, siginfo, KM_NOSLEEP); + mutex_exit(&p->p_lock); + } + } + mutex_exit(&pidlock); +} + +/* + * return memory object ID + */ +int +as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) +{ + struct seg *seg; + int sts; + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + seg = as_segat(as, addr); + if (seg == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + return (EFAULT); + } + /* + * catch old drivers which may not support getmemid + */ + if (seg->s_ops->getmemid == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + return (ENODEV); + } + + sts = SEGOP_GETMEMID(seg, addr, memidp); + + AS_LOCK_EXIT(as, &as->a_lock); + return (sts); +} diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c new file mode 100644 index 0000000000..67b4e58f0f --- /dev/null +++ b/usr/src/uts/common/vm/vm_page.c @@ -0,0 +1,6708 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - physical page management. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/time.h> +#include <sys/vnode.h> +#include <sys/vm.h> +#include <sys/vtrace.h> +#include <sys/swap.h> +#include <sys/cmn_err.h> +#include <sys/tuneable.h> +#include <sys/sysmacros.h> +#include <sys/cpuvar.h> +#include <sys/callb.h> +#include <sys/debug.h> +#include <sys/tnf_probe.h> +#include <sys/condvar_impl.h> +#include <sys/mem_config.h> +#include <sys/mem_cage.h> +#include <sys/kmem.h> +#include <sys/atomic.h> +#include <sys/strlog.h> +#include <sys/mman.h> +#include <sys/ontrap.h> +#include <sys/lgrp.h> +#include <sys/vfs.h> + +#include <vm/hat.h> +#include <vm/anon.h> +#include <vm/page.h> +#include <vm/seg.h> +#include <vm/pvn.h> +#include <vm/seg_kmem.h> +#include <vm/vm_dep.h> + +#include <fs/fs_subr.h> + +static int nopageage = 0; + +static pgcnt_t max_page_get; /* max page_get request size in pages */ +pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ + +/* + * vnode for all pages which are retired from the VM system; + * such as pages with Uncorrectable Errors. + */ +struct vnode retired_ppages; + +static void page_retired_init(void); +static void retired_dispose(vnode_t *vp, page_t *pp, int flag, + int dn, cred_t *cr); +static void retired_inactive(vnode_t *vp, cred_t *cr); +static void page_retired(page_t *pp); +static void retired_page_removed(page_t *pp); +void page_unretire_pages(void); + +/* + * The maximum number of pages that will be unretired in one iteration. + * This number is totally arbitrary. + */ +#define UNRETIRE_PAGES 256 + +/* + * We limit the number of pages that may be retired to + * a percentage of the total physical memory. Note that + * the percentage values are stored as 'basis points', + * ie, 100 basis points is 1%. + */ +#define MAX_PAGES_RETIRED_BPS_DEFAULT 10 /* .1% */ + +uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT; + +static int pages_retired_limit_exceeded(void); + +/* + * operations vector for vnode with retired pages. Only VOP_DISPOSE + * and VOP_INACTIVE are intercepted. + */ +struct vnodeops retired_vnodeops = { + "retired_vnodeops", + fs_nosys, /* open */ + fs_nosys, /* close */ + fs_nosys, /* read */ + fs_nosys, /* write */ + fs_nosys, /* ioctl */ + fs_nosys, /* setfl */ + fs_nosys, /* getattr */ + fs_nosys, /* setattr */ + fs_nosys, /* access */ + fs_nosys, /* lookup */ + fs_nosys, /* create */ + fs_nosys, /* remove */ + fs_nosys, /* link */ + fs_nosys, /* rename */ + fs_nosys, /* mkdir */ + fs_nosys, /* rmdir */ + fs_nosys, /* readdir */ + fs_nosys, /* symlink */ + fs_nosys, /* readlink */ + fs_nosys, /* fsync */ + retired_inactive, + fs_nosys, /* fid */ + fs_rwlock, /* rwlock */ + fs_rwunlock, /* rwunlock */ + fs_nosys, /* seek */ + fs_nosys, /* cmp */ + fs_nosys, /* frlock */ + fs_nosys, /* space */ + fs_nosys, /* realvp */ + fs_nosys, /* getpage */ + fs_nosys, /* putpage */ + fs_nosys_map, + fs_nosys_addmap, + fs_nosys, /* delmap */ + fs_nosys_poll, + fs_nosys, /* dump */ + fs_nosys, /* l_pathconf */ + fs_nosys, /* pageio */ + fs_nosys, /* dumpctl */ + retired_dispose, + fs_nosys, /* setsecattr */ + fs_nosys, /* getsecatt */ + fs_nosys, /* shrlock */ + fs_vnevent_nosupport /* vnevent */ +}; + +/* + * freemem_lock protects all freemem variables: + * availrmem. Also this lock protects the globals which track the + * availrmem changes for accurate kernel footprint calculation. + * See below for an explanation of these + * globals. + */ +kmutex_t freemem_lock; +pgcnt_t availrmem; +pgcnt_t availrmem_initial; + +/* + * These globals track availrmem changes to get a more accurate + * estimate of tke kernel size. Historically pp_kernel is used for + * kernel size and is based on availrmem. But availrmem is adjusted for + * locked pages in the system not just for kernel locked pages. + * These new counters will track the pages locked through segvn and + * by explicit user locking. + * + * segvn_pages_locked : This keeps track on a global basis how many pages + * are currently locked because of I/O. + * + * pages_locked : How many pages are locked becuase of user specified + * locking through mlock or plock. + * + * pages_useclaim,pages_claimed : These two variables track the + * cliam adjustments because of the protection changes on a segvn segment. + * + * All these globals are protected by the same lock which protects availrmem. + */ +pgcnt_t segvn_pages_locked; +pgcnt_t pages_locked; +pgcnt_t pages_useclaim; +pgcnt_t pages_claimed; + + +/* + * new_freemem_lock protects freemem, freemem_wait & freemem_cv. + */ +static kmutex_t new_freemem_lock; +static uint_t freemem_wait; /* someone waiting for freemem */ +static kcondvar_t freemem_cv; + +/* + * The logical page free list is maintained as two lists, the 'free' + * and the 'cache' lists. + * The free list contains those pages that should be reused first. + * + * The implementation of the lists is machine dependent. + * page_get_freelist(), page_get_cachelist(), + * page_list_sub(), and page_list_add() + * form the interface to the machine dependent implementation. + * + * Pages with p_free set are on the cache list. + * Pages with p_free and p_age set are on the free list, + * + * A page may be locked while on either list. + */ + +/* + * free list accounting stuff. + * + * + * Spread out the value for the number of pages on the + * page free and page cache lists. If there is just one + * value, then it must be under just one lock. + * The lock contention and cache traffic are a real bother. + * + * When we acquire and then drop a single pcf lock + * we can start in the middle of the array of pcf structures. + * If we acquire more than one pcf lock at a time, we need to + * start at the front to avoid deadlocking. + * + * pcf_count holds the number of pages in each pool. + * + * pcf_block is set when page_create_get_something() has asked the + * PSM page freelist and page cachelist routines without specifying + * a color and nothing came back. This is used to block anything + * else from moving pages from one list to the other while the + * lists are searched again. If a page is freeed while pcf_block is + * set, then pcf_reserve is incremented. pcgs_unblock() takes care + * of clearning pcf_block, doing the wakeups, etc. + */ + +#if NCPU <= 4 +#define PAD 1 +#define PCF_FANOUT 4 +static uint_t pcf_mask = PCF_FANOUT - 1; +#else +#define PAD 9 +#ifdef sun4v +#define PCF_FANOUT 32 +#else +#define PCF_FANOUT 128 +#endif +static uint_t pcf_mask = PCF_FANOUT - 1; +#endif + +struct pcf { + uint_t pcf_touch; /* just to help the cache */ + uint_t pcf_count; /* page count */ + kmutex_t pcf_lock; /* protects the structure */ + uint_t pcf_wait; /* number of waiters */ + uint_t pcf_block; /* pcgs flag to page_free() */ + uint_t pcf_reserve; /* pages freed after pcf_block set */ + uint_t pcf_fill[PAD]; /* to line up on the caches */ +}; + +static struct pcf pcf[PCF_FANOUT]; +#define PCF_INDEX() ((CPU->cpu_id) & (pcf_mask)) + +kmutex_t pcgs_lock; /* serializes page_create_get_ */ +kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */ +kmutex_t pcgs_wait_lock; /* used for delay in pcgs */ +static kcondvar_t pcgs_cv; /* cv for delay in pcgs */ + +#define PAGE_LOCK_MAXIMUM \ + ((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1) + +/* + * Control over the verbosity of page retirement. When set to zero, no messages + * will be printed. A value of one will trigger messages for retirement + * operations, and is intended for processors which don't yet support FMA + * (spitfire). Two will cause verbose messages to be printed when retirements + * complete, and is intended only for debugging purposes. + */ +int page_retire_messages = 0; + +#ifdef VM_STATS + +/* + * No locks, but so what, they are only statistics. + */ + +static struct page_tcnt { + int pc_free_cache; /* free's into cache list */ + int pc_free_dontneed; /* free's with dontneed */ + int pc_free_pageout; /* free's from pageout */ + int pc_free_free; /* free's into free list */ + int pc_free_pages; /* free's into large page free list */ + int pc_destroy_pages; /* large page destroy's */ + int pc_get_cache; /* get's from cache list */ + int pc_get_free; /* get's from free list */ + int pc_reclaim; /* reclaim's */ + int pc_abortfree; /* abort's of free pages */ + int pc_find_hit; /* find's that find page */ + int pc_find_miss; /* find's that don't find page */ + int pc_destroy_free; /* # of free pages destroyed */ +#define PC_HASH_CNT (4*PAGE_HASHAVELEN) + int pc_find_hashlen[PC_HASH_CNT+1]; + int pc_addclaim_pages; + int pc_subclaim_pages; + int pc_free_replacement_page[2]; + int pc_try_demote_pages[6]; + int pc_demote_pages[2]; +} pagecnt; + +uint_t hashin_count; +uint_t hashin_not_held; +uint_t hashin_already; + +uint_t hashout_count; +uint_t hashout_not_held; + +uint_t page_create_count; +uint_t page_create_not_enough; +uint_t page_create_not_enough_again; +uint_t page_create_zero; +uint_t page_create_hashout; +uint_t page_create_page_lock_failed; +uint_t page_create_trylock_failed; +uint_t page_create_found_one; +uint_t page_create_hashin_failed; +uint_t page_create_dropped_phm; + +uint_t page_create_new; +uint_t page_create_exists; +uint_t page_create_putbacks; +uint_t page_create_overshoot; + +uint_t page_reclaim_zero; +uint_t page_reclaim_zero_locked; + +uint_t page_rename_exists; +uint_t page_rename_count; + +uint_t page_lookup_cnt[20]; +uint_t page_lookup_nowait_cnt[10]; +uint_t page_find_cnt; +uint_t page_exists_cnt; +uint_t page_exists_forreal_cnt; +uint_t page_lookup_dev_cnt; +uint_t get_cachelist_cnt; +uint_t page_create_cnt[10]; +uint_t alloc_pages[8]; +uint_t page_exphcontg[19]; +uint_t page_create_large_cnt[10]; + +/* + * Collects statistics. + */ +#define PAGE_HASH_SEARCH(index, pp, vp, off) { \ + uint_t mylen = 0; \ + \ + for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \ + if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ + break; \ + } \ + if ((pp) != NULL) \ + pagecnt.pc_find_hit++; \ + else \ + pagecnt.pc_find_miss++; \ + if (mylen > PC_HASH_CNT) \ + mylen = PC_HASH_CNT; \ + pagecnt.pc_find_hashlen[mylen]++; \ +} + +#else /* VM_STATS */ + +/* + * Don't collect statistics + */ +#define PAGE_HASH_SEARCH(index, pp, vp, off) { \ + for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ + if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ + break; \ + } \ +} + +#endif /* VM_STATS */ + + + +#ifdef DEBUG +#define MEMSEG_SEARCH_STATS +#endif + +#ifdef MEMSEG_SEARCH_STATS +struct memseg_stats { + uint_t nsearch; + uint_t nlastwon; + uint_t nhashwon; + uint_t nnotfound; +} memseg_stats; + +#define MEMSEG_STAT_INCR(v) \ + atomic_add_32(&memseg_stats.v, 1) +#else +#define MEMSEG_STAT_INCR(x) +#endif + +struct memseg *memsegs; /* list of memory segments */ + + +static void page_init_mem_config(void); +static int page_do_hashin(page_t *, vnode_t *, u_offset_t); +static void page_do_hashout(page_t *); + +static void page_demote_vp_pages(page_t *); + +/* + * vm subsystem related initialization + */ +void +vm_init(void) +{ + boolean_t callb_vm_cpr(void *, int); + + (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); + page_init_mem_config(); + + /* + * initialise the vnode for retired pages + */ + page_retired_init(); +} + +/* + * This function is called at startup and when memory is added or deleted. + */ +void +init_pages_pp_maximum() +{ + static pgcnt_t p_min; + static pgcnt_t pages_pp_maximum_startup; + static pgcnt_t avrmem_delta; + static int init_done; + static int user_set; /* true if set in /etc/system */ + + if (init_done == 0) { + + /* If the user specified a value, save it */ + if (pages_pp_maximum != 0) { + user_set = 1; + pages_pp_maximum_startup = pages_pp_maximum; + } + + /* + * Setting of pages_pp_maximum is based first time + * on the value of availrmem just after the start-up + * allocations. To preserve this relationship at run + * time, use a delta from availrmem_initial. + */ + ASSERT(availrmem_initial >= availrmem); + avrmem_delta = availrmem_initial - availrmem; + + /* The allowable floor of pages_pp_maximum */ + p_min = tune.t_minarmem + 100; + + /* Make sure we don't come through here again. */ + init_done = 1; + } + /* + * Determine pages_pp_maximum, the number of currently available + * pages (availrmem) that can't be `locked'. If not set by + * the user, we set it to 4% of the currently available memory + * plus 4MB. + * But we also insist that it be greater than tune.t_minarmem; + * otherwise a process could lock down a lot of memory, get swapped + * out, and never have enough to get swapped back in. + */ + if (user_set) + pages_pp_maximum = pages_pp_maximum_startup; + else + pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25) + + btop(4 * 1024 * 1024); + + if (pages_pp_maximum <= p_min) { + pages_pp_maximum = p_min; + } +} + +void +set_max_page_get(pgcnt_t target_total_pages) +{ + max_page_get = target_total_pages / 2; +} + +static pgcnt_t pending_delete; + +/*ARGSUSED*/ +static void +page_mem_config_post_add( + void *arg, + pgcnt_t delta_pages) +{ + set_max_page_get(total_pages - pending_delete); + init_pages_pp_maximum(); +} + +/*ARGSUSED*/ +static int +page_mem_config_pre_del( + void *arg, + pgcnt_t delta_pages) +{ + pgcnt_t nv; + + nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages); + set_max_page_get(total_pages - nv); + return (0); +} + +/*ARGSUSED*/ +static void +page_mem_config_post_del( + void *arg, + pgcnt_t delta_pages, + int cancelled) +{ + pgcnt_t nv; + + nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages); + set_max_page_get(total_pages - nv); + if (!cancelled) + init_pages_pp_maximum(); +} + +static kphysm_setup_vector_t page_mem_config_vec = { + KPHYSM_SETUP_VECTOR_VERSION, + page_mem_config_post_add, + page_mem_config_pre_del, + page_mem_config_post_del, +}; + +static void +page_init_mem_config(void) +{ + int ret; + + ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL); + ASSERT(ret == 0); +} + +/* + * Evenly spread out the PCF counters for large free pages + */ +static void +page_free_large_ctr(pgcnt_t npages) +{ + static struct pcf *p = pcf; + pgcnt_t lump; + + freemem += npages; + + lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; + + while (npages > 0) { + + ASSERT(!p->pcf_block); + + if (lump < npages) { + p->pcf_count += (uint_t)lump; + npages -= lump; + } else { + p->pcf_count += (uint_t)npages; + npages = 0; + } + + ASSERT(!p->pcf_wait); + + if (++p > &pcf[PCF_FANOUT - 1]) + p = pcf; + } + + ASSERT(npages == 0); +} + +/* + * Add a physical chunk of memory to the system freee lists during startup. + * Platform specific startup() allocates the memory for the page structs. + * + * num - number of page structures + * base - page number (pfn) to be associated with the first page. + * + * Since we are doing this during startup (ie. single threaded), we will + * use shortcut routines to avoid any locking overhead while putting all + * these pages on the freelists. + * + * NOTE: Any changes performed to page_free(), must also be performed to + * add_physmem() since this is how we initialize all page_t's at + * boot time. + */ +void +add_physmem( + page_t *pp, + pgcnt_t num, + pfn_t pnum) +{ + page_t *root = NULL; + uint_t szc = page_num_pagesizes() - 1; + pgcnt_t large = page_get_pagecnt(szc); + pgcnt_t cnt = 0; + + TRACE_2(TR_FAC_VM, TR_PAGE_INIT, + "add_physmem:pp %p num %lu", pp, num); + + /* + * Arbitrarily limit the max page_get request + * to 1/2 of the page structs we have. + */ + total_pages += num; + set_max_page_get(total_pages); + + /* + * The physical space for the pages array + * representing ram pages has already been + * allocated. Here we initialize each lock + * in the page structure, and put each on + * the free list + */ + for (; num; pp = page_next_raw(pp), pnum++, num--) { + + /* + * this needs to fill in the page number + * and do any other arch specific initialization + */ + add_physmem_cb(pp, pnum); + + /* + * Initialize the page lock as unlocked, since nobody + * can see or access this page yet. + */ + pp->p_selock = 0; + + /* + * Initialize IO lock + */ + page_iolock_init(pp); + + /* + * initialize other fields in the page_t + */ + PP_SETFREE(pp); + page_clr_all_props(pp); + PP_SETAGED(pp); + pp->p_offset = (u_offset_t)-1; + pp->p_next = pp; + pp->p_prev = pp; + + /* + * Simple case: System doesn't support large pages. + */ + if (szc == 0) { + pp->p_szc = 0; + page_free_at_startup(pp); + continue; + } + + /* + * Handle unaligned pages, we collect them up onto + * the root page until we have a full large page. + */ + if (!IS_P2ALIGNED(pnum, large)) { + + /* + * If not in a large page, + * just free as small page. + */ + if (root == NULL) { + pp->p_szc = 0; + page_free_at_startup(pp); + continue; + } + + /* + * Link a constituent page into the large page. + */ + pp->p_szc = szc; + page_list_concat(&root, &pp); + + /* + * When large page is fully formed, free it. + */ + if (++cnt == large) { + page_free_large_ctr(cnt); + page_list_add_pages(root, PG_LIST_ISINIT); + root = NULL; + cnt = 0; + } + continue; + } + + /* + * At this point we have a page number which + * is aligned. We assert that we aren't already + * in a different large page. + */ + ASSERT(IS_P2ALIGNED(pnum, large)); + ASSERT(root == NULL && cnt == 0); + + /* + * If insufficient number of pages left to form + * a large page, just free the small page. + */ + if (num < large) { + pp->p_szc = 0; + page_free_at_startup(pp); + continue; + } + + /* + * Otherwise start a new large page. + */ + pp->p_szc = szc; + cnt++; + root = pp; + } + ASSERT(root == NULL && cnt == 0); +} + +/* + * Find a page representing the specified [vp, offset]. + * If we find the page but it is intransit coming in, + * it will have an "exclusive" lock and we wait for + * the i/o to complete. A page found on the free list + * is always reclaimed and then locked. On success, the page + * is locked, its data is valid and it isn't on the free + * list, while a NULL is returned if the page doesn't exist. + */ +page_t * +page_lookup(vnode_t *vp, u_offset_t off, se_t se) +{ + return (page_lookup_create(vp, off, se, NULL, NULL, 0)); +} + +/* + * Find a page representing the specified [vp, offset]. + * We either return the one we found or, if passed in, + * create one with identity of [vp, offset] of the + * pre-allocated page. If we find exsisting page but it is + * intransit coming in, it will have an "exclusive" lock + * and we wait for the i/o to complete. A page found on + * the free list is always reclaimed and then locked. + * On success, the page is locked, its data is valid and + * it isn't on the free list, while a NULL is returned + * if the page doesn't exist and newpp is NULL; + */ +page_t * +page_lookup_create( + vnode_t *vp, + u_offset_t off, + se_t se, + page_t *newpp, + spgcnt_t *nrelocp, + int flags) +{ + page_t *pp; + kmutex_t *phm; + ulong_t index; + uint_t hash_locked; + uint_t es; + + ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); + VM_STAT_ADD(page_lookup_cnt[0]); + ASSERT(newpp ? PAGE_EXCL(newpp) : 1); + + /* + * Acquire the appropriate page hash lock since + * we have to search the hash list. Pages that + * hash to this list can't change identity while + * this lock is held. + */ + hash_locked = 0; + index = PAGE_HASH_FUNC(vp, off); + phm = NULL; +top: + PAGE_HASH_SEARCH(index, pp, vp, off); + if (pp != NULL) { + VM_STAT_ADD(page_lookup_cnt[1]); + es = (newpp != NULL) ? 1 : 0; + es |= flags; + if (!hash_locked) { + VM_STAT_ADD(page_lookup_cnt[2]); + if (!page_try_reclaim_lock(pp, se, es)) { + /* + * On a miss, acquire the phm. Then + * next time, page_lock() will be called, + * causing a wait if the page is busy. + * just looping with page_trylock() would + * get pretty boring. + */ + VM_STAT_ADD(page_lookup_cnt[3]); + phm = PAGE_HASH_MUTEX(index); + mutex_enter(phm); + hash_locked = 1; + goto top; + } + } else { + VM_STAT_ADD(page_lookup_cnt[4]); + if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) { + VM_STAT_ADD(page_lookup_cnt[5]); + goto top; + } + } + + /* + * Since `pp' is locked it can not change identity now. + * Reconfirm we locked the correct page. + * + * Both the p_vnode and p_offset *must* be cast volatile + * to force a reload of their values: The PAGE_HASH_SEARCH + * macro will have stuffed p_vnode and p_offset into + * registers before calling page_trylock(); another thread, + * actually holding the hash lock, could have changed the + * page's identity in memory, but our registers would not + * be changed, fooling the reconfirmation. If the hash + * lock was held during the search, the casting would + * not be needed. + */ + VM_STAT_ADD(page_lookup_cnt[6]); + if (((volatile struct vnode *)(pp->p_vnode) != vp) || + ((volatile u_offset_t)(pp->p_offset) != off)) { + VM_STAT_ADD(page_lookup_cnt[7]); + if (hash_locked) { + panic("page_lookup_create: lost page %p", + (void *)pp); + /*NOTREACHED*/ + } + page_unlock(pp); + phm = PAGE_HASH_MUTEX(index); + mutex_enter(phm); + hash_locked = 1; + goto top; + } + + /* + * If page_trylock() was called, then pp may still be on + * the cachelist (can't be on the free list, it would not + * have been found in the search). If it is on the + * cachelist it must be pulled now. To pull the page from + * the cachelist, it must be exclusively locked. + * + * The other big difference between page_trylock() and + * page_lock(), is that page_lock() will pull the + * page from whatever free list (the cache list in this + * case) the page is on. If page_trylock() was used + * above, then we have to do the reclaim ourselves. + */ + if ((!hash_locked) && (PP_ISFREE(pp))) { + ASSERT(PP_ISAGED(pp) == 0); + VM_STAT_ADD(page_lookup_cnt[8]); + + /* + * page_relcaim will insure that we + * have this page exclusively + */ + + if (!page_reclaim(pp, NULL)) { + /* + * Page_reclaim dropped whatever lock + * we held. + */ + VM_STAT_ADD(page_lookup_cnt[9]); + phm = PAGE_HASH_MUTEX(index); + mutex_enter(phm); + hash_locked = 1; + goto top; + } else if (se == SE_SHARED && newpp == NULL) { + VM_STAT_ADD(page_lookup_cnt[10]); + page_downgrade(pp); + } + } + + if (hash_locked) { + mutex_exit(phm); + } + + if (newpp != NULL && pp->p_szc < newpp->p_szc && + PAGE_EXCL(pp) && nrelocp != NULL) { + ASSERT(nrelocp != NULL); + (void) page_relocate(&pp, &newpp, 1, 1, nrelocp, + NULL); + if (*nrelocp > 0) { + VM_STAT_COND_ADD(*nrelocp == 1, + page_lookup_cnt[11]); + VM_STAT_COND_ADD(*nrelocp > 1, + page_lookup_cnt[12]); + pp = newpp; + se = SE_EXCL; + } else { + if (se == SE_SHARED) { + page_downgrade(pp); + } + VM_STAT_ADD(page_lookup_cnt[13]); + } + } else if (newpp != NULL && nrelocp != NULL) { + if (PAGE_EXCL(pp) && se == SE_SHARED) { + page_downgrade(pp); + } + VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc, + page_lookup_cnt[14]); + VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc, + page_lookup_cnt[15]); + VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc, + page_lookup_cnt[16]); + } else if (newpp != NULL && PAGE_EXCL(pp)) { + se = SE_EXCL; + } + } else if (!hash_locked) { + VM_STAT_ADD(page_lookup_cnt[17]); + phm = PAGE_HASH_MUTEX(index); + mutex_enter(phm); + hash_locked = 1; + goto top; + } else if (newpp != NULL) { + /* + * If we have a preallocated page then + * insert it now and basically behave like + * page_create. + */ + VM_STAT_ADD(page_lookup_cnt[18]); + /* + * Since we hold the page hash mutex and + * just searched for this page, page_hashin + * had better not fail. If it does, that + * means some thread did not follow the + * page hash mutex rules. Panic now and + * get it over with. As usual, go down + * holding all the locks. + */ + ASSERT(MUTEX_HELD(phm)); + if (!page_hashin(newpp, vp, off, phm)) { + ASSERT(MUTEX_HELD(phm)); + panic("page_lookup_create: hashin failed %p %p %llx %p", + (void *)newpp, (void *)vp, off, (void *)phm); + /*NOTREACHED*/ + } + ASSERT(MUTEX_HELD(phm)); + mutex_exit(phm); + phm = NULL; + page_set_props(newpp, P_REF); + page_io_lock(newpp); + pp = newpp; + se = SE_EXCL; + } else { + VM_STAT_ADD(page_lookup_cnt[19]); + mutex_exit(phm); + } + + ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); + + ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1); + + return (pp); +} + +/* + * Search the hash list for the page representing the + * specified [vp, offset] and return it locked. Skip + * free pages and pages that cannot be locked as requested. + * Used while attempting to kluster pages. + */ +page_t * +page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se) +{ + page_t *pp; + kmutex_t *phm; + ulong_t index; + uint_t locked; + + ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); + VM_STAT_ADD(page_lookup_nowait_cnt[0]); + + index = PAGE_HASH_FUNC(vp, off); + PAGE_HASH_SEARCH(index, pp, vp, off); + locked = 0; + if (pp == NULL) { +top: + VM_STAT_ADD(page_lookup_nowait_cnt[1]); + locked = 1; + phm = PAGE_HASH_MUTEX(index); + mutex_enter(phm); + PAGE_HASH_SEARCH(index, pp, vp, off); + } + + if (pp == NULL || PP_ISFREE(pp)) { + VM_STAT_ADD(page_lookup_nowait_cnt[2]); + pp = NULL; + } else { + if (!page_trylock(pp, se)) { + VM_STAT_ADD(page_lookup_nowait_cnt[3]); + pp = NULL; + } else { + VM_STAT_ADD(page_lookup_nowait_cnt[4]); + /* + * See the comment in page_lookup() + */ + if (((volatile struct vnode *)(pp->p_vnode) != vp) || + ((u_offset_t)(pp->p_offset) != off)) { + VM_STAT_ADD(page_lookup_nowait_cnt[5]); + if (locked) { + panic("page_lookup_nowait %p", + (void *)pp); + /*NOTREACHED*/ + } + page_unlock(pp); + goto top; + } + if (PP_ISFREE(pp)) { + VM_STAT_ADD(page_lookup_nowait_cnt[6]); + page_unlock(pp); + pp = NULL; + } + } + } + if (locked) { + VM_STAT_ADD(page_lookup_nowait_cnt[7]); + mutex_exit(phm); + } + + ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); + + return (pp); +} + +/* + * Search the hash list for a page with the specified [vp, off] + * that is known to exist and is already locked. This routine + * is typically used by segment SOFTUNLOCK routines. + */ +page_t * +page_find(vnode_t *vp, u_offset_t off) +{ + page_t *pp; + kmutex_t *phm; + ulong_t index; + + ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); + VM_STAT_ADD(page_find_cnt); + + index = PAGE_HASH_FUNC(vp, off); + phm = PAGE_HASH_MUTEX(index); + + mutex_enter(phm); + PAGE_HASH_SEARCH(index, pp, vp, off); + mutex_exit(phm); + + ASSERT(pp != NULL); + ASSERT(PAGE_LOCKED(pp) || panicstr); + return (pp); +} + +/* + * Determine whether a page with the specified [vp, off] + * currently exists in the system. Obviously this should + * only be considered as a hint since nothing prevents the + * page from disappearing or appearing immediately after + * the return from this routine. Subsequently, we don't + * even bother to lock the list. + */ +page_t * +page_exists(vnode_t *vp, u_offset_t off) +{ + page_t *pp; + ulong_t index; + + ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); + VM_STAT_ADD(page_exists_cnt); + + index = PAGE_HASH_FUNC(vp, off); + PAGE_HASH_SEARCH(index, pp, vp, off); + + return (pp); +} + +/* + * Determine if physically contiguous pages exist for [vp, off] - [vp, off + + * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array + * with these pages locked SHARED. If necessary reclaim pages from + * freelist. Return 1 if contiguous pages exist and 0 otherwise. + * + * If we fail to lock pages still return 1 if pages exist and contiguous. + * But in this case return value is just a hint. ppa array won't be filled. + * Caller should initialize ppa[0] as NULL to distinguish return value. + * + * Returns 0 if pages don't exist or not physically contiguous. + * + * This routine doesn't work for anonymous(swapfs) pages. + */ +int +page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[]) +{ + pgcnt_t pages; + pfn_t pfn; + page_t *rootpp; + pgcnt_t i; + pgcnt_t j; + u_offset_t save_off = off; + ulong_t index; + kmutex_t *phm; + page_t *pp; + uint_t pszc; + int loopcnt = 0; + + ASSERT(szc != 0); + ASSERT(vp != NULL); + ASSERT(!IS_SWAPFSVP(vp)); + ASSERT(vp != &kvp); + +again: + if (++loopcnt > 3) { + VM_STAT_ADD(page_exphcontg[0]); + return (0); + } + + index = PAGE_HASH_FUNC(vp, off); + phm = PAGE_HASH_MUTEX(index); + + mutex_enter(phm); + PAGE_HASH_SEARCH(index, pp, vp, off); + mutex_exit(phm); + + VM_STAT_ADD(page_exphcontg[1]); + + if (pp == NULL) { + VM_STAT_ADD(page_exphcontg[2]); + return (0); + } + + pages = page_get_pagecnt(szc); + rootpp = pp; + pfn = rootpp->p_pagenum; + + if ((pszc = pp->p_szc) >= szc && ppa != NULL) { + VM_STAT_ADD(page_exphcontg[3]); + if (!page_trylock(pp, SE_SHARED)) { + VM_STAT_ADD(page_exphcontg[4]); + return (1); + } + if (pp->p_szc != pszc || pp->p_vnode != vp || + pp->p_offset != off) { + VM_STAT_ADD(page_exphcontg[5]); + page_unlock(pp); + off = save_off; + goto again; + } + /* + * szc was non zero and vnode and offset matched after we + * locked the page it means it can't become free on us. + */ + ASSERT(!PP_ISFREE(pp)); + if (!IS_P2ALIGNED(pfn, pages)) { + page_unlock(pp); + return (0); + } + ppa[0] = pp; + pp++; + off += PAGESIZE; + pfn++; + for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) { + if (!page_trylock(pp, SE_SHARED)) { + VM_STAT_ADD(page_exphcontg[6]); + pp--; + while (i-- > 0) { + page_unlock(pp); + pp--; + } + ppa[0] = NULL; + return (1); + } + if (pp->p_szc != pszc) { + VM_STAT_ADD(page_exphcontg[7]); + page_unlock(pp); + pp--; + while (i-- > 0) { + page_unlock(pp); + pp--; + } + ppa[0] = NULL; + off = save_off; + goto again; + } + /* + * szc the same as for previous already locked pages + * with right identity. Since this page had correct + * szc after we locked it can't get freed or destroyed + * and therefore must have the expected identity. + */ + ASSERT(!PP_ISFREE(pp)); + if (pp->p_vnode != vp || + pp->p_offset != off) { + panic("page_exists_physcontig: " + "large page identity doesn't match"); + } + ppa[i] = pp; + ASSERT(pp->p_pagenum == pfn); + } + VM_STAT_ADD(page_exphcontg[8]); + ppa[pages] = NULL; + return (1); + } else if (pszc >= szc) { + VM_STAT_ADD(page_exphcontg[9]); + if (!IS_P2ALIGNED(pfn, pages)) { + return (0); + } + return (1); + } + + if (!IS_P2ALIGNED(pfn, pages)) { + VM_STAT_ADD(page_exphcontg[10]); + return (0); + } + + if (page_numtomemseg_nolock(pfn) != + page_numtomemseg_nolock(pfn + pages - 1)) { + VM_STAT_ADD(page_exphcontg[11]); + return (0); + } + + /* + * We loop up 4 times across pages to promote page size. + * We're extra cautious to promote page size atomically with respect + * to everybody else. But we can probably optimize into 1 loop if + * this becomes an issue. + */ + + for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) { + ASSERT(pp->p_pagenum == pfn); + if (!page_trylock(pp, SE_EXCL)) { + VM_STAT_ADD(page_exphcontg[12]); + break; + } + if (pp->p_vnode != vp || + pp->p_offset != off) { + VM_STAT_ADD(page_exphcontg[13]); + page_unlock(pp); + break; + } + if (pp->p_szc >= szc) { + ASSERT(i == 0); + page_unlock(pp); + off = save_off; + goto again; + } + } + + if (i != pages) { + VM_STAT_ADD(page_exphcontg[14]); + --pp; + while (i-- > 0) { + page_unlock(pp); + --pp; + } + return (0); + } + + pp = rootpp; + for (i = 0; i < pages; i++, pp++) { + if (PP_ISFREE(pp)) { + VM_STAT_ADD(page_exphcontg[15]); + ASSERT(!PP_ISAGED(pp)); + ASSERT(pp->p_szc == 0); + if (!page_reclaim(pp, NULL)) { + break; + } + } else { + ASSERT(pp->p_szc < szc); + VM_STAT_ADD(page_exphcontg[16]); + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + } + } + if (i < pages) { + VM_STAT_ADD(page_exphcontg[17]); + /* + * page_reclaim failed because we were out of memory. + * drop the rest of the locks and return because this page + * must be already reallocated anyway. + */ + pp = rootpp; + for (j = 0; j < pages; j++, pp++) { + if (j != i) { + page_unlock(pp); + } + } + return (0); + } + + off = save_off; + pp = rootpp; + for (i = 0; i < pages; i++, pp++, off += PAGESIZE) { + ASSERT(PAGE_EXCL(pp)); + ASSERT(!PP_ISFREE(pp)); + ASSERT(!hat_page_is_mapped(pp)); + ASSERT(pp->p_vnode == vp); + ASSERT(pp->p_offset == off); + pp->p_szc = szc; + } + pp = rootpp; + for (i = 0; i < pages; i++, pp++) { + if (ppa == NULL) { + page_unlock(pp); + } else { + ppa[i] = pp; + page_downgrade(ppa[i]); + } + } + if (ppa != NULL) { + ppa[pages] = NULL; + } + VM_STAT_ADD(page_exphcontg[18]); + ASSERT(vp->v_pages != NULL); + return (1); +} + +/* + * Determine whether a page with the specified [vp, off] + * currently exists in the system and if so return its + * size code. Obviously this should only be considered as + * a hint since nothing prevents the page from disappearing + * or appearing immediately after the return from this routine. + */ +int +page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc) +{ + page_t *pp; + kmutex_t *phm; + ulong_t index; + int rc = 0; + + ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); + ASSERT(szc != NULL); + VM_STAT_ADD(page_exists_forreal_cnt); + + index = PAGE_HASH_FUNC(vp, off); + phm = PAGE_HASH_MUTEX(index); + + mutex_enter(phm); + PAGE_HASH_SEARCH(index, pp, vp, off); + if (pp != NULL) { + *szc = pp->p_szc; + rc = 1; + } + mutex_exit(phm); + return (rc); +} + +/* wakeup threads waiting for pages in page_create_get_something() */ +void +wakeup_pcgs(void) +{ + if (!CV_HAS_WAITERS(&pcgs_cv)) + return; + cv_broadcast(&pcgs_cv); +} + +/* + * 'freemem' is used all over the kernel as an indication of how many + * pages are free (either on the cache list or on the free page list) + * in the system. In very few places is a really accurate 'freemem' + * needed. To avoid contention of the lock protecting a the + * single freemem, it was spread out into NCPU buckets. Set_freemem + * sets freemem to the total of all NCPU buckets. It is called from + * clock() on each TICK. + */ +void +set_freemem() +{ + struct pcf *p; + ulong_t t; + uint_t i; + + t = 0; + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + t += p->pcf_count; + p++; + } + freemem = t; + + /* + * Don't worry about grabbing mutex. It's not that + * critical if we miss a tick or two. This is + * where we wakeup possible delayers in + * page_create_get_something(). + */ + wakeup_pcgs(); +} + +ulong_t +get_freemem() +{ + struct pcf *p; + ulong_t t; + uint_t i; + + t = 0; + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + t += p->pcf_count; + p++; + } + /* + * We just calculated it, might as well set it. + */ + freemem = t; + return (t); +} + +/* + * Acquire all of the page cache & free (pcf) locks. + */ +void +pcf_acquire_all() +{ + struct pcf *p; + uint_t i; + + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + p->pcf_touch = 1; + mutex_enter(&p->pcf_lock); + p++; + } +} + +/* + * Release all the pcf_locks. + */ +void +pcf_release_all() +{ + struct pcf *p; + uint_t i; + + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + mutex_exit(&p->pcf_lock); + p++; + } +} + +/* + * Inform the VM system that we need some pages freed up. + * Calls must be symmetric, e.g.: + * + * page_needfree(100); + * wait a bit; + * page_needfree(-100); + */ +void +page_needfree(spgcnt_t npages) +{ + mutex_enter(&new_freemem_lock); + needfree += npages; + mutex_exit(&new_freemem_lock); +} + +/* + * Throttle for page_create(): try to prevent freemem from dropping + * below throttlefree. We can't provide a 100% guarantee because + * KM_NOSLEEP allocations, page_reclaim(), and various other things + * nibble away at the freelist. However, we can block all PG_WAIT + * allocations until memory becomes available. The motivation is + * that several things can fall apart when there's no free memory: + * + * (1) If pageout() needs memory to push a page, the system deadlocks. + * + * (2) By (broken) specification, timeout(9F) can neither fail nor + * block, so it has no choice but to panic the system if it + * cannot allocate a callout structure. + * + * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block; + * it panics if it cannot allocate a callback structure. + * + * (4) Untold numbers of third-party drivers have not yet been hardened + * against KM_NOSLEEP and/or allocb() failures; they simply assume + * success and panic the system with a data fault on failure. + * (The long-term solution to this particular problem is to ship + * hostile fault-injecting DEBUG kernels with the DDK.) + * + * It is theoretically impossible to guarantee success of non-blocking + * allocations, but in practice, this throttle is very hard to break. + */ +static int +page_create_throttle(pgcnt_t npages, int flags) +{ + ulong_t fm; + uint_t i; + pgcnt_t tf; /* effective value of throttlefree */ + + /* + * Never deny pages when: + * - it's a thread that cannot block [NOMEMWAIT()] + * - the allocation cannot block and must not fail + * - the allocation cannot block and is pageout dispensated + */ + if (NOMEMWAIT() || + ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) || + ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE)) + return (1); + + /* + * If the allocation can't block, we look favorably upon it + * unless we're below pageout_reserve. In that case we fail + * the allocation because we want to make sure there are a few + * pages available for pageout. + */ + if ((flags & PG_WAIT) == 0) + return (freemem >= npages + pageout_reserve); + + /* Calculate the effective throttlefree value */ + tf = throttlefree - + ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); + + cv_signal(&proc_pageout->p_cv); + + while (freemem < npages + tf) { + pcf_acquire_all(); + mutex_enter(&new_freemem_lock); + fm = 0; + for (i = 0; i < PCF_FANOUT; i++) { + fm += pcf[i].pcf_count; + pcf[i].pcf_wait++; + mutex_exit(&pcf[i].pcf_lock); + } + freemem = fm; + needfree += npages; + freemem_wait++; + cv_wait(&freemem_cv, &new_freemem_lock); + freemem_wait--; + needfree -= npages; + mutex_exit(&new_freemem_lock); + } + return (1); +} + +/* + * page_create_wait() is called to either coalecse pages from the + * different pcf buckets or to wait because there simply are not + * enough pages to satisfy the caller's request. + * + * Sadly, this is called from platform/vm/vm_machdep.c + */ +int +page_create_wait(size_t npages, uint_t flags) +{ + pgcnt_t total; + uint_t i; + struct pcf *p; + + /* + * Wait until there are enough free pages to satisfy our + * entire request. + * We set needfree += npages before prodding pageout, to make sure + * it does real work when npages > lotsfree > freemem. + */ + VM_STAT_ADD(page_create_not_enough); + + ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1); +checkagain: + if ((flags & PG_NORELOC) && + kcage_freemem < kcage_throttlefree + npages) + (void) kcage_create_throttle(npages, flags); + + if (freemem < npages + throttlefree) + if (!page_create_throttle(npages, flags)) + return (0); + + /* + * Since page_create_va() looked at every + * bucket, assume we are going to have to wait. + * Get all of the pcf locks. + */ + total = 0; + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + p->pcf_touch = 1; + mutex_enter(&p->pcf_lock); + total += p->pcf_count; + if (total >= npages) { + /* + * Wow! There are enough pages laying around + * to satisfy the request. Do the accounting, + * drop the locks we acquired, and go back. + * + * freemem is not protected by any lock. So, + * we cannot have any assertion containing + * freemem. + */ + freemem -= npages; + + while (p >= pcf) { + if (p->pcf_count <= npages) { + npages -= p->pcf_count; + p->pcf_count = 0; + } else { + p->pcf_count -= (uint_t)npages; + npages = 0; + } + mutex_exit(&p->pcf_lock); + p--; + } + ASSERT(npages == 0); + return (1); + } + p++; + } + + /* + * All of the pcf locks are held, there are not enough pages + * to satisfy the request (npages < total). + * Be sure to acquire the new_freemem_lock before dropping + * the pcf locks. This prevents dropping wakeups in page_free(). + * The order is always pcf_lock then new_freemem_lock. + * + * Since we hold all the pcf locks, it is a good time to set freemem. + * + * If the caller does not want to wait, return now. + * Else turn the pageout daemon loose to find something + * and wait till it does. + * + */ + freemem = total; + + if ((flags & PG_WAIT) == 0) { + pcf_release_all(); + + TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM, + "page_create_nomem:npages %ld freemem %ld", npages, freemem); + return (0); + } + + ASSERT(proc_pageout != NULL); + cv_signal(&proc_pageout->p_cv); + + TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, + "page_create_sleep_start: freemem %ld needfree %ld", + freemem, needfree); + + /* + * We are going to wait. + * We currently hold all of the pcf_locks, + * get the new_freemem_lock (it protects freemem_wait), + * before dropping the pcf_locks. + */ + mutex_enter(&new_freemem_lock); + + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + p->pcf_wait++; + mutex_exit(&p->pcf_lock); + p++; + } + + needfree += npages; + freemem_wait++; + + cv_wait(&freemem_cv, &new_freemem_lock); + + freemem_wait--; + needfree -= npages; + + mutex_exit(&new_freemem_lock); + + TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END, + "page_create_sleep_end: freemem %ld needfree %ld", + freemem, needfree); + + VM_STAT_ADD(page_create_not_enough_again); + goto checkagain; +} + +/* + * A routine to do the opposite of page_create_wait(). + */ +void +page_create_putback(spgcnt_t npages) +{ + struct pcf *p; + pgcnt_t lump; + uint_t *which; + + /* + * When a contiguous lump is broken up, we have to + * deal with lots of pages (min 64) so lets spread + * the wealth around. + */ + lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; + freemem += npages; + + for (p = pcf; (npages > 0) && (p < &pcf[PCF_FANOUT]); p++) { + which = &p->pcf_count; + + mutex_enter(&p->pcf_lock); + + if (p->pcf_block) { + which = &p->pcf_reserve; + } + + if (lump < npages) { + *which += (uint_t)lump; + npages -= lump; + } else { + *which += (uint_t)npages; + npages = 0; + } + + if (p->pcf_wait) { + mutex_enter(&new_freemem_lock); + /* + * Check to see if some other thread + * is actually waiting. Another bucket + * may have woken it up by now. If there + * are no waiters, then set our pcf_wait + * count to zero to avoid coming in here + * next time. + */ + if (freemem_wait) { + if (npages > 1) { + cv_broadcast(&freemem_cv); + } else { + cv_signal(&freemem_cv); + } + p->pcf_wait--; + } else { + p->pcf_wait = 0; + } + mutex_exit(&new_freemem_lock); + } + mutex_exit(&p->pcf_lock); + } + ASSERT(npages == 0); +} + +/* + * A helper routine for page_create_get_something. + * The indenting got to deep down there. + * Unblock the pcf counters. Any pages freed after + * pcf_block got set are moved to pcf_count and + * wakeups (cv_broadcast() or cv_signal()) are done as needed. + */ +static void +pcgs_unblock(void) +{ + int i; + struct pcf *p; + + /* Update freemem while we're here. */ + freemem = 0; + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + mutex_enter(&p->pcf_lock); + ASSERT(p->pcf_count == 0); + p->pcf_count = p->pcf_reserve; + p->pcf_block = 0; + freemem += p->pcf_count; + if (p->pcf_wait) { + mutex_enter(&new_freemem_lock); + if (freemem_wait) { + if (p->pcf_reserve > 1) { + cv_broadcast(&freemem_cv); + p->pcf_wait = 0; + } else { + cv_signal(&freemem_cv); + p->pcf_wait--; + } + } else { + p->pcf_wait = 0; + } + mutex_exit(&new_freemem_lock); + } + p->pcf_reserve = 0; + mutex_exit(&p->pcf_lock); + p++; + } +} + +/* + * Called from page_create_va() when both the cache and free lists + * have been checked once. + * + * Either returns a page or panics since the accounting was done + * way before we got here. + * + * We don't come here often, so leave the accounting on permanently. + */ + +#define MAX_PCGS 100 + +#ifdef DEBUG +#define PCGS_TRIES 100 +#else /* DEBUG */ +#define PCGS_TRIES 10 +#endif /* DEBUG */ + +#ifdef VM_STATS +uint_t pcgs_counts[PCGS_TRIES]; +uint_t pcgs_too_many; +uint_t pcgs_entered; +uint_t pcgs_entered_noreloc; +uint_t pcgs_locked; +uint_t pcgs_cagelocked; +#endif /* VM_STATS */ + +static page_t * +page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg, + caddr_t vaddr, uint_t flags) +{ + uint_t count; + page_t *pp; + uint_t locked, i; + struct pcf *p; + lgrp_t *lgrp; + int cagelocked = 0; + + VM_STAT_ADD(pcgs_entered); + + /* + * Tap any reserve freelists: if we fail now, we'll die + * since the page(s) we're looking for have already been + * accounted for. + */ + flags |= PG_PANIC; + + if ((flags & PG_NORELOC) != 0) { + VM_STAT_ADD(pcgs_entered_noreloc); + /* + * Requests for free pages from critical threads + * such as pageout still won't throttle here, but + * we must try again, to give the cageout thread + * another chance to catch up. Since we already + * accounted for the pages, we had better get them + * this time. + * + * N.B. All non-critical threads acquire the pcgs_cagelock + * to serialize access to the freelists. This implements a + * turnstile-type synchornization to avoid starvation of + * critical requests for PG_NORELOC memory by non-critical + * threads: all non-critical threads must acquire a 'ticket' + * before passing through, which entails making sure + * kcage_freemem won't fall below minfree prior to grabbing + * pages from the freelists. + */ + if (kcage_create_throttle(1, flags) == KCT_NONCRIT) { + mutex_enter(&pcgs_cagelock); + cagelocked = 1; + VM_STAT_ADD(pcgs_cagelocked); + } + } + + /* + * Time to get serious. + * We failed to get a `correctly colored' page from both the + * free and cache lists. + * We escalate in stage. + * + * First try both lists without worring about color. + * + * Then, grab all page accounting locks (ie. pcf[]) and + * steal any pages that they have and set the pcf_block flag to + * stop deletions from the lists. This will help because + * a page can get added to the free list while we are looking + * at the cache list, then another page could be added to the cache + * list allowing the page on the free list to be removed as we + * move from looking at the cache list to the free list. This + * could happen over and over. We would never find the page + * we have accounted for. + * + * Noreloc pages are a subset of the global (relocatable) page pool. + * They are not tracked separately in the pcf bins, so it is + * impossible to know when doing pcf accounting if the available + * page(s) are noreloc pages or not. When looking for a noreloc page + * it is quite easy to end up here even if the global (relocatable) + * page pool has plenty of free pages but the noreloc pool is empty. + * + * When the noreloc pool is empty (or low), additional noreloc pages + * are created by converting pages from the global page pool. This + * process will stall during pcf accounting if the pcf bins are + * already locked. Such is the case when a noreloc allocation is + * looping here in page_create_get_something waiting for more noreloc + * pages to appear. + * + * Short of adding a new field to the pcf bins to accurately track + * the number of free noreloc pages, we instead do not grab the + * pcgs_lock, do not set the pcf blocks and do not timeout when + * allocating a noreloc page. This allows noreloc allocations to + * loop without blocking global page pool allocations. + * + * NOTE: the behaviour of page_create_get_something has not changed + * for the case of global page pool allocations. + */ + + flags &= ~PG_MATCH_COLOR; + locked = 0; +#ifndef __sparc + /* + * page_create_get_something may be called because 4g memory may be + * depleted. Set flags to allow for relocation of base page below + * 4g if necessary. + */ + if (physmax4g) + flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); +#endif + + lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); + + for (count = 0; kcage_on || count < MAX_PCGS; count++) { + pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, + flags, lgrp); + if (pp == NULL) { + pp = page_get_cachelist(vp, off, seg, vaddr, + flags, lgrp); + } + if (pp == NULL) { + /* + * Serialize. Don't fight with other pcgs(). + */ + if (!locked && (!kcage_on || !(flags & PG_NORELOC))) { + mutex_enter(&pcgs_lock); + VM_STAT_ADD(pcgs_locked); + locked = 1; + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + mutex_enter(&p->pcf_lock); + ASSERT(p->pcf_block == 0); + p->pcf_block = 1; + p->pcf_reserve = p->pcf_count; + p->pcf_count = 0; + mutex_exit(&p->pcf_lock); + p++; + } + freemem = 0; + } + + if (count) { + /* + * Since page_free() puts pages on + * a list then accounts for it, we + * just have to wait for page_free() + * to unlock any page it was working + * with. The page_lock()-page_reclaim() + * path falls in the same boat. + * + * We don't need to check on the + * PG_WAIT flag, we have already + * accounted for the page we are + * looking for in page_create_va(). + * + * We just wait a moment to let any + * locked pages on the lists free up, + * then continue around and try again. + * + * Will be awakened by set_freemem(). + */ + mutex_enter(&pcgs_wait_lock); + cv_wait(&pcgs_cv, &pcgs_wait_lock); + mutex_exit(&pcgs_wait_lock); + } + } else { +#ifdef VM_STATS + if (count >= PCGS_TRIES) { + VM_STAT_ADD(pcgs_too_many); + } else { + VM_STAT_ADD(pcgs_counts[count]); + } +#endif + if (locked) { + pcgs_unblock(); + mutex_exit(&pcgs_lock); + } + if (cagelocked) + mutex_exit(&pcgs_cagelock); + return (pp); + } + } + /* + * we go down holding the pcf locks. + */ + panic("no %spage found %d", + ((flags & PG_NORELOC) ? "non-reloc " : ""), count); + /*NOTREACHED*/ +} + +/* + * Create enough pages for "bytes" worth of data starting at + * "off" in "vp". + * + * Where flag must be one of: + * + * PG_EXCL: Exclusive create (fail if any page already + * exists in the page cache) which does not + * wait for memory to become available. + * + * PG_WAIT: Non-exclusive create which can wait for + * memory to become available. + * + * PG_PHYSCONTIG: Allocate physically contiguous pages. + * (Not Supported) + * + * A doubly linked list of pages is returned to the caller. Each page + * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock) + * lock. + * + * Unable to change the parameters to page_create() in a minor release, + * we renamed page_create() to page_create_va(), changed all known calls + * from page_create() to page_create_va(), and created this wrapper. + * + * Upon a major release, we should break compatibility by deleting this + * wrapper, and replacing all the strings "page_create_va", with "page_create". + * + * NOTE: There is a copy of this interface as page_create_io() in + * i86/vm/vm_machdep.c. Any bugs fixed here should be applied + * there. + */ +page_t * +page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags) +{ + caddr_t random_vaddr; + struct seg kseg; + +#ifdef DEBUG + cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p", + (void *)caller()); +#endif + + random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^ + (uintptr_t)(off >> PAGESHIFT)); + kseg.s_as = &kas; + + return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr)); +} + +#ifdef DEBUG +uint32_t pg_alloc_pgs_mtbf = 0; +#endif + +/* + * Used for large page support. It will attempt to allocate + * a large page(s) off the freelist. + * + * Returns non zero on failure. + */ +int +page_alloc_pages(struct seg *seg, caddr_t addr, page_t **basepp, + page_t *ppa[], uint_t szc, int anypgsz) +{ + pgcnt_t npgs, curnpgs, totpgs; + size_t pgsz; + page_t *pplist = NULL, *pp; + int err = 0; + lgrp_t *lgrp; + + ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1)); + + VM_STAT_ADD(alloc_pages[0]); + +#ifdef DEBUG + if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) { + return (ENOMEM); + } +#endif + + pgsz = page_get_pagesize(szc); + totpgs = curnpgs = npgs = pgsz >> PAGESHIFT; + + ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0); + /* + * One must be NULL but not both. + * And one must be non NULL but not both. + */ + ASSERT(basepp != NULL || ppa != NULL); + ASSERT(basepp == NULL || ppa == NULL); + + (void) page_create_wait(npgs, PG_WAIT); + + while (npgs && szc) { + lgrp = lgrp_mem_choose(seg, addr, pgsz); + pp = page_get_freelist(NULL, 0, seg, addr, pgsz, 0, lgrp); + if (pp != NULL) { + VM_STAT_ADD(alloc_pages[1]); + page_list_concat(&pplist, &pp); + ASSERT(npgs >= curnpgs); + npgs -= curnpgs; + } else if (anypgsz) { + VM_STAT_ADD(alloc_pages[2]); + szc--; + pgsz = page_get_pagesize(szc); + curnpgs = pgsz >> PAGESHIFT; + } else { + VM_STAT_ADD(alloc_pages[3]); + ASSERT(npgs == totpgs); + page_create_putback(npgs); + return (ENOMEM); + } + } + if (szc == 0) { + VM_STAT_ADD(alloc_pages[4]); + ASSERT(npgs != 0); + page_create_putback(npgs); + err = ENOMEM; + } else if (basepp != NULL) { + ASSERT(npgs == 0); + ASSERT(ppa == NULL); + *basepp = pplist; + } + + npgs = totpgs - npgs; + pp = pplist; + + /* + * Clear the free and age bits. Also if we were passed in a ppa then + * fill it in with all the constituent pages from the large page. But + * if we failed to allocate all the pages just free what we got. + */ + while (npgs != 0) { + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + if (ppa != NULL || err != 0) { + if (err == 0) { + VM_STAT_ADD(alloc_pages[5]); + PP_CLRFREE(pp); + PP_CLRAGED(pp); + page_sub(&pplist, pp); + *ppa++ = pp; + npgs--; + } else { + VM_STAT_ADD(alloc_pages[6]); + ASSERT(pp->p_szc != 0); + curnpgs = page_get_pagecnt(pp->p_szc); + page_list_break(&pp, &pplist, curnpgs); + page_list_add_pages(pp, 0); + page_create_putback(curnpgs); + ASSERT(npgs >= curnpgs); + npgs -= curnpgs; + } + pp = pplist; + } else { + VM_STAT_ADD(alloc_pages[7]); + PP_CLRFREE(pp); + PP_CLRAGED(pp); + pp = pp->p_next; + npgs--; + } + } + return (err); +} + +/* + * Get a single large page off of the freelists, and set it up for use. + * Number of bytes requested must be a supported page size. + * + * Note that this call may fail even if there is sufficient + * memory available or PG_WAIT is set, so the caller must + * be willing to fallback on page_create_va(), block and retry, + * or fail the requester. + */ +page_t * +page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, + struct seg *seg, caddr_t vaddr, void *arg) +{ + pgcnt_t npages, pcftotal; + page_t *pp; + page_t *rootpp; + lgrp_t *lgrp; + uint_t enough; + uint_t pcf_index; + uint_t i; + struct pcf *p; + struct pcf *q; + lgrp_id_t *lgrpid = (lgrp_id_t *)arg; + + ASSERT(vp != NULL); + + ASSERT((flags & ~(PG_EXCL | PG_WAIT | + PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); + /* but no others */ + + ASSERT((flags & PG_EXCL) == PG_EXCL); + + npages = btop(bytes); + + if (!kcage_on || panicstr) { + /* + * Cage is OFF, or we are single threaded in + * panic, so make everything a RELOC request. + */ + flags &= ~PG_NORELOC; + } + + /* + * Make sure there's adequate physical memory available. + * Note: PG_WAIT is ignored here. + */ + if (freemem <= throttlefree + npages) { + VM_STAT_ADD(page_create_large_cnt[1]); + return (NULL); + } + + /* + * If cage is on, dampen draw from cage when available + * cage space is low. + */ + if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) && + kcage_freemem < kcage_throttlefree + npages) { + + /* + * The cage is on, the caller wants PG_NORELOC + * pages and available cage memory is very low. + * Call kcage_create_throttle() to attempt to + * control demand on the cage. + */ + if (kcage_create_throttle(npages, flags) == KCT_FAILURE) { + VM_STAT_ADD(page_create_large_cnt[2]); + return (NULL); + } + } + + enough = 0; + pcf_index = PCF_INDEX(); + p = &pcf[pcf_index]; + p->pcf_touch = 1; + q = &pcf[PCF_FANOUT]; + for (pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { + if (p->pcf_count > npages) { + /* + * a good one to try. + */ + mutex_enter(&p->pcf_lock); + if (p->pcf_count > npages) { + p->pcf_count -= (uint_t)npages; + /* + * freemem is not protected by any lock. + * Thus, we cannot have any assertion + * containing freemem here. + */ + freemem -= npages; + enough = 1; + mutex_exit(&p->pcf_lock); + break; + } + mutex_exit(&p->pcf_lock); + } + pcftotal += p->pcf_count; + p++; + if (p >= q) { + p = pcf; + } + p->pcf_touch = 1; + } + + if (!enough) { + /* If there isn't enough memory available, give up. */ + if (pcftotal < npages) { + VM_STAT_ADD(page_create_large_cnt[3]); + return (NULL); + } + + /* try to collect pages from several pcf bins */ + for (p = pcf, pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { + p->pcf_touch = 1; + mutex_enter(&p->pcf_lock); + pcftotal += p->pcf_count; + if (pcftotal >= npages) { + /* + * Wow! There are enough pages laying around + * to satisfy the request. Do the accounting, + * drop the locks we acquired, and go back. + * + * freemem is not protected by any lock. So, + * we cannot have any assertion containing + * freemem. + */ + pgcnt_t tpages = npages; + freemem -= npages; + while (p >= pcf) { + if (p->pcf_count <= tpages) { + tpages -= p->pcf_count; + p->pcf_count = 0; + } else { + p->pcf_count -= (uint_t)tpages; + tpages = 0; + } + mutex_exit(&p->pcf_lock); + p--; + } + ASSERT(tpages == 0); + break; + } + p++; + } + if (i == PCF_FANOUT) { + /* failed to collect pages - release the locks */ + while (--p >= pcf) { + mutex_exit(&p->pcf_lock); + } + VM_STAT_ADD(page_create_large_cnt[4]); + return (NULL); + } + } + + /* + * This is where this function behaves fundamentally differently + * than page_create_va(); since we're intending to map the page + * with a single TTE, we have to get it as a physically contiguous + * hardware pagesize chunk. If we can't, we fail. + */ + if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max && + LGRP_EXISTS(lgrp_table[*lgrpid])) + lgrp = lgrp_table[*lgrpid]; + else + lgrp = lgrp_mem_choose(seg, vaddr, bytes); + + if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr, + bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) { + page_create_putback(npages); + VM_STAT_ADD(page_create_large_cnt[5]); + return (NULL); + } + + /* + * if we got the page with the wrong mtype give it back this is a + * workaround for CR 6249718. When CR 6249718 is fixed we never get + * inside "if" and the workaround becomes just a nop + */ + if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) { + page_list_add_pages(rootpp, 0); + page_create_putback(npages); + VM_STAT_ADD(page_create_large_cnt[6]); + return (NULL); + } + + /* + * If satisfying this request has left us with too little + * memory, start the wheels turning to get some back. The + * first clause of the test prevents waking up the pageout + * daemon in situations where it would decide that there's + * nothing to do. + */ + if (nscan < desscan && freemem < minfree) { + TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, + "pageout_cv_signal:freemem %ld", freemem); + cv_signal(&proc_pageout->p_cv); + } + + pp = rootpp; + while (npages--) { + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_vnode == NULL); + ASSERT(!hat_page_is_mapped(pp)); + PP_CLRFREE(pp); + PP_CLRAGED(pp); + if (!page_hashin(pp, vp, off, NULL)) + panic("page_create_large: hashin failed: page %p", + (void *)pp); + page_io_lock(pp); + off += PAGESIZE; + pp = pp->p_next; + } + + VM_STAT_ADD(page_create_large_cnt[0]); + return (rootpp); +} + +page_t * +page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, + struct seg *seg, caddr_t vaddr) +{ + page_t *plist = NULL; + pgcnt_t npages; + pgcnt_t found_on_free = 0; + pgcnt_t pages_req; + page_t *npp = NULL; + uint_t enough; + uint_t i; + uint_t pcf_index; + struct pcf *p; + struct pcf *q; + lgrp_t *lgrp; + + TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, + "page_create_start:vp %p off %llx bytes %lu flags %x", + vp, off, bytes, flags); + + ASSERT(bytes != 0 && vp != NULL); + + if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) { + panic("page_create: invalid flags"); + /*NOTREACHED*/ + } + ASSERT((flags & ~(PG_EXCL | PG_WAIT | + PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); + /* but no others */ + + pages_req = npages = btopr(bytes); + /* + * Try to see whether request is too large to *ever* be + * satisfied, in order to prevent deadlock. We arbitrarily + * decide to limit maximum size requests to max_page_get. + */ + if (npages >= max_page_get) { + if ((flags & PG_WAIT) == 0) { + TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG, + "page_create_toobig:vp %p off %llx npages " + "%lu max_page_get %lu", + vp, off, npages, max_page_get); + return (NULL); + } else { + cmn_err(CE_WARN, + "Request for too much kernel memory " + "(%lu bytes), will hang forever", bytes); + for (;;) + delay(1000000000); + } + } + + if (!kcage_on || panicstr) { + /* + * Cage is OFF, or we are single threaded in + * panic, so make everything a RELOC request. + */ + flags &= ~PG_NORELOC; + } + + if (freemem <= throttlefree + npages) + if (!page_create_throttle(npages, flags)) + return (NULL); + + /* + * If cage is on, dampen draw from cage when available + * cage space is low. + */ + if ((flags & PG_NORELOC) && + kcage_freemem < kcage_throttlefree + npages) { + + /* + * The cage is on, the caller wants PG_NORELOC + * pages and available cage memory is very low. + * Call kcage_create_throttle() to attempt to + * control demand on the cage. + */ + if (kcage_create_throttle(npages, flags) == KCT_FAILURE) + return (NULL); + } + + VM_STAT_ADD(page_create_cnt[0]); + + enough = 0; + pcf_index = PCF_INDEX(); + + p = &pcf[pcf_index]; + p->pcf_touch = 1; + q = &pcf[PCF_FANOUT]; + for (i = 0; i < PCF_FANOUT; i++) { + if (p->pcf_count > npages) { + /* + * a good one to try. + */ + mutex_enter(&p->pcf_lock); + if (p->pcf_count > npages) { + p->pcf_count -= (uint_t)npages; + /* + * freemem is not protected by any lock. + * Thus, we cannot have any assertion + * containing freemem here. + */ + freemem -= npages; + enough = 1; + mutex_exit(&p->pcf_lock); + break; + } + mutex_exit(&p->pcf_lock); + } + p++; + if (p >= q) { + p = pcf; + } + p->pcf_touch = 1; + } + + if (!enough) { + /* + * Have to look harder. If npages is greater than + * one, then we might have to coalecse the counters. + * + * Go wait. We come back having accounted + * for the memory. + */ + VM_STAT_ADD(page_create_cnt[1]); + if (!page_create_wait(npages, flags)) { + VM_STAT_ADD(page_create_cnt[2]); + return (NULL); + } + } + + TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, + "page_create_success:vp %p off %llx", vp, off); + + /* + * If satisfying this request has left us with too little + * memory, start the wheels turning to get some back. The + * first clause of the test prevents waking up the pageout + * daemon in situations where it would decide that there's + * nothing to do. + */ + if (nscan < desscan && freemem < minfree) { + TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, + "pageout_cv_signal:freemem %ld", freemem); + cv_signal(&proc_pageout->p_cv); + } + + /* + * Loop around collecting the requested number of pages. + * Most of the time, we have to `create' a new page. With + * this in mind, pull the page off the free list before + * getting the hash lock. This will minimize the hash + * lock hold time, nesting, and the like. If it turns + * out we don't need the page, we put it back at the end. + */ + while (npages--) { + page_t *pp; + kmutex_t *phm = NULL; + ulong_t index; + + index = PAGE_HASH_FUNC(vp, off); +top: + ASSERT(phm == NULL); + ASSERT(index == PAGE_HASH_FUNC(vp, off)); + ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); + + if (npp == NULL) { + /* + * Try to get a page from the freelist (ie, + * a page with no [vp, off] tag). If that + * fails, use the cachelist. + * + * During the first attempt at both the free + * and cache lists we try for the correct color. + */ + /* + * XXXX-how do we deal with virtual indexed + * caches and and colors? + */ + VM_STAT_ADD(page_create_cnt[4]); + /* + * Get lgroup to allocate next page of shared memory + * from and use it to specify where to allocate + * the physical memory + */ + lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); + npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, + flags | PG_MATCH_COLOR, lgrp); + if (npp == NULL) { + npp = page_get_cachelist(vp, off, seg, + vaddr, flags | PG_MATCH_COLOR, lgrp); + if (npp == NULL) { + npp = page_create_get_something(vp, + off, seg, vaddr, + flags & ~PG_MATCH_COLOR); + } + + if (PP_ISAGED(npp) == 0) { + /* + * Since this page came from the + * cachelist, we must destroy the + * old vnode association. + */ + page_hashout(npp, NULL); + } + } + } + + /* + * We own this page! + */ + ASSERT(PAGE_EXCL(npp)); + ASSERT(npp->p_vnode == NULL); + ASSERT(!hat_page_is_mapped(npp)); + PP_CLRFREE(npp); + PP_CLRAGED(npp); + + /* + * Here we have a page in our hot little mits and are + * just waiting to stuff it on the appropriate lists. + * Get the mutex and check to see if it really does + * not exist. + */ + phm = PAGE_HASH_MUTEX(index); + mutex_enter(phm); + PAGE_HASH_SEARCH(index, pp, vp, off); + if (pp == NULL) { + VM_STAT_ADD(page_create_new); + pp = npp; + npp = NULL; + if (!page_hashin(pp, vp, off, phm)) { + /* + * Since we hold the page hash mutex and + * just searched for this page, page_hashin + * had better not fail. If it does, that + * means somethread did not follow the + * page hash mutex rules. Panic now and + * get it over with. As usual, go down + * holding all the locks. + */ + ASSERT(MUTEX_HELD(phm)); + panic("page_create: " + "hashin failed %p %p %llx %p", + (void *)pp, (void *)vp, off, (void *)phm); + /*NOTREACHED*/ + } + ASSERT(MUTEX_HELD(phm)); + mutex_exit(phm); + phm = NULL; + + /* + * Hat layer locking need not be done to set + * the following bits since the page is not hashed + * and was on the free list (i.e., had no mappings). + * + * Set the reference bit to protect + * against immediate pageout + * + * XXXmh modify freelist code to set reference + * bit so we don't have to do it here. + */ + page_set_props(pp, P_REF); + found_on_free++; + } else { + VM_STAT_ADD(page_create_exists); + if (flags & PG_EXCL) { + /* + * Found an existing page, and the caller + * wanted all new pages. Undo all of the work + * we have done. + */ + mutex_exit(phm); + phm = NULL; + while (plist != NULL) { + pp = plist; + page_sub(&plist, pp); + page_io_unlock(pp); + /* large pages should not end up here */ + ASSERT(pp->p_szc == 0); + /*LINTED: constant in conditional ctx*/ + VN_DISPOSE(pp, B_INVAL, 0, kcred); + } + VM_STAT_ADD(page_create_found_one); + goto fail; + } + ASSERT(flags & PG_WAIT); + if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) { + /* + * Start all over again if we blocked trying + * to lock the page. + */ + mutex_exit(phm); + VM_STAT_ADD(page_create_page_lock_failed); + phm = NULL; + goto top; + } + mutex_exit(phm); + phm = NULL; + + if (PP_ISFREE(pp)) { + ASSERT(PP_ISAGED(pp) == 0); + VM_STAT_ADD(pagecnt.pc_get_cache); + page_list_sub(pp, PG_CACHE_LIST); + PP_CLRFREE(pp); + found_on_free++; + } + } + + /* + * Got a page! It is locked. Acquire the i/o + * lock since we are going to use the p_next and + * p_prev fields to link the requested pages together. + */ + page_io_lock(pp); + page_add(&plist, pp); + plist = plist->p_next; + off += PAGESIZE; + vaddr += PAGESIZE; + } + + ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1); +fail: + if (npp != NULL) { + /* + * Did not need this page after all. + * Put it back on the free list. + */ + VM_STAT_ADD(page_create_putbacks); + PP_SETFREE(npp); + PP_SETAGED(npp); + npp->p_offset = (u_offset_t)-1; + page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); + page_unlock(npp); + + } + + ASSERT(pages_req >= found_on_free); + + { + uint_t overshoot = (uint_t)(pages_req - found_on_free); + + if (overshoot) { + VM_STAT_ADD(page_create_overshoot); + p = &pcf[pcf_index]; + p->pcf_touch = 1; + mutex_enter(&p->pcf_lock); + if (p->pcf_block) { + p->pcf_reserve += overshoot; + } else { + p->pcf_count += overshoot; + if (p->pcf_wait) { + mutex_enter(&new_freemem_lock); + if (freemem_wait) { + cv_signal(&freemem_cv); + p->pcf_wait--; + } else { + p->pcf_wait = 0; + } + mutex_exit(&new_freemem_lock); + } + } + mutex_exit(&p->pcf_lock); + /* freemem is approximate, so this test OK */ + if (!p->pcf_block) + freemem += overshoot; + } + } + + return (plist); +} + +/* + * One or more constituent pages of this large page has been marked + * toxic. Simply demote the large page to PAGESIZE pages and let + * page_free() handle it. This routine should only be called by + * large page free routines (page_free_pages() and page_destroy_pages(). + * All pages are locked SE_EXCL and have already been marked free. + */ +static void +page_free_toxic_pages(page_t *rootpp) +{ + page_t *tpp; + pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc); + uint_t szc = rootpp->p_szc; + + for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) { + ASSERT(tpp->p_szc == szc); + ASSERT((PAGE_EXCL(tpp) && + !page_iolock_assert(tpp)) || panicstr); + tpp->p_szc = 0; + } + + while (rootpp != NULL) { + tpp = rootpp; + page_sub(&rootpp, tpp); + ASSERT(PP_ISFREE(tpp)); + PP_CLRFREE(tpp); + page_free(tpp, 1); + } +} + +/* + * Put page on the "free" list. + * The free list is really two lists maintained by + * the PSM of whatever machine we happen to be on. + */ +void +page_free(page_t *pp, int dontneed) +{ + struct pcf *p; + uint_t pcf_index; + + ASSERT((PAGE_EXCL(pp) && + !page_iolock_assert(pp)) || panicstr); + + if (page_deteriorating(pp)) { + volatile int i = 0; + char *kaddr; + volatile int rb, wb; + uint64_t pa; + volatile int ue = 0; + on_trap_data_t otd; + + if (pp->p_vnode != NULL) { + /* + * Let page_destroy() do its bean counting and + * hash out the page; it will then call back + * into page_free() with pp->p_vnode == NULL. + */ + page_destroy(pp, 0); + return; + } + + if (page_isfailing(pp)) { + /* + * If we have already exceeded the limit for + * pages retired, we will treat this page as + * 'toxic' rather than failing. That will ensure + * that the page is at least cleaned, and if + * a UE is detected, the page will be retired + * anyway. + */ + if (pages_retired_limit_exceeded()) { + /* + * clear the flag and reset to toxic + */ + page_clrtoxic(pp); + page_settoxic(pp, PAGE_IS_TOXIC); + } else { + pa = ptob((uint64_t)page_pptonum(pp)); + if (page_retire_messages) { + cmn_err(CE_NOTE, "Page 0x%08x.%08x " + "removed from service", + (uint32_t)(pa >> 32), (uint32_t)pa); + } + goto page_failed; + } + } + + pagescrub(pp, 0, PAGESIZE); + + /* + * We want to determine whether the error that occurred on + * this page is transient or persistent, so we get a mapping + * to the page and try every possible bit pattern to compare + * what we write with what we read back. A smaller number + * of bit patterns might suffice, but there's no point in + * getting fancy. If this is the hot path on your system, + * you've got bigger problems. + */ + kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1); + for (wb = 0xff; wb >= 0; wb--) { + if (on_trap(&otd, OT_DATA_EC)) { + pa = ptob((uint64_t)page_pptonum(pp)) + i; + page_settoxic(pp, PAGE_IS_FAILING); + + if (page_retire_messages) { + cmn_err(CE_WARN, "Uncorrectable Error " + "occurred at PA 0x%08x.%08x while " + "attempting to clear previously " + "reported error; page removed from " + "service", (uint32_t)(pa >> 32), + (uint32_t)pa); + } + + ue++; + break; + } + + /* + * Write out the bit pattern, flush it to memory, and + * read it back while under on_trap() protection. + */ + for (i = 0; i < PAGESIZE; i++) + kaddr[i] = wb; + + sync_data_memory(kaddr, PAGESIZE); + + for (i = 0; i < PAGESIZE; i++) { + if ((rb = (uchar_t)kaddr[i]) != wb) { + page_settoxic(pp, PAGE_IS_FAILING); + goto out; + } + } + } +out: + no_trap(); + ppmapout(kaddr); + + if (wb >= 0 && !ue) { + pa = ptob((uint64_t)page_pptonum(pp)) + i; + if (page_retire_messages) { + cmn_err(CE_WARN, "Data Mismatch occurred at PA " + "0x%08x.%08x [ 0x%x != 0x%x ] while " + "attempting to clear previously reported " + "error; page removed from service", + (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb); + } + } +page_failed: + /* + * DR operations change the association between a page_t + * and the physical page it represents. Check if the + * page is still bad. If it is, then retire it. + */ + if (page_isfaulty(pp) && page_isfailing(pp)) { + /* + * In the future, it might be useful to have a platform + * callback here to tell the hardware to fence off this + * page during the next reboot. + * + * We move the page to the retired_vnode here + */ + (void) page_hashin(pp, &retired_ppages, + (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL); + mutex_enter(&freemem_lock); + availrmem--; + mutex_exit(&freemem_lock); + page_retired(pp); + page_downgrade(pp); + + /* + * If DR raced with the above page retirement code, + * we might have retired a good page. If so, unretire + * the page. + */ + if (!page_isfaulty(pp)) + page_unretire_pages(); + return; + } + + pa = ptob((uint64_t)page_pptonum(pp)); + + if (page_retire_messages) { + cmn_err(CE_NOTE, "Previously reported error on page " + "0x%08x.%08x cleared", (uint32_t)(pa >> 32), + (uint32_t)pa); + } + + page_clrtoxic(pp); + } + + if (PP_ISFREE(pp)) { + panic("page_free: page %p is free", (void *)pp); + } + + if (pp->p_szc != 0) { + if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || + pp->p_vnode == &kvp) { + panic("page_free: anon or kernel " + "or no vnode large page %p", (void *)pp); + } + page_demote_vp_pages(pp); + ASSERT(pp->p_szc == 0); + } + + /* + * The page_struct_lock need not be acquired to examine these + * fields since the page has an "exclusive" lock. + */ + if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { + panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d", + pp, page_pptonum(pp), pp->p_lckcnt, pp->p_cowcnt); + /*NOTREACHED*/ + } + + ASSERT(!hat_page_getshare(pp)); + + PP_SETFREE(pp); + ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) || + !hat_ismod(pp)); + page_clr_all_props(pp); + ASSERT(!hat_page_getshare(pp)); + + /* + * Now we add the page to the head of the free list. + * But if this page is associated with a paged vnode + * then we adjust the head forward so that the page is + * effectively at the end of the list. + */ + if (pp->p_vnode == NULL) { + /* + * Page has no identity, put it on the free list. + */ + PP_SETAGED(pp); + pp->p_offset = (u_offset_t)-1; + page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); + VM_STAT_ADD(pagecnt.pc_free_free); + TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, + "page_free_free:pp %p", pp); + } else { + PP_CLRAGED(pp); + + if (!dontneed || nopageage) { + /* move it to the tail of the list */ + page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL); + + VM_STAT_ADD(pagecnt.pc_free_cache); + TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL, + "page_free_cache_tail:pp %p", pp); + } else { + page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD); + + VM_STAT_ADD(pagecnt.pc_free_dontneed); + TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD, + "page_free_cache_head:pp %p", pp); + } + } + page_unlock(pp); + + /* + * Now do the `freemem' accounting. + */ + pcf_index = PCF_INDEX(); + p = &pcf[pcf_index]; + p->pcf_touch = 1; + + mutex_enter(&p->pcf_lock); + if (p->pcf_block) { + p->pcf_reserve += 1; + } else { + p->pcf_count += 1; + if (p->pcf_wait) { + mutex_enter(&new_freemem_lock); + /* + * Check to see if some other thread + * is actually waiting. Another bucket + * may have woken it up by now. If there + * are no waiters, then set our pcf_wait + * count to zero to avoid coming in here + * next time. Also, since only one page + * was put on the free list, just wake + * up one waiter. + */ + if (freemem_wait) { + cv_signal(&freemem_cv); + p->pcf_wait--; + } else { + p->pcf_wait = 0; + } + mutex_exit(&new_freemem_lock); + } + } + mutex_exit(&p->pcf_lock); + + /* freemem is approximate, so this test OK */ + if (!p->pcf_block) + freemem += 1; +} + +/* + * Put page on the "free" list during intial startup. + * This happens during initial single threaded execution. + */ +void +page_free_at_startup(page_t *pp) +{ + struct pcf *p; + uint_t pcf_index; + + page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT); + VM_STAT_ADD(pagecnt.pc_free_free); + + /* + * Now do the `freemem' accounting. + */ + pcf_index = PCF_INDEX(); + p = &pcf[pcf_index]; + p->pcf_touch = 1; + + ASSERT(p->pcf_block == 0); + ASSERT(p->pcf_wait == 0); + p->pcf_count += 1; + + /* freemem is approximate, so this is OK */ + freemem += 1; +} + +void +page_free_pages(page_t *pp) +{ + page_t *tpp, *rootpp = NULL; + pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); + pgcnt_t i; + uint_t szc = pp->p_szc; + int toxic = 0; + + VM_STAT_ADD(pagecnt.pc_free_pages); + TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, + "page_free_free:pp %p", pp); + + ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); + if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { + panic("page_free_pages: not root page %p", (void *)pp); + /*NOTREACHED*/ + } + + for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) { + ASSERT((PAGE_EXCL(tpp) && + !page_iolock_assert(tpp)) || panicstr); + if (PP_ISFREE(tpp)) { + panic("page_free_pages: page %p is free", (void *)tpp); + /*NOTREACHED*/ + } + if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 || + tpp->p_cowcnt != 0) { + panic("page_free_pages %p", (void *)tpp); + /*NOTREACHED*/ + } + + ASSERT(!hat_page_getshare(tpp)); + ASSERT(tpp->p_vnode == NULL); + ASSERT(tpp->p_szc == szc); + + if (page_deteriorating(tpp)) + toxic = 1; + + PP_SETFREE(tpp); + page_clr_all_props(tpp); + PP_SETAGED(tpp); + tpp->p_offset = (u_offset_t)-1; + ASSERT(tpp->p_next == tpp); + ASSERT(tpp->p_prev == tpp); + page_list_concat(&rootpp, &tpp); + } + ASSERT(rootpp == pp); + + if (toxic) { + page_free_toxic_pages(rootpp); + return; + } + page_list_add_pages(rootpp, 0); + page_create_putback(pgcnt); +} + +int free_pages = 1; + +/* + * This routine attempts to return pages to the cachelist via page_release(). + * It does not *have* to be successful in all cases, since the pageout scanner + * will catch any pages it misses. It does need to be fast and not introduce + * too much overhead. + * + * If a page isn't found on the unlocked sweep of the page_hash bucket, we + * don't lock and retry. This is ok, since the page scanner will eventually + * find any page we miss in free_vp_pages(). + */ +void +free_vp_pages(vnode_t *vp, u_offset_t off, size_t len) +{ + page_t *pp; + u_offset_t eoff; + extern int swap_in_range(vnode_t *, u_offset_t, size_t); + + eoff = off + len; + + if (free_pages == 0) + return; + if (swap_in_range(vp, off, len)) + return; + + for (; off < eoff; off += PAGESIZE) { + + /* + * find the page using a fast, but inexact search. It'll be OK + * if a few pages slip through the cracks here. + */ + pp = page_exists(vp, off); + + /* + * If we didn't find the page (it may not exist), the page + * is free, looks still in use (shared), or we can't lock it, + * just give up. + */ + if (pp == NULL || + PP_ISFREE(pp) || + page_share_cnt(pp) > 0 || + !page_trylock(pp, SE_EXCL)) + continue; + + /* + * Once we have locked pp, verify that it's still the + * correct page and not already free + */ + ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL)); + if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) { + page_unlock(pp); + continue; + } + + /* + * try to release the page... + */ + (void) page_release(pp, 1); + } +} + +/* + * Reclaim the given page from the free list. + * Returns 1 on success or 0 on failure. + * + * The page is unlocked if it can't be reclaimed (when freemem == 0). + * If `lock' is non-null, it will be dropped and re-acquired if + * the routine must wait while freemem is 0. + * + * As it turns out, boot_getpages() does this. It picks a page, + * based on where OBP mapped in some address, gets its pfn, searches + * the memsegs, locks the page, then pulls it off the free list! + */ +int +page_reclaim(page_t *pp, kmutex_t *lock) +{ + struct pcf *p; + uint_t pcf_index; + struct cpu *cpup; + int enough; + uint_t i; + + ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); + ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp)); + ASSERT(pp->p_szc == 0); + + /* + * If `freemem' is 0, we cannot reclaim this page from the + * freelist, so release every lock we might hold: the page, + * and the `lock' before blocking. + * + * The only way `freemem' can become 0 while there are pages + * marked free (have their p->p_free bit set) is when the + * system is low on memory and doing a page_create(). In + * order to guarantee that once page_create() starts acquiring + * pages it will be able to get all that it needs since `freemem' + * was decreased by the requested amount. So, we need to release + * this page, and let page_create() have it. + * + * Since `freemem' being zero is not supposed to happen, just + * use the usual hash stuff as a starting point. If that bucket + * is empty, then assume the worst, and start at the beginning + * of the pcf array. If we always start at the beginning + * when acquiring more than one pcf lock, there won't be any + * deadlock problems. + */ + + /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */ + + if (freemem <= throttlefree && !page_create_throttle(1l, 0)) { + pcf_acquire_all(); + goto page_reclaim_nomem; + } + + enough = 0; + pcf_index = PCF_INDEX(); + p = &pcf[pcf_index]; + p->pcf_touch = 1; + mutex_enter(&p->pcf_lock); + if (p->pcf_count >= 1) { + enough = 1; + p->pcf_count--; + } + mutex_exit(&p->pcf_lock); + + if (!enough) { + VM_STAT_ADD(page_reclaim_zero); + /* + * Check again. Its possible that some other thread + * could have been right behind us, and added one + * to a list somewhere. Acquire each of the pcf locks + * until we find a page. + */ + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + p->pcf_touch = 1; + mutex_enter(&p->pcf_lock); + if (p->pcf_count >= 1) { + p->pcf_count -= 1; + enough = 1; + break; + } + p++; + } + + if (!enough) { +page_reclaim_nomem: + /* + * We really can't have page `pp'. + * Time for the no-memory dance with + * page_free(). This is just like + * page_create_wait(). Plus the added + * attraction of releasing whatever mutex + * we held when we were called with in `lock'. + * Page_unlock() will wakeup any thread + * waiting around for this page. + */ + if (lock) { + VM_STAT_ADD(page_reclaim_zero_locked); + mutex_exit(lock); + } + page_unlock(pp); + + /* + * get this before we drop all the pcf locks. + */ + mutex_enter(&new_freemem_lock); + + p = pcf; + for (i = 0; i < PCF_FANOUT; i++) { + p->pcf_wait++; + mutex_exit(&p->pcf_lock); + p++; + } + + freemem_wait++; + cv_wait(&freemem_cv, &new_freemem_lock); + freemem_wait--; + + mutex_exit(&new_freemem_lock); + + if (lock) { + mutex_enter(lock); + } + return (0); + } + + /* + * There was a page to be found. + * The pcf accounting has been done, + * though none of the pcf_wait flags have been set, + * drop the locks and continue on. + */ + while (p >= pcf) { + mutex_exit(&p->pcf_lock); + p--; + } + } + + /* + * freemem is not protected by any lock. Thus, we cannot + * have any assertion containing freemem here. + */ + freemem -= 1; + + VM_STAT_ADD(pagecnt.pc_reclaim); + if (PP_ISAGED(pp)) { + page_list_sub(pp, PG_FREE_LIST); + TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE, + "page_reclaim_free:pp %p", pp); + } else { + page_list_sub(pp, PG_CACHE_LIST); + TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE, + "page_reclaim_cache:pp %p", pp); + } + + /* + * clear the p_free & p_age bits since this page is no longer + * on the free list. Notice that there was a brief time where + * a page is marked as free, but is not on the list. + * + * Set the reference bit to protect against immediate pageout. + */ + PP_CLRFREE(pp); + PP_CLRAGED(pp); + page_set_props(pp, P_REF); + + CPU_STATS_ENTER_K(); + cpup = CPU; /* get cpup now that CPU cannot change */ + CPU_STATS_ADDQ(cpup, vm, pgrec, 1); + CPU_STATS_ADDQ(cpup, vm, pgfrec, 1); + CPU_STATS_EXIT_K(); + + return (1); +} + + + +/* + * Destroy identity of the page and put it back on + * the page free list. Assumes that the caller has + * acquired the "exclusive" lock on the page. + */ +void +page_destroy(page_t *pp, int dontfree) +{ + ASSERT((PAGE_EXCL(pp) && + !page_iolock_assert(pp)) || panicstr); + + if (pp->p_szc != 0) { + if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || + pp->p_vnode == &kvp) { + panic("page_destroy: anon or kernel or no vnode " + "large page %p", (void *)pp); + } + page_demote_vp_pages(pp); + ASSERT(pp->p_szc == 0); + } + + TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp); + + /* + * Unload translations, if any, then hash out the + * page to erase its identity. + */ + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + page_hashout(pp, NULL); + + if (!dontfree) { + /* + * Acquire the "freemem_lock" for availrmem. + * The page_struct_lock need not be acquired for lckcnt + * and cowcnt since the page has an "exclusive" lock. + */ + if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) { + mutex_enter(&freemem_lock); + if (pp->p_lckcnt != 0) { + availrmem++; + pp->p_lckcnt = 0; + } + if (pp->p_cowcnt != 0) { + availrmem += pp->p_cowcnt; + pp->p_cowcnt = 0; + } + mutex_exit(&freemem_lock); + } + /* + * Put the page on the "free" list. + */ + page_free(pp, 0); + } +} + +void +page_destroy_pages(page_t *pp) +{ + + page_t *tpp, *rootpp = NULL; + pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); + pgcnt_t i, pglcks = 0; + uint_t szc = pp->p_szc; + int toxic = 0; + + ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); + + VM_STAT_ADD(pagecnt.pc_destroy_pages); + + TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp); + + if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { + panic("page_destroy_pages: not root page %p", (void *)pp); + /*NOTREACHED*/ + } + + for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) { + ASSERT((PAGE_EXCL(tpp) && + !page_iolock_assert(tpp)) || panicstr); + (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); + page_hashout(tpp, NULL); + ASSERT(tpp->p_offset == (u_offset_t)-1); + if (tpp->p_lckcnt != 0) { + pglcks++; + tpp->p_lckcnt = 0; + } else if (tpp->p_cowcnt != 0) { + pglcks += tpp->p_cowcnt; + tpp->p_cowcnt = 0; + } + ASSERT(!hat_page_getshare(tpp)); + ASSERT(tpp->p_vnode == NULL); + ASSERT(tpp->p_szc == szc); + + if (page_deteriorating(tpp)) + toxic = 1; + + PP_SETFREE(tpp); + page_clr_all_props(tpp); + PP_SETAGED(tpp); + ASSERT(tpp->p_next == tpp); + ASSERT(tpp->p_prev == tpp); + page_list_concat(&rootpp, &tpp); + } + + ASSERT(rootpp == pp); + if (pglcks != 0) { + mutex_enter(&freemem_lock); + availrmem += pglcks; + mutex_exit(&freemem_lock); + } + + if (toxic) { + page_free_toxic_pages(rootpp); + return; + } + page_list_add_pages(rootpp, 0); + page_create_putback(pgcnt); +} + +/* + * Similar to page_destroy(), but destroys pages which are + * locked and known to be on the page free list. Since + * the page is known to be free and locked, no one can access + * it. + * + * Also, the number of free pages does not change. + */ +void +page_destroy_free(page_t *pp) +{ + ASSERT(PAGE_EXCL(pp)); + ASSERT(PP_ISFREE(pp)); + ASSERT(pp->p_vnode); + ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0); + ASSERT(!hat_page_is_mapped(pp)); + ASSERT(PP_ISAGED(pp) == 0); + ASSERT(pp->p_szc == 0); + + VM_STAT_ADD(pagecnt.pc_destroy_free); + page_list_sub(pp, PG_CACHE_LIST); + + page_hashout(pp, NULL); + ASSERT(pp->p_vnode == NULL); + ASSERT(pp->p_offset == (u_offset_t)-1); + ASSERT(pp->p_hash == NULL); + + PP_SETAGED(pp); + page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); + page_unlock(pp); + + mutex_enter(&new_freemem_lock); + if (freemem_wait) { + cv_signal(&freemem_cv); + } + mutex_exit(&new_freemem_lock); +} + +/* + * Rename the page "opp" to have an identity specified + * by [vp, off]. If a page already exists with this name + * it is locked and destroyed. Note that the page's + * translations are not unloaded during the rename. + * + * This routine is used by the anon layer to "steal" the + * original page and is not unlike destroying a page and + * creating a new page using the same page frame. + * + * XXX -- Could deadlock if caller 1 tries to rename A to B while + * caller 2 tries to rename B to A. + */ +void +page_rename(page_t *opp, vnode_t *vp, u_offset_t off) +{ + page_t *pp; + int olckcnt = 0; + int ocowcnt = 0; + kmutex_t *phm; + ulong_t index; + + ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp)); + ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); + ASSERT(PP_ISFREE(opp) == 0); + + VM_STAT_ADD(page_rename_count); + + TRACE_3(TR_FAC_VM, TR_PAGE_RENAME, + "page rename:pp %p vp %p off %llx", opp, vp, off); + + page_hashout(opp, NULL); + PP_CLRAGED(opp); + + /* + * Acquire the appropriate page hash lock, since + * we're going to rename the page. + */ + index = PAGE_HASH_FUNC(vp, off); + phm = PAGE_HASH_MUTEX(index); + mutex_enter(phm); +top: + /* + * Look for an existing page with this name and destroy it if found. + * By holding the page hash lock all the way to the page_hashin() + * call, we are assured that no page can be created with this + * identity. In the case when the phm lock is dropped to undo any + * hat layer mappings, the existing page is held with an "exclusive" + * lock, again preventing another page from being created with + * this identity. + */ + PAGE_HASH_SEARCH(index, pp, vp, off); + if (pp != NULL) { + VM_STAT_ADD(page_rename_exists); + + /* + * As it turns out, this is one of only two places where + * page_lock() needs to hold the passed in lock in the + * successful case. In all of the others, the lock could + * be dropped as soon as the attempt is made to lock + * the page. It is tempting to add yet another arguement, + * PL_KEEP or PL_DROP, to let page_lock know what to do. + */ + if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) { + /* + * Went to sleep because the page could not + * be locked. We were woken up when the page + * was unlocked, or when the page was destroyed. + * In either case, `phm' was dropped while we + * slept. Hence we should not just roar through + * this loop. + */ + goto top; + } + + if (hat_page_is_mapped(pp)) { + /* + * Unload translations. Since we hold the + * exclusive lock on this page, the page + * can not be changed while we drop phm. + * This is also not a lock protocol violation, + * but rather the proper way to do things. + */ + mutex_exit(phm); + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + mutex_enter(phm); + } + page_hashout(pp, phm); + } + /* + * Hash in the page with the new identity. + */ + if (!page_hashin(opp, vp, off, phm)) { + /* + * We were holding phm while we searched for [vp, off] + * and only dropped phm if we found and locked a page. + * If we can't create this page now, then some thing + * is really broken. + */ + panic("page_rename: Can't hash in page: %p", (void *)pp); + /*NOTREACHED*/ + } + + ASSERT(MUTEX_HELD(phm)); + mutex_exit(phm); + + /* + * Now that we have dropped phm, lets get around to finishing up + * with pp. + */ + if (pp != NULL) { + ASSERT(!hat_page_is_mapped(pp)); + /* for now large pages should not end up here */ + ASSERT(pp->p_szc == 0); + /* + * Save the locks for transfer to the new page and then + * clear them so page_free doesn't think they're important. + * The page_struct_lock need not be acquired for lckcnt and + * cowcnt since the page has an "exclusive" lock. + */ + olckcnt = pp->p_lckcnt; + ocowcnt = pp->p_cowcnt; + pp->p_lckcnt = pp->p_cowcnt = 0; + + /* + * Put the page on the "free" list after we drop + * the lock. The less work under the lock the better. + */ + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, B_FREE, 0, kcred); + } + + /* + * Transfer the lock count from the old page (if any). + * The page_struct_lock need not be acquired for lckcnt and + * cowcnt since the page has an "exclusive" lock. + */ + opp->p_lckcnt += olckcnt; + opp->p_cowcnt += ocowcnt; +} + +/* + * low level routine to add page `pp' to the hash and vp chains for [vp, offset] + * + * Pages are normally inserted at the start of a vnode's v_pages list. + * If the vnode is VMODSORT and the page is modified, it goes at the end. + * This can happen when a modified page is relocated for DR. + * + * Returns 1 on success and 0 on failure. + */ +static int +page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset) +{ + page_t **listp; + page_t *tp; + ulong_t index; + + ASSERT(PAGE_EXCL(pp)); + ASSERT(vp != NULL); + ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); + + /* + * Be sure to set these up before the page is inserted on the hash + * list. As soon as the page is placed on the list some other + * thread might get confused and wonder how this page could + * possibly hash to this list. + */ + pp->p_vnode = vp; + pp->p_offset = offset; + + /* + * record if this page is on a swap vnode + */ + if ((vp->v_flag & VISSWAP) != 0) + PP_SETSWAP(pp); + + index = PAGE_HASH_FUNC(vp, offset); + ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index))); + listp = &page_hash[index]; + + /* + * If this page is already hashed in, fail this attempt to add it. + */ + for (tp = *listp; tp != NULL; tp = tp->p_hash) { + if (tp->p_vnode == vp && tp->p_offset == offset) { + pp->p_vnode = NULL; + pp->p_offset = (u_offset_t)(-1); + return (0); + } + } + pp->p_hash = *listp; + *listp = pp; + + /* + * Add the page to the vnode's list of pages + */ + if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp)) + listp = &vp->v_pages->p_vpprev->p_vpnext; + else + listp = &vp->v_pages; + + page_vpadd(listp, pp); + + return (1); +} + +/* + * Add page `pp' to both the hash and vp chains for [vp, offset]. + * + * Returns 1 on success and 0 on failure. + * If hold is passed in, it is not dropped. + */ +int +page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold) +{ + kmutex_t *phm = NULL; + kmutex_t *vphm; + int rc; + + ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); + + TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN, + "page_hashin:pp %p vp %p offset %llx", + pp, vp, offset); + + VM_STAT_ADD(hashin_count); + + if (hold != NULL) + phm = hold; + else { + VM_STAT_ADD(hashin_not_held); + phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset)); + mutex_enter(phm); + } + + vphm = page_vnode_mutex(vp); + mutex_enter(vphm); + rc = page_do_hashin(pp, vp, offset); + mutex_exit(vphm); + if (hold == NULL) + mutex_exit(phm); + if (rc == 0) + VM_STAT_ADD(hashin_already); + return (rc); +} + +/* + * Remove page ``pp'' from the hash and vp chains and remove vp association. + * All mutexes must be held + */ +static void +page_do_hashout(page_t *pp) +{ + page_t **hpp; + page_t *hp; + vnode_t *vp = pp->p_vnode; + + ASSERT(vp != NULL); + ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); + + /* + * First, take pp off of its hash chain. + */ + hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)]; + + for (;;) { + hp = *hpp; + if (hp == pp) + break; + if (hp == NULL) { + panic("page_do_hashout"); + /*NOTREACHED*/ + } + hpp = &hp->p_hash; + } + *hpp = pp->p_hash; + + /* + * Now remove it from its associated vnode. + */ + if (vp->v_pages) + page_vpsub(&vp->v_pages, pp); + + pp->p_hash = NULL; + page_clr_all_props(pp); + PP_CLRSWAP(pp); + pp->p_vnode = NULL; + pp->p_offset = (u_offset_t)-1; +} + +/* + * Remove page ``pp'' from the hash and vp chains and remove vp association. + * + * When `phm' is non-NULL it contains the address of the mutex protecting the + * hash list pp is on. It is not dropped. + */ +void +page_hashout(page_t *pp, kmutex_t *phm) +{ + vnode_t *vp; + ulong_t index; + kmutex_t *nphm; + kmutex_t *vphm; + kmutex_t *sep; + + ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1); + ASSERT(pp->p_vnode != NULL); + ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); + ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode))); + + vp = pp->p_vnode; + + TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT, + "page_hashout:pp %p vp %p", pp, vp); + + /* Kernel probe */ + TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */, + tnf_opaque, vnode, vp, + tnf_offset, offset, pp->p_offset); + + /* + * + */ + VM_STAT_ADD(hashout_count); + index = PAGE_HASH_FUNC(vp, pp->p_offset); + if (phm == NULL) { + VM_STAT_ADD(hashout_not_held); + nphm = PAGE_HASH_MUTEX(index); + mutex_enter(nphm); + } + ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1); + + + /* + * grab page vnode mutex and remove it... + */ + vphm = page_vnode_mutex(vp); + mutex_enter(vphm); + + page_do_hashout(pp); + + mutex_exit(vphm); + if (phm == NULL) + mutex_exit(nphm); + + /* + * If the page was retired, update the pages_retired + * total and clear the page flag + */ + if (page_isretired(pp)) { + retired_page_removed(pp); + } + + /* + * Wake up processes waiting for this page. The page's + * identity has been changed, and is probably not the + * desired page any longer. + */ + sep = page_se_mutex(pp); + mutex_enter(sep); + if (CV_HAS_WAITERS(&pp->p_cv)) + cv_broadcast(&pp->p_cv); + mutex_exit(sep); +} + +/* + * Add the page to the front of a linked list of pages + * using the p_next & p_prev pointers for the list. + * The caller is responsible for protecting the list pointers. + */ +void +page_add(page_t **ppp, page_t *pp) +{ + ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); + + page_add_common(ppp, pp); +} + + + +/* + * Common code for page_add() and mach_page_add() + */ +void +page_add_common(page_t **ppp, page_t *pp) +{ + if (*ppp == NULL) { + pp->p_next = pp->p_prev = pp; + } else { + pp->p_next = *ppp; + pp->p_prev = (*ppp)->p_prev; + (*ppp)->p_prev = pp; + pp->p_prev->p_next = pp; + } + *ppp = pp; +} + + +/* + * Remove this page from a linked list of pages + * using the p_next & p_prev pointers for the list. + * + * The caller is responsible for protecting the list pointers. + */ +void +page_sub(page_t **ppp, page_t *pp) +{ + ASSERT((PP_ISFREE(pp)) ? 1 : + (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); + + if (*ppp == NULL || pp == NULL) { + panic("page_sub: bad arg(s): pp %p, *ppp %p", + (void *)pp, (void *)(*ppp)); + /*NOTREACHED*/ + } + + page_sub_common(ppp, pp); +} + + +/* + * Common code for page_sub() and mach_page_sub() + */ +void +page_sub_common(page_t **ppp, page_t *pp) +{ + if (*ppp == pp) + *ppp = pp->p_next; /* go to next page */ + + if (*ppp == pp) + *ppp = NULL; /* page list is gone */ + else { + pp->p_prev->p_next = pp->p_next; + pp->p_next->p_prev = pp->p_prev; + } + pp->p_prev = pp->p_next = pp; /* make pp a list of one */ +} + + +/* + * Break page list cppp into two lists with npages in the first list. + * The tail is returned in nppp. + */ +void +page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages) +{ + page_t *s1pp = *oppp; + page_t *s2pp; + page_t *e1pp, *e2pp; + long n = 0; + + if (s1pp == NULL) { + *nppp = NULL; + return; + } + if (npages == 0) { + *nppp = s1pp; + *oppp = NULL; + return; + } + for (n = 0, s2pp = *oppp; n < npages; n++) { + s2pp = s2pp->p_next; + } + /* Fix head and tail of new lists */ + e1pp = s2pp->p_prev; + e2pp = s1pp->p_prev; + s1pp->p_prev = e1pp; + e1pp->p_next = s1pp; + s2pp->p_prev = e2pp; + e2pp->p_next = s2pp; + + /* second list empty */ + if (s2pp == s1pp) { + *oppp = s1pp; + *nppp = NULL; + } else { + *oppp = s1pp; + *nppp = s2pp; + } +} + +/* + * Concatenate page list nppp onto the end of list ppp. + */ +void +page_list_concat(page_t **ppp, page_t **nppp) +{ + page_t *s1pp, *s2pp, *e1pp, *e2pp; + + if (*nppp == NULL) { + return; + } + if (*ppp == NULL) { + *ppp = *nppp; + return; + } + s1pp = *ppp; + e1pp = s1pp->p_prev; + s2pp = *nppp; + e2pp = s2pp->p_prev; + s1pp->p_prev = e2pp; + e2pp->p_next = s1pp; + e1pp->p_next = s2pp; + s2pp->p_prev = e1pp; +} + +/* + * return the next page in the page list + */ +page_t * +page_list_next(page_t *pp) +{ + return (pp->p_next); +} + + +/* + * Add the page to the front of the linked list of pages + * using p_vpnext/p_vpprev pointers for the list. + * + * The caller is responsible for protecting the lists. + */ +void +page_vpadd(page_t **ppp, page_t *pp) +{ + if (*ppp == NULL) { + pp->p_vpnext = pp->p_vpprev = pp; + } else { + pp->p_vpnext = *ppp; + pp->p_vpprev = (*ppp)->p_vpprev; + (*ppp)->p_vpprev = pp; + pp->p_vpprev->p_vpnext = pp; + } + *ppp = pp; +} + +/* + * Remove this page from the linked list of pages + * using p_vpnext/p_vpprev pointers for the list. + * + * The caller is responsible for protecting the lists. + */ +void +page_vpsub(page_t **ppp, page_t *pp) +{ + if (*ppp == NULL || pp == NULL) { + panic("page_vpsub: bad arg(s): pp %p, *ppp %p", + (void *)pp, (void *)(*ppp)); + /*NOTREACHED*/ + } + + if (*ppp == pp) + *ppp = pp->p_vpnext; /* go to next page */ + + if (*ppp == pp) + *ppp = NULL; /* page list is gone */ + else { + pp->p_vpprev->p_vpnext = pp->p_vpnext; + pp->p_vpnext->p_vpprev = pp->p_vpprev; + } + pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */ +} + +/* + * Lock a physical page into memory "long term". Used to support "lock + * in memory" functions. Accepts the page to be locked, and a cow variable + * to indicate whether a the lock will travel to the new page during + * a potential copy-on-write. + */ +int +page_pp_lock( + page_t *pp, /* page to be locked */ + int cow, /* cow lock */ + int kernel) /* must succeed -- ignore checking */ +{ + int r = 0; /* result -- assume failure */ + + ASSERT(PAGE_LOCKED(pp)); + + page_struct_lock(pp); + /* + * Acquire the "freemem_lock" for availrmem. + */ + if (cow) { + mutex_enter(&freemem_lock); + if ((availrmem > pages_pp_maximum) && + (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { + availrmem--; + pages_locked++; + mutex_exit(&freemem_lock); + r = 1; + if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { + cmn_err(CE_WARN, + "COW lock limit reached on pfn 0x%lx", + page_pptonum(pp)); + } + } else + mutex_exit(&freemem_lock); + } else { + if (pp->p_lckcnt) { + if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { + r = 1; + if (++pp->p_lckcnt == + (ushort_t)PAGE_LOCK_MAXIMUM) { + cmn_err(CE_WARN, "Page lock limit " + "reached on pfn 0x%lx", + page_pptonum(pp)); + } + } + } else { + if (kernel) { + /* availrmem accounting done by caller */ + ++pp->p_lckcnt; + r = 1; + } else { + mutex_enter(&freemem_lock); + if (availrmem > pages_pp_maximum) { + availrmem--; + pages_locked++; + ++pp->p_lckcnt; + r = 1; + } + mutex_exit(&freemem_lock); + } + } + } + page_struct_unlock(pp); + return (r); +} + +/* + * Decommit a lock on a physical page frame. Account for cow locks if + * appropriate. + */ +void +page_pp_unlock( + page_t *pp, /* page to be unlocked */ + int cow, /* expect cow lock */ + int kernel) /* this was a kernel lock */ +{ + ASSERT(PAGE_LOCKED(pp)); + + page_struct_lock(pp); + /* + * Acquire the "freemem_lock" for availrmem. + * If cowcnt or lcknt is already 0 do nothing; i.e., we + * could be called to unlock even if nothing is locked. This could + * happen if locked file pages were truncated (removing the lock) + * and the file was grown again and new pages faulted in; the new + * pages are unlocked but the segment still thinks they're locked. + */ + if (cow) { + if (pp->p_cowcnt) { + mutex_enter(&freemem_lock); + pp->p_cowcnt--; + availrmem++; + pages_locked--; + mutex_exit(&freemem_lock); + } + } else { + if (pp->p_lckcnt && --pp->p_lckcnt == 0) { + if (!kernel) { + mutex_enter(&freemem_lock); + availrmem++; + pages_locked--; + mutex_exit(&freemem_lock); + } + } + } + page_struct_unlock(pp); +} + +/* + * This routine reserves availrmem for npages; + * flags: KM_NOSLEEP or KM_SLEEP + * returns 1 on success or 0 on failure + */ +int +page_resv(pgcnt_t npages, uint_t flags) +{ + mutex_enter(&freemem_lock); + while (availrmem < tune.t_minarmem + npages) { + if (flags & KM_NOSLEEP) { + mutex_exit(&freemem_lock); + return (0); + } + mutex_exit(&freemem_lock); + page_needfree(npages); + kmem_reap(); + delay(hz >> 2); + page_needfree(-(spgcnt_t)npages); + mutex_enter(&freemem_lock); + } + availrmem -= npages; + mutex_exit(&freemem_lock); + return (1); +} + +/* + * This routine unreserves availrmem for npages; + */ +void +page_unresv(pgcnt_t npages) +{ + mutex_enter(&freemem_lock); + availrmem += npages; + mutex_exit(&freemem_lock); +} + +/* + * See Statement at the beginning of segvn_lockop() regarding + * the way we handle cowcnts and lckcnts. + * + * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage + * that breaks COW has PROT_WRITE. + * + * Note that, we may also break COW in case we are softlocking + * on read access during physio; + * in this softlock case, the vpage may not have PROT_WRITE. + * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp' + * if the vpage doesn't have PROT_WRITE. + * + * This routine is never called if we are stealing a page + * in anon_private. + * + * The caller subtracted from availrmem for read only mapping. + * if lckcnt is 1 increment availrmem. + */ +void +page_pp_useclaim( + page_t *opp, /* original page frame losing lock */ + page_t *npp, /* new page frame gaining lock */ + uint_t write_perm) /* set if vpage has PROT_WRITE */ +{ + int payback = 0; + + ASSERT(PAGE_LOCKED(opp)); + ASSERT(PAGE_LOCKED(npp)); + + page_struct_lock(opp); + + ASSERT(npp->p_cowcnt == 0); + ASSERT(npp->p_lckcnt == 0); + + /* Don't use claim if nothing is locked (see page_pp_unlock above) */ + if ((write_perm && opp->p_cowcnt != 0) || + (!write_perm && opp->p_lckcnt != 0)) { + + if (write_perm) { + npp->p_cowcnt++; + ASSERT(opp->p_cowcnt != 0); + opp->p_cowcnt--; + } else { + + ASSERT(opp->p_lckcnt != 0); + + /* + * We didn't need availrmem decremented if p_lckcnt on + * original page is 1. Here, we are unlocking + * read-only copy belonging to original page and + * are locking a copy belonging to new page. + */ + if (opp->p_lckcnt == 1) + payback = 1; + + npp->p_lckcnt++; + opp->p_lckcnt--; + } + } + if (payback) { + mutex_enter(&freemem_lock); + availrmem++; + pages_useclaim--; + mutex_exit(&freemem_lock); + } + page_struct_unlock(opp); +} + +/* + * Simple claim adjust functions -- used to support changes in + * claims due to changes in access permissions. Used by segvn_setprot(). + */ +int +page_addclaim(page_t *pp) +{ + int r = 0; /* result */ + + ASSERT(PAGE_LOCKED(pp)); + + page_struct_lock(pp); + ASSERT(pp->p_lckcnt != 0); + + if (pp->p_lckcnt == 1) { + if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { + --pp->p_lckcnt; + r = 1; + if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { + cmn_err(CE_WARN, + "COW lock limit reached on pfn 0x%lx", + page_pptonum(pp)); + } + } + } else { + mutex_enter(&freemem_lock); + if ((availrmem > pages_pp_maximum) && + (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { + --availrmem; + ++pages_claimed; + mutex_exit(&freemem_lock); + --pp->p_lckcnt; + r = 1; + if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { + cmn_err(CE_WARN, + "COW lock limit reached on pfn 0x%lx", + page_pptonum(pp)); + } + } else + mutex_exit(&freemem_lock); + } + page_struct_unlock(pp); + return (r); +} + +int +page_subclaim(page_t *pp) +{ + int r = 0; + + ASSERT(PAGE_LOCKED(pp)); + + page_struct_lock(pp); + ASSERT(pp->p_cowcnt != 0); + + if (pp->p_lckcnt) { + if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { + r = 1; + /* + * for availrmem + */ + mutex_enter(&freemem_lock); + availrmem++; + pages_claimed--; + mutex_exit(&freemem_lock); + + pp->p_cowcnt--; + + if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { + cmn_err(CE_WARN, + "Page lock limit reached on pfn 0x%lx", + page_pptonum(pp)); + } + } + } else { + r = 1; + pp->p_cowcnt--; + pp->p_lckcnt++; + } + page_struct_unlock(pp); + return (r); +} + +int +page_addclaim_pages(page_t **ppa) +{ + + pgcnt_t lckpgs = 0, pg_idx; + + VM_STAT_ADD(pagecnt.pc_addclaim_pages); + + mutex_enter(&page_llock); + for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { + + ASSERT(PAGE_LOCKED(ppa[pg_idx])); + ASSERT(ppa[pg_idx]->p_lckcnt != 0); + if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { + mutex_exit(&page_llock); + return (0); + } + if (ppa[pg_idx]->p_lckcnt > 1) + lckpgs++; + } + + if (lckpgs != 0) { + mutex_enter(&freemem_lock); + if (availrmem >= pages_pp_maximum + lckpgs) { + availrmem -= lckpgs; + pages_claimed += lckpgs; + } else { + mutex_exit(&freemem_lock); + mutex_exit(&page_llock); + return (0); + } + mutex_exit(&freemem_lock); + } + + for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { + ppa[pg_idx]->p_lckcnt--; + ppa[pg_idx]->p_cowcnt++; + } + mutex_exit(&page_llock); + return (1); +} + +int +page_subclaim_pages(page_t **ppa) +{ + pgcnt_t ulckpgs = 0, pg_idx; + + VM_STAT_ADD(pagecnt.pc_subclaim_pages); + + mutex_enter(&page_llock); + for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { + + ASSERT(PAGE_LOCKED(ppa[pg_idx])); + ASSERT(ppa[pg_idx]->p_cowcnt != 0); + if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { + mutex_exit(&page_llock); + return (0); + } + if (ppa[pg_idx]->p_lckcnt != 0) + ulckpgs++; + } + + if (ulckpgs != 0) { + mutex_enter(&freemem_lock); + availrmem += ulckpgs; + pages_claimed -= ulckpgs; + mutex_exit(&freemem_lock); + } + + for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { + ppa[pg_idx]->p_cowcnt--; + ppa[pg_idx]->p_lckcnt++; + + } + mutex_exit(&page_llock); + return (1); +} + +page_t * +page_numtopp(pfn_t pfnum, se_t se) +{ + page_t *pp; + +retry: + pp = page_numtopp_nolock(pfnum); + if (pp == NULL) { + return ((page_t *)NULL); + } + + /* + * Acquire the appropriate lock on the page. + */ + while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) { + if (page_pptonum(pp) != pfnum) + goto retry; + continue; + } + + if (page_pptonum(pp) != pfnum) { + page_unlock(pp); + goto retry; + } + + return (pp); +} + +page_t * +page_numtopp_noreclaim(pfn_t pfnum, se_t se) +{ + page_t *pp; + +retry: + pp = page_numtopp_nolock(pfnum); + if (pp == NULL) { + return ((page_t *)NULL); + } + + /* + * Acquire the appropriate lock on the page. + */ + while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) { + if (page_pptonum(pp) != pfnum) + goto retry; + continue; + } + + if (page_pptonum(pp) != pfnum) { + page_unlock(pp); + goto retry; + } + + return (pp); +} + +/* + * This routine is like page_numtopp, but will only return page structs + * for pages which are ok for loading into hardware using the page struct. + */ +page_t * +page_numtopp_nowait(pfn_t pfnum, se_t se) +{ + page_t *pp; + +retry: + pp = page_numtopp_nolock(pfnum); + if (pp == NULL) { + return ((page_t *)NULL); + } + + /* + * Try to acquire the appropriate lock on the page. + */ + if (PP_ISFREE(pp)) + pp = NULL; + else { + if (!page_trylock(pp, se)) + pp = NULL; + else { + if (page_pptonum(pp) != pfnum) { + page_unlock(pp); + goto retry; + } + if (PP_ISFREE(pp)) { + page_unlock(pp); + pp = NULL; + } + } + } + return (pp); +} + +/* + * Returns a count of dirty pages that are in the process + * of being written out. If 'cleanit' is set, try to push the page. + */ +pgcnt_t +page_busy(int cleanit) +{ + page_t *page0 = page_first(); + page_t *pp = page0; + pgcnt_t nppbusy = 0; + u_offset_t off; + + do { + vnode_t *vp = pp->p_vnode; + + /* + * A page is a candidate for syncing if it is: + * + * (a) On neither the freelist nor the cachelist + * (b) Hashed onto a vnode + * (c) Not a kernel page + * (d) Dirty + * (e) Not part of a swapfile + * (f) a page which belongs to a real vnode; eg has a non-null + * v_vfsp pointer. + * (g) Backed by a filesystem which doesn't have a + * stubbed-out sync operation + */ + if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp && + hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL && + vfs_can_sync(vp->v_vfsp)) { + nppbusy++; + vfs_syncprogress(); + + if (!cleanit) + continue; + if (!page_trylock(pp, SE_EXCL)) + continue; + + if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) || + pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || + !(hat_pagesync(pp, + HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) { + page_unlock(pp); + continue; + } + off = pp->p_offset; + VN_HOLD(vp); + page_unlock(pp); + (void) VOP_PUTPAGE(vp, off, PAGESIZE, + B_ASYNC | B_FREE, kcred); + VN_RELE(vp); + } + } while ((pp = page_next(pp)) != page0); + + return (nppbusy); +} + +void page_invalidate_pages(void); + +/* + * callback handler to vm sub-system + * + * callers make sure no recursive entries to this func. + */ +/*ARGSUSED*/ +boolean_t +callb_vm_cpr(void *arg, int code) +{ + if (code == CB_CODE_CPR_CHKPT) + page_invalidate_pages(); + return (B_TRUE); +} + +/* + * Invalidate all pages of the system. + * It shouldn't be called until all user page activities are all stopped. + */ +void +page_invalidate_pages() +{ + page_t *pp; + page_t *page0; + pgcnt_t nbusypages; + int retry = 0; + const int MAXRETRIES = 4; +#if defined(__sparc) + extern struct vnode prom_ppages; +#endif /* __sparc */ + +top: + /* + * Flush dirty pages and destory the clean ones. + */ + nbusypages = 0; + + pp = page0 = page_first(); + do { + struct vnode *vp; + u_offset_t offset; + int mod; + + /* + * skip the page if it has no vnode or the page associated + * with the kernel vnode or prom allocated kernel mem. + */ +#if defined(__sparc) + if ((vp = pp->p_vnode) == NULL || vp == &kvp || + vp == &prom_ppages) +#else /* x86 doesn't have prom or prom_ppage */ + if ((vp = pp->p_vnode) == NULL || vp == &kvp) +#endif /* __sparc */ + continue; + + /* + * skip the page which is already free invalidated. + */ + if (PP_ISFREE(pp) && PP_ISAGED(pp)) + continue; + + /* + * skip pages that are already locked or can't be "exclusively" + * locked or are already free. After we lock the page, check + * the free and age bits again to be sure it's not destroied + * yet. + * To achieve max. parallelization, we use page_trylock instead + * of page_lock so that we don't get block on individual pages + * while we have thousands of other pages to process. + */ + if (!page_trylock(pp, SE_EXCL)) { + nbusypages++; + continue; + } else if (PP_ISFREE(pp)) { + if (!PP_ISAGED(pp)) { + page_destroy_free(pp); + } else { + page_unlock(pp); + } + continue; + } + /* + * Is this page involved in some I/O? shared? + * + * The page_struct_lock need not be acquired to + * examine these fields since the page has an + * "exclusive" lock. + */ + if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { + page_unlock(pp); + continue; + } + + if (vp->v_type == VCHR) { + panic("vp->v_type == VCHR"); + /*NOTREACHED*/ + } + + if (!page_try_demote_pages(pp)) { + page_unlock(pp); + continue; + } + + /* + * Check the modified bit. Leave the bits alone in hardware + * (they will be modified if we do the putpage). + */ + mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) + & P_MOD); + if (mod) { + offset = pp->p_offset; + /* + * Hold the vnode before releasing the page lock + * to prevent it from being freed and re-used by + * some other thread. + */ + VN_HOLD(vp); + page_unlock(pp); + /* + * No error return is checked here. Callers such as + * cpr deals with the dirty pages at the dump time + * if this putpage fails. + */ + (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL, + kcred); + VN_RELE(vp); + } else { + page_destroy(pp, 0); + } + } while ((pp = page_next(pp)) != page0); + if (nbusypages && retry++ < MAXRETRIES) { + delay(1); + goto top; + } +} + +/* + * Replace the page "old" with the page "new" on the page hash and vnode lists + * + * the replacemnt must be done in place, ie the equivalent sequence: + * + * vp = old->p_vnode; + * off = old->p_offset; + * page_do_hashout(old) + * page_do_hashin(new, vp, off) + * + * doesn't work, since + * 1) if old is the only page on the vnode, the v_pages list has a window + * where it looks empty. This will break file system assumptions. + * and + * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list. + */ +static void +page_do_relocate_hash(page_t *new, page_t *old) +{ + page_t **hash_list; + vnode_t *vp = old->p_vnode; + kmutex_t *sep; + + ASSERT(PAGE_EXCL(old)); + ASSERT(PAGE_EXCL(new)); + ASSERT(vp != NULL); + ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); + ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset)))); + + /* + * First find old page on the page hash list + */ + hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)]; + + for (;;) { + if (*hash_list == old) + break; + if (*hash_list == NULL) { + panic("page_do_hashout"); + /*NOTREACHED*/ + } + hash_list = &(*hash_list)->p_hash; + } + + /* + * update new and replace old with new on the page hash list + */ + new->p_vnode = old->p_vnode; + new->p_offset = old->p_offset; + new->p_hash = old->p_hash; + *hash_list = new; + + if ((new->p_vnode->v_flag & VISSWAP) != 0) + PP_SETSWAP(new); + + /* + * replace old with new on the vnode's page list + */ + if (old->p_vpnext == old) { + new->p_vpnext = new; + new->p_vpprev = new; + } else { + new->p_vpnext = old->p_vpnext; + new->p_vpprev = old->p_vpprev; + new->p_vpnext->p_vpprev = new; + new->p_vpprev->p_vpnext = new; + } + if (vp->v_pages == old) + vp->v_pages = new; + + /* + * clear out the old page + */ + old->p_hash = NULL; + old->p_vpnext = NULL; + old->p_vpprev = NULL; + old->p_vnode = NULL; + PP_CLRSWAP(old); + old->p_offset = (u_offset_t)-1; + page_clr_all_props(old); + + /* + * Wake up processes waiting for this page. The page's + * identity has been changed, and is probably not the + * desired page any longer. + */ + sep = page_se_mutex(old); + mutex_enter(sep); + if (CV_HAS_WAITERS(&old->p_cv)) + cv_broadcast(&old->p_cv); + mutex_exit(sep); +} + +/* + * This function moves the identity of page "pp_old" to page "pp_new". + * Both pages must be locked on entry. "pp_new" is free, has no identity, + * and need not be hashed out from anywhere. + */ +void +page_relocate_hash(page_t *pp_new, page_t *pp_old) +{ + vnode_t *vp = pp_old->p_vnode; + u_offset_t off = pp_old->p_offset; + kmutex_t *phm, *vphm; + + /* + * Rehash two pages + */ + ASSERT(PAGE_EXCL(pp_old)); + ASSERT(PAGE_EXCL(pp_new)); + ASSERT(vp != NULL); + ASSERT(pp_new->p_vnode == NULL); + + /* + * hashout then hashin while holding the mutexes + */ + phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off)); + mutex_enter(phm); + vphm = page_vnode_mutex(vp); + mutex_enter(vphm); + + page_do_relocate_hash(pp_new, pp_old); + + mutex_exit(vphm); + mutex_exit(phm); + + /* + * The page_struct_lock need not be acquired for lckcnt and + * cowcnt since the page has an "exclusive" lock. + */ + ASSERT(pp_new->p_lckcnt == 0); + ASSERT(pp_new->p_cowcnt == 0); + pp_new->p_lckcnt = pp_old->p_lckcnt; + pp_new->p_cowcnt = pp_old->p_cowcnt; + pp_old->p_lckcnt = pp_old->p_cowcnt = 0; + + /* The following comment preserved from page_flip(). */ + /* XXX - Do we need to protect fsdata? */ + pp_new->p_fsdata = pp_old->p_fsdata; +} + +/* + * Helper routine used to lock all remaining members of a + * large page. The caller is responsible for passing in a locked + * pp. If pp is a large page, then it succeeds in locking all the + * remaining constituent pages or it returns with only the + * original page locked. + * + * Returns 1 on success, 0 on failure. + * + * If success is returned this routine gurantees p_szc for all constituent + * pages of a large page pp belongs to can't change. To achieve this we + * recheck szc of pp after locking all constituent pages and retry if szc + * changed (it could only decrease). Since hat_page_demote() needs an EXCL + * lock on one of constituent pages it can't be running after all constituent + * pages are locked. hat_page_demote() with a lock on a constituent page + * outside of this large page (i.e. pp belonged to a larger large page) is + * already done with all constituent pages of pp since the root's p_szc is + * changed last. Thefore no need to synchronize with hat_page_demote() that + * locked a constituent page outside of pp's current large page. + */ +#ifdef DEBUG +uint32_t gpg_trylock_mtbf = 0; +#endif + +int +group_page_trylock(page_t *pp, se_t se) +{ + page_t *tpp; + pgcnt_t npgs, i, j; + uint_t pszc = pp->p_szc; + +#ifdef DEBUG + if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) { + return (0); + } +#endif + + if (pp != PP_GROUPLEADER(pp, pszc)) { + return (0); + } + +retry: + ASSERT(PAGE_LOCKED_SE(pp, se)); + ASSERT(!PP_ISFREE(pp)); + if (pszc == 0) { + return (1); + } + npgs = page_get_pagecnt(pszc); + tpp = pp + 1; + for (i = 1; i < npgs; i++, tpp++) { + if (!page_trylock(tpp, se)) { + tpp = pp + 1; + for (j = 1; j < i; j++, tpp++) { + page_unlock(tpp); + } + return (0); + } + } + if (pp->p_szc != pszc) { + ASSERT(pp->p_szc < pszc); + ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp && + !IS_SWAPFSVP(pp->p_vnode)); + tpp = pp + 1; + for (i = 1; i < npgs; i++, tpp++) { + page_unlock(tpp); + } + pszc = pp->p_szc; + goto retry; + } + return (1); +} + +void +group_page_unlock(page_t *pp) +{ + page_t *tpp; + pgcnt_t npgs, i; + + ASSERT(PAGE_LOCKED(pp)); + ASSERT(!PP_ISFREE(pp)); + ASSERT(pp == PP_PAGEROOT(pp)); + npgs = page_get_pagecnt(pp->p_szc); + for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) { + page_unlock(tpp); + } +} + +/* + * returns + * 0 : on success and *nrelocp is number of relocated PAGESIZE pages + * ERANGE : this is not a base page + * EBUSY : failure to get locks on the page/pages + * ENOMEM : failure to obtain replacement pages + * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel + * + * Return with all constituent members of target and replacement + * SE_EXCL locked. It is the callers responsibility to drop the + * locks. + */ +int +do_page_relocate( + page_t **target, + page_t **replacement, + int grouplock, + spgcnt_t *nrelocp, + lgrp_t *lgrp) +{ +#ifdef DEBUG + page_t *first_repl; +#endif /* DEBUG */ + page_t *repl; + page_t *targ; + page_t *pl = NULL; + uint_t ppattr; + pfn_t pfn, repl_pfn; + uint_t szc; + spgcnt_t npgs, i; + int repl_contig = 0; + uint_t flags = 0; + spgcnt_t dofree = 0; + + *nrelocp = 0; + +#if defined(__sparc) + /* + * We need to wait till OBP has completed + * its boot-time handoff of its resources to the kernel + * before we allow page relocation + */ + if (page_relocate_ready == 0) { + return (EAGAIN); + } +#endif + + /* + * If this is not a base page, + * just return with 0x0 pages relocated. + */ + targ = *target; + ASSERT(PAGE_EXCL(targ)); + ASSERT(!PP_ISFREE(targ)); + szc = targ->p_szc; + ASSERT(szc < mmu_page_sizes); + VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); + pfn = targ->p_pagenum; + if (pfn != PFN_BASE(pfn, szc)) { + VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]); + return (ERANGE); + } + + if ((repl = *replacement) != NULL && repl->p_szc >= szc) { + repl_pfn = repl->p_pagenum; + if (repl_pfn != PFN_BASE(repl_pfn, szc)) { + VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]); + return (ERANGE); + } + repl_contig = 1; + } + + /* + * We must lock all members of this large page or we cannot + * relocate any part of it. + */ + if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) { + VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]); + return (EBUSY); + } + + /* + * reread szc it could have been decreased before + * group_page_trylock() was done. + */ + szc = targ->p_szc; + ASSERT(szc < mmu_page_sizes); + VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); + ASSERT(pfn == PFN_BASE(pfn, szc)); + + npgs = page_get_pagecnt(targ->p_szc); + + if (repl == NULL) { + dofree = npgs; /* Size of target page in MMU pages */ + if (!page_create_wait(dofree, 0)) { + if (grouplock != 0) { + group_page_unlock(targ); + } + VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); + return (ENOMEM); + } + + /* + * seg kmem pages require that the target and replacement + * page be the same pagesize. + */ + flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0; + repl = page_get_replacement_page(targ, lgrp, flags); + if (repl == NULL) { + if (grouplock != 0) { + group_page_unlock(targ); + } + page_create_putback(dofree); + VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); + return (ENOMEM); + } + } +#ifdef DEBUG + else { + ASSERT(PAGE_LOCKED(repl)); + } +#endif /* DEBUG */ + +#if defined(__sparc) + /* + * Let hat_page_relocate() complete the relocation if it's kernel page + */ + if (targ->p_vnode == &kvp) { + *replacement = repl; + if (hat_page_relocate(target, replacement, nrelocp) != 0) { + if (grouplock != 0) { + group_page_unlock(targ); + } + if (dofree) { + *replacement = NULL; + page_free_replacement_page(repl); + page_create_putback(dofree); + } + VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]); + return (EAGAIN); + } + VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); + return (0); + } +#else +#if defined(lint) + dofree = dofree; +#endif +#endif + +#ifdef DEBUG + first_repl = repl; +#endif /* DEBUG */ + + for (i = 0; i < npgs; i++) { + ASSERT(PAGE_EXCL(targ)); + + (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD); + + ASSERT(hat_page_getshare(targ) == 0); + ASSERT(!PP_ISFREE(targ)); + ASSERT(targ->p_pagenum == (pfn + i)); + ASSERT(repl_contig == 0 || + repl->p_pagenum == (repl_pfn + i)); + + /* + * Copy the page contents and attributes then + * relocate the page in the page hash. + */ + ppcopy(targ, repl); + ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO)); + page_clr_all_props(repl); + page_set_props(repl, ppattr); + page_relocate_hash(repl, targ); + + ASSERT(hat_page_getshare(targ) == 0); + ASSERT(hat_page_getshare(repl) == 0); + /* + * Now clear the props on targ, after the + * page_relocate_hash(), they no longer + * have any meaning. + */ + page_clr_all_props(targ); + ASSERT(targ->p_next == targ); + ASSERT(targ->p_prev == targ); + page_list_concat(&pl, &targ); + + targ++; + if (repl_contig != 0) { + repl++; + } else { + repl = repl->p_next; + } + } + /* assert that we have come full circle with repl */ + ASSERT(repl_contig == 1 || first_repl == repl); + + *target = pl; + if (*replacement == NULL) { + ASSERT(first_repl == repl); + *replacement = repl; + } + VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); + *nrelocp = npgs; + return (0); +} +/* + * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated. + */ +int +page_relocate( + page_t **target, + page_t **replacement, + int grouplock, + int freetarget, + spgcnt_t *nrelocp, + lgrp_t *lgrp) +{ + spgcnt_t ret; + + /* do_page_relocate returns 0 on success or errno value */ + ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp); + + if (ret != 0 || freetarget == 0) { + return (ret); + } + if (*nrelocp == 1) { + ASSERT(*target != NULL); + page_free(*target, 1); + } else { + page_t *tpp = *target; + uint_t szc = tpp->p_szc; + pgcnt_t npgs = page_get_pagecnt(szc); + ASSERT(npgs > 1); + ASSERT(szc != 0); + do { + ASSERT(PAGE_EXCL(tpp)); + ASSERT(!hat_page_is_mapped(tpp)); + ASSERT(tpp->p_szc == szc); + PP_SETFREE(tpp); + PP_SETAGED(tpp); + npgs--; + } while ((tpp = tpp->p_next) != *target); + ASSERT(npgs == 0); + page_list_add_pages(*target, 0); + npgs = page_get_pagecnt(szc); + page_create_putback(npgs); + } + return (ret); +} + +/* + * it is up to the caller to deal with pcf accounting. + */ +void +page_free_replacement_page(page_t *pplist) +{ + page_t *pp; + + while (pplist != NULL) { + /* + * pp_targ is a linked list. + */ + pp = pplist; + if (pp->p_szc == 0) { + page_sub(&pplist, pp); + page_clr_all_props(pp); + PP_SETFREE(pp); + PP_SETAGED(pp); + page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); + page_unlock(pp); + VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]); + } else { + spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc); + page_t *tpp; + page_list_break(&pp, &pplist, curnpgs); + tpp = pp; + do { + ASSERT(PAGE_EXCL(tpp)); + ASSERT(!hat_page_is_mapped(tpp)); + page_clr_all_props(pp); + PP_SETFREE(tpp); + PP_SETAGED(tpp); + } while ((tpp = tpp->p_next) != pp); + page_list_add_pages(pp, 0); + VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]); + } + } +} + +/* + * Relocate target to non-relocatable replacement page. + */ +int +page_relocate_cage(page_t **target, page_t **replacement) +{ + page_t *tpp, *rpp; + spgcnt_t pgcnt, npgs; + int result; + + tpp = *target; + + ASSERT(PAGE_EXCL(tpp)); + ASSERT(tpp->p_szc == 0); + + pgcnt = btop(page_get_pagesize(tpp->p_szc)); + + do { + (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC); + rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC); + if (rpp == NULL) { + page_create_putback(pgcnt); + kcage_cageout_wakeup(); + } + } while (rpp == NULL); + + ASSERT(PP_ISNORELOC(rpp)); + + result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL); + + if (result == 0) { + *replacement = rpp; + if (pgcnt != npgs) + panic("page_relocate_cage: partial relocation"); + } + + return (result); +} + +/* + * Release the page lock on a page, place on cachelist + * tail if no longer mapped. Caller can let us know if + * the page is known to be clean. + */ +int +page_release(page_t *pp, int checkmod) +{ + int status; + + ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) && + (pp->p_vnode != NULL)); + + if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) && + ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) && + pp->p_lckcnt == 0 && pp->p_cowcnt == 0 && + !hat_page_is_mapped(pp)) { + + /* + * If page is modified, unlock it + * + * (p_nrm & P_MOD) bit has the latest stuff because: + * (1) We found that this page doesn't have any mappings + * _after_ holding SE_EXCL and + * (2) We didn't drop SE_EXCL lock after the check in (1) + */ + if (checkmod && hat_ismod(pp)) { + page_unlock(pp); + status = PGREL_MOD; + } else { + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, B_FREE, 0, kcred); + status = PGREL_CLEAN; + } + } else { + page_unlock(pp); + status = PGREL_NOTREL; + } + return (status); +} + +int +page_try_demote_pages(page_t *pp) +{ + page_t *tpp, *rootpp = pp; + pfn_t pfn = page_pptonum(pp); + spgcnt_t i, npgs; + uint_t szc = pp->p_szc; + vnode_t *vp = pp->p_vnode; + + ASSERT(PAGE_EXCL(rootpp)); + + VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]); + + if (rootpp->p_szc == 0) { + VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]); + return (1); + } + + if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) { + VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]); + page_demote_vp_pages(rootpp); + ASSERT(pp->p_szc == 0); + return (1); + } + + /* + * Adjust rootpp if passed in is not the base + * constituent page. + */ + npgs = page_get_pagecnt(rootpp->p_szc); + ASSERT(npgs > 1); + if (!IS_P2ALIGNED(pfn, npgs)) { + pfn = P2ALIGN(pfn, npgs); + rootpp = page_numtopp_nolock(pfn); + VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]); + ASSERT(rootpp->p_vnode != NULL); + ASSERT(rootpp->p_szc == szc); + } + + /* + * We can't demote kernel pages since we can't hat_unload() + * the mappings. + */ + if (rootpp->p_vnode == &kvp) + return (0); + + /* + * Attempt to lock all constituent pages except the page passed + * in since it's already locked. + */ + for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) { + ASSERT(!PP_ISFREE(tpp)); + ASSERT(tpp->p_vnode != NULL); + + if (tpp != pp && !page_trylock(tpp, SE_EXCL)) + break; + ASSERT(tpp->p_szc == rootpp->p_szc); + ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i); + (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); + } + + /* + * If we failed to lock them all then unlock what we have locked + * so far and bail. + */ + if (i < npgs) { + tpp = rootpp; + while (i-- > 0) { + if (tpp != pp) + page_unlock(tpp); + tpp = page_next(tpp); + } + VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]); + return (0); + } + + /* + * XXX probably p_szc clearing and page unlocking can be done within + * one loop but since this is rare code we can play very safe. + */ + for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) { + ASSERT(PAGE_EXCL(tpp)); + tpp->p_szc = 0; + } + + /* + * Unlock all pages except the page passed in. + */ + for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) { + ASSERT(!hat_page_is_mapped(tpp)); + if (tpp != pp) + page_unlock(tpp); + } + VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]); + return (1); +} + +/* + * Called by page_free() and page_destroy() to demote the page size code + * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero + * p_szc on free list, neither can we just clear p_szc of a single page_t + * within a large page since it will break other code that relies on p_szc + * being the same for all page_t's of a large page). Anonymous pages should + * never end up here because anon_map_getpages() cannot deal with p_szc + * changes after a single constituent page is locked. While anonymous or + * kernel large pages are demoted or freed the entire large page at a time + * with all constituent pages locked EXCL for the file system pages we + * have to be able to demote a large page (i.e. decrease all constituent pages + * p_szc) with only just an EXCL lock on one of constituent pages. The reason + * we can easily deal with anonymous page demotion the entire large page at a + * time is that those operation originate at address space level and concern + * the entire large page region with actual demotion only done when pages are + * not shared with any other processes (therefore we can always get EXCL lock + * on all anonymous constituent pages after clearing segment page + * cache). However file system pages can be truncated or invalidated at a + * PAGESIZE level from the file system side and end up in page_free() or + * page_destroy() (we also allow only part of the large page to be SOFTLOCKed + * and therfore pageout should be able to demote a large page by EXCL locking + * any constituent page that is not under SOFTLOCK). In those cases we cannot + * rely on being able to lock EXCL all constituent pages. + * + * To prevent szc changes on file system pages one has to lock all constituent + * pages at least SHARED (or call page_szc_lock()). The only subsystem that + * doesn't rely on locking all constituent pages (or using page_szc_lock()) to + * prevent szc changes is hat layer that uses its own page level mlist + * locks. hat assumes that szc doesn't change after mlist lock for a page is + * taken. Therefore we need to change szc under hat level locks if we only + * have an EXCL lock on a single constituent page and hat still references any + * of constituent pages. (Note we can't "ignore" hat layer by simply + * hat_pageunload() all constituent pages without having EXCL locks on all of + * constituent pages). We use hat_page_demote() call to safely demote szc of + * all constituent pages under hat locks when we only have an EXCL lock on one + * of constituent pages. + * + * This routine calls page_szc_lock() before calling hat_page_demote() to + * allow segvn in one special case not to lock all constituent pages SHARED + * before calling hat_memload_array() that relies on p_szc not changeing even + * before hat level mlist lock is taken. In that case segvn uses + * page_szc_lock() to prevent hat_page_demote() changeing p_szc values. + * + * Anonymous or kernel page demotion still has to lock all pages exclusively + * and do hat_pageunload() on all constituent pages before demoting the page + * therefore there's no need for anonymous or kernel page demotion to use + * hat_page_demote() mechanism. + * + * hat_page_demote() removes all large mappings that map pp and then decreases + * p_szc starting from the last constituent page of the large page. By working + * from the tail of a large page in pfn decreasing order allows one looking at + * the root page to know that hat_page_demote() is done for root's szc area. + * e.g. if a root page has szc 1 one knows it only has to lock all constituent + * pages within szc 1 area to prevent szc changes because hat_page_demote() + * that started on this page when it had szc > 1 is done for this szc 1 area. + * + * We are guranteed that all constituent pages of pp's large page belong to + * the same vnode with the consecutive offsets increasing in the direction of + * the pfn i.e. the identity of constituent pages can't change until their + * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove + * large mappings to pp even though we don't lock any constituent page except + * pp (i.e. we won't unload e.g. kernel locked page). + */ +static void +page_demote_vp_pages(page_t *pp) +{ + kmutex_t *mtx; + + ASSERT(PAGE_EXCL(pp)); + ASSERT(!PP_ISFREE(pp)); + ASSERT(pp->p_vnode != NULL); + ASSERT(!IS_SWAPFSVP(pp->p_vnode)); + ASSERT(pp->p_vnode != &kvp); + + VM_STAT_ADD(pagecnt.pc_demote_pages[0]); + + mtx = page_szc_lock(pp); + if (mtx != NULL) { + hat_page_demote(pp); + mutex_exit(mtx); + } + ASSERT(pp->p_szc == 0); +} + +/* + * Page retire operation. + * + * page_retire() + * Attempt to retire (throw away) page pp. We cannot do this if + * the page is dirty; if the page is clean, we can try. We return 0 on + * success, -1 on failure. This routine should be invoked by the platform's + * memory error detection code. + * + * pages_retired_limit_exceeded() + * We set a limit on the number of pages which may be retired. This + * is set to a percentage of total physical memory. This limit is + * enforced here. + */ + +static pgcnt_t retired_pgcnt = 0; + +/* + * routines to update the count of retired pages + */ +static void +page_retired(page_t *pp) +{ + ASSERT(pp); + + page_settoxic(pp, PAGE_IS_RETIRED); + atomic_add_long(&retired_pgcnt, 1); +} + +static void +retired_page_removed(page_t *pp) +{ + ASSERT(pp); + ASSERT(page_isretired(pp)); + ASSERT(retired_pgcnt > 0); + + page_clrtoxic(pp); + atomic_add_long(&retired_pgcnt, -1); +} + + +static int +pages_retired_limit_exceeded() +{ + pgcnt_t retired_max; + + /* + * If the percentage is zero or is not set correctly, + * return TRUE so that pages are not retired. + */ + if (max_pages_retired_bps <= 0 || + max_pages_retired_bps >= 10000) + return (1); + + /* + * Calculate the maximum number of pages allowed to + * be retired as a percentage of total physical memory + * (Remember that we are using basis points, hence the 10000.) + */ + retired_max = (physmem * max_pages_retired_bps) / 10000; + + /* + * return 'TRUE' if we have already retired more + * than the legal limit + */ + return (retired_pgcnt >= retired_max); +} + +#define PAGE_RETIRE_SELOCK 0 +#define PAGE_RETIRE_NORECLAIM 1 +#define PAGE_RETIRE_LOCKED 2 +#define PAGE_RETIRE_COW 3 +#define PAGE_RETIRE_DIRTY 4 +#define PAGE_RETIRE_LPAGE 5 +#define PAGE_RETIRE_SUCCESS 6 +#define PAGE_RETIRE_LIMIT 7 +#define PAGE_RETIRE_NCODES 8 + +typedef struct page_retire_op { + int pr_count; + short pr_unlock; + short pr_retval; + char *pr_message; +} page_retire_op_t; + +page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = { + { 0, 0, -1, "cannot lock page" }, + { 0, 0, -1, "cannot reclaim cached page" }, + { 0, 1, -1, "page is locked" }, + { 0, 1, -1, "copy-on-write page" }, + { 0, 1, -1, "page is dirty" }, + { 0, 1, -1, "cannot demote large page" }, + { 0, 0, 0, "page successfully retired" }, + { 0, 0, -1, "excess pages retired already" }, +}; + +static int +page_retire_done(page_t *pp, int code) +{ + page_retire_op_t *prop = &page_retire_ops[code]; + + prop->pr_count++; + + if (prop->pr_unlock) + page_unlock(pp); + + if (page_retire_messages > 1) { + printf("page_retire(%p) pfn 0x%lx %s: %s\n", + (void *)pp, page_pptonum(pp), + prop->pr_retval == -1 ? "failed" : "succeeded", + prop->pr_message); + } + + return (prop->pr_retval); +} + +int +page_retire(page_t *pp, uchar_t flag) +{ + uint64_t pa = ptob((uint64_t)page_pptonum(pp)); + + ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC); + + /* + * DR operations change the association between a page_t + * and the physical page it represents. Check if the + * page is still bad. + */ + if (!page_isfaulty(pp)) { + page_clrtoxic(pp); + return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); + } + + /* + * We set the flag here so that even if we fail due + * to exceeding the limit for retired pages, the + * page will still be checked and either cleared + * or retired in page_free(). + */ + page_settoxic(pp, flag); + + if (flag == PAGE_IS_TOXIC) { + if (page_retire_messages) { + cmn_err(CE_NOTE, "Scheduling clearing of error on" + " page 0x%08x.%08x", + (uint32_t)(pa >> 32), (uint32_t)pa); + } + + } else { /* PAGE_IS_FAILING */ + if (pages_retired_limit_exceeded()) { + /* + * Return as we have already exceeded the + * maximum number of pages allowed to be + * retired + */ + return (page_retire_done(pp, PAGE_RETIRE_LIMIT)); + } + + if (page_retire_messages) { + cmn_err(CE_NOTE, "Scheduling removal of " + "page 0x%08x.%08x", + (uint32_t)(pa >> 32), (uint32_t)pa); + } + } + + if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL)) + return (page_retire_done(pp, PAGE_RETIRE_SELOCK)); + + /* + * If this is a large page we first try and demote it + * to PAGESIZE pages and then dispose of the toxic page. + * On failure we will let the page free/destroy + * code handle it later since this is a mapped page. + * Note that free large pages can always be demoted. + * + */ + if (pp->p_szc != 0) { + if (PP_ISFREE(pp)) + (void) page_demote_free_pages(pp); + else + (void) page_try_demote_pages(pp); + + if (pp->p_szc != 0) + return (page_retire_done(pp, PAGE_RETIRE_LPAGE)); + } + + if (PP_ISFREE(pp)) { + if (!page_reclaim(pp, NULL)) + return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM)); + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred) + return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); + } + + if (pp->p_lckcnt != 0) + return (page_retire_done(pp, PAGE_RETIRE_LOCKED)); + + if (pp->p_cowcnt != 0) + return (page_retire_done(pp, PAGE_RETIRE_COW)); + + /* + * Unload all translations to this page. No new translations + * can be created while we hold the exclusive lock on the page. + */ + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + + if (hat_ismod(pp)) + return (page_retire_done(pp, PAGE_RETIRE_DIRTY)); + + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, B_INVAL, 0, kcred); + + return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); +} + +/* + * Mark any existing pages for migration in the given range + */ +void +page_mark_migrate(struct seg *seg, caddr_t addr, size_t len, + struct anon_map *amp, ulong_t anon_index, vnode_t *vp, + u_offset_t vnoff, int rflag) +{ + struct anon *ap; + vnode_t *curvp; + lgrp_t *from; + pgcnt_t i; + pgcnt_t nlocked; + u_offset_t off; + pfn_t pfn; + size_t pgsz; + size_t segpgsz; + pgcnt_t pages; + uint_t pszc; + page_t **ppa; + pgcnt_t ppa_nentries; + page_t *pp; + caddr_t va; + ulong_t an_idx; + anon_sync_obj_t cookie; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* + * Don't do anything if don't need to do lgroup optimizations + * on this system + */ + if (!lgrp_optimizations()) + return; + + /* + * Align address and length to (potentially large) page boundary + */ + segpgsz = page_get_pagesize(seg->s_szc); + addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz); + if (rflag) + len = P2ROUNDUP(len, segpgsz); + + /* + * Allocate page array to accomodate largest page size + */ + pgsz = page_get_pagesize(page_num_pagesizes() - 1); + ppa_nentries = btop(pgsz); + ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP); + + /* + * Do one (large) page at a time + */ + va = addr; + while (va < addr + len) { + /* + * Lookup (root) page for vnode and offset corresponding to + * this virtual address + * Try anonmap first since there may be copy-on-write + * pages, but initialize vnode pointer and offset using + * vnode arguments just in case there isn't an amp. + */ + curvp = vp; + off = vnoff + va - seg->s_base; + if (amp) { + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + an_idx = anon_index + seg_page(seg, va); + anon_array_enter(amp, an_idx, &cookie); + ap = anon_get_ptr(amp->ahp, an_idx); + if (ap) + swap_xlate(ap, &curvp, &off); + anon_array_exit(&cookie); + ANON_LOCK_EXIT(&->a_rwlock); + } + + pp = NULL; + if (curvp) + pp = page_lookup(curvp, off, SE_SHARED); + + /* + * If there isn't a page at this virtual address, + * skip to next page + */ + if (pp == NULL) { + va += PAGESIZE; + continue; + } + + /* + * Figure out which lgroup this page is in for kstats + */ + pfn = page_pptonum(pp); + from = lgrp_pfn_to_lgrp(pfn); + + /* + * Get page size, and round up and skip to next page boundary + * if unaligned address + */ + pszc = pp->p_szc; + pgsz = page_get_pagesize(pszc); + pages = btop(pgsz); + if (!IS_P2ALIGNED(va, pgsz) || + !IS_P2ALIGNED(pfn, pages) || + pgsz > segpgsz) { + pgsz = MIN(pgsz, segpgsz); + page_unlock(pp); + i = btop(P2END((uintptr_t)va, pgsz) - + (uintptr_t)va); + va = (caddr_t)P2END((uintptr_t)va, pgsz); + lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i); + continue; + } + + /* + * Upgrade to exclusive lock on page + */ + if (!page_tryupgrade(pp)) { + page_unlock(pp); + va += pgsz; + lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, + btop(pgsz)); + continue; + } + + /* + * Remember pages locked exclusively and how many + */ + ppa[0] = pp; + nlocked = 1; + + /* + * Lock constituent pages if this is large page + */ + if (pages > 1) { + /* + * Lock all constituents except root page, since it + * should be locked already. + */ + for (i = 1; i < pages; i++) { + pp = page_next(pp); + if (!page_trylock(pp, SE_EXCL)) { + break; + } + if (PP_ISFREE(pp) || + pp->p_szc != pszc) { + /* + * hat_page_demote() raced in with us. + */ + ASSERT(!IS_SWAPFSVP(curvp)); + page_unlock(pp); + break; + } + ppa[nlocked] = pp; + nlocked++; + } + } + + /* + * If all constituent pages couldn't be locked, + * unlock pages locked so far and skip to next page. + */ + if (nlocked != pages) { + for (i = 0; i < nlocked; i++) + page_unlock(ppa[i]); + va += pgsz; + lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, + btop(pgsz)); + continue; + } + + /* + * hat_page_demote() can no longer happen + * since last cons page had the right p_szc after + * all cons pages were locked. all cons pages + * should now have the same p_szc. + */ + + /* + * All constituent pages locked successfully, so mark + * large page for migration and unload the mappings of + * constituent pages, so a fault will occur on any part of the + * large page + */ + PP_SETMIGRATE(ppa[0]); + for (i = 0; i < nlocked; i++) { + pp = ppa[i]; + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + ASSERT(hat_page_getshare(pp) == 0); + page_unlock(pp); + } + lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked); + + va += pgsz; + } + kmem_free(ppa, ppa_nentries * sizeof (page_t *)); +} + +/* + * Migrate any pages that have been marked for migration in the given range + */ +void +page_migrate( + struct seg *seg, + caddr_t addr, + page_t **ppa, + pgcnt_t npages) +{ + lgrp_t *from; + lgrp_t *to; + page_t *newpp; + page_t *pp; + pfn_t pfn; + size_t pgsz; + spgcnt_t page_cnt; + spgcnt_t i; + uint_t pszc; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); + + while (npages > 0) { + pp = *ppa; + pszc = pp->p_szc; + pgsz = page_get_pagesize(pszc); + page_cnt = btop(pgsz); + + /* + * Check to see whether this page is marked for migration + * + * Assume that root page of large page is marked for + * migration and none of the other constituent pages + * are marked. This really simplifies clearing the + * migrate bit by not having to clear it from each + * constituent page. + * + * note we don't want to relocate an entire large page if + * someone is only using one subpage. + */ + if (npages < page_cnt) + break; + + /* + * Is it marked for migration? + */ + if (!PP_ISMIGRATE(pp)) + goto next; + + /* + * Determine lgroups that page is being migrated between + */ + pfn = page_pptonum(pp); + if (!IS_P2ALIGNED(pfn, page_cnt)) { + break; + } + from = lgrp_pfn_to_lgrp(pfn); + to = lgrp_mem_choose(seg, addr, pgsz); + + /* + * Check to see whether we are trying to migrate page to lgroup + * where it is allocated already + */ + if (to == from) { + PP_CLRMIGRATE(pp); + goto next; + } + + /* + * Need to get exclusive lock's to migrate + */ + for (i = 0; i < page_cnt; i++) { + ASSERT(PAGE_LOCKED(ppa[i])); + if (page_pptonum(ppa[i]) != pfn + i || + ppa[i]->p_szc != pszc) { + break; + } + if (!page_tryupgrade(ppa[i])) { + lgrp_stat_add(from->lgrp_id, + LGRP_PM_FAIL_LOCK_PGS, + page_cnt); + break; + } + } + if (i != page_cnt) { + while (--i != -1) { + page_downgrade(ppa[i]); + } + goto next; + } + + (void) page_create_wait(page_cnt, PG_WAIT); + newpp = page_get_replacement_page(pp, to, PGR_SAMESZC); + if (newpp == NULL) { + page_create_putback(page_cnt); + for (i = 0; i < page_cnt; i++) { + page_downgrade(ppa[i]); + } + lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS, + page_cnt); + goto next; + } + ASSERT(newpp->p_szc == pszc); + /* + * Clear migrate bit and relocate page + */ + PP_CLRMIGRATE(pp); + if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) { + panic("page_migrate: page_relocate failed"); + } + ASSERT(page_cnt * PAGESIZE == pgsz); + + /* + * Keep stats for number of pages migrated from and to + * each lgroup + */ + lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt); + lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt); + /* + * update the page_t array we were passed in and + * unlink constituent pages of a large page. + */ + for (i = 0; i < page_cnt; ++i, ++pp) { + ASSERT(PAGE_EXCL(newpp)); + ASSERT(newpp->p_szc == pszc); + ppa[i] = newpp; + pp = newpp; + page_sub(&newpp, pp); + page_downgrade(pp); + } + ASSERT(newpp == NULL); +next: + addr += pgsz; + ppa += page_cnt; + npages -= page_cnt; + } +} + +/* + * initialize the vnode for retired pages + */ +static void +page_retired_init(void) +{ + vn_setops(&retired_ppages, &retired_vnodeops); +} + +/* ARGSUSED */ +static void +retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr) +{ + panic("retired_dispose invoked"); +} + +/* ARGSUSED */ +static void +retired_inactive(vnode_t *vp, cred_t *cr) +{} + +void +page_unretire_pages(void) +{ + page_t *pp; + kmutex_t *vphm; + vnode_t *vp; + page_t *rpages[UNRETIRE_PAGES]; + pgcnt_t i, npages, rmem; + uint64_t pa; + + rmem = 0; + + for (;;) { + /* + * We do this in 2 steps: + * + * 1. We walk the retired pages list and collect a list of + * pages that have the toxic field cleared. + * + * 2. We iterate through the page list and unretire each one. + * + * We have to do it in two steps on account of the mutexes that + * we need to acquire. + */ + + vp = &retired_ppages; + vphm = page_vnode_mutex(vp); + mutex_enter(vphm); + + if ((pp = vp->v_pages) == NULL) { + mutex_exit(vphm); + break; + } + + i = 0; + do { + ASSERT(pp != NULL); + ASSERT(pp->p_vnode == vp); + + /* + * DR operations change the association between a page_t + * and the physical page it represents. Check if the + * page is still bad. If not, unretire it. + */ + if (!page_isfaulty(pp)) + rpages[i++] = pp; + + pp = pp->p_vpnext; + } while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES)); + + mutex_exit(vphm); + + npages = i; + for (i = 0; i < npages; i++) { + pp = rpages[i]; + pa = ptob((uint64_t)page_pptonum(pp)); + + /* + * Need to upgrade the shared lock to an exclusive + * lock in order to hash out the page. + * + * The page could have been retired but the page lock + * may not have been downgraded yet. If so, skip this + * page. page_free() will call this function after the + * lock is downgraded. + */ + + if (!PAGE_SHARED(pp) || !page_tryupgrade(pp)) + continue; + + /* + * Both page_free() and DR call this function. They + * can potentially call this function at the same + * time and race with each other. + */ + if (!page_isretired(pp) || page_isfaulty(pp)) { + page_downgrade(pp); + continue; + } + + cmn_err(CE_NOTE, + "unretiring retired page 0x%08x.%08x", + (uint32_t)(pa >> 32), (uint32_t)pa); + + /* + * When a page is removed from the retired pages vnode, + * its toxic field is also cleared. So, we do not have + * to do that seperately here. + */ + page_hashout(pp, (kmutex_t *)NULL); + + /* + * This is a good page. So, free it. + */ + pp->p_vnode = NULL; + page_free(pp, 1); + rmem++; + } + + /* + * If the rpages array was filled up, then there could be more + * retired pages that are not faulty. We need to iterate + * again and unretire them. Otherwise, we are done. + */ + if (npages < UNRETIRE_PAGES) + break; + } + + mutex_enter(&freemem_lock); + availrmem += rmem; + mutex_exit(&freemem_lock); +} + +ulong_t mem_waiters = 0; +ulong_t max_count = 20; +#define MAX_DELAY 0x1ff + +/* + * Check if enough memory is available to proceed. + * Depending on system configuration and how much memory is + * reserved for swap we need to check against two variables. + * e.g. on systems with little physical swap availrmem can be + * more reliable indicator of how much memory is available. + * On systems with large phys swap freemem can be better indicator. + * If freemem drops below threshold level don't return an error + * immediately but wake up pageout to free memory and block. + * This is done number of times. If pageout is not able to free + * memory within certain time return an error. + * The same applies for availrmem but kmem_reap is used to + * free memory. + */ +int +page_mem_avail(pgcnt_t npages) +{ + ulong_t count; + +#if defined(__i386) + if (freemem > desfree + npages && + availrmem > swapfs_reserve + npages && + btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem + + npages) + return (1); +#else + if (freemem > desfree + npages && + availrmem > swapfs_reserve + npages) + return (1); +#endif + + count = max_count; + atomic_add_long(&mem_waiters, 1); + + while (freemem < desfree + npages && --count) { + cv_signal(&proc_pageout->p_cv); + if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { + atomic_add_long(&mem_waiters, -1); + return (0); + } + } + if (count == 0) { + atomic_add_long(&mem_waiters, -1); + return (0); + } + + count = max_count; + while (availrmem < swapfs_reserve + npages && --count) { + kmem_reap(); + if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { + atomic_add_long(&mem_waiters, -1); + return (0); + } + } + atomic_add_long(&mem_waiters, -1); + if (count == 0) + return (0); + +#if defined(__i386) + if (btop(vmem_size(heap_arena, VMEM_FREE)) < + tune.t_minarmem + npages) + return (0); +#endif + return (1); +} + + +/* + * Search the memory segments to locate the desired page. Within a + * segment, pages increase linearly with one page structure per + * physical page frame (size PAGESIZE). The search begins + * with the segment that was accessed last, to take advantage of locality. + * If the hint misses, we start from the beginning of the sorted memseg list + */ + + +/* + * Some data structures for pfn to pp lookup. + */ +ulong_t mhash_per_slot; +struct memseg *memseg_hash[N_MEM_SLOTS]; + +page_t * +page_numtopp_nolock(pfn_t pfnum) +{ + static struct memseg *last_memseg_by_pfnum = NULL; + struct memseg *seg; + page_t *pp; + + /* + * XXX - Since page_numtopp_nolock is called in many places where + * the search fails more than it succeeds. It maybe worthwhile + * to put a check for pf_is_memory or a pfnum <= max_pfn (set at + * boot time). + * + * if (!pf_is_memory(pfnum) || (pfnum > max_pfn)) + * return (NULL); + */ + + MEMSEG_STAT_INCR(nsearch); + + /* Try last winner first */ + if (((seg = last_memseg_by_pfnum) != NULL) && + (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { + MEMSEG_STAT_INCR(nlastwon); + pp = seg->pages + (pfnum - seg->pages_base); + if (pp->p_pagenum == pfnum) + return ((page_t *)pp); + } + + /* Else Try hash */ + if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && + (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { + MEMSEG_STAT_INCR(nhashwon); + last_memseg_by_pfnum = seg; + pp = seg->pages + (pfnum - seg->pages_base); + if (pp->p_pagenum == pfnum) + return ((page_t *)pp); + } + + /* Else Brute force */ + for (seg = memsegs; seg != NULL; seg = seg->next) { + if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { + last_memseg_by_pfnum = seg; + pp = seg->pages + (pfnum - seg->pages_base); + return ((page_t *)pp); + } + } + last_memseg_by_pfnum = NULL; + MEMSEG_STAT_INCR(nnotfound); + return ((page_t *)NULL); + +} + +struct memseg * +page_numtomemseg_nolock(pfn_t pfnum) +{ + struct memseg *seg; + page_t *pp; + + /* Try hash */ + if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && + (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { + pp = seg->pages + (pfnum - seg->pages_base); + if (pp->p_pagenum == pfnum) + return (seg); + } + + /* Else Brute force */ + for (seg = memsegs; seg != NULL; seg = seg->next) { + if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { + return (seg); + } + } + return ((struct memseg *)NULL); +} + +/* + * Given a page and a count return the page struct that is + * n structs away from the current one in the global page + * list. + * + * This function wraps to the first page upon + * reaching the end of the memseg list. + */ +page_t * +page_nextn(page_t *pp, ulong_t n) +{ + static struct memseg *last_page_next_memseg = NULL; + struct memseg *seg; + page_t *ppn; + + if (((seg = last_page_next_memseg) == NULL) || + (seg->pages_base == seg->pages_end) || + !(pp >= seg->pages && pp < seg->epages)) { + + for (seg = memsegs; seg; seg = seg->next) { + if (pp >= seg->pages && pp < seg->epages) + break; + } + + if (seg == NULL) { + /* Memory delete got in, return something valid. */ + /* TODO: fix me. */ + seg = memsegs; + pp = seg->pages; + } + } + + /* check for wraparound - possible if n is large */ + while ((ppn = (pp + n)) >= seg->epages || ppn < pp) { + n -= seg->epages - pp; + seg = seg->next; + if (seg == NULL) + seg = memsegs; + pp = seg->pages; + } + last_page_next_memseg = seg; + return (ppn); +} + +/* + * Initialize for a loop using page_next_scan_large(). + */ +page_t * +page_next_scan_init(void **cookie) +{ + ASSERT(cookie != NULL); + *cookie = (void *)memsegs; + return ((page_t *)memsegs->pages); +} + +/* + * Return the next page in a scan of page_t's, assuming we want + * to skip over sub-pages within larger page sizes. + * + * The cookie is used to keep track of the current memseg. + */ +page_t * +page_next_scan_large( + page_t *pp, + ulong_t *n, + void **cookie) +{ + struct memseg *seg = (struct memseg *)*cookie; + page_t *new_pp; + ulong_t cnt; + pfn_t pfn; + + + /* + * get the count of page_t's to skip based on the page size + */ + ASSERT(pp != NULL); + if (pp->p_szc == 0) { + cnt = 1; + } else { + pfn = page_pptonum(pp); + cnt = page_get_pagecnt(pp->p_szc); + cnt -= pfn & (cnt - 1); + } + *n += cnt; + new_pp = pp + cnt; + + /* + * Catch if we went past the end of the current memory segment. If so, + * just move to the next segment with pages. + */ + if (new_pp >= seg->epages) { + do { + seg = seg->next; + if (seg == NULL) + seg = memsegs; + } while (seg->pages == seg->epages); + new_pp = seg->pages; + *cookie = (void *)seg; + } + + return (new_pp); +} + + +/* + * Returns next page in list. Note: this function wraps + * to the first page in the list upon reaching the end + * of the list. Callers should be aware of this fact. + */ + +/* We should change this be a #define */ + +page_t * +page_next(page_t *pp) +{ + return (page_nextn(pp, 1)); +} + +/* + * Special for routines processing an array of page_t. + */ +page_t * +page_nextn_raw(page_t *pp, ulong_t n) +{ + return (pp+n); +} + +page_t * +page_first() +{ + return ((page_t *)memsegs->pages); +} + + +/* + * This routine is called at boot with the initial memory configuration + * and when memory is added or removed. + */ +void +build_pfn_hash() +{ + pfn_t cur; + pgcnt_t index; + struct memseg *pseg; + int i; + + /* + * Clear memseg_hash array. + * Since memory add/delete is designed to operate concurrently + * with normal operation, the hash rebuild must be able to run + * concurrently with page_numtopp_nolock(). To support this + * functionality, assignments to memseg_hash array members must + * be done atomically. + * + * NOTE: bzero() does not currently guarantee this for kernel + * threads, and cannot be used here. + */ + for (i = 0; i < N_MEM_SLOTS; i++) + memseg_hash[i] = NULL; + + hat_kpm_mseghash_clear(N_MEM_SLOTS); + + /* + * Physmax is the last valid pfn. + */ + mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT; + for (pseg = memsegs; pseg != NULL; pseg = pseg->next) { + index = MEMSEG_PFN_HASH(pseg->pages_base); + cur = pseg->pages_base; + do { + if (index >= N_MEM_SLOTS) + index = MEMSEG_PFN_HASH(cur); + + if (memseg_hash[index] == NULL || + memseg_hash[index]->pages_base > pseg->pages_base) { + memseg_hash[index] = pseg; + hat_kpm_mseghash_update(index, pseg); + } + cur += mhash_per_slot; + index++; + } while (cur < pseg->pages_end); + } +} + +/* + * Return the pagenum for the pp + */ +pfn_t +page_pptonum(page_t *pp) +{ + return (pp->p_pagenum); +} + +/* + * interface to the referenced and modified etc bits + * in the PSM part of the page struct + * when no locking is desired. + */ +void +page_set_props(page_t *pp, uint_t flags) +{ + ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0); + pp->p_nrm |= (uchar_t)flags; +} + +void +page_clr_all_props(page_t *pp) +{ + pp->p_nrm = 0; +} + +/* + * The following functions is called from free_vp_pages() + * for an inexact estimate of a newly free'd page... + */ +ulong_t +page_share_cnt(page_t *pp) +{ + return (hat_page_getshare(pp)); +} + +/* + * The following functions are used in handling memory + * errors. + */ + +int +page_istoxic(page_t *pp) +{ + return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC); +} + +int +page_isfailing(page_t *pp) +{ + return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING); +} + +int +page_isretired(page_t *pp) +{ + return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED); +} + +int +page_deteriorating(page_t *pp) +{ + return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0); +} + +void +page_settoxic(page_t *pp, uchar_t flag) +{ + uchar_t new_flag = 0; + while ((new_flag & flag) != flag) { + uchar_t old_flag = pp->p_toxic; + new_flag = old_flag | flag; + (void) cas8(&pp->p_toxic, old_flag, new_flag); + new_flag = ((volatile page_t *)pp)->p_toxic; + } +} + +void +page_clrtoxic(page_t *pp) +{ + /* + * We don't need to worry about atomicity on the + * p_toxic flag here as this is only called from + * page_free() while holding an exclusive lock on + * the page + */ + pp->p_toxic = PAGE_IS_OK; +} + +void +page_clrtoxic_flag(page_t *pp, uchar_t flag) +{ + uchar_t new_flag = ((volatile page_t *)pp)->p_toxic; + while ((new_flag & flag) == flag) { + uchar_t old_flag = new_flag; + new_flag = old_flag & ~flag; + (void) cas8(&pp->p_toxic, old_flag, new_flag); + new_flag = ((volatile page_t *)pp)->p_toxic; + } +} + +int +page_isfaulty(page_t *pp) +{ + return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY); +} + +/* + * The following four functions are called from /proc code + * for the /proc/<pid>/xmap interface. + */ +int +page_isshared(page_t *pp) +{ + return (hat_page_getshare(pp) > 1); +} + +int +page_isfree(page_t *pp) +{ + return (PP_ISFREE(pp)); +} + +int +page_isref(page_t *pp) +{ + return (hat_page_getattr(pp, P_REF)); +} + +int +page_ismod(page_t *pp) +{ + return (hat_page_getattr(pp, P_MOD)); +} diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c new file mode 100644 index 0000000000..3d1d773321 --- /dev/null +++ b/usr/src/uts/common/vm/vm_pagelist.c @@ -0,0 +1,3726 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file contains common functions to access and manage the page lists. + * Many of these routines originated from platform dependent modules + * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in + * a platform independent manner. + * + * vm/vm_dep.h provides for platform specific support. + */ + +#include <sys/types.h> +#include <sys/debug.h> +#include <sys/cmn_err.h> +#include <sys/systm.h> +#include <sys/atomic.h> +#include <sys/sysmacros.h> +#include <vm/as.h> +#include <vm/page.h> +#include <vm/seg_kmem.h> +#include <vm/seg_vn.h> +#include <sys/memnode.h> +#include <vm/vm_dep.h> +#include <sys/lgrp.h> +#include <sys/mem_config.h> +#include <sys/callb.h> +#include <sys/mem_cage.h> +#include <sys/sdt.h> + +extern uint_t vac_colors; + +/* + * number of page colors equivalent to reqested color in page_get routines. + * If set, keeps large pages intact longer and keeps MPO allocation + * from the local mnode in favor of acquiring the 'correct' page color from + * a demoted large page or from a remote mnode. + */ +int colorequiv; + +/* + * if set, specifies the percentage of large pages that are free from within + * a large page region before attempting to lock those pages for + * page_get_contig_pages processing. + * + * Should be turned on when kpr is available when page_trylock_contig_pages + * can be more selective. + */ + +int ptcpthreshold; + +/* + * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. + * use slot 0 (base page size unused) to enable or disable limiting search. + * Enabled by default. + */ +int pgcpfailcnt[MMU_PAGE_SIZES]; +int pgcplimitsearch = 1; + +#ifdef VM_STATS +struct vmm_vmstats_str vmm_vmstats; + +#endif /* VM_STATS */ + +#if defined(__sparc) +#define LPGCREATE 0 +#else +/* enable page_get_contig_pages */ +#define LPGCREATE 1 +#endif + +int pg_contig_disable; +int pg_lpgcreate_nocage = LPGCREATE; + +/* + * page_freelist_fill pfn flag to signify no hi pfn requirement. + */ +#define PFNNULL 0 + +/* Flags involved in promotion and demotion routines */ +#define PC_FREE 0x1 /* put page on freelist */ +#define PC_ALLOC 0x2 /* return page for allocation */ + +/* + * Flag for page_demote to be used with PC_FREE to denote that we don't care + * what the color is as the color parameter to the function is ignored. + */ +#define PC_NO_COLOR (-1) + +/* + * page counters candidates info + * See page_ctrs_cands comment below for more details. + * fields are as follows: + * pcc_pages_free: # pages which freelist coalesce can create + * pcc_color_free_len: number of elements in pcc_color_free array + * pcc_color_free: pointer to page free counts per color + */ +typedef struct pcc_info { + pgcnt_t pcc_pages_free; + int pcc_color_free_len; + pgcnt_t *pcc_color_free; +} pcc_info_t; + +/* + * On big machines it can take a long time to check page_counters + * arrays. page_ctrs_cands is a summary array whose elements are a dynamically + * updated sum of all elements of the corresponding page_counters arrays. + * page_freelist_coalesce() searches page_counters only if an appropriate + * element of page_ctrs_cands array is greater than 0. + * + * An extra dimension is used for page_ctrs_cands to spread the elements + * over a few e$ cache lines to avoid serialization during the array + * updates. + */ +#pragma align 64(page_ctrs_cands) + +static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; + +/* + * Return in val the total number of free pages which can be created + * for the given mnode (m) and region size (r) + */ +#define PGCTRS_CANDS_GETVALUE(m, r, val) { \ + int i; \ + val = 0; \ + for (i = 0; i < NPC_MUTEX; i++) { \ + val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \ + } \ +} + +/* + * Return in val the total number of free pages which can be created + * for the given mnode (m), region size (r), and color (c) + */ +#define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \ + int i; \ + val = 0; \ + ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \ + for (i = 0; i < NPC_MUTEX; i++) { \ + val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \ + } \ +} + +/* + * We can only allow a single thread to update a counter within the physical + * range of the largest supported page size. That is the finest granularity + * possible since the counter values are dependent on each other + * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the + * ctr_mutex lock index for a particular physical range. + */ +static kmutex_t *ctr_mutex[NPC_MUTEX]; + +#define PP_CTR_LOCK_INDX(pp) \ + (((pp)->p_pagenum >> \ + (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) + +/* + * Local functions prototypes. + */ + +void page_ctr_add(page_t *, int); +void page_ctr_add_internal(int, page_t *, int); +void page_ctr_sub(page_t *, int); +uint_t page_convert_color(uchar_t, uchar_t, uint_t); +void page_freelist_lock(int); +void page_freelist_unlock(int); +page_t *page_promote(int, pfn_t, uchar_t, int); +page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); +page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t); +page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); +static int page_trylock_cons(page_t *pp, se_t se); + +#define PNUM_SIZE(szc) \ + (hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift) +#define PNUM_SHIFT(szc) \ + (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) + +/* + * The page_counters array below is used to keep track of free contiguous + * physical memory. A hw_page_map_t will be allocated per mnode per szc. + * This contains an array of counters, the size of the array, a shift value + * used to convert a pagenum into a counter array index or vice versa, as + * well as a cache of the last successful index to be promoted to a larger + * page size. As an optimization, we keep track of the last successful index + * to be promoted per page color for the given size region, and this is + * allocated dynamically based upon the number of colors for a given + * region size. + * + * Conceptually, the page counters are represented as: + * + * page_counters[region_size][mnode] + * + * region_size: size code of a candidate larger page made up + * of contiguous free smaller pages. + * + * page_counters[region_size][mnode].hpm_counters[index]: + * represents how many (region_size - 1) pages either + * exist or can be created within the given index range. + * + * Let's look at a sparc example: + * If we want to create a free 512k page, we look at region_size 2 + * for the mnode we want. We calculate the index and look at a specific + * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at + * this location, it means that 8 64k pages either exist or can be created + * from 8K pages in order to make a single free 512k page at the given + * index. Note that when a region is full, it will contribute to the + * counts in the region above it. Thus we will not know what page + * size the free pages will be which can be promoted to this new free + * page unless we look at all regions below the current region. + */ + +/* + * Note: hpmctr_t is defined in platform vm_dep.h + * hw_page_map_t contains all the information needed for the page_counters + * logic. The fields are as follows: + * + * hpm_counters: dynamically allocated array to hold counter data + * hpm_entries: entries in hpm_counters + * hpm_shift: shift for pnum/array index conv + * hpm_base: PFN mapped to counter index 0 + * hpm_color_current_len: # of elements in hpm_color_current "array" below + * hpm_color_current: last index in counter array for this color at + * which we successfully created a large page + */ +typedef struct hw_page_map { + hpmctr_t *hpm_counters; + size_t hpm_entries; + int hpm_shift; + pfn_t hpm_base; + size_t hpm_color_current_len; + size_t *hpm_color_current; +} hw_page_map_t; + +/* + * Element zero is not used, but is allocated for convenience. + */ +static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; + +/* + * The following macros are convenient ways to get access to the individual + * elements of the page_counters arrays. They can be used on both + * the left side and right side of equations. + */ +#define PAGE_COUNTERS(mnode, rg_szc, idx) \ + (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) + +#define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ + (page_counters[(rg_szc)][(mnode)].hpm_counters) + +#define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ + (page_counters[(rg_szc)][(mnode)].hpm_shift) + +#define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ + (page_counters[(rg_szc)][(mnode)].hpm_entries) + +#define PAGE_COUNTERS_BASE(mnode, rg_szc) \ + (page_counters[(rg_szc)][(mnode)].hpm_base) + +#define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \ + (page_counters[(rg_szc)][(mnode)].hpm_color_current_len) + +#define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \ + (page_counters[(rg_szc)][(mnode)].hpm_color_current) + +#define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \ + (page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)]) + +#define PNUM_TO_IDX(mnode, rg_szc, pnum) \ + (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ + PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) + +#define IDX_TO_PNUM(mnode, rg_szc, index) \ + (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ + ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) + +/* + * Protects the hpm_counters and hpm_color_current memory from changing while + * looking at page counters information. + * Grab the write lock to modify what these fields point at. + * Grab the read lock to prevent any pointers from changing. + * The write lock can not be held during memory allocation due to a possible + * recursion deadlock with trying to grab the read lock while the + * write lock is already held. + */ +krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; + +/* + * page size to page size code + */ +int +page_szc(size_t pagesize) +{ + int i = 0; + + while (hw_page_array[i].hp_size) { + if (pagesize == hw_page_array[i].hp_size) + return (i); + i++; + } + return (-1); +} + +/* + * page size to page size code for user supported page sizes + */ +int +page_user_szc(size_t pagesize) +{ + int szc = page_szc(pagesize); + if (szc != -1) + return (SZC_2_USERSZC(szc)); + return (-1); +} + +/* + * Return how many page sizes are available for the user to use. This is + * what the hardware supports and not based upon how the OS implements the + * support of different page sizes. + */ +uint_t +page_num_user_pagesizes(void) +{ + return (mmu_exported_page_sizes); +} + +uint_t +page_num_pagesizes(void) +{ + return (mmu_page_sizes); +} + +/* + * returns the count of the number of base pagesize pages associated with szc + */ +pgcnt_t +page_get_pagecnt(uint_t szc) +{ + if (szc >= mmu_page_sizes) + panic("page_get_pagecnt: out of range %d", szc); + return (hw_page_array[szc].hp_pgcnt); +} + +size_t +page_get_pagesize(uint_t szc) +{ + if (szc >= mmu_page_sizes) + panic("page_get_pagesize: out of range %d", szc); + return (hw_page_array[szc].hp_size); +} + +/* + * Return the size of a page based upon the index passed in. An index of + * zero refers to the smallest page size in the system, and as index increases + * it refers to the next larger supported page size in the system. + * Note that szc and userszc may not be the same due to unsupported szc's on + * some systems. + */ +size_t +page_get_user_pagesize(uint_t userszc) +{ + uint_t szc = USERSZC_2_SZC(userszc); + + if (szc >= mmu_page_sizes) + panic("page_get_user_pagesize: out of range %d", szc); + return (hw_page_array[szc].hp_size); +} + +uint_t +page_get_shift(uint_t szc) +{ + if (szc >= mmu_page_sizes) + panic("page_get_shift: out of range %d", szc); + return (hw_page_array[szc].hp_shift); +} + +uint_t +page_get_pagecolors(uint_t szc) +{ + ASSERT(page_colors != 0); + return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1)); +} + +/* + * Called by startup(). + * Size up the per page size free list counters based on physmax + * of each node and max_mem_nodes. + */ +size_t +page_ctrs_sz(void) +{ + int r; /* region size */ + int mnode; + uint_t ctrs_sz = 0; + int i; + pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; + + /* + * We need to determine how many page colors there are for each + * page size in order to allocate memory for any color specific + * arrays. + */ + colors_per_szc[0] = page_colors; + for (i = 1; i < mmu_page_sizes; i++) { + colors_per_szc[i] = + page_convert_color(0, i, page_colors - 1) + 1; + } + + for (mnode = 0; mnode < max_mem_nodes; mnode++) { + + pgcnt_t r_pgcnt; + pfn_t r_base; + pgcnt_t r_align; + + if (mem_node_config[mnode].exists == 0) + continue; + + /* + * determine size needed for page counter arrays with + * base aligned to large page size. + */ + for (r = 1; r < mmu_page_sizes; r++) { + /* add in space for hpm_counters */ + r_align = page_get_pagecnt(r); + r_base = mem_node_config[mnode].physbase; + r_base &= ~(r_align - 1); + r_pgcnt = howmany(mem_node_config[mnode].physmax - + r_base, r_align); + /* + * Round up to always allocate on pointer sized + * boundaries. + */ + ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), + sizeof (hpmctr_t *)); + + /* add in space for hpm_color_current */ + ctrs_sz += (colors_per_szc[r] * + sizeof (size_t)); + } + } + + for (r = 1; r < mmu_page_sizes; r++) { + ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); + + /* add in space for page_ctrs_cands */ + ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t)); + ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] * + sizeof (pgcnt_t); + } + + /* ctr_mutex */ + ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); + + /* size for page list counts */ + PLCNT_SZ(ctrs_sz); + + /* + * add some slop for roundups. page_ctrs_alloc will roundup the start + * address of the counters to ecache_alignsize boundary for every + * memory node. + */ + return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); +} + +caddr_t +page_ctrs_alloc(caddr_t alloc_base) +{ + int mnode; + int r; /* region size */ + int i; + pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; + + /* + * We need to determine how many page colors there are for each + * page size in order to allocate memory for any color specific + * arrays. + */ + colors_per_szc[0] = page_colors; + for (i = 1; i < mmu_page_sizes; i++) { + colors_per_szc[i] = + page_convert_color(0, i, page_colors - 1) + 1; + } + + for (r = 1; r < mmu_page_sizes; r++) { + page_counters[r] = (hw_page_map_t *)alloc_base; + alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); + } + + /* page_ctrs_cands */ + for (r = 1; r < mmu_page_sizes; r++) { + for (i = 0; i < NPC_MUTEX; i++) { + page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base; + alloc_base += max_mem_nodes * (sizeof (pcc_info_t)); + + } + } + + /* page_ctrs_cands pcc_color_free array */ + for (r = 1; r < mmu_page_sizes; r++) { + for (i = 0; i < NPC_MUTEX; i++) { + for (mnode = 0; mnode < max_mem_nodes; mnode++) { + page_ctrs_cands[i][r][mnode].pcc_color_free_len + = colors_per_szc[r]; + page_ctrs_cands[i][r][mnode].pcc_color_free = + (pgcnt_t *)alloc_base; + alloc_base += colors_per_szc[r] * + sizeof (pgcnt_t); + } + } + } + + /* ctr_mutex */ + for (i = 0; i < NPC_MUTEX; i++) { + ctr_mutex[i] = (kmutex_t *)alloc_base; + alloc_base += (max_mem_nodes * sizeof (kmutex_t)); + } + + /* initialize page list counts */ + PLCNT_INIT(alloc_base); + + for (mnode = 0; mnode < max_mem_nodes; mnode++) { + + pgcnt_t r_pgcnt; + pfn_t r_base; + pgcnt_t r_align; + int r_shift; + + if (mem_node_config[mnode].exists == 0) + continue; + + for (r = 1; r < mmu_page_sizes; r++) { + /* + * the page_counters base has to be aligned to the + * page count of page size code r otherwise the counts + * will cross large page boundaries. + */ + r_align = page_get_pagecnt(r); + r_base = mem_node_config[mnode].physbase; + /* base needs to be aligned - lower to aligned value */ + r_base &= ~(r_align - 1); + r_pgcnt = howmany(mem_node_config[mnode].physmax - + r_base, r_align); + r_shift = PAGE_BSZS_SHIFT(r); + + PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; + PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; + PAGE_COUNTERS_BASE(mnode, r) = r_base; + PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = + colors_per_szc[r]; + PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = + (size_t *)alloc_base; + alloc_base += (sizeof (size_t) * colors_per_szc[r]); + for (i = 0; i < colors_per_szc[r]; i++) { + PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; + } + PAGE_COUNTERS_COUNTERS(mnode, r) = + (hpmctr_t *)alloc_base; + /* + * Round up to make alloc_base always be aligned on + * a pointer boundary. + */ + alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), + sizeof (hpmctr_t *)); + + /* + * Verify that PNUM_TO_IDX and IDX_TO_PNUM + * satisfy the identity requirement. + * We should be able to go from one to the other + * and get consistent values. + */ + ASSERT(PNUM_TO_IDX(mnode, r, + (IDX_TO_PNUM(mnode, r, 0))) == 0); + ASSERT(IDX_TO_PNUM(mnode, r, + (PNUM_TO_IDX(mnode, r, r_base))) == r_base); + } + /* + * Roundup the start address of the page_counters to + * cache aligned boundary for every memory node. + * page_ctrs_sz() has added some slop for these roundups. + */ + alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, + L2CACHE_ALIGN); + } + + /* Initialize other page counter specific data structures. */ + for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { + rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); + } + + return (alloc_base); +} + +/* + * Functions to adjust region counters for each size free list. + * Caller is responsible to acquire the ctr_mutex lock if necessary and + * thus can be called during startup without locks. + */ +/* ARGSUSED */ +void +page_ctr_add_internal(int mnode, page_t *pp, int flags) +{ + ssize_t r; /* region size */ + ssize_t idx; + pfn_t pfnum; + int lckidx; + + ASSERT(pp->p_szc < mmu_page_sizes); + + PLCNT_INCR(pp, mnode, pp->p_szc, flags); + + /* no counter update needed for largest page size */ + if (pp->p_szc >= mmu_page_sizes - 1) { + return; + } + + r = pp->p_szc + 1; + pfnum = pp->p_pagenum; + lckidx = PP_CTR_LOCK_INDX(pp); + + /* + * Increment the count of free pages for the current + * region. Continue looping up in region size incrementing + * count if the preceeding region is full. + */ + while (r < mmu_page_sizes) { + idx = PNUM_TO_IDX(mnode, r, pfnum); + + ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); + ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); + + if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) + break; + + page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++; + page_ctrs_cands[lckidx][r][mnode]. + pcc_color_free[PP_2_BIN_SZC(pp, r)]++; + r++; + } +} + +void +page_ctr_add(page_t *pp, int flags) +{ + int lckidx = PP_CTR_LOCK_INDX(pp); + int mnode = PP_2_MEM_NODE(pp); + kmutex_t *lock = &ctr_mutex[lckidx][mnode]; + + mutex_enter(lock); + page_ctr_add_internal(mnode, pp, flags); + mutex_exit(lock); +} + +void +page_ctr_sub(page_t *pp, int flags) +{ + int lckidx; + int mnode = PP_2_MEM_NODE(pp); + kmutex_t *lock; + ssize_t r; /* region size */ + ssize_t idx; + pfn_t pfnum; + + ASSERT(pp->p_szc < mmu_page_sizes); + + PLCNT_DECR(pp, mnode, pp->p_szc, flags); + + /* no counter update needed for largest page size */ + if (pp->p_szc >= mmu_page_sizes - 1) { + return; + } + + r = pp->p_szc + 1; + pfnum = pp->p_pagenum; + lckidx = PP_CTR_LOCK_INDX(pp); + lock = &ctr_mutex[lckidx][mnode]; + + /* + * Decrement the count of free pages for the current + * region. Continue looping up in region size decrementing + * count if the preceeding region was full. + */ + mutex_enter(lock); + while (r < mmu_page_sizes) { + idx = PNUM_TO_IDX(mnode, r, pfnum); + + ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); + ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); + + if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { + break; + } + ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0); + ASSERT(page_ctrs_cands[lckidx][r][mnode]. + pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); + + page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--; + page_ctrs_cands[lckidx][r][mnode]. + pcc_color_free[PP_2_BIN_SZC(pp, r)]--; + r++; + } + mutex_exit(lock); +} + +/* + * Adjust page counters following a memory attach, since typically the + * size of the array needs to change, and the PFN to counter index + * mapping needs to change. + */ +uint_t +page_ctrs_adjust(int mnode) +{ + pgcnt_t npgs; + int r; /* region size */ + int i; + size_t pcsz, old_csz; + hpmctr_t *new_ctr, *old_ctr; + pfn_t oldbase, newbase; + size_t old_npgs; + hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; + size_t size_cache[MMU_PAGE_SIZES]; + size_t *color_cache[MMU_PAGE_SIZES]; + size_t *old_color_array; + pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; + + newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; + npgs = roundup(mem_node_config[mnode].physmax, + PC_BASE_ALIGN) - newbase; + + /* + * We need to determine how many page colors there are for each + * page size in order to allocate memory for any color specific + * arrays. + */ + colors_per_szc[0] = page_colors; + for (r = 1; r < mmu_page_sizes; r++) { + colors_per_szc[r] = + page_convert_color(0, r, page_colors - 1) + 1; + } + + /* + * Preallocate all of the new hpm_counters arrays as we can't + * hold the page_ctrs_rwlock as a writer and allocate memory. + * If we can't allocate all of the arrays, undo our work so far + * and return failure. + */ + for (r = 1; r < mmu_page_sizes; r++) { + pcsz = npgs >> PAGE_BSZS_SHIFT(r); + + ctr_cache[r] = kmem_zalloc(pcsz * + sizeof (hpmctr_t), KM_NOSLEEP); + if (ctr_cache[r] == NULL) { + while (--r >= 1) { + kmem_free(ctr_cache[r], + size_cache[r] * sizeof (hpmctr_t)); + } + return (ENOMEM); + } + size_cache[r] = pcsz; + } + /* + * Preallocate all of the new color current arrays as we can't + * hold the page_ctrs_rwlock as a writer and allocate memory. + * If we can't allocate all of the arrays, undo our work so far + * and return failure. + */ + for (r = 1; r < mmu_page_sizes; r++) { + color_cache[r] = kmem_zalloc(sizeof (size_t) * + colors_per_szc[r], KM_NOSLEEP); + if (color_cache[r] == NULL) { + while (--r >= 1) { + kmem_free(color_cache[r], + colors_per_szc[r] * sizeof (size_t)); + } + for (r = 1; r < mmu_page_sizes; r++) { + kmem_free(ctr_cache[r], + size_cache[r] * sizeof (hpmctr_t)); + } + return (ENOMEM); + } + } + + /* + * Grab the write lock to prevent others from walking these arrays + * while we are modifying them. + */ + rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); + page_freelist_lock(mnode); + for (r = 1; r < mmu_page_sizes; r++) { + PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); + old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); + old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); + oldbase = PAGE_COUNTERS_BASE(mnode, r); + old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); + old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r); + + pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); + new_ctr = ctr_cache[r]; + ctr_cache[r] = NULL; + if (old_ctr != NULL && + (oldbase + old_npgs > newbase) && + (newbase + npgs > oldbase)) { + /* + * Map the intersection of the old and new + * counters into the new array. + */ + size_t offset; + if (newbase > oldbase) { + offset = (newbase - oldbase) >> + PAGE_COUNTERS_SHIFT(mnode, r); + bcopy(old_ctr + offset, new_ctr, + MIN(pcsz, (old_csz - offset)) * + sizeof (hpmctr_t)); + } else { + offset = (oldbase - newbase) >> + PAGE_COUNTERS_SHIFT(mnode, r); + bcopy(old_ctr, new_ctr + offset, + MIN(pcsz - offset, old_csz) * + sizeof (hpmctr_t)); + } + } + + PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; + PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; + PAGE_COUNTERS_BASE(mnode, r) = newbase; + PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r]; + PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r]; + color_cache[r] = NULL; + /* + * for now, just reset on these events as it's probably + * not worthwhile to try and optimize this. + */ + for (i = 0; i < colors_per_szc[r]; i++) { + PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; + } + + /* cache info for freeing out of the critical path */ + if ((caddr_t)old_ctr >= kernelheap && + (caddr_t)old_ctr < ekernelheap) { + ctr_cache[r] = old_ctr; + size_cache[r] = old_csz; + } + if ((caddr_t)old_color_array >= kernelheap && + (caddr_t)old_color_array < ekernelheap) { + color_cache[r] = old_color_array; + } + /* + * Verify that PNUM_TO_IDX and IDX_TO_PNUM + * satisfy the identity requirement. + * We should be able to go from one to the other + * and get consistent values. + */ + ASSERT(PNUM_TO_IDX(mnode, r, + (IDX_TO_PNUM(mnode, r, 0))) == 0); + ASSERT(IDX_TO_PNUM(mnode, r, + (PNUM_TO_IDX(mnode, r, newbase))) == newbase); + } + page_freelist_unlock(mnode); + rw_exit(&page_ctrs_rwlock[mnode]); + + /* + * Now that we have dropped the write lock, it is safe to free all + * of the memory we have cached above. + */ + for (r = 1; r < mmu_page_sizes; r++) { + if (ctr_cache[r] != NULL) { + kmem_free(ctr_cache[r], + size_cache[r] * sizeof (hpmctr_t)); + } + if (color_cache[r] != NULL) { + kmem_free(color_cache[r], + colors_per_szc[r] * sizeof (size_t)); + } + } + return (0); +} + +/* + * color contains a valid color index or bin for cur_szc + */ +uint_t +page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color) +{ + uint_t shift; + + if (cur_szc > new_szc) { + shift = page_get_shift(cur_szc) - page_get_shift(new_szc); + return (color << shift); + } else if (cur_szc < new_szc) { + shift = page_get_shift(new_szc) - page_get_shift(cur_szc); + return (color >> shift); + } + return (color); +} + +#ifdef DEBUG + +/* + * confirm pp is a large page corresponding to szc + */ +void +chk_lpg(page_t *pp, uchar_t szc) +{ + spgcnt_t npgs = page_get_pagecnt(pp->p_szc); + uint_t noreloc; + + if (npgs == 1) { + ASSERT(pp->p_szc == 0); + ASSERT(pp->p_next == pp); + ASSERT(pp->p_prev == pp); + return; + } + + ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); + ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); + + ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); + ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); + ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); + ASSERT(pp->p_prev == (pp + (npgs - 1))); + + /* + * Check list of pages. + */ + noreloc = PP_ISNORELOC(pp); + while (npgs--) { + if (npgs != 0) { + ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); + ASSERT(pp->p_next == (pp + 1)); + } + ASSERT(pp->p_szc == szc); + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); + ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); + ASSERT(pp->p_vnode == NULL); + ASSERT(PP_ISNORELOC(pp) == noreloc); + + pp = pp->p_next; + } +} +#endif /* DEBUG */ + +void +page_freelist_lock(int mnode) +{ + int i; + for (i = 0; i < NPC_MUTEX; i++) { + mutex_enter(FPC_MUTEX(mnode, i)); + mutex_enter(CPC_MUTEX(mnode, i)); + } +} + +void +page_freelist_unlock(int mnode) +{ + int i; + for (i = 0; i < NPC_MUTEX; i++) { + mutex_exit(FPC_MUTEX(mnode, i)); + mutex_exit(CPC_MUTEX(mnode, i)); + } +} + +/* + * add pp to the specified page list. Defaults to head of the page list + * unless PG_LIST_TAIL is specified. + */ +void +page_list_add(page_t *pp, int flags) +{ + page_t **ppp; + kmutex_t *pcm; + uint_t bin, mtype; + int mnode; + + ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); + ASSERT(PP_ISFREE(pp)); + ASSERT(!hat_page_is_mapped(pp)); + ASSERT(hat_page_getshare(pp) == 0); + + /* + * Large pages should be freed via page_list_add_pages(). + */ + ASSERT(pp->p_szc == 0); + + /* + * Don't need to lock the freelist first here + * because the page isn't on the freelist yet. + * This means p_szc can't change on us. + */ + + bin = PP_2_BIN(pp); + mnode = PP_2_MEM_NODE(pp); + mtype = PP_2_MTYPE(pp); + + if (flags & PG_LIST_ISINIT) { + /* + * PG_LIST_ISINIT is set during system startup (ie. single + * threaded), add a page to the free list and add to the + * the free region counters w/o any locking + */ + ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); + + /* inline version of page_add() */ + if (*ppp != NULL) { + pp->p_next = *ppp; + pp->p_prev = (*ppp)->p_prev; + (*ppp)->p_prev = pp; + pp->p_prev->p_next = pp; + } else + *ppp = pp; + + page_ctr_add_internal(mnode, pp, flags); + } else { + pcm = PC_BIN_MUTEX(mnode, bin, flags); + + if (flags & PG_FREE_LIST) { + ASSERT(PP_ISAGED(pp)); + ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); + + } else { + ASSERT(pp->p_vnode); + ASSERT((pp->p_offset & PAGEOFFSET) == 0); + ppp = &PAGE_CACHELISTS(mnode, bin, mtype); + } + mutex_enter(pcm); + page_add(ppp, pp); + + if (flags & PG_LIST_TAIL) + *ppp = (*ppp)->p_next; + /* + * Add counters before releasing pcm mutex to avoid a race with + * page_freelist_coalesce and page_freelist_fill. + */ + page_ctr_add(pp, flags); + mutex_exit(pcm); + } + + +#if defined(__sparc) + if (PP_ISNORELOC(pp)) { + kcage_freemem_add(1); + } +#endif + /* + * It is up to the caller to unlock the page! + */ + ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); +} + + +#ifdef __sparc +/* + * This routine is only used by kcage_init during system startup. + * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add + * without the overhead of taking locks and updating counters. + */ +void +page_list_noreloc_startup(page_t *pp) +{ + page_t **ppp; + uint_t bin; + int mnode; + int mtype; + int flags = PG_LIST_ISCAGE; + + /* + * If this is a large page on the freelist then + * break it up into smaller pages. + */ + if (pp->p_szc != 0) + page_boot_demote(pp); + + /* + * Get list page is currently on. + */ + bin = PP_2_BIN(pp); + mnode = PP_2_MEM_NODE(pp); + mtype = PP_2_MTYPE(pp); + ASSERT(mtype == MTYPE_RELOC); + ASSERT(pp->p_szc == 0); + + if (PP_ISAGED(pp)) { + ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); + flags |= PG_FREE_LIST; + } else { + ppp = &PAGE_CACHELISTS(mnode, bin, mtype); + flags |= PG_CACHE_LIST; + } + + ASSERT(*ppp != NULL); + + /* + * Delete page from current list. + */ + if (*ppp == pp) + *ppp = pp->p_next; /* go to next page */ + if (*ppp == pp) { + *ppp = NULL; /* page list is gone */ + } else { + pp->p_prev->p_next = pp->p_next; + pp->p_next->p_prev = pp->p_prev; + } + + /* LINTED */ + PLCNT_DECR(pp, mnode, 0, flags); + + /* + * Set no reloc for cage initted pages. + */ + PP_SETNORELOC(pp); + + mtype = PP_2_MTYPE(pp); + ASSERT(mtype == MTYPE_NORELOC); + + /* + * Get new list for page. + */ + if (PP_ISAGED(pp)) { + ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); + } else { + ppp = &PAGE_CACHELISTS(mnode, bin, mtype); + } + + /* + * Insert page on new list. + */ + if (*ppp == NULL) { + *ppp = pp; + pp->p_next = pp->p_prev = pp; + } else { + pp->p_next = *ppp; + pp->p_prev = (*ppp)->p_prev; + (*ppp)->p_prev = pp; + pp->p_prev->p_next = pp; + } + + /* LINTED */ + PLCNT_INCR(pp, mnode, 0, flags); + + /* + * Update cage freemem counter + */ + atomic_add_long(&kcage_freemem, 1); +} +#else /* __sparc */ + +/* ARGSUSED */ +void +page_list_noreloc_startup(page_t *pp) +{ + panic("page_list_noreloc_startup: should be here only for sparc"); +} +#endif + +void +page_list_add_pages(page_t *pp, int flags) +{ + kmutex_t *pcm; + pgcnt_t pgcnt; + uint_t bin, mtype, i; + int mnode; + + /* default to freelist/head */ + ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); + + CHK_LPG(pp, pp->p_szc); + VM_STAT_ADD(vmm_vmstats.pc_list_add_pages[pp->p_szc]); + + bin = PP_2_BIN(pp); + mnode = PP_2_MEM_NODE(pp); + mtype = PP_2_MTYPE(pp); + + if (flags & PG_LIST_ISINIT) { + ASSERT(pp->p_szc == mmu_page_sizes - 1); + page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); + ASSERT(!PP_ISNORELOC(pp)); + PLCNT_INCR(pp, mnode, pp->p_szc, flags); + } else { + + ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); + + pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); + + mutex_enter(pcm); + page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); + page_ctr_add(pp, PG_FREE_LIST); + mutex_exit(pcm); + + pgcnt = page_get_pagecnt(pp->p_szc); +#if defined(__sparc) + if (PP_ISNORELOC(pp)) + kcage_freemem_add(pgcnt); +#endif + for (i = 0; i < pgcnt; i++, pp++) + page_unlock(pp); + } +} + +/* + * During boot, need to demote a large page to base + * pagesize pages for seg_kmem for use in boot_alloc() + */ +void +page_boot_demote(page_t *pp) +{ + ASSERT(pp->p_szc != 0); + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + + (void) page_demote(PP_2_MEM_NODE(pp), + PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, + PC_FREE); + + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + ASSERT(pp->p_szc == 0); +} + +/* + * Take a particular page off of whatever freelist the page + * is claimed to be on. + * + * NOTE: Only used for PAGESIZE pages. + */ +void +page_list_sub(page_t *pp, int flags) +{ + int bin; + uint_t mtype; + int mnode; + kmutex_t *pcm; + page_t **ppp; + + ASSERT(PAGE_EXCL(pp)); + ASSERT(PP_ISFREE(pp)); + + /* + * The p_szc field can only be changed by page_promote() + * and page_demote(). Only free pages can be promoted and + * demoted and the free list MUST be locked during these + * operations. So to prevent a race in page_list_sub() + * between computing which bin of the freelist lock to + * grab and actually grabing the lock we check again that + * the bin we locked is still the correct one. Notice that + * the p_szc field could have actually changed on us but + * if the bin happens to still be the same we are safe. + */ +try_again: + bin = PP_2_BIN(pp); + mnode = PP_2_MEM_NODE(pp); + pcm = PC_BIN_MUTEX(mnode, bin, flags); + mutex_enter(pcm); + if (PP_2_BIN(pp) != bin) { + mutex_exit(pcm); + goto try_again; + } + mtype = PP_2_MTYPE(pp); + + if (flags & PG_FREE_LIST) { + ASSERT(PP_ISAGED(pp)); + ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); + } else { + ASSERT(!PP_ISAGED(pp)); + ppp = &PAGE_CACHELISTS(mnode, bin, mtype); + } + + /* + * Common PAGESIZE case. + * + * Note that we locked the freelist. This prevents + * any page promotion/demotion operations. Therefore + * the p_szc will not change until we drop pcm mutex. + */ + if (pp->p_szc == 0) { + page_sub(ppp, pp); + /* + * Subtract counters before releasing pcm mutex + * to avoid race with page_freelist_coalesce. + */ + page_ctr_sub(pp, flags); + mutex_exit(pcm); + +#if defined(__sparc) + if (PP_ISNORELOC(pp)) { + kcage_freemem_sub(1); + } +#endif + return; + } + + /* + * Large pages on the cache list are not supported. + */ + if (flags & PG_CACHE_LIST) + panic("page_list_sub: large page on cachelist"); + + /* + * Slow but rare. + * + * Somebody wants this particular page which is part + * of a large page. In this case we just demote the page + * if it's on the freelist. + * + * We have to drop pcm before locking the entire freelist. + * Once we have re-locked the freelist check to make sure + * the page hasn't already been demoted or completely + * freed. + */ + mutex_exit(pcm); + page_freelist_lock(mnode); + if (pp->p_szc != 0) { + /* + * Large page is on freelist. + */ + (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), + pp->p_szc, 0, PC_NO_COLOR, PC_FREE); + } + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + ASSERT(pp->p_szc == 0); + + /* + * Subtract counters before releasing pcm mutex + * to avoid race with page_freelist_coalesce. + */ + bin = PP_2_BIN(pp); + mtype = PP_2_MTYPE(pp); + ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); + + page_sub(ppp, pp); + page_ctr_sub(pp, flags); + page_freelist_unlock(mnode); + +#if defined(__sparc) + if (PP_ISNORELOC(pp)) { + kcage_freemem_sub(1); + } +#endif +} + +void +page_list_sub_pages(page_t *pp, uint_t szc) +{ + kmutex_t *pcm; + uint_t bin, mtype; + int mnode; + + ASSERT(PAGE_EXCL(pp)); + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + + /* + * See comment in page_list_sub(). + */ +try_again: + bin = PP_2_BIN(pp); + mnode = PP_2_MEM_NODE(pp); + pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); + mutex_enter(pcm); + if (PP_2_BIN(pp) != bin) { + mutex_exit(pcm); + goto try_again; + } + + VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages1[pp->p_szc]); + + /* + * If we're called with a page larger than szc or it got + * promoted above szc before we locked the freelist then + * drop pcm and re-lock entire freelist. If page still larger + * than szc then demote it. + */ + if (pp->p_szc > szc) { + VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages2[pp->p_szc]); + mutex_exit(pcm); + pcm = NULL; + page_freelist_lock(mnode); + if (pp->p_szc > szc) { + VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages3[pp->p_szc]); + (void) page_demote(mnode, + PFN_BASE(pp->p_pagenum, pp->p_szc), + pp->p_szc, szc, PC_NO_COLOR, PC_FREE); + } + bin = PP_2_BIN(pp); + } + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + ASSERT(pp->p_szc <= szc); + ASSERT(pp == PP_PAGEROOT(pp)); + + mtype = PP_2_MTYPE(pp); + if (pp->p_szc != 0) { + page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); + CHK_LPG(pp, pp->p_szc); + } else { + page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); + } + page_ctr_sub(pp, PG_FREE_LIST); + + if (pcm != NULL) { + mutex_exit(pcm); + } else { + page_freelist_unlock(mnode); + } + +#if defined(__sparc) + if (PP_ISNORELOC(pp)) { + pgcnt_t pgcnt; + + pgcnt = page_get_pagecnt(pp->p_szc); + kcage_freemem_sub(pgcnt); + } +#endif +} + +/* + * Add the page to the front of a linked list of pages + * using the p_next & p_prev pointers for the list. + * The caller is responsible for protecting the list pointers. + */ +void +mach_page_add(page_t **ppp, page_t *pp) +{ + if (*ppp == NULL) { + pp->p_next = pp->p_prev = pp; + } else { + pp->p_next = *ppp; + pp->p_prev = (*ppp)->p_prev; + (*ppp)->p_prev = pp; + pp->p_prev->p_next = pp; + } + *ppp = pp; +} + +/* + * Remove this page from a linked list of pages + * using the p_next & p_prev pointers for the list. + * + * The caller is responsible for protecting the list pointers. + */ +void +mach_page_sub(page_t **ppp, page_t *pp) +{ + ASSERT(PP_ISFREE(pp)); + + if (*ppp == NULL || pp == NULL) + panic("mach_page_sub"); + + if (*ppp == pp) + *ppp = pp->p_next; /* go to next page */ + + if (*ppp == pp) + *ppp = NULL; /* page list is gone */ + else { + pp->p_prev->p_next = pp->p_next; + pp->p_next->p_prev = pp->p_prev; + } + pp->p_prev = pp->p_next = pp; /* make pp a list of one */ +} + +/* + * Routine fsflush uses to gradually coalesce the free list into larger pages. + */ +void +page_promote_size(page_t *pp, uint_t cur_szc) +{ + pfn_t pfn; + int mnode; + int idx; + int new_szc = cur_szc + 1; + int full = FULL_REGION_CNT(new_szc); + + pfn = page_pptonum(pp); + mnode = PFN_2_MEM_NODE(pfn); + + page_freelist_lock(mnode); + + idx = PNUM_TO_IDX(mnode, new_szc, pfn); + if (PAGE_COUNTERS(mnode, new_szc, idx) == full) + (void) page_promote(mnode, pfn, new_szc, PC_FREE); + + page_freelist_unlock(mnode); +} + +static uint_t page_promote_err; +static uint_t page_promote_noreloc_err; + +/* + * Create a single larger page (of szc new_szc) from smaller contiguous pages + * for the given mnode starting at pfnum. Pages involved are on the freelist + * before the call and may be returned to the caller if requested, otherwise + * they will be placed back on the freelist. + * If flags is PC_ALLOC, then the large page will be returned to the user in + * a state which is consistent with a page being taken off the freelist. If + * we failed to lock the new large page, then we will return NULL to the + * caller and put the large page on the freelist instead. + * If flags is PC_FREE, then the large page will be placed on the freelist, + * and NULL will be returned. + * The caller is responsible for locking the freelist as well as any other + * accounting which needs to be done for a returned page. + * + * RFE: For performance pass in pp instead of pfnum so + * we can avoid excessive calls to page_numtopp_nolock(). + * This would depend on an assumption that all contiguous + * pages are in the same memseg so we can just add/dec + * our pp. + * + * Lock ordering: + * + * There is a potential but rare deadlock situation + * for page promotion and demotion operations. The problem + * is there are two paths into the freelist manager and + * they have different lock orders: + * + * page_create() + * lock freelist + * page_lock(EXCL) + * unlock freelist + * return + * caller drops page_lock + * + * page_free() and page_reclaim() + * caller grabs page_lock(EXCL) + * + * lock freelist + * unlock freelist + * drop page_lock + * + * What prevents a thread in page_create() from deadlocking + * with a thread freeing or reclaiming the same page is the + * page_trylock() in page_get_freelist(). If the trylock fails + * it skips the page. + * + * The lock ordering for promotion and demotion is the same as + * for page_create(). Since the same deadlock could occur during + * page promotion and freeing or reclaiming of a page on the + * cache list we might have to fail the operation and undo what + * have done so far. Again this is rare. + */ +page_t * +page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) +{ + page_t *pp, *pplist, *tpp, *start_pp; + pgcnt_t new_npgs, npgs; + uint_t bin; + pgcnt_t tmpnpgs, pages_left; + uint_t mtype; + uint_t noreloc; + uint_t i; + int which_list; + ulong_t index; + kmutex_t *phm; + + /* + * General algorithm: + * Find the starting page + * Walk each page struct removing it from the freelist, + * and linking it to all the other pages removed. + * Once all pages are off the freelist, + * walk the list, modifying p_szc to new_szc and what + * ever other info needs to be done to create a large free page. + * According to the flags, either return the page or put it + * on the freelist. + */ + + start_pp = page_numtopp_nolock(pfnum); + ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); + new_npgs = page_get_pagecnt(new_szc); + ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); + + /* + * Loop through smaller pages to confirm that all pages + * give the same result for PP_ISNORELOC(). + * We can check this reliably here as the protocol for setting + * P_NORELOC requires pages to be taken off the free list first. + */ + for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) { + if (pp == start_pp) { + /* First page, set requirement. */ + noreloc = PP_ISNORELOC(pp); + } else if (noreloc != PP_ISNORELOC(pp)) { + page_promote_noreloc_err++; + page_promote_err++; + return (NULL); + } + } + + pages_left = new_npgs; + pplist = NULL; + pp = start_pp; + + /* Loop around coalescing the smaller pages into a big page. */ + while (pages_left) { + /* + * Remove from the freelist. + */ + ASSERT(PP_ISFREE(pp)); + bin = PP_2_BIN(pp); + ASSERT(mnode == PP_2_MEM_NODE(pp)); + mtype = PP_2_MTYPE(pp); + if (PP_ISAGED(pp)) { + + /* + * PG_FREE_LIST + */ + if (pp->p_szc) { + page_vpsub(&PAGE_FREELISTS(mnode, + pp->p_szc, bin, mtype), pp); + } else { + mach_page_sub(&PAGE_FREELISTS(mnode, 0, + bin, mtype), pp); + } + which_list = PG_FREE_LIST; + } else { + ASSERT(pp->p_szc == 0); + + /* + * PG_CACHE_LIST + * + * Since this page comes from the + * cachelist, we must destroy the + * vnode association. + */ + if (!page_trylock(pp, SE_EXCL)) { + goto fail_promote; + } + + /* + * We need to be careful not to deadlock + * with another thread in page_lookup(). + * The page_lookup() thread could be holding + * the same phm that we need if the two + * pages happen to hash to the same phm lock. + * At this point we have locked the entire + * freelist and page_lookup() could be trying + * to grab a freelist lock. + */ + index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); + phm = PAGE_HASH_MUTEX(index); + if (!mutex_tryenter(phm)) { + page_unlock(pp); + goto fail_promote; + } + + mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); + page_hashout(pp, phm); + mutex_exit(phm); + PP_SETAGED(pp); + page_unlock(pp); + which_list = PG_CACHE_LIST; + } + page_ctr_sub(pp, which_list); + + /* + * Concatenate the smaller page(s) onto + * the large page list. + */ + tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); + pages_left -= npgs; + tpp = pp; + while (npgs--) { + tpp->p_szc = new_szc; + tpp = tpp->p_next; + } + page_list_concat(&pplist, &pp); + pp += tmpnpgs; + } + CHK_LPG(pplist, new_szc); + + /* + * return the page to the user if requested + * in the properly locked state. + */ + if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { + return (pplist); + } + + /* + * Otherwise place the new large page on the freelist + */ + bin = PP_2_BIN(pplist); + mnode = PP_2_MEM_NODE(pplist); + mtype = PP_2_MTYPE(pplist); + page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); + + page_ctr_add(pplist, PG_FREE_LIST); + return (NULL); + +fail_promote: + /* + * A thread must have still been freeing or + * reclaiming the page on the cachelist. + * To prevent a deadlock undo what we have + * done sofar and return failure. This + * situation can only happen while promoting + * PAGESIZE pages. + */ + page_promote_err++; + while (pplist) { + pp = pplist; + mach_page_sub(&pplist, pp); + pp->p_szc = 0; + bin = PP_2_BIN(pp); + mtype = PP_2_MTYPE(pp); + mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); + page_ctr_add(pp, PG_FREE_LIST); + } + return (NULL); + +} + +/* + * Break up a large page into smaller size pages. + * Pages involved are on the freelist before the call and may + * be returned to the caller if requested, otherwise they will + * be placed back on the freelist. + * The caller is responsible for locking the freelist as well as any other + * accounting which needs to be done for a returned page. + * If flags is not PC_ALLOC, the color argument is ignored, and thus + * technically, any value may be passed in but PC_NO_COLOR is the standard + * which should be followed for clarity's sake. + */ +page_t * +page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, + int color, int flags) +{ + page_t *pp, *pplist, *npplist; + pgcnt_t npgs, n; + uint_t bin; + uint_t mtype; + page_t *ret_pp = NULL; + + ASSERT(cur_szc != 0); + ASSERT(new_szc < cur_szc); + + pplist = page_numtopp_nolock(pfnum); + ASSERT(pplist != NULL); + + ASSERT(pplist->p_szc == cur_szc); + + bin = PP_2_BIN(pplist); + ASSERT(mnode == PP_2_MEM_NODE(pplist)); + mtype = PP_2_MTYPE(pplist); + page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); + + CHK_LPG(pplist, cur_szc); + page_ctr_sub(pplist, PG_FREE_LIST); + + /* + * Number of PAGESIZE pages for smaller new_szc + * page. + */ + npgs = page_get_pagecnt(new_szc); + + while (pplist) { + pp = pplist; + + ASSERT(pp->p_szc == cur_szc); + + /* + * We either break it up into PAGESIZE pages or larger. + */ + if (npgs == 1) { /* PAGESIZE case */ + mach_page_sub(&pplist, pp); + ASSERT(pp->p_szc == cur_szc); + ASSERT(new_szc == 0); + ASSERT(mnode == PP_2_MEM_NODE(pp)); + pp->p_szc = new_szc; + bin = PP_2_BIN(pp); + if ((bin == color) && (flags == PC_ALLOC) && + (ret_pp == NULL) && + page_trylock_cons(pp, SE_EXCL)) { + ret_pp = pp; + } else { + mtype = PP_2_MTYPE(pp); + mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, + mtype), pp); + page_ctr_add(pp, PG_FREE_LIST); + } + } else { + + /* + * Break down into smaller lists of pages. + */ + page_list_break(&pplist, &npplist, npgs); + + pp = pplist; + n = npgs; + while (n--) { + ASSERT(pp->p_szc == cur_szc); + pp->p_szc = new_szc; + pp = pp->p_next; + } + + CHK_LPG(pplist, new_szc); + + bin = PP_2_BIN(pplist); + ASSERT(mnode == PP_2_MEM_NODE(pp)); + if ((bin == color) && (flags == PC_ALLOC) && + (ret_pp == NULL) && + page_trylock_cons(pp, SE_EXCL)) { + ret_pp = pp; + } else { + mtype = PP_2_MTYPE(pp); + page_vpadd(&PAGE_FREELISTS(mnode, new_szc, + bin, mtype), pplist); + + page_ctr_add(pplist, PG_FREE_LIST); + } + pplist = npplist; + } + } + return (ret_pp); +} + +int mpss_coalesce_disable = 0; + +/* + * Coalesce free pages into a page of the given szc and color if possible. + * Return the pointer to the page created, otherwise, return NULL. + */ +static page_t * +page_freelist_coalesce(int mnode, uchar_t szc, int color) +{ + int r; /* region size */ + int idx, full, i; + pfn_t pfnum; + size_t len; + size_t buckets_to_check; + pgcnt_t cands; + page_t *ret_pp; + int color_stride; + + VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce); + + if (mpss_coalesce_disable) { + return (NULL); + } + + r = szc; + PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands); + if (cands == 0) { + VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip); + return (NULL); + } + full = FULL_REGION_CNT(r); + color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : + page_colors; + + /* Prevent page_counters dynamic memory from being freed */ + rw_enter(&page_ctrs_rwlock[mnode], RW_READER); + len = PAGE_COUNTERS_ENTRIES(mnode, r); + buckets_to_check = len / color_stride; + idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color); + ASSERT((idx % color_stride) == color); + idx += color_stride; + if (idx >= len) + idx = color; + for (i = 0; i < buckets_to_check; i++) { + if (PAGE_COUNTERS(mnode, r, idx) == full) { + pfnum = IDX_TO_PNUM(mnode, r, idx); + ASSERT(pfnum >= mem_node_config[mnode].physbase && + pfnum < mem_node_config[mnode].physmax); + /* + * RFE: For performance maybe we can do something less + * brutal than locking the entire freelist. So far + * this doesn't seem to be a performance problem? + */ + page_freelist_lock(mnode); + if (PAGE_COUNTERS(mnode, r, idx) != full) { + VM_STAT_ADD(vmm_vmstats.page_ctrs_changed); + goto skip_this_one; + } + ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC); + if (ret_pp != NULL) { + PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = + idx; + page_freelist_unlock(mnode); + rw_exit(&page_ctrs_rwlock[mnode]); +#if defined(__sparc) + if (PP_ISNORELOC(ret_pp)) { + pgcnt_t npgs; + + npgs = page_get_pagecnt(ret_pp->p_szc); + kcage_freemem_sub(npgs); + } +#endif + return (ret_pp); + } +skip_this_one: + page_freelist_unlock(mnode); + /* + * No point looking for another page if we've + * already tried all of the ones that + * page_ctr_cands indicated. Stash off where we left + * off. + * Note: this is not exact since we don't hold the + * page_freelist_locks before we initially get the + * value of cands for performance reasons, but should + * be a decent approximation. + */ + if (--cands == 0) { + PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = + idx; + break; + } + } + idx += color_stride; + if (idx >= len) + idx = color; + } + rw_exit(&page_ctrs_rwlock[mnode]); + VM_STAT_ADD(vmm_vmstats.page_ctrs_failed); + return (NULL); +} + +/* + * For the given mnode, promote as many small pages to large pages as possible. + */ +void +page_freelist_coalesce_all(int mnode) +{ + int r; /* region size */ + int idx, full; + pfn_t pfnum; + size_t len; + + VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); + + if (mpss_coalesce_disable) { + return; + } + + /* + * Lock the entire freelist and coalesce what we can. + * + * Always promote to the largest page possible + * first to reduce the number of page promotions. + */ + rw_enter(&page_ctrs_rwlock[mnode], RW_READER); + page_freelist_lock(mnode); + for (r = mmu_page_sizes - 1; r > 0; r--) { + pgcnt_t cands; + + PGCTRS_CANDS_GETVALUE(mnode, r, cands); + if (cands == 0) { + VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); + continue; + } + + full = FULL_REGION_CNT(r); + len = PAGE_COUNTERS_ENTRIES(mnode, r); + + for (idx = 0; idx < len; idx++) { + if (PAGE_COUNTERS(mnode, r, idx) == full) { + pfnum = IDX_TO_PNUM(mnode, r, idx); + ASSERT(pfnum >= + mem_node_config[mnode].physbase && + pfnum < + mem_node_config[mnode].physmax); + (void) page_promote(mnode, pfnum, r, PC_FREE); + } + } + } + page_freelist_unlock(mnode); + rw_exit(&page_ctrs_rwlock[mnode]); +} + +/* + * This is where all polices for moving pages around + * to different page size free lists is implemented. + * Returns 1 on success, 0 on failure. + * + * So far these are the priorities for this algorithm in descending + * order: + * + * 1) When servicing a request try to do so with a free page + * from next size up. Helps defer fragmentation as long + * as possible. + * + * 2) Page coalesce on demand. Only when a freelist + * larger than PAGESIZE is empty and step 1 + * will not work since all larger size lists are + * also empty. + * + * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. + */ +page_t * +page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi) +{ + uchar_t nszc = szc + 1; + int bin; + page_t *pp, *firstpp; + page_t *ret_pp = NULL; + + ASSERT(szc < mmu_page_sizes); + + /* + * First try to break up a larger page to fill + * current size freelist. + */ + while (nszc < mmu_page_sizes) { + /* + * If page found then demote it. + */ + bin = page_convert_color(szc, nszc, color); + if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { + page_freelist_lock(mnode); + firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); + + /* + * If pfnhi is not PFNNULL, look for large page below + * pfnhi. PFNNULL signifies no pfn requirement. + */ + if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { + do { + pp = pp->p_vpnext; + if (pp == firstpp) { + pp = NULL; + break; + } + } while (pp->p_pagenum >= pfnhi); + } + if (pp) { + ASSERT(pp->p_szc == nszc); + ret_pp = page_demote(mnode, pp->p_pagenum, + pp->p_szc, szc, color, PC_ALLOC); + if (ret_pp) { + page_freelist_unlock(mnode); +#if defined(__sparc) + if (PP_ISNORELOC(ret_pp)) { + pgcnt_t npgs; + + npgs = page_get_pagecnt( + ret_pp->p_szc); + kcage_freemem_sub(npgs); + } +#endif + return (ret_pp); + } + } + page_freelist_unlock(mnode); + } + nszc++; + } + + /* + * Ok that didn't work. Time to coalesce. + */ + if (szc != 0) { + ret_pp = page_freelist_coalesce(mnode, szc, color); + } + + return (ret_pp); +} + +/* + * Helper routine used only by the freelist code to lock + * a page. If the page is a large page then it succeeds in + * locking all the constituent pages or none at all. + * Returns 1 on sucess, 0 on failure. + */ +static int +page_trylock_cons(page_t *pp, se_t se) +{ + page_t *tpp, *first_pp = pp; + + /* + * Fail if can't lock first or only page. + */ + if (!page_trylock(pp, se)) { + return (0); + } + + /* + * PAGESIZE: common case. + */ + if (pp->p_szc == 0) { + return (1); + } + + /* + * Large page case. + */ + tpp = pp->p_next; + while (tpp != pp) { + if (!page_trylock(tpp, se)) { + /* + * On failure unlock what we + * have locked so far. + */ + while (first_pp != tpp) { + page_unlock(first_pp); + first_pp = first_pp->p_next; + } + return (0); + } + tpp = tpp->p_next; + } + return (1); +} + +page_t * +page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, + uint_t flags) +{ + kmutex_t *pcm; + int i, fill_tried, fill_marker; + page_t *pp, *first_pp; + uint_t bin_marker; + int colors, cpucolors; + uchar_t nszc; + uint_t nszc_color_shift; + int nwaybins = 0, nwaycnt; + + ASSERT(szc < mmu_page_sizes); + + VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); + + /* LINTED */ + MTYPE_START(mnode, mtype, flags); + if (mtype < 0) { /* mnode foes not have memory in mtype range */ + VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); + return (NULL); + } + + /* + * Set how many physical colors for this page size. + */ + colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : + page_colors; + + nszc = MIN(szc + 1, mmu_page_sizes - 1); + nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc); + + /* cpu_page_colors is non-zero if a page color may be in > 1 bin */ + cpucolors = cpu_page_colors; + + /* + * adjust cpucolors to possibly check additional 'equivalent' bins + * to try to minimize fragmentation of large pages by delaying calls + * to page_freelist_fill. + */ + if (colorequiv > 1) { + int equivcolors = colors / colorequiv; + + if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) + cpucolors = equivcolors; + } + + ASSERT(colors <= page_colors); + ASSERT(colors); + ASSERT((colors & (colors - 1)) == 0); + + ASSERT(bin < colors); + + /* + * Only hold one freelist lock at a time, that way we + * can start anywhere and not have to worry about lock + * ordering. + */ +big_try_again: + fill_tried = 0; + nwaycnt = 0; + for (i = 0; i <= colors; i++) { +try_again: + ASSERT(bin < colors); + if (PAGE_FREELISTS(mnode, szc, bin, mtype)) { + pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); + mutex_enter(pcm); + pp = PAGE_FREELISTS(mnode, szc, bin, mtype); + if (pp != NULL) { + /* + * These were set before the page + * was put on the free list, + * they must still be set. + */ + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + ASSERT(pp->p_vnode == NULL); + ASSERT(pp->p_hash == NULL); + ASSERT(pp->p_offset == (u_offset_t)-1); + ASSERT(pp->p_szc == szc); + ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); + + /* + * Walk down the hash chain. + * 8k pages are linked on p_next + * and p_prev fields. Large pages + * are a contiguous group of + * constituent pages linked together + * on their p_next and p_prev fields. + * The large pages are linked together + * on the hash chain using p_vpnext + * p_vpprev of the base constituent + * page of each large page. + */ + first_pp = pp; + while (!page_trylock_cons(pp, SE_EXCL)) { + if (szc == 0) { + pp = pp->p_next; + } else { + pp = pp->p_vpnext; + } + + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + ASSERT(pp->p_vnode == NULL); + ASSERT(pp->p_hash == NULL); + ASSERT(pp->p_offset == (u_offset_t)-1); + ASSERT(pp->p_szc == szc); + ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == + mnode); + + if (pp == first_pp) { + pp = NULL; + break; + } + } + + if (pp) { + ASSERT(mtype == PP_2_MTYPE(pp)); + ASSERT(pp->p_szc == szc); + if (szc == 0) { + page_sub(&PAGE_FREELISTS(mnode, + szc, bin, mtype), pp); + } else { + page_vpsub(&PAGE_FREELISTS( + mnode, szc, bin, mtype), + pp); + CHK_LPG(pp, szc); + } + page_ctr_sub(pp, PG_FREE_LIST); + + if ((PP_ISFREE(pp) == 0) || + (PP_ISAGED(pp) == 0)) + panic("free page is not. pp %p", + (void *)pp); + mutex_exit(pcm); + +#if defined(__sparc) + ASSERT(!kcage_on || PP_ISNORELOC(pp) || + (flags & PG_NORELOC) == 0); + + if (PP_ISNORELOC(pp)) { + pgcnt_t npgs; + + npgs = page_get_pagecnt(szc); + kcage_freemem_sub(npgs); + } +#endif + VM_STAT_ADD(vmm_vmstats. + pgmf_allocok[szc]); + return (pp); + } + } + mutex_exit(pcm); + } + + /* + * Wow! The initial bin is empty. + * If specific color is needed, check if page color may be + * in other bins. cpucolors is: + * 0 if the colors for this cpu is equal to page_colors. + * This means that pages with a particular color are in a + * single bin. + * -1 if colors of cpus (cheetah+) are heterogenous. Need to + * first determine the colors for the current cpu. + * >0 colors of all cpus are homogenous and < page_colors + */ + + if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { + if (!nwaybins) { + /* + * cpucolors is negative if ecache setsizes + * are heterogenous. determine colors for this + * particular cpu. + */ + if (cpucolors < 0) { + cpucolors = CPUSETSIZE() / MMU_PAGESIZE; + ASSERT(cpucolors > 0); + nwaybins = colors / cpucolors; + } else { + nwaybins = colors / cpucolors; + ASSERT(szc > 0 || nwaybins > 1); + } + if (nwaybins < 2) + cpucolors = 0; + } + + if (cpucolors && (nwaycnt + 1 <= nwaybins)) { + nwaycnt++; + bin = (bin + (colors / nwaybins)) & + (colors - 1); + if (nwaycnt < nwaybins) { + goto try_again; + } + } + /* back to initial color if fall-thru */ + } + + /* + * color bins are all empty if color match. Try and satisfy + * the request by breaking up or coalescing pages from + * a different size freelist of the correct color that + * satisfies the ORIGINAL color requested. If that + * fails then try pages of the same size but different + * colors assuming we are not called with + * PG_MATCH_COLOR. + */ + if (!fill_tried) { + fill_tried = 1; + fill_marker = bin >> nszc_color_shift; + pp = page_freelist_fill(szc, bin, mnode, mtype, + PFNNULL); + if (pp != NULL) { + return (pp); + } + } + + if (flags & PG_MATCH_COLOR) + break; + + /* + * Select next color bin to try. + */ + if (szc == 0) { + /* + * PAGESIZE page case. + */ + if (i == 0) { + bin = (bin + BIN_STEP) & page_colors_mask; + bin_marker = bin; + } else { + bin = (bin + vac_colors) & page_colors_mask; + if (bin == bin_marker) { + bin = (bin + 1) & page_colors_mask; + bin_marker = bin; + } + } + } else { + /* + * Large page case. + */ + bin = (bin + 1) & (colors - 1); + } + /* + * If bin advanced to the next color bin of the + * next larger pagesize, there is a chance the fill + * could succeed. + */ + if (fill_marker != (bin >> nszc_color_shift)) + fill_tried = 0; + } + +#if defined(__sparc) + if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && + (kcage_freemem >= kcage_lotsfree)) { + /* + * The Cage is ON and with plenty of free mem, and + * we're willing to check for a NORELOC page if we + * couldn't find a RELOC page, so spin again. + */ + flags |= PG_NORELOC; + mtype = MTYPE_NORELOC; + goto big_try_again; + } +#else + if (flags & PGI_MT_RANGE) { + /* cycle through range of mtypes */ + MTYPE_NEXT(mnode, mtype, flags); + if (mtype >= 0) + goto big_try_again; + } +#endif + VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); + + return (NULL); +} + + +/* + * Returns the count of free pages for 'pp' with size code 'szc'. + * Note: This function does not return an exact value as the page freelist + * locks are not held and thus the values in the page_counters may be + * changing as we walk through the data. + */ +static int +page_freecnt(int mnode, page_t *pp, uchar_t szc) +{ + pgcnt_t pgfree; + pgcnt_t cnt; + ssize_t r = szc; /* region size */ + ssize_t idx; + int i; + int full, range; + + /* Make sure pagenum passed in is aligned properly */ + ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); + ASSERT(szc > 0); + + /* Prevent page_counters dynamic memory from being freed */ + rw_enter(&page_ctrs_rwlock[mnode], RW_READER); + idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); + cnt = PAGE_COUNTERS(mnode, r, idx); + pgfree = cnt << PNUM_SHIFT(r - 1); + range = FULL_REGION_CNT(szc); + + /* Check for completely full region */ + if (cnt == range) { + rw_exit(&page_ctrs_rwlock[mnode]); + return (pgfree); + } + + while (--r > 0) { + idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); + full = FULL_REGION_CNT(r); + for (i = 0; i < range; i++, idx++) { + cnt = PAGE_COUNTERS(mnode, r, idx); + /* + * If cnt here is full, that means we have already + * accounted for these pages earlier. + */ + if (cnt != full) { + pgfree += (cnt << PNUM_SHIFT(r - 1)); + } + } + range *= full; + } + rw_exit(&page_ctrs_rwlock[mnode]); + return (pgfree); +} + +/* + * Called from page_geti_contig_pages to exclusively lock constituent pages + * starting from 'spp' for page size code 'szc'. + * + * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' + * region needs to be greater than or equal to the threshold. + */ +static int +page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) +{ + pgcnt_t pgcnt = PNUM_SIZE(szc); + pgcnt_t pgfree, i; + page_t *pp; + + VM_STAT_ADD(vmm_vmstats.ptcp[szc]); + + + if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) + goto skipptcpcheck; + /* + * check if there are sufficient free pages available before attempting + * to trylock. Count is approximate as page counters can change. + */ + pgfree = page_freecnt(mnode, spp, szc); + + /* attempt to trylock if there are sufficient already free pages */ + if (pgfree < pgcnt/ptcpthreshold) { + VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); + return (0); + } + +skipptcpcheck: + + for (i = 0; i < pgcnt; i++) { + pp = &spp[i]; + if (!page_trylock(pp, SE_EXCL)) { + VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); + while (--i != (pgcnt_t)-1) { + pp = &spp[i]; + ASSERT(PAGE_EXCL(pp)); + page_unlock(pp); + } + return (0); + } + ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); + if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && + !PP_ISFREE(pp)) { + VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); + ASSERT(i == 0); + page_unlock(pp); + return (0); + } + if (PP_ISNORELOC(pp)) { + VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); + while (i != (pgcnt_t)-1) { + pp = &spp[i]; + ASSERT(PAGE_EXCL(pp)); + page_unlock(pp); + i--; + } + return (0); + } + } + VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); + return (1); +} + +/* + * Claim large page pointed to by 'pp'. 'pp' is the starting set + * of 'szc' constituent pages that had been locked exclusively previously. + * Will attempt to relocate constituent pages in use. + */ +static page_t * +page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) +{ + spgcnt_t pgcnt, npgs, i; + page_t *targpp, *rpp, *hpp; + page_t *replpp = NULL; + page_t *pplist = NULL; + + ASSERT(pp != NULL); + + pgcnt = page_get_pagecnt(szc); + while (pgcnt) { + ASSERT(PAGE_EXCL(pp)); + ASSERT(!PP_ISNORELOC(pp)); + if (PP_ISFREE(pp)) { + /* + * If this is a PG_FREE_LIST page then its + * size code can change underneath us due to + * page promotion or demotion. As an optimzation + * use page_list_sub_pages() instead of + * page_list_sub(). + */ + if (PP_ISAGED(pp)) { + page_list_sub_pages(pp, szc); + if (pp->p_szc == szc) { + return (pp); + } + ASSERT(pp->p_szc < szc); + npgs = page_get_pagecnt(pp->p_szc); + hpp = pp; + for (i = 0; i < npgs; i++, pp++) { + pp->p_szc = szc; + } + page_list_concat(&pplist, &hpp); + pgcnt -= npgs; + continue; + } + ASSERT(!PP_ISAGED(pp)); + ASSERT(pp->p_szc == 0); + page_list_sub(pp, PG_CACHE_LIST); + page_hashout(pp, NULL); + PP_SETAGED(pp); + pp->p_szc = szc; + page_list_concat(&pplist, &pp); + pp++; + pgcnt--; + continue; + } + npgs = page_get_pagecnt(pp->p_szc); + + /* + * page_create_wait freemem accounting done by caller of + * page_get_freelist and not necessary to call it prior to + * calling page_get_replacement_page. + * + * page_get_replacement_page can call page_get_contig_pages + * to acquire a large page (szc > 0); the replacement must be + * smaller than the contig page size to avoid looping or + * szc == 0 and PGI_PGCPSZC0 is set. + */ + if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { + replpp = page_get_replacement_page(pp, NULL, 0); + if (replpp) { + npgs = page_get_pagecnt(pp->p_szc); + ASSERT(npgs <= pgcnt); + targpp = pp; + } + } + + /* + * If replacement is NULL or do_page_relocate fails, fail + * coalescing of pages. + */ + if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, + &npgs, NULL) != 0)) { + /* + * Unlock un-processed target list + */ + while (pgcnt--) { + ASSERT(PAGE_EXCL(pp)); + page_unlock(pp); + pp++; + } + /* + * Free the processed target list. + */ + while (pplist) { + pp = pplist; + page_sub(&pplist, pp); + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_szc == szc); + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp)); + pp->p_szc = 0; + page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); + page_unlock(pp); + } + + if (replpp != NULL) + page_free_replacement_page(replpp); + + return (NULL); + } + ASSERT(pp == targpp); + + /* LINTED */ + ASSERT(hpp = pp); /* That's right, it's an assignment */ + + pp += npgs; + pgcnt -= npgs; + + while (npgs--) { + ASSERT(PAGE_EXCL(targpp)); + ASSERT(!PP_ISFREE(targpp)); + ASSERT(!PP_ISNORELOC(targpp)); + PP_SETFREE(targpp); + ASSERT(PP_ISAGED(targpp)); + ASSERT(targpp->p_szc < szc || (szc == 0 && + (flags & PGI_PGCPSZC0))); + targpp->p_szc = szc; + targpp = targpp->p_next; + + rpp = replpp; + ASSERT(rpp != NULL); + page_sub(&replpp, rpp); + ASSERT(PAGE_EXCL(rpp)); + ASSERT(!PP_ISFREE(rpp)); + page_unlock(rpp); + } + ASSERT(targpp == hpp); + ASSERT(replpp == NULL); + page_list_concat(&pplist, &targpp); + } + CHK_LPG(pplist, szc); + return (pplist); +} + +/* + * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code + * of 0 means nothing left after trim. + */ + +int +trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) +{ + pfn_t kcagepfn; + int decr; + int rc = 0; + + if (PP_ISNORELOC(mseg->pages)) { + if (PP_ISNORELOC(mseg->epages - 1) == 0) { + + /* lower part of this mseg inside kernel cage */ + decr = kcage_current_pfn(&kcagepfn); + + /* kernel cage may have transitioned past mseg */ + if (kcagepfn >= mseg->pages_base && + kcagepfn < mseg->pages_end) { + ASSERT(decr == 0); + *lo = kcagepfn; + *hi = MIN(pfnhi, + (mseg->pages_end - 1)); + rc = 1; + } + } + /* else entire mseg in the cage */ + } else { + if (PP_ISNORELOC(mseg->epages - 1)) { + + /* upper part of this mseg inside kernel cage */ + decr = kcage_current_pfn(&kcagepfn); + + /* kernel cage may have transitioned past mseg */ + if (kcagepfn >= mseg->pages_base && + kcagepfn < mseg->pages_end) { + ASSERT(decr); + *hi = kcagepfn; + *lo = MAX(pfnlo, mseg->pages_base); + rc = 1; + } + } else { + /* entire mseg outside of kernel cage */ + *lo = MAX(pfnlo, mseg->pages_base); + *hi = MIN(pfnhi, (mseg->pages_end - 1)); + rc = 1; + } + } + return (rc); +} + +/* + * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a + * page with size code 'szc'. Claiming such a page requires acquiring + * exclusive locks on all constituent pages (page_trylock_contig_pages), + * relocating pages in use and concatenating these constituent pages into a + * large page. + * + * The page lists do not have such a large page and page_freelist_fill has + * already failed to demote larger pages and/or coalesce smaller free pages. + * + * 'flags' may specify PG_COLOR_MATCH which would limit the search of large + * pages with the same color as 'bin'. + * + * 'pfnflag' specifies the subset of the pfn range to search. + */ + + +static page_t * +page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, + pfn_t pfnlo, pfn_t pfnhi, int pfnflag) +{ + struct memseg *mseg; + pgcnt_t szcpgcnt = page_get_pagecnt(szc); + pgcnt_t szcpgmask = szcpgcnt - 1; + pfn_t randpfn; + page_t *pp, *randpp, *endpp; + uint_t colors; + pfn_t hi, lo; + uint_t skip; + + ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); + + if ((pfnhi - pfnlo) + 1 < szcpgcnt) + return (NULL); + + ASSERT(szc < mmu_page_sizes); + + colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : + page_colors; + + ASSERT(bin < colors); + + /* + * trim the pfn range to search based on pfnflag. pfnflag is set + * when there have been previous page_get_contig_page failures to + * limit the search. + * + * The high bit in pfnflag specifies the number of 'slots' in the + * pfn range and the remainder of pfnflag specifies which slot. + * For example, a value of 1010b would mean the second slot of + * the pfn range that has been divided into 8 slots. + */ + if (pfnflag > 1) { + int slots = 1 << (highbit(pfnflag) - 1); + int slotid = pfnflag & (slots - 1); + pgcnt_t szcpages; + int slotlen; + + pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); + pfnhi = pfnhi & ~(szcpgcnt - 1); + + szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; + slotlen = howmany(szcpages, slots); + pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); + ASSERT(pfnlo < pfnhi); + if (pfnhi > pfnlo + (slotlen * szcpgcnt)) + pfnhi = pfnlo + (slotlen * szcpgcnt); + } + + memsegs_lock(0); + + /* + * loop through memsegs to look for contig page candidates + */ + + for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { + if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { + /* no overlap */ + continue; + } + + if (mseg->pages_end - mseg->pages_base < szcpgcnt) + /* mseg too small */ + continue; + + /* trim off kernel cage pages from pfn range */ + if (kcage_on) { + if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) + continue; + } else { + lo = MAX(pfnlo, mseg->pages_base); + hi = MIN(pfnhi, (mseg->pages_end - 1)); + } + + /* round to szcpgcnt boundaries */ + lo = P2ROUNDUP(lo, szcpgcnt); + hi = hi & ~(szcpgcnt - 1); + + if (hi <= lo) + continue; + + /* + * set lo to point to the pfn for the desired bin. Large + * page sizes may only have a single page color + */ + if ((colors > 1) && (flags & PG_MATCH_COLOR)) { + uint_t lobin; + + /* + * factor in colorequiv to check additional + * 'equivalent' bins. + */ + if (colorequiv > 1 && colors > colorequiv) + colors = colors / colorequiv; + + /* determine bin that lo currently points to */ + lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt; + + /* + * set lo to point at appropriate color and set skip + * to arrive at the next szc page of the same color. + */ + lo += ((bin - lobin) & (colors - 1)) * szcpgcnt; + + skip = colors * szcpgcnt; + } else { + /* check all pages starting from lo */ + skip = szcpgcnt; + } + if (hi <= lo) + /* mseg cannot satisfy color request */ + continue; + + /* randomly choose a point between lo and hi to begin search */ + + randpfn = (pfn_t)GETTICK(); + randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); + randpp = mseg->pages + (randpfn - mseg->pages_base); + + ASSERT(randpp->p_pagenum == randpfn); + + pp = randpp; + endpp = mseg->pages + (hi - mseg->pages_base); + + ASSERT(randpp + szcpgcnt <= endpp); + + do { + ASSERT(!(pp->p_pagenum & szcpgmask)); + ASSERT((flags & PG_MATCH_COLOR) == 0 || + colorequiv > 1 || + PP_2_BIN(pp) == bin); + if (page_trylock_contig_pages(mnode, pp, szc, flags)) { + /* pages unlocked by page_claim on failure */ + if (page_claim_contig_pages(pp, szc, flags)) { + memsegs_unlock(0); + return (pp); + } + } + + pp += skip; + if (pp >= endpp) { + /* start from the beginning */ + pp = mseg->pages + (lo - mseg->pages_base); + ASSERT(pp->p_pagenum == lo); + ASSERT(pp + szcpgcnt <= endpp); + } + } while (pp != randpp); + } + memsegs_unlock(0); + return (NULL); +} + + +/* + * controlling routine that searches through physical memory in an attempt to + * claim a large page based on the input parameters. + * on the page free lists. + * + * calls page_geti_contig_pages with an initial pfn range from the mnode + * and mtype. page_geti_contig_pages will trim off the parts of the pfn range + * that overlaps with the kernel cage or does not match the requested page + * color if PG_MATCH_COLOR is set. Since this search is very expensive, + * page_geti_contig_pages may further limit the search range based on + * previous failure counts (pgcpfailcnt[]). + * + * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base + * pagesize page that satisfies mtype. + */ +page_t * +page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, + uint_t flags) +{ + pfn_t pfnlo, pfnhi; /* contig pages pfn range */ + page_t *pp; + int pfnflag = 0; /* no limit on search if 0 */ + + VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); + + /* LINTED */ + MTYPE_START(mnode, mtype, flags); + if (mtype < 0) { /* mnode does not have memory in mtype range */ + VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); + return (NULL); + } + + ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); + + /* do not limit search and ignore color if hi pri */ + + if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) + pfnflag = pgcpfailcnt[szc]; + + /* remove color match to improve chances */ + + if (flags & PGI_PGCPHIPRI || pfnflag) + flags &= ~PG_MATCH_COLOR; + + do { + /* get pfn range based on mnode and mtype */ + MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); + + ASSERT(pfnhi >= pfnlo); + + pp = page_geti_contig_pages(mnode, bin, szc, flags, + pfnlo, pfnhi, pfnflag); + + if (pp != NULL) { + pfnflag = pgcpfailcnt[szc]; + if (pfnflag) { + /* double the search size */ + pgcpfailcnt[szc] = pfnflag >> 1; + } + VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); + return (pp); + } + /* LINTED */ + } while ((flags & PGI_MT_RANGE) && + (MTYPE_NEXT(mnode, mtype, flags) >= 0)); + + VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); + return (NULL); +} + + +/* + * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. + * + * Does its own locking and accounting. + * If PG_MATCH_COLOR is set, then NULL will be returned if there are no + * pages of the proper color even if there are pages of a different color. + * + * Finds a page, removes it, THEN locks it. + */ + +/*ARGSUSED*/ +page_t * +page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, + caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) +{ + struct as *as = seg->s_as; + page_t *pp = NULL; + ulong_t bin; + uchar_t szc; + int mnode; + int mtype; + page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); + lgrp_mnode_cookie_t lgrp_cookie; + + page_get_func = page_get_mnode_freelist; + + /* + * If we aren't passed a specific lgroup, or passed a freed lgrp + * assume we wish to allocate near to the current thread's home. + */ + if (!LGRP_EXISTS(lgrp)) + lgrp = lgrp_home_lgrp(); + + if (kcage_on) { + if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && + kcage_freemem < kcage_throttlefree + btop(size) && + curthread != kcage_cageout_thread) { + /* + * Set a "reserve" of kcage_throttlefree pages for + * PG_PANIC and cageout thread allocations. + * + * Everybody else has to serialize in + * page_create_get_something() to get a cage page, so + * that we don't deadlock cageout! + */ + return (NULL); + } + } else { + flags &= ~PG_NORELOC; + flags |= PGI_NOCAGE; + } + + /* LINTED */ + MTYPE_INIT(mtype, vp, vaddr, flags); + + /* + * Convert size to page size code. + */ + if ((szc = page_szc(size)) == (uchar_t)-1) + panic("page_get_freelist: illegal page size request"); + ASSERT(szc < mmu_page_sizes); + + VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); + + /* LINTED */ + AS_2_BIN(as, seg, vp, vaddr, bin); + + /* bin is for base pagesize color - convert if larger pagesize. */ + if (szc) + bin = page_convert_color(0, szc, bin); + + /* + * Try to get a local page first, but try remote if we can't + * get a page of the right color. + */ +pgretry: + LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); + while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { + pp = page_get_func(mnode, bin, mtype, szc, flags); + if (pp != NULL) { + VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); + DTRACE_PROBE4(page__get, + lgrp_t *, lgrp, + int, mnode, + ulong_t, bin, + uint_t, flags); + return (pp); + } + } + ASSERT(pp == NULL); + + /* + * for non-SZC0 PAGESIZE requests, check cachelist before checking + * remote free lists. Caller expected to call page_get_cachelist which + * will check local cache lists and remote free lists. + */ + if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { + VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); + return (NULL); + } + + ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); + + lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); + + /* + * Try to get a non-local freelist page. + */ + LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); + while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { + pp = page_get_func(mnode, bin, mtype, szc, flags); + if (pp != NULL) { + DTRACE_PROBE4(page__get, + lgrp_t *, lgrp, + int, mnode, + ulong_t, bin, + uint_t, flags); + VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); + return (pp); + } + } + + ASSERT(pp == NULL); + + /* + * when the cage is off chances are page_get_contig_pages() will fail + * to lock a large page chunk therefore when the cage is off it's not + * called by default. this can be changed via /etc/system. + * + * page_get_contig_pages() also called to acquire a base pagesize page + * for page_create_get_something(). + */ + if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && + (kcage_on || pg_lpgcreate_nocage || szc == 0) && + (page_get_func != page_get_contig_pages)) { + + VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); + page_get_func = page_get_contig_pages; + goto pgretry; + } + + if (pgcplimitsearch && page_get_func == page_get_contig_pages) + pgcpfailcnt[szc]++; + + VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); + return (NULL); +} + +/* + * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. + * + * Does its own locking. + * If PG_MATCH_COLOR is set, then NULL will be returned if there are no + * pages of the proper color even if there are pages of a different color. + * Otherwise, scan the bins for ones with pages. For each bin with pages, + * try to lock one of them. If no page can be locked, try the + * next bin. Return NULL if a page can not be found and locked. + * + * Finds a pages, trys to lock it, then removes it. + */ + +/*ARGSUSED*/ +page_t * +page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, + caddr_t vaddr, uint_t flags, struct lgrp *lgrp) +{ + page_t *pp; + struct as *as = seg->s_as; + ulong_t bin; + /*LINTED*/ + int mnode; + int mtype; + lgrp_mnode_cookie_t lgrp_cookie; + + /* + * If we aren't passed a specific lgroup, or pasased a freed lgrp + * assume we wish to allocate near to the current thread's home. + */ + if (!LGRP_EXISTS(lgrp)) + lgrp = lgrp_home_lgrp(); + + if (!kcage_on) { + flags &= ~PG_NORELOC; + flags |= PGI_NOCAGE; + } + + if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && + kcage_freemem <= kcage_throttlefree) { + /* + * Reserve kcage_throttlefree pages for critical kernel + * threads. + * + * Everybody else has to go to page_create_get_something() + * to get a cage page, so we don't deadlock cageout. + */ + return (NULL); + } + + /* LINTED */ + AS_2_BIN(as, seg, vp, vaddr, bin); + + ASSERT(bin <= page_colors_mask); + + /* LINTED */ + MTYPE_INIT(mtype, vp, vaddr, flags); + + VM_STAT_ADD(vmm_vmstats.pgc_alloc); + + /* + * Try local cachelists first + */ + LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); + while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { + pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); + if (pp != NULL) { + VM_STAT_ADD(vmm_vmstats.pgc_allocok); + DTRACE_PROBE4(page__get, + lgrp_t *, lgrp, + int, mnode, + ulong_t, bin, + uint_t, flags); + return (pp); + } + } + + lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); + + /* + * Try freelists/cachelists that are farther away + * This is our only chance to allocate remote pages for PAGESIZE + * requests. + */ + LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); + while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { + pp = page_get_mnode_freelist(mnode, bin, mtype, + 0, flags); + if (pp != NULL) { + VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); + DTRACE_PROBE4(page__get, + lgrp_t *, lgrp, + int, mnode, + ulong_t, bin, + uint_t, flags); + return (pp); + } + pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); + if (pp != NULL) { + VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); + DTRACE_PROBE4(page__get, + lgrp_t *, lgrp, + int, mnode, + ulong_t, bin, + uint_t, flags); + return (pp); + } + } + + VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); + return (NULL); +} + +page_t * +page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) +{ + kmutex_t *pcm; + int i; + page_t *pp; + page_t *first_pp; + uint_t bin_marker; + int nwaybins, nwaycnt; + int cpucolors; + + VM_STAT_ADD(vmm_vmstats.pgmc_alloc); + + /* LINTED */ + MTYPE_START(mnode, mtype, flags); + if (mtype < 0) { /* mnode does not have memory in mtype range */ + VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); + return (NULL); + } + + nwaybins = 0; + cpucolors = cpu_page_colors; + /* + * adjust cpucolors to possibly check additional 'equivalent' bins + * to try to minimize fragmentation of large pages by delaying calls + * to page_freelist_fill. + */ + if (colorequiv > 1) { + int equivcolors = page_colors / colorequiv; + + if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) + cpucolors = equivcolors; + } + + /* + * Only hold one cachelist lock at a time, that way we + * can start anywhere and not have to worry about lock + * ordering. + */ + +big_try_again: + nwaycnt = 0; + for (i = 0; i <= page_colors; i++) { + if (PAGE_CACHELISTS(mnode, bin, mtype)) { + pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); + mutex_enter(pcm); + pp = PAGE_CACHELISTS(mnode, bin, mtype); + if (pp != NULL) { + first_pp = pp; + ASSERT(pp->p_vnode); + ASSERT(PP_ISAGED(pp) == 0); + ASSERT(pp->p_szc == 0); + ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); + while (!page_trylock(pp, SE_EXCL)) { + pp = pp->p_next; + ASSERT(pp->p_szc == 0); + if (pp == first_pp) { + /* + * We have searched the + * complete list! + * And all of them (might + * only be one) are locked. + * This can happen since + * these pages can also be + * found via the hash list. + * When found via the hash + * list, they are locked + * first, then removed. + * We give up to let the + * other thread run. + */ + pp = NULL; + break; + } + ASSERT(pp->p_vnode); + ASSERT(PP_ISFREE(pp)); + ASSERT(PP_ISAGED(pp) == 0); + ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == + mnode); + } + + if (pp) { + page_t **ppp; + /* + * Found and locked a page. + * Pull it off the list. + */ + ASSERT(mtype == PP_2_MTYPE(pp)); + ppp = &PAGE_CACHELISTS(mnode, bin, + mtype); + page_sub(ppp, pp); + /* + * Subtract counters before releasing + * pcm mutex to avoid a race with + * page_freelist_coalesce and + * page_freelist_fill. + */ + page_ctr_sub(pp, PG_CACHE_LIST); + mutex_exit(pcm); + ASSERT(pp->p_vnode); + ASSERT(PP_ISAGED(pp) == 0); +#if defined(__sparc) + ASSERT(!kcage_on || + (flags & PG_NORELOC) == 0 || + PP_ISNORELOC(pp)); + if (PP_ISNORELOC(pp)) { + kcage_freemem_sub(1); + } +#endif + VM_STAT_ADD(vmm_vmstats. + pgmc_allocok); + return (pp); + } + } + mutex_exit(pcm); + } + + /* + * Wow! The initial bin is empty or no page in the bin could + * be locked. + * + * If specific color is needed, check if page color may be in + * other bins. + */ + if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { + if (!nwaybins) { + if (cpucolors < 0) { + cpucolors = CPUSETSIZE() / MMU_PAGESIZE; + ASSERT(cpucolors > 0); + nwaybins = page_colors / cpucolors; + if (nwaybins < 2) + cpucolors = 0; + } else { + nwaybins = page_colors / cpucolors; + ASSERT(nwaybins > 1); + } + } + + if (++nwaycnt >= nwaybins) { + break; + } + bin = (bin + (page_colors / nwaybins)) & + page_colors_mask; + continue; + } + + if (i == 0) { + bin = (bin + BIN_STEP) & page_colors_mask; + bin_marker = bin; + } else { + bin = (bin + vac_colors) & page_colors_mask; + if (bin == bin_marker) { + bin = (bin + 1) & page_colors_mask; + bin_marker = bin; + } + } + } + +#if defined(__sparc) + if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && + (kcage_freemem >= kcage_lotsfree)) { + /* + * The Cage is ON and with plenty of free mem, and + * we're willing to check for a NORELOC page if we + * couldn't find a RELOC page, so spin again. + */ + flags |= PG_NORELOC; + mtype = MTYPE_NORELOC; + goto big_try_again; + } +#else + if (flags & PGI_MT_RANGE) { + MTYPE_NEXT(mnode, mtype, flags); + if (mtype >= 0) + goto big_try_again; + } +#endif + VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); + return (NULL); +} + +#ifdef DEBUG +#define REPL_PAGE_STATS +#endif /* DEBUG */ + +#ifdef REPL_PAGE_STATS +struct repl_page_stats { + uint_t ngets; + uint_t ngets_noreloc; + uint_t npgr_noreloc; + uint_t nnopage_first; + uint_t nnopage; + uint_t nhashout; + uint_t nnofree; + uint_t nnext_pp; +} repl_page_stats; +#define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) +#else /* REPL_PAGE_STATS */ +#define REPL_STAT_INCR(v) +#endif /* REPL_PAGE_STATS */ + +int pgrppgcp; + +/* + * The freemem accounting must be done by the caller. + * First we try to get a replacement page of the same size as like_pp, + * if that is not possible, then we just get a set of discontiguous + * PAGESIZE pages. + */ +page_t * +page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp, + uint_t pgrflags) +{ + page_t *like_pp; + page_t *pp, *pplist; + page_t *pl = NULL; + ulong_t bin; + int mnode, page_mnode; + int szc; + spgcnt_t npgs, pg_cnt; + pfn_t pfnum; + int mtype; + int flags = 0; + lgrp_mnode_cookie_t lgrp_cookie; + + + REPL_STAT_INCR(ngets); + like_pp = orig_like_pp; + ASSERT(PAGE_EXCL(like_pp)); + + szc = like_pp->p_szc; + npgs = page_get_pagecnt(szc); + /* + * Now we reset like_pp to the base page_t. + * That way, we won't walk past the end of this 'szc' page. + */ + pfnum = PFN_BASE(like_pp->p_pagenum, szc); + like_pp = page_numtopp_nolock(pfnum); + ASSERT(like_pp->p_szc == szc); + + if (PP_ISNORELOC(like_pp)) { + ASSERT(kcage_on); + REPL_STAT_INCR(ngets_noreloc); + flags = PGI_RELOCONLY; + } else if (pgrflags & PGR_NORELOC) { + ASSERT(kcage_on); + REPL_STAT_INCR(npgr_noreloc); + flags = PG_NORELOC; + } + + /* + * Kernel pages must always be replaced with the same size + * pages, since we cannot properly handle demotion of kernel + * pages. + */ + if (like_pp->p_vnode == &kvp) + pgrflags |= PGR_SAMESZC; + + /* LINTED */ + MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode); + + while (npgs) { + pplist = NULL; + for (;;) { + pg_cnt = page_get_pagecnt(szc); + bin = PP_2_BIN(like_pp); + ASSERT(like_pp->p_szc == orig_like_pp->p_szc); + ASSERT(pg_cnt <= npgs); + + /* + * If an lgroup was specified, try to get the + * page from that lgroup. + */ + if (LGRP_EXISTS(lgrp)) { + /* Try the lgroup's freelists first */ + LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, + LGRP_SRCH_LOCAL); + while ((pplist == NULL) && + (mnode = lgrp_memnode_choose(&lgrp_cookie)) + != -1) { + pplist = page_get_mnode_freelist( + mnode, bin, mtype, szc, + flags); + } + + /* + * Now try it's cachelists if this is a + * small page. Don't need to do it for + * larger ones since page_freelist_coalesce() + * already failed. + */ + if (pplist != NULL || szc != 0) + break; + + /* Now try it's cachelists */ + LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, + LGRP_SRCH_LOCAL); + + while ((pplist == NULL) && + (mnode = lgrp_memnode_choose(&lgrp_cookie)) + != -1) { + pplist = page_get_mnode_cachelist( + bin, flags, mnode, mtype); + } + if (pplist != NULL) { + page_hashout(pplist, NULL); + PP_SETAGED(pplist); + REPL_STAT_INCR(nhashout); + break; + } + /* Done looking in this lgroup. Bail out. */ + break; + } + + ASSERT(!LGRP_EXISTS(lgrp)); + /* + * No lgroup was specified, so just try to get the + * page as close to like_pp's mnode as possible. + * First try the local freelist... + */ + mnode = PP_2_MEM_NODE(like_pp); + pplist = page_get_mnode_freelist(mnode, bin, + mtype, szc, flags); + if (pplist != NULL) + break; + + REPL_STAT_INCR(nnofree); + + /* + * ...then the local cachelist. Don't need to do it for + * larger pages cause page_freelist_coalesce() already + * failed there anyway. + */ + if (szc == 0) { + pplist = page_get_mnode_cachelist(bin, flags, + mnode, mtype); + if (pplist != NULL) { + page_hashout(pplist, NULL); + PP_SETAGED(pplist); + REPL_STAT_INCR(nhashout); + break; + } + } + + /* Now try remote freelists */ + page_mnode = mnode; + lgrp = + lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); + LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, + LGRP_SRCH_HIER); + while (pplist == NULL && + (mnode = lgrp_memnode_choose(&lgrp_cookie)) + != -1) { + /* + * Skip local mnode. + */ + if ((mnode == page_mnode) || + (mem_node_config[mnode].exists == 0)) + continue; + + pplist = page_get_mnode_freelist(mnode, + bin, mtype, szc, flags); + } + + if (pplist != NULL) + break; + + + /* Now try remote cachelists */ + LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, + LGRP_SRCH_HIER); + while (pplist == NULL && szc == 0) { + mnode = lgrp_memnode_choose(&lgrp_cookie); + if (mnode == -1) + break; + /* + * Skip local mnode. + */ + if ((mnode == page_mnode) || + (mem_node_config[mnode].exists == 0)) + continue; + + pplist = page_get_mnode_cachelist(bin, + flags, mnode, mtype); + + if (pplist != NULL) { + page_hashout(pplist, NULL); + PP_SETAGED(pplist); + REPL_STAT_INCR(nhashout); + break; + } + } + + /* + * Break out of while loop under the following cases: + * - If we successfully got a page. + * - If pgrflags specified only returning a specific + * page size and we could not find that page size. + * - If we could not satisfy the request with PAGESIZE + * or larger pages. + */ + if (pplist != NULL || szc == 0) + break; + + if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { + /* try to find contig page */ + + LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, + LGRP_SRCH_HIER); + + while ((pplist == NULL) && + (mnode = + lgrp_memnode_choose(&lgrp_cookie)) + != -1) { + pplist = page_get_contig_pages( + mnode, bin, mtype, szc, + flags | PGI_PGCPHIPRI); + } + break; + } + + /* + * The correct thing to do here is try the next + * page size down using szc--. Due to a bug + * with the processing of HAT_RELOAD_SHARE + * where the sfmmu_ttecnt arrays of all + * hats sharing an ISM segment don't get updated, + * using intermediate size pages for relocation + * can lead to continuous page faults. + */ + szc = 0; + } + + if (pplist != NULL) { + DTRACE_PROBE4(page__get, + lgrp_t *, lgrp, + int, mnode, + ulong_t, bin, + uint_t, flags); + + while (pplist != NULL && pg_cnt--) { + ASSERT(pplist != NULL); + pp = pplist; + page_sub(&pplist, pp); + PP_CLRFREE(pp); + PP_CLRAGED(pp); + page_list_concat(&pl, &pp); + npgs--; + like_pp = like_pp + 1; + REPL_STAT_INCR(nnext_pp); + } + ASSERT(pg_cnt == 0); + } else { + break; + } + } + + if (npgs) { + /* + * We were unable to allocate the necessary number + * of pages. + * We need to free up any pl. + */ + REPL_STAT_INCR(nnopage); + page_free_replacement_page(pl); + return (NULL); + } else { + return (pl); + } +} + +/* + * demote a free large page to it's constituent pages + */ +void +page_demote_free_pages(page_t *pp) +{ + + int mnode; + + ASSERT(pp != NULL); + ASSERT(PAGE_LOCKED(pp)); + ASSERT(PP_ISFREE(pp)); + ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); + + mnode = PP_2_MEM_NODE(pp); + page_freelist_lock(mnode); + if (pp->p_szc != 0) { + (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, + pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); + } + page_freelist_unlock(mnode); + ASSERT(pp->p_szc == 0); +} diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c new file mode 100644 index 0000000000..fcafb5f803 --- /dev/null +++ b/usr/src/uts/common/vm/vm_pvn.c @@ -0,0 +1,1147 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - paged vnode. + * + * This file supplies vm support for the vnode operations that deal with pages. + */ +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/vmmeter.h> +#include <sys/vmsystm.h> +#include <sys/mman.h> +#include <sys/vfs.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/cpuvar.h> +#include <sys/vtrace.h> +#include <sys/tnf_probe.h> + +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/rm.h> +#include <vm/pvn.h> +#include <vm/page.h> +#include <vm/seg_map.h> +#include <vm/seg_kmem.h> +#include <sys/fs/swapnode.h> + +int pvn_nofodklust = 0; +int pvn_write_noklust = 0; + +uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */ +uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */ + /* support for vmodsort for testing */ + +static struct kmem_cache *marker_cache = NULL; + +/* + * Find the largest contiguous block which contains `addr' for file offset + * `offset' in it while living within the file system block sizes (`vp_off' + * and `vp_len') and the address space limits for which no pages currently + * exist and which map to consecutive file offsets. + */ +page_t * +pvn_read_kluster( + struct vnode *vp, + u_offset_t off, + struct seg *seg, + caddr_t addr, + u_offset_t *offp, /* return values */ + size_t *lenp, /* return values */ + u_offset_t vp_off, + size_t vp_len, + int isra) +{ + ssize_t deltaf, deltab; + page_t *pp; + page_t *plist = NULL; + spgcnt_t pagesavail; + u_offset_t vp_end; + + ASSERT(off >= vp_off && off < vp_off + vp_len); + + /* + * We only want to do klustering/read ahead if there + * is more than minfree pages currently available. + */ + pagesavail = freemem - minfree; + + if (pagesavail <= 0) + if (isra) + return ((page_t *)NULL); /* ra case - give up */ + else + pagesavail = 1; /* must return a page */ + + /* We calculate in pages instead of bytes due to 32-bit overflows */ + if (pagesavail < (spgcnt_t)btopr(vp_len)) { + /* + * Don't have enough free memory for the + * max request, try sizing down vp request. + */ + deltab = (ssize_t)(off - vp_off); + vp_len -= deltab; + vp_off += deltab; + if (pagesavail < btopr(vp_len)) { + /* + * Still not enough memory, just settle for + * pagesavail which is at least 1. + */ + vp_len = ptob(pagesavail); + } + } + + vp_end = vp_off + vp_len; + ASSERT(off >= vp_off && off < vp_end); + + if (isra && SEGOP_KLUSTER(seg, addr, 0)) + return ((page_t *)NULL); /* segment driver says no */ + + if ((plist = page_create_va(vp, off, + PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL) + return ((page_t *)NULL); + + if (vp_len <= PAGESIZE || pvn_nofodklust) { + *offp = off; + *lenp = MIN(vp_len, PAGESIZE); + } else { + /* + * Scan back from front by incrementing "deltab" and + * comparing "off" with "vp_off + deltab" to avoid + * "signed" versus "unsigned" conversion problems. + */ + for (deltab = PAGESIZE; off >= vp_off + deltab; + deltab += PAGESIZE) { + /* + * Call back to the segment driver to verify that + * the klustering/read ahead operation makes sense. + */ + if (SEGOP_KLUSTER(seg, addr, -deltab)) + break; /* page not eligible */ + if ((pp = page_create_va(vp, off - deltab, + PAGESIZE, PG_EXCL, seg, addr - deltab)) + == NULL) + break; /* already have the page */ + /* + * Add page to front of page list. + */ + page_add(&plist, pp); + } + deltab -= PAGESIZE; + + /* scan forward from front */ + for (deltaf = PAGESIZE; off + deltaf < vp_end; + deltaf += PAGESIZE) { + /* + * Call back to the segment driver to verify that + * the klustering/read ahead operation makes sense. + */ + if (SEGOP_KLUSTER(seg, addr, deltaf)) + break; /* page not file extension */ + if ((pp = page_create_va(vp, off + deltaf, + PAGESIZE, PG_EXCL, seg, addr + deltaf)) + == NULL) + break; /* already have page */ + + /* + * Add page to end of page list. + */ + page_add(&plist, pp); + plist = plist->p_next; + } + *offp = off = off - deltab; + *lenp = deltab + deltaf; + ASSERT(off >= vp_off); + + /* + * If we ended up getting more than was actually + * requested, retract the returned length to only + * reflect what was requested. This might happen + * if we were allowed to kluster pages across a + * span of (say) 5 frags, and frag size is less + * than PAGESIZE. We need a whole number of + * pages to contain those frags, but the returned + * size should only allow the returned range to + * extend as far as the end of the frags. + */ + if ((vp_off + vp_len) < (off + *lenp)) { + ASSERT(vp_end > off); + *lenp = vp_end - off; + } + } + TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER, + "pvn_read_kluster:seg %p addr %x isra %x", + seg, addr, isra); + return (plist); +} + +/* + * Handle pages for this vnode on either side of the page "pp" + * which has been locked by the caller. This routine will also + * do klustering in the range [vp_off, vp_off + vp_len] up + * until a page which is not found. The offset and length + * of pages included is returned in "*offp" and "*lenp". + * + * Returns a list of dirty locked pages all ready to be + * written back. + */ +page_t * +pvn_write_kluster( + struct vnode *vp, + page_t *pp, + u_offset_t *offp, /* return values */ + size_t *lenp, /* return values */ + u_offset_t vp_off, + size_t vp_len, + int flags) +{ + u_offset_t off; + page_t *dirty; + size_t deltab, deltaf; + se_t se; + u_offset_t vp_end; + + off = pp->p_offset; + + /* + * Kustering should not be done if we are invalidating + * pages since we could destroy pages that belong to + * some other process if this is a swap vnode. + */ + if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) { + *offp = off; + *lenp = PAGESIZE; + return (pp); + } + + if (flags & (B_FREE | B_INVAL)) + se = SE_EXCL; + else + se = SE_SHARED; + + dirty = pp; + /* + * Scan backwards looking for pages to kluster by incrementing + * "deltab" and comparing "off" with "vp_off + deltab" to + * avoid "signed" versus "unsigned" conversion problems. + */ + for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) { + pp = page_lookup_nowait(vp, off - deltab, se); + if (pp == NULL) + break; /* page not found */ + if (pvn_getdirty(pp, flags | B_DELWRI) == 0) + break; + page_add(&dirty, pp); + } + deltab -= PAGESIZE; + + vp_end = vp_off + vp_len; + /* now scan forwards looking for pages to kluster */ + for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) { + pp = page_lookup_nowait(vp, off + deltaf, se); + if (pp == NULL) + break; /* page not found */ + if (pvn_getdirty(pp, flags | B_DELWRI) == 0) + break; + page_add(&dirty, pp); + dirty = dirty->p_next; + } + + *offp = off - deltab; + *lenp = deltab + deltaf; + return (dirty); +} + +/* + * Generic entry point used to release the "shared/exclusive" lock + * and the "p_iolock" on pages after i/o is complete. + */ +void +pvn_io_done(page_t *plist) +{ + page_t *pp; + + while (plist != NULL) { + pp = plist; + page_sub(&plist, pp); + page_io_unlock(pp); + page_unlock(pp); + } +} + +/* + * Entry point to be used by file system getpage subr's and + * other such routines which either want to unlock pages (B_ASYNC + * request) or destroy a list of pages if an error occurred. + */ +void +pvn_read_done(page_t *plist, int flags) +{ + page_t *pp; + + while (plist != NULL) { + pp = plist; + page_sub(&plist, pp); + page_io_unlock(pp); + if (flags & B_ERROR) { + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, B_INVAL, 0, kcred); + } else { + (void) page_release(pp, 0); + } + } +} + +/* + * Automagic pageout. + * When memory gets tight, start freeing pages popping out of the + * write queue. + */ +int write_free = 1; +pgcnt_t pages_before_pager = 200; /* LMXXX */ + +/* + * Routine to be called when page-out's complete. + * The caller, typically VOP_PUTPAGE, has to explicity call this routine + * after waiting for i/o to complete (biowait) to free the list of + * pages associated with the buffer. These pages must be locked + * before i/o is initiated. + * + * If a write error occurs, the pages are marked as modified + * so the write will be re-tried later. + */ + +void +pvn_write_done(page_t *plist, int flags) +{ + int dfree = 0; + int pgrec = 0; + int pgout = 0; + int pgpgout = 0; + int anonpgout = 0; + int anonfree = 0; + int fspgout = 0; + int fsfree = 0; + int execpgout = 0; + int execfree = 0; + page_t *pp; + struct cpu *cpup; + struct vnode *vp = NULL; /* for probe */ + uint_t ppattr; + + ASSERT((flags & B_READ) == 0); + + /* + * If we are about to start paging anyway, start freeing pages. + */ + if (write_free && freemem < lotsfree + pages_before_pager && + (flags & B_ERROR) == 0) { + flags |= B_FREE; + } + + /* + * Handle each page involved in the i/o operation. + */ + while (plist != NULL) { + pp = plist; + ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); + page_sub(&plist, pp); + + /* Kernel probe support */ + if (vp == NULL) + vp = pp->p_vnode; + + if (flags & B_ERROR) { + /* + * Write operation failed. We don't want + * to destroy (or free) the page unless B_FORCE + * is set. We set the mod bit again and release + * all locks on the page so that it will get written + * back again later when things are hopefully + * better again. + * If B_INVAL and B_FORCE is set we really have + * to destroy the page. + */ + if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { + page_io_unlock(pp); + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, B_INVAL, 0, kcred); + } else { + hat_setmod(pp); + page_io_unlock(pp); + page_unlock(pp); + } + } else if (flags & B_INVAL) { + /* + * XXX - Failed writes with B_INVAL set are + * not handled appropriately. + */ + page_io_unlock(pp); + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, B_INVAL, 0, kcred); + } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { + /* + * Update statistics for pages being paged out + */ + if (pp->p_vnode) { + if (IS_SWAPFSVP(pp->p_vnode)) { + anonpgout++; + } else { + if (pp->p_vnode->v_flag & VVMEXEC) { + execpgout++; + } else { + fspgout++; + } + } + } + page_io_unlock(pp); + pgout = 1; + pgpgout++; + TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, + "page_ws_out:pp %p", pp); + + /* + * The page_struct_lock need not be acquired to + * examine "p_lckcnt" and "p_cowcnt" since we'll + * have an "exclusive" lock if the upgrade succeeds. + */ + if (page_tryupgrade(pp) && + pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { + /* + * Check if someone has reclaimed the + * page. If ref and mod are not set, no + * one is using it so we can free it. + * The rest of the system is careful + * to use the NOSYNC flag to unload + * translations set up for i/o w/o + * affecting ref and mod bits. + * + * Obtain a copy of the real hardware + * mod bit using hat_pagesync(pp, HAT_DONTZERO) + * to avoid having to flush the cache. + */ + ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | + HAT_SYNC_STOPON_MOD); + ck_refmod: + if (!(ppattr & (P_REF | P_MOD))) { + if (hat_page_is_mapped(pp)) { + /* + * Doesn't look like the page + * was modified so now we + * really have to unload the + * translations. Meanwhile + * another CPU could've + * modified it so we have to + * check again. We don't loop + * forever here because now + * the translations are gone + * and no one can get a new one + * since we have the "exclusive" + * lock on the page. + */ + (void) hat_pageunload(pp, + HAT_FORCE_PGUNLOAD); + ppattr = hat_page_getattr(pp, + P_REF | P_MOD); + goto ck_refmod; + } + /* + * Update statistics for pages being + * freed + */ + if (pp->p_vnode) { + if (IS_SWAPFSVP(pp->p_vnode)) { + anonfree++; + } else { + if (pp->p_vnode->v_flag + & VVMEXEC) { + execfree++; + } else { + fsfree++; + } + } + } + /*LINTED: constant in conditional ctx*/ + VN_DISPOSE(pp, B_FREE, + (flags & B_DONTNEED), kcred); + dfree++; + } else { + page_unlock(pp); + pgrec++; + TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, + "page_ws_free:pp %p", pp); + } + } else { + /* + * Page is either `locked' in memory + * or was reclaimed and now has a + * "shared" lock, so release it. + */ + page_unlock(pp); + } + } else { + /* + * Neither B_FREE nor B_INVAL nor B_ERROR. + * Just release locks. + */ + page_io_unlock(pp); + page_unlock(pp); + } + } + + CPU_STATS_ENTER_K(); + cpup = CPU; /* get cpup now that CPU cannot change */ + CPU_STATS_ADDQ(cpup, vm, dfree, dfree); + CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); + CPU_STATS_ADDQ(cpup, vm, pgout, pgout); + CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); + CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); + CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); + CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); + CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); + CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); + CPU_STATS_ADDQ(cpup, vm, execfree, execfree); + CPU_STATS_EXIT_K(); + + /* Kernel probe */ + TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, + tnf_opaque, vnode, vp, + tnf_ulong, pages_pageout, pgpgout, + tnf_ulong, pages_freed, dfree, + tnf_ulong, pages_reclaimed, pgrec); +} + +/* + * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, + * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster + * operation and is only to be considered if it doesn't involve any + * waiting here. B_TRUNC indicates that the file is being truncated + * and so no i/o needs to be done. B_FORCE indicates that the page + * must be destroyed so don't try wrting it out. + * + * The caller must ensure that the page is locked. Returns 1, if + * the page should be written back (the "iolock" is held in this + * case), or 0 if the page has been dealt with or has been + * unlocked. + */ +int +pvn_getdirty(page_t *pp, int flags) +{ + ASSERT((flags & (B_INVAL | B_FREE)) ? + PAGE_EXCL(pp) : PAGE_SHARED(pp)); + ASSERT(PP_ISFREE(pp) == 0); + + /* + * If trying to invalidate or free a logically `locked' page, + * forget it. Don't need page_struct_lock to check p_lckcnt and + * p_cowcnt as the page is exclusively locked. + */ + if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && + (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { + page_unlock(pp); + return (0); + } + + /* + * Now acquire the i/o lock so we can add it to the dirty + * list (if necessary). We avoid blocking on the i/o lock + * in the following cases: + * + * If B_DELWRI is set, which implies that this request is + * due to a klustering operartion. + * + * If this is an async (B_ASYNC) operation and we are not doing + * invalidation (B_INVAL) [The current i/o or fsflush will ensure + * that the the page is written out]. + */ + if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { + if (!page_io_trylock(pp)) { + page_unlock(pp); + return (0); + } + } else { + page_io_lock(pp); + } + + /* + * If we want to free or invalidate the page then + * we need to unload it so that anyone who wants + * it will have to take a minor fault to get it. + * Otherwise, we're just writing the page back so we + * need to sync up the hardwre and software mod bit to + * detect any future modifications. We clear the + * software mod bit when we put the page on the dirty + * list. + */ + if (flags & (B_INVAL | B_FREE)) { + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + } else { + (void) hat_pagesync(pp, HAT_SYNC_ZERORM); + } + + if (!hat_ismod(pp) || (flags & B_TRUNC)) { + /* + * Don't need to add it to the + * list after all. + */ + page_io_unlock(pp); + if (flags & B_INVAL) { + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, B_INVAL, 0, kcred); + } else if (flags & B_FREE) { + /*LINTED: constant in conditional context*/ + VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); + } else { + /* + * This is advisory path for the callers + * of VOP_PUTPAGE() who prefer freeing the + * page _only_ if no one else is accessing it. + * E.g. segmap_release() + * + * The above hat_ismod() check is useless because: + * (1) we may not be holding SE_EXCL lock; + * (2) we've not unloaded _all_ translations + * + * Let page_release() do the heavy-lifting. + */ + (void) page_release(pp, 1); + } + return (0); + } + + /* + * Page is dirty, get it ready for the write back + * and add page to the dirty list. + */ + hat_clrrefmod(pp); + + /* + * If we're going to free the page when we're done + * then we can let others try to use it starting now. + * We'll detect the fact that they used it when the + * i/o is done and avoid freeing the page. + */ + if (flags & B_FREE) + page_downgrade(pp); + + + TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); + + return (1); +} + + +/*ARGSUSED*/ +static int +marker_constructor(void *buf, void *cdrarg, int kmflags) +{ + page_t *mark = buf; + bzero(mark, sizeof (page_t)); + return (0); +} + +void +pvn_init() +{ + if (pvn_vmodsort_disable == 0) + pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL); + marker_cache = kmem_cache_create("marker_cache", + sizeof (page_t), 0, marker_constructor, + NULL, NULL, NULL, NULL, 0); +} + + +/* + * Process a vnode's page list for all pages whose offset is >= off. + * Pages are to either be free'd, invalidated, or written back to disk. + * + * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE + * is specified, otherwise they are "shared" locked. + * + * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} + * + * Special marker page_t's are inserted in the list in order + * to keep track of where we are in the list when locks are dropped. + * + * Note the list is circular and insertions can happen only at the + * head and tail of the list. The algorithm ensures visiting all pages + * on the list in the following way: + * + * Drop two marker pages at the end of the list. + * + * Move one marker page backwards towards the start of the list until + * it is at the list head, processing the pages passed along the way. + * + * Due to race conditions when the vphm mutex is dropped, additional pages + * can be added to either end of the list, so we'll continue to move + * the marker and process pages until it is up against the end marker. + * + * There is one special exit condition. If we are processing a VMODSORT + * vnode and only writing back modified pages, we can stop as soon as + * we run into an unmodified page. This makes fsync(3) operations fast. + */ +int +pvn_vplist_dirty( + vnode_t *vp, + u_offset_t off, + int (*putapage)(vnode_t *, page_t *, u_offset_t *, + size_t *, int, cred_t *), + int flags, + cred_t *cred) +{ + page_t *pp; + page_t *mark; /* marker page that moves toward head */ + page_t *end; /* marker page at end of list */ + int err = 0; + int error; + kmutex_t *vphm; + se_t se; + page_t **where_to_move; + + ASSERT(vp->v_type != VCHR); + + if (vp->v_pages == NULL) + return (0); + + + /* + * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. + * + * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() + * from getting blocked while flushing pages to a dead NFS server. + */ + mutex_enter(&vp->v_lock); + if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { + mutex_exit(&vp->v_lock); + return (EAGAIN); + } + + while (vp->v_flag & VVMLOCK) + cv_wait(&vp->v_cv, &vp->v_lock); + + if (vp->v_pages == NULL) { + mutex_exit(&vp->v_lock); + return (0); + } + + vp->v_flag |= VVMLOCK; + mutex_exit(&vp->v_lock); + + + /* + * Set up the marker pages used to walk the list + */ + end = kmem_cache_alloc(marker_cache, KM_SLEEP); + end->p_vnode = vp; + end->p_offset = (u_offset_t)-2; + mark = kmem_cache_alloc(marker_cache, KM_SLEEP); + mark->p_vnode = vp; + mark->p_offset = (u_offset_t)-1; + + /* + * Grab the lock protecting the vnode's page list + * note that this lock is dropped at times in the loop. + */ + vphm = page_vnode_mutex(vp); + mutex_enter(vphm); + if (vp->v_pages == NULL) + goto leave; + + /* + * insert the markers and loop through the list of pages + */ + page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); + page_vpadd(&mark->p_vpnext, end); + for (;;) { + + /* + * If only doing an async write back, then we can + * stop as soon as we get to start of the list. + */ + if (flags == B_ASYNC && vp->v_pages == mark) + break; + + /* + * otherwise stop when we've gone through all the pages + */ + if (mark->p_vpprev == end) + break; + + pp = mark->p_vpprev; + if (vp->v_pages == pp) + where_to_move = &vp->v_pages; + else + where_to_move = &pp->p_vpprev->p_vpnext; + + ASSERT(pp->p_vnode == vp); + + /* + * Skip this page if the offset is out of the desired range. + * Just move the marker and continue. + */ + if (pp->p_offset < off) { + page_vpsub(&vp->v_pages, mark); + page_vpadd(where_to_move, mark); + continue; + } + + /* + * If just flushing dirty pages to disk and this vnode + * is using a sorted list of pages, we can stop processing + * as soon as we find an unmodified page. Since all the + * modified pages are visited first. + */ + if (IS_VMODSORT(vp) && + !(flags & (B_INVAL | B_FREE | B_TRUNC)) && + !hat_ismod(pp)) { +#ifdef DEBUG + /* + * For debug kernels examine what should be all the + * remaining clean pages, asserting that they are + * not modified. + */ + page_t *chk = pp; + int attr; + + page_vpsub(&vp->v_pages, mark); + page_vpadd(where_to_move, mark); + do { + chk = chk->p_vpprev; + ASSERT(chk != end); + if (chk == mark) + continue; + attr = hat_page_getattr(chk, P_MOD | P_REF); + if ((attr & P_MOD) == 0) + continue; + panic("v_pages list not all clean: " + "page_t*=%p vnode=%p off=%lx " + "attr=0x%x last clean page_t*=%p\n", + (void *)chk, (void *)chk->p_vnode, + (long)chk->p_offset, attr, (void *)pp); + } while (chk != vp->v_pages); +#endif + break; + } + + /* + * If we are supposed to invalidate or free this + * page, then we need an exclusive lock. + */ + se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; + + /* + * We must acquire the page lock for all synchronous + * operations (invalidate, free and write). + */ + if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { + /* + * If the page_lock() drops the mutex + * we must retry the loop. + */ + if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) + continue; + + /* + * It's ok to move the marker page now. + */ + page_vpsub(&vp->v_pages, mark); + page_vpadd(where_to_move, mark); + } else { + + /* + * update the marker page for all remaining cases + */ + page_vpsub(&vp->v_pages, mark); + page_vpadd(where_to_move, mark); + + /* + * For write backs, If we can't lock the page, it's + * invalid or in the process of being destroyed. Skip + * it, assuming someone else is writing it. + */ + if (!page_trylock(pp, se)) + continue; + } + + ASSERT(pp->p_vnode == vp); + + /* + * Successfully locked the page, now figure out what to + * do with it. Free pages are easily dealt with, invalidate + * if desired or just go on to the next page. + */ + if (PP_ISFREE(pp)) { + if ((flags & B_INVAL) == 0) { + page_unlock(pp); + continue; + } + + /* + * Invalidate (destroy) the page. + */ + mutex_exit(vphm); + page_destroy_free(pp); + mutex_enter(vphm); + continue; + } + + /* + * pvn_getdirty() figures out what do do with a dirty page. + * If the page is dirty, the putapage() routine will write it + * and will kluster any other adjacent dirty pages it can. + * + * pvn_getdirty() and `(*putapage)' unlock the page. + */ + mutex_exit(vphm); + if (pvn_getdirty(pp, flags)) { + error = (*putapage)(vp, pp, NULL, NULL, flags, cred); + if (!err) + err = error; + } + mutex_enter(vphm); + } + page_vpsub(&vp->v_pages, mark); + page_vpsub(&vp->v_pages, end); + +leave: + /* + * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds + */ + mutex_exit(vphm); + kmem_cache_free(marker_cache, mark); + kmem_cache_free(marker_cache, end); + mutex_enter(&vp->v_lock); + vp->v_flag &= ~VVMLOCK; + cv_broadcast(&vp->v_cv); + mutex_exit(&vp->v_lock); + return (err); +} + +/* + * Zero out zbytes worth of data. Caller should be aware that this + * routine may enter back into the fs layer (xxx_getpage). Locks + * that the xxx_getpage routine may need should not be held while + * calling this. + */ +void +pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes) +{ + caddr_t addr; + + ASSERT(vp->v_type != VCHR); + + if (vp->v_pages == NULL) + return; + + /* + * zbytes may be zero but there still may be some portion of + * a page which needs clearing (since zbytes is a function + * of filesystem block size, not pagesize.) + */ + if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0) + return; + + /* + * We get the last page and handle the partial + * zeroing via kernel mappings. This will make the page + * dirty so that we know that when this page is written + * back, the zeroed information will go out with it. If + * the page is not currently in memory, then the kzero + * operation will cause it to be brought it. We use kzero + * instead of bzero so that if the page cannot be read in + * for any reason, the system will not panic. We need + * to zero out a minimum of the fs given zbytes, but we + * might also have to do more to get the entire last page. + */ + + if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE) + panic("pvn_vptrunc zbytes"); + addr = segmap_getmapflt(segkmap, vp, vplen, + MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE); + (void) kzero(addr + (vplen & MAXBOFFSET), + MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET))); + (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC); +} + +/* + * Handles common work of the VOP_GETPAGE routines when more than + * one page must be returned by calling a file system specific operation + * to do most of the work. Must be called with the vp already locked + * by the VOP_GETPAGE routine. + */ +int +pvn_getpages( + int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[], + size_t, struct seg *, caddr_t, enum seg_rw, cred_t *), + struct vnode *vp, + u_offset_t off, + size_t len, + uint_t *protp, + page_t *pl[], + size_t plsz, + struct seg *seg, + caddr_t addr, + enum seg_rw rw, + struct cred *cred) +{ + page_t **ppp; + u_offset_t o, eoff; + size_t sz, xlen; + int err; + + ASSERT(plsz >= len); /* insure that we have enough space */ + + /* + * Loop one page at a time and let getapage function fill + * in the next page in array. We only allow one page to be + * returned at a time (except for the last page) so that we + * don't have any problems with duplicates and other such + * painful problems. This is a very simple minded algorithm, + * but it does the job correctly. We hope that the cost of a + * getapage call for a resident page that we might have been + * able to get from an earlier call doesn't cost too much. + */ + ppp = pl; + sz = PAGESIZE; + eoff = off + len; + xlen = len; + for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE, + xlen -= PAGESIZE) { + if (o + PAGESIZE >= eoff) { + /* + * Last time through - allow the all of + * what's left of the pl[] array to be used. + */ + sz = plsz - (o - off); + } + err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr, + rw, cred); + if (err) { + /* + * Release any pages we already got. + */ + if (o > off && pl != NULL) { + for (ppp = pl; *ppp != NULL; *ppp++ = NULL) + (void) page_release(*ppp, 1); + } + break; + } + if (pl != NULL) + ppp++; + } + return (err); +} + +/* + * Initialize the page list array. + */ +void +pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz, + u_offset_t off, size_t io_len, enum seg_rw rw) +{ + ssize_t sz; + page_t *ppcur, **ppp; + + if (plsz >= io_len) { + /* + * Everything fits, set up to load + * all the pages. + */ + sz = io_len; + } else { + /* + * Set up to load plsz worth + * starting at the needed page. + */ + while (pp->p_offset != off) { + /* XXX - Do we need this assert? */ + ASSERT(pp->p_next->p_offset != + pp->p_offset); + /* + * Remove page from the i/o list, + * release the i/o and the page lock. + */ + ppcur = pp; + page_sub(&pp, ppcur); + page_io_unlock(ppcur); + (void) page_release(ppcur, 1); + } + sz = plsz; + } + + /* + * Initialize the page list array. + */ + ppp = pl; + do { + ppcur = pp; + *ppp++ = ppcur; + page_sub(&pp, ppcur); + page_io_unlock(ppcur); + if (rw != S_CREATE) + page_downgrade(ppcur); + sz -= PAGESIZE; + } while (sz > 0 && pp != NULL); + *ppp = NULL; /* terminate list */ + + /* + * Now free the remaining pages that weren't + * loaded in the page list. + */ + while (pp != NULL) { + ppcur = pp; + page_sub(&pp, ppcur); + page_io_unlock(ppcur); + (void) page_release(ppcur, 1); + } +} diff --git a/usr/src/uts/common/vm/vm_rm.c b/usr/src/uts/common/vm/vm_rm.c new file mode 100644 index 0000000000..36cd5f0375 --- /dev/null +++ b/usr/src/uts/common/vm/vm_rm.c @@ -0,0 +1,189 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mman.h> +#include <sys/sysmacros.h> +#include <sys/errno.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/proc.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> + +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg_vn.h> +#include <vm/rm.h> +#include <vm/seg.h> +#include <vm/page.h> + +/* + * Yield the size of an address space. + * + * The size can only be used as a hint since we cannot guarantee it + * will stay the same size unless the as->a_lock is held by the caller. + */ +size_t +rm_assize(struct as *as) +{ + size_t size = 0; + struct seg *seg; + struct segvn_data *svd; + extern struct seg_ops segdev_ops; /* needs a header file */ + + ASSERT(as != NULL && AS_READ_HELD(as, &as->a_lock)); + + if (as == &kas) + return (0); + + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + if (seg->s_ops == &segdev_ops && + ((SEGOP_GETTYPE(seg, seg->s_base) & + (MAP_SHARED | MAP_PRIVATE)) == 0)) { + /* + * Don't include mappings of /dev/null. These just + * reserve address space ranges and have no memory. + * We cheat by knowing that these segments come + * from segdev and have no mapping type. + */ + /* EMPTY */; + } else if (seg->s_ops == &segvn_ops && + (svd = (struct segvn_data *)seg->s_data) != NULL && + (svd->vp == NULL || svd->vp->v_type != VREG) && + (svd->flags & MAP_NORESERVE)) { + /* + * Don't include MAP_NORESERVE pages in the + * address range unless their mappings have + * actually materialized. We cheat by knowing + * that segvn is the only segment driver that + * supports MAP_NORESERVE and that the actual + * number of bytes reserved is in the segment's + * private data structure. + */ + size += svd->swresv; + } else { + caddr_t addr = seg->s_base; + size_t segsize = seg->s_size; + vnode_t *vp; + vattr_t vattr; + + /* + * If the segment is mapped beyond the end of the + * underlying mapped file, if any, then limit the + * segment's size contribution to the file size. + */ + vattr.va_mask = AT_SIZE; + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, addr, &vp) == 0 && + vp != NULL && vp->v_type == VREG && + VOP_GETATTR(vp, &vattr, ATTR_HINT, CRED()) == 0) { + u_offset_t filesize = vattr.va_size; + u_offset_t offset = SEGOP_GETOFFSET(seg, addr); + + if (filesize < offset) + filesize = 0; + else + filesize -= offset; + filesize = P2ROUNDUP_TYPED(filesize, PAGESIZE, + u_offset_t); + if ((u_offset_t)segsize > filesize) + segsize = filesize; + } + size += segsize; + } + } + + return (size); +} + +/* + * Yield the memory claim requirement for an address space. + * + * This is currently implemented as the number of active hardware + * translations that have page structures. Therefore, it can + * underestimate the traditional resident set size, eg, if the + * physical page is present and the hardware translation is missing; + * and it can overestimate the rss, eg, if there are active + * translations to a frame buffer with page structs. + * Also, it does not take sharing and XHATs into account. + */ +size_t +rm_asrss(as) + register struct as *as; +{ + if (as != (struct as *)NULL && as != &kas) + return ((size_t)btop(hat_get_mapped_size(as->a_hat))); + else + return (0); +} + +/* + * Return a 16-bit binary fraction representing the percent of total memory + * used by this address space. Binary point is to right of high-order bit. + * Defined as the ratio of a_rss for the process to total physical memory. + * This assumes 2s-complement arithmetic and that shorts and longs are + * 16 bits and 32 bits, respectively. + */ +ushort_t +rm_pctmemory(struct as *as) +{ + /* This can't overflow */ + ulong_t num = (ulong_t)rm_asrss(as) << (PAGESHIFT-1); + int shift = 16 - PAGESHIFT; + ulong_t total = total_pages; + + if (shift < 0) { + num >>= (-shift); + shift = 0; + } + while (shift > 0 && (num & 0x80000000) == 0) { + shift--; + num <<= 1; + } + if (shift > 0) + total >>= shift; + + return (num / total); +} diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c new file mode 100644 index 0000000000..50cc21cdf7 --- /dev/null +++ b/usr/src/uts/common/vm/vm_seg.c @@ -0,0 +1,952 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - segment management. + */ + +#include <sys/types.h> +#include <sys/inttypes.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kmem.h> +#include <sys/vmsystm.h> +#include <sys/debug.h> +#include <sys/cmn_err.h> +#include <sys/callb.h> +#include <sys/mem_config.h> + +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> + +/* + * kstats for segment advise + */ +segadvstat_t segadvstat = { + { "MADV_FREE_hit", KSTAT_DATA_ULONG }, + { "MADV_FREE_miss", KSTAT_DATA_ULONG }, +}; + +kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat; +uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t); + +/* #define PDEBUG */ +#if defined(PDEBUG) || defined(lint) || defined(__lint) +int pdebug = 0; +#else +#define pdebug 0 +#endif /* PDEBUG */ + +#define PPRINTF if (pdebug) printf +#define PPRINT(x) PPRINTF(x) +#define PPRINT1(x, a) PPRINTF(x, a) +#define PPRINT2(x, a, b) PPRINTF(x, a, b) +#define PPRINT3(x, a, b, c) PPRINTF(x, a, b, c) +#define PPRINT4(x, a, b, c, d) PPRINTF(x, a, b, c, d) +#define PPRINT5(x, a, b, c, d, e) PPRINTF(x, a, b, c, d, e) + +#define P_HASHMASK (p_hashsize - 1) +#define P_BASESHIFT 6 + +/* + * entry in the segment page cache + */ +struct seg_pcache { + struct seg_pcache *p_hnext; /* list for hashed blocks */ + struct seg_pcache *p_hprev; + int p_active; /* active count */ + int p_ref; /* ref bit */ + size_t p_len; /* segment length */ + caddr_t p_addr; /* base address */ + struct seg *p_seg; /* segment */ + struct page **p_pp; /* pp shadow list */ + enum seg_rw p_rw; /* rw */ + uint_t p_flags; /* bit flags */ + int (*p_callback)(struct seg *, caddr_t, size_t, + struct page **, enum seg_rw); +}; + +struct seg_phash { + struct seg_pcache *p_hnext; /* list for hashed blocks */ + struct seg_pcache *p_hprev; + int p_qlen; /* Q length */ + kmutex_t p_hmutex; /* protects hash bucket */ +}; + +static int seg_preap_time = 20; /* reclaim every 20 secs */ +static int seg_pmaxqlen = 5; /* max Q length in hash list */ +static int seg_ppcount = 5; /* max # of purges per reclaim interval */ +static int seg_plazy = 1; /* if 1, pages are cached after pageunlock */ +static pgcnt_t seg_pwindow; /* max # of pages that can be cached */ +static pgcnt_t seg_plocked; /* # of pages which are cached by pagelock */ +static pgcnt_t seg_plocked_window; /* # pages from window */ +int seg_preapahead; + +static uint_t seg_pdisable = 0; /* if not 0, caching temporarily disabled */ + +static int seg_pupdate_active = 1; /* background reclaim thread */ +static clock_t seg_preap_interval; /* reap interval in ticks */ + +static kmutex_t seg_pcache; /* protects the whole pagelock cache */ +static kmutex_t seg_pmem; /* protects window counter */ +static ksema_t seg_psaync_sem; /* sema for reclaim thread */ +static struct seg_phash *p_hashtab; +static int p_hashsize = 0; + +#define p_hash(seg) \ + (P_HASHMASK & \ + ((uintptr_t)(seg) >> P_BASESHIFT)) + +#define p_match(pcp, seg, addr, len, rw) \ + (((pcp)->p_seg == (seg) && \ + (pcp)->p_addr == (addr) && \ + (pcp)->p_rw == (rw) && \ + (pcp)->p_len == (len)) ? 1 : 0) + +#define p_match_pp(pcp, seg, addr, len, pp, rw) \ + (((pcp)->p_seg == (seg) && \ + (pcp)->p_addr == (addr) && \ + (pcp)->p_pp == (pp) && \ + (pcp)->p_rw == (rw) && \ + (pcp)->p_len == (len)) ? 1 : 0) + + +/* + * lookup an address range in pagelock cache. Return shadow list + * and bump up active count. + */ +struct page ** +seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) +{ + struct seg_pcache *pcp; + struct seg_phash *hp; + + /* + * Skip pagelock cache, while DR is in progress or + * seg_pcache is off. + */ + if (seg_pdisable || seg_plazy == 0) { + return (NULL); + } + + hp = &p_hashtab[p_hash(seg)]; + mutex_enter(&hp->p_hmutex); + for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; + pcp = pcp->p_hnext) { + if (p_match(pcp, seg, addr, len, rw)) { + pcp->p_active++; + mutex_exit(&hp->p_hmutex); + + PPRINT5("seg_plookup hit: seg %p, addr %p, " + "len %lx, count %d, pplist %p \n", + (void *)seg, (void *)addr, len, pcp->p_active, + (void *)pcp->p_pp); + + return (pcp->p_pp); + } + } + mutex_exit(&hp->p_hmutex); + + PPRINT("seg_plookup miss:\n"); + + return (NULL); +} + +/* + * mark address range inactive. If the cache is off or the address + * range is not in the cache we call the segment driver to reclaim + * the pages. Otherwise just decrement active count and set ref bit. + */ +void +seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp, + enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t, + struct page **, enum seg_rw)) +{ + struct seg_pcache *pcp; + struct seg_phash *hp; + + if (seg_plazy == 0) { + (void) (*callback)(seg, addr, len, pp, rw); + return; + } + hp = &p_hashtab[p_hash(seg)]; + mutex_enter(&hp->p_hmutex); + for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp; + pcp = pcp->p_hnext) { + if (p_match_pp(pcp, seg, addr, len, pp, rw)) { + pcp->p_active--; + ASSERT(pcp->p_active >= 0); + if (pcp->p_active == 0 && seg_pdisable) { + int npages; + + ASSERT(callback == pcp->p_callback); + /* free the entry */ + hp->p_qlen--; + pcp->p_hprev->p_hnext = pcp->p_hnext; + pcp->p_hnext->p_hprev = pcp->p_hprev; + mutex_exit(&hp->p_hmutex); + npages = pcp->p_len >> PAGESHIFT; + mutex_enter(&seg_pmem); + seg_plocked -= npages; + if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { + seg_plocked_window -= npages; + } + mutex_exit(&seg_pmem); + kmem_free(pcp, sizeof (struct seg_pcache)); + goto out; + } + pcp->p_ref = 1; + mutex_exit(&hp->p_hmutex); + return; + } + } + mutex_exit(&hp->p_hmutex); +out: + (void) (*callback)(seg, addr, len, pp, rw); +} + +/* + * The seg_pinsert_check() is used by segment drivers to predict whether + * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing. + */ + +int +seg_pinsert_check(struct seg *seg, size_t len, uint_t flags) +{ + struct seg_phash *hp; + + if (seg_plazy == 0) { + return (SEGP_FAIL); + } + if (seg_pdisable != 0) { + return (SEGP_FAIL); + } + ASSERT((len & PAGEOFFSET) == 0); + hp = &p_hashtab[p_hash(seg)]; + if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) { + return (SEGP_FAIL); + } + /* + * If the SEGP_FORCE_WIRED flag is set, + * we skip the check for seg_pwindow. + */ + if ((flags & SEGP_FORCE_WIRED) == 0) { + pgcnt_t npages; + + npages = len >> PAGESHIFT; + if ((seg_plocked_window + npages) > seg_pwindow) { + return (SEGP_FAIL); + } + } + return (SEGP_SUCCESS); +} + + +/* + * insert address range with shadow list into pagelock cache. If + * the cache is off or caching is temporarily disabled or the allowed + * 'window' is exceeded - return SEGP_FAIL. Otherwise return + * SEGP_SUCCESS. + */ +int +seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp, + enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t, + size_t, struct page **, enum seg_rw)) +{ + struct seg_pcache *pcp; + struct seg_phash *hp; + pgcnt_t npages; + + if (seg_plazy == 0) { + return (SEGP_FAIL); + } + if (seg_pdisable != 0) { + return (SEGP_FAIL); + } + ASSERT((len & PAGEOFFSET) == 0); + hp = &p_hashtab[p_hash(seg)]; + if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) { + return (SEGP_FAIL); + } + npages = len >> PAGESHIFT; + mutex_enter(&seg_pmem); + /* + * If the SEGP_FORCE_WIRED flag is set, + * we skip the check for seg_pwindow. + */ + if ((flags & SEGP_FORCE_WIRED) == 0) { + seg_plocked_window += npages; + if (seg_plocked_window > seg_pwindow) { + seg_plocked_window -= npages; + mutex_exit(&seg_pmem); + return (SEGP_FAIL); + } + } + seg_plocked += npages; + mutex_exit(&seg_pmem); + + pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP); + pcp->p_seg = seg; + pcp->p_addr = addr; + pcp->p_len = len; + pcp->p_pp = pp; + pcp->p_rw = rw; + pcp->p_callback = callback; + pcp->p_active = 1; + pcp->p_flags = flags; + + PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n", + (void *)seg, (void *)addr, len, (void *)pp); + + hp = &p_hashtab[p_hash(seg)]; + mutex_enter(&hp->p_hmutex); + hp->p_qlen++; + pcp->p_hnext = hp->p_hnext; + pcp->p_hprev = (struct seg_pcache *)hp; + hp->p_hnext->p_hprev = pcp; + hp->p_hnext = pcp; + mutex_exit(&hp->p_hmutex); + return (SEGP_SUCCESS); +} + +/* + * purge all entries from the pagelock cache if not active + * and not recently used. Drop all locks and call through + * the address space into the segment driver to reclaim + * the pages. This makes sure we get the address space + * and segment driver locking right. + */ +static void +seg_ppurge_all(int force) +{ + struct seg_pcache *delcallb_list = NULL; + struct seg_pcache *pcp; + struct seg_phash *hp; + int purge_count = 0; + pgcnt_t npages = 0; + pgcnt_t npages_window = 0; + + /* + * if the cache if off or empty, return + */ + if (seg_plazy == 0 || seg_plocked == 0) { + return; + } + for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) { + mutex_enter(&hp->p_hmutex); + pcp = hp->p_hnext; + + /* + * While 'force' is set, seg_pasync_thread is not + * throttled. This is to speedup flushing of seg_pcache + * in preparation for DR. + * + * In normal case, when 'force' is not set, we throttle + * seg_pasync_thread so that we don't spend all the time + * time in purging the cache. + */ + while ((pcp != (struct seg_pcache *)hp) && + (force || (purge_count <= seg_ppcount))) { + + /* + * purge entries which are not active and + * have not been used recently and + * have the SEGP_ASYNC_FLUSH flag. + * + * In the 'force' case, we ignore the + * SEGP_ASYNC_FLUSH flag. + */ + if (!(pcp->p_flags & SEGP_ASYNC_FLUSH)) + pcp->p_ref = 1; + if (force) + pcp->p_ref = 0; + if (!pcp->p_ref && !pcp->p_active) { + struct as *as = pcp->p_seg->s_as; + + /* + * try to get the readers lock on the address + * space before taking out the cache element. + * This ensures as_pagereclaim() can actually + * call through the address space and free + * the pages. If we don't get the lock, just + * skip this entry. The pages will be reclaimed + * by the segment driver at unmap time. + */ + if (AS_LOCK_TRYENTER(as, &as->a_lock, + RW_READER)) { + hp->p_qlen--; + pcp->p_hprev->p_hnext = pcp->p_hnext; + pcp->p_hnext->p_hprev = pcp->p_hprev; + pcp->p_hprev = delcallb_list; + delcallb_list = pcp; + purge_count++; + } + } else { + pcp->p_ref = 0; + } + pcp = pcp->p_hnext; + } + mutex_exit(&hp->p_hmutex); + if (!force && purge_count > seg_ppcount) + break; + } + + /* + * run the delayed callback list. We don't want to hold the + * cache lock during a call through the address space. + */ + while (delcallb_list != NULL) { + struct as *as; + + pcp = delcallb_list; + delcallb_list = pcp->p_hprev; + as = pcp->p_seg->s_as; + + PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, " + "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr, + pcp->p_len, (void *)pcp->p_pp); + + as_pagereclaim(as, pcp->p_pp, pcp->p_addr, + pcp->p_len, pcp->p_rw); + AS_LOCK_EXIT(as, &as->a_lock); + npages += pcp->p_len >> PAGESHIFT; + if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { + npages_window += pcp->p_len >> PAGESHIFT; + } + kmem_free(pcp, sizeof (struct seg_pcache)); + } + mutex_enter(&seg_pmem); + seg_plocked -= npages; + seg_plocked_window -= npages_window; + mutex_exit(&seg_pmem); +} + +/* + * Remove cached pages for segment(s) entries from hashtable. + * The segments are identified by a given clients callback + * function. + * This is useful for multiple seg's cached on behalf of + * dummy segment (ISM/DISM) with common callback function. + * The clients callback function may return status indicating + * that the last seg's entry has been purged. In such a case + * the seg_ppurge_seg() stops searching hashtable and exits. + * Otherwise all hashtable entries are scanned. + */ +void +seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t, + struct page **, enum seg_rw)) +{ + struct seg_pcache *pcp, *npcp; + struct seg_phash *hp; + pgcnt_t npages = 0; + pgcnt_t npages_window = 0; + int done = 0; + + /* + * if the cache if off or empty, return + */ + if (seg_plazy == 0 || seg_plocked == 0) { + return; + } + mutex_enter(&seg_pcache); + seg_pdisable++; + mutex_exit(&seg_pcache); + + for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) { + + mutex_enter(&hp->p_hmutex); + pcp = hp->p_hnext; + while (pcp != (struct seg_pcache *)hp) { + + /* + * purge entries which are not active + */ + npcp = pcp->p_hnext; + if (!pcp->p_active && pcp->p_callback == callback) { + hp->p_qlen--; + pcp->p_hprev->p_hnext = pcp->p_hnext; + pcp->p_hnext->p_hprev = pcp->p_hprev; + + if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr, + pcp->p_len, pcp->p_pp, pcp->p_rw)) { + done = 1; + } + + npages += pcp->p_len >> PAGESHIFT; + if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { + npages_window += + pcp->p_len >> PAGESHIFT; + } + kmem_free(pcp, sizeof (struct seg_pcache)); + } + pcp = npcp; + if (done) + break; + } + mutex_exit(&hp->p_hmutex); + if (done) + break; + } + + mutex_enter(&seg_pcache); + seg_pdisable--; + mutex_exit(&seg_pcache); + + mutex_enter(&seg_pmem); + seg_plocked -= npages; + seg_plocked_window -= npages_window; + mutex_exit(&seg_pmem); +} + +/* + * purge all entries for a given segment. Since we + * callback into the segment driver directly for page + * reclaim the caller needs to hold the right locks. + */ +void +seg_ppurge(struct seg *seg) +{ + struct seg_pcache *delcallb_list = NULL; + struct seg_pcache *pcp; + struct seg_phash *hp; + pgcnt_t npages = 0; + pgcnt_t npages_window = 0; + + if (seg_plazy == 0) { + return; + } + hp = &p_hashtab[p_hash(seg)]; + mutex_enter(&hp->p_hmutex); + pcp = hp->p_hnext; + while (pcp != (struct seg_pcache *)hp) { + if (pcp->p_seg == seg) { + if (pcp->p_active) { + break; + } + hp->p_qlen--; + pcp->p_hprev->p_hnext = pcp->p_hnext; + pcp->p_hnext->p_hprev = pcp->p_hprev; + pcp->p_hprev = delcallb_list; + delcallb_list = pcp; + } + pcp = pcp->p_hnext; + } + mutex_exit(&hp->p_hmutex); + while (delcallb_list != NULL) { + pcp = delcallb_list; + delcallb_list = pcp->p_hprev; + + PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, " + "pplist %p\n", (void *)seg, (void *)pcp->p_addr, + pcp->p_len, (void *)pcp->p_pp); + + ASSERT(seg == pcp->p_seg); + (void) (*pcp->p_callback)(seg, pcp->p_addr, + pcp->p_len, pcp->p_pp, pcp->p_rw); + npages += pcp->p_len >> PAGESHIFT; + if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) { + npages_window += pcp->p_len >> PAGESHIFT; + } + kmem_free(pcp, sizeof (struct seg_pcache)); + } + mutex_enter(&seg_pmem); + seg_plocked -= npages; + seg_plocked_window -= npages_window; + mutex_exit(&seg_pmem); +} + +static void seg_pinit_mem_config(void); + +/* + * setup the pagelock cache + */ +static void +seg_pinit(void) +{ + struct seg_phash *hp; + int i; + uint_t physmegs; + + sema_init(&seg_psaync_sem, 0, NULL, SEMA_DEFAULT, NULL); + + mutex_enter(&seg_pcache); + if (p_hashtab == NULL) { + physmegs = physmem >> (20 - PAGESHIFT); + + /* If p_hashsize was not set in /etc/system ... */ + if (p_hashsize == 0) { + /* + * Choose p_hashsize based on physmem. + */ + if (physmegs < 64) { + p_hashsize = 64; + } else if (physmegs < 1024) { + p_hashsize = 1024; + } else if (physmegs < 10 * 1024) { + p_hashsize = 8192; + } else if (physmegs < 20 * 1024) { + p_hashsize = 2 * 8192; + seg_pmaxqlen = 16; + } else { + p_hashsize = 128 * 1024; + seg_pmaxqlen = 128; + } + } + + p_hashtab = kmem_zalloc( + p_hashsize * sizeof (struct seg_phash), KM_SLEEP); + for (i = 0; i < p_hashsize; i++) { + hp = (struct seg_phash *)&p_hashtab[i]; + hp->p_hnext = (struct seg_pcache *)hp; + hp->p_hprev = (struct seg_pcache *)hp; + mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL); + } + if (seg_pwindow == 0) { + if (physmegs < 24) { + /* don't use cache */ + seg_plazy = 0; + } else if (physmegs < 64) { + seg_pwindow = physmem >> 5; /* 3% of memory */ + } else if (physmegs < 10 * 1024) { + seg_pwindow = physmem >> 3; /* 12% of memory */ + } else { + seg_pwindow = physmem >> 1; + } + } + } + mutex_exit(&seg_pcache); + + seg_pinit_mem_config(); +} + +/* + * called by pageout if memory is low + */ +void +seg_preap(void) +{ + /* + * if the cache if off or empty, return + */ + if (seg_plocked == 0 || seg_plazy == 0) { + return; + } + sema_v(&seg_psaync_sem); +} + +static void seg_pupdate(void *); + +/* + * run as a backgroud thread and reclaim pagelock + * pages which have not been used recently + */ +void +seg_pasync_thread(void) +{ + callb_cpr_t cpr_info; + kmutex_t pasync_lock; /* just for CPR stuff */ + + mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL); + + CALLB_CPR_INIT(&cpr_info, &pasync_lock, + callb_generic_cpr, "seg_pasync"); + + if (seg_preap_interval == 0) { + seg_preap_interval = seg_preap_time * hz; + } else { + seg_preap_interval *= hz; + } + if (seg_plazy && seg_pupdate_active) { + (void) timeout(seg_pupdate, NULL, seg_preap_interval); + } + + for (;;) { + mutex_enter(&pasync_lock); + CALLB_CPR_SAFE_BEGIN(&cpr_info); + mutex_exit(&pasync_lock); + sema_p(&seg_psaync_sem); + mutex_enter(&pasync_lock); + CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock); + mutex_exit(&pasync_lock); + + seg_ppurge_all(0); + } +} + +static void +seg_pupdate(void *dummy) +{ + sema_v(&seg_psaync_sem); + + if (seg_plazy && seg_pupdate_active) { + (void) timeout(seg_pupdate, dummy, seg_preap_interval); + } +} + +static struct kmem_cache *seg_cache; + +/* + * Initialize segment management data structures. + */ +void +seg_init(void) +{ + kstat_t *ksp; + + seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED, + segadvstat_ndata, KSTAT_FLAG_VIRTUAL); + if (ksp) { + ksp->ks_data = (void *)segadvstat_ptr; + kstat_install(ksp); + } + + seg_pinit(); +} + +/* + * Allocate a segment to cover [base, base+size] + * and attach it to the specified address space. + */ +struct seg * +seg_alloc(struct as *as, caddr_t base, size_t size) +{ + struct seg *new; + caddr_t segbase; + size_t segsize; + + segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK); + segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) - + (uintptr_t)segbase; + + if (!valid_va_range(&segbase, &segsize, segsize, AH_LO)) + return ((struct seg *)NULL); /* bad virtual addr range */ + + if (as != &kas && + valid_usr_range(segbase, segsize, 0, as, + as->a_userlimit) != RANGE_OKAY) + return ((struct seg *)NULL); /* bad virtual addr range */ + + new = kmem_cache_alloc(seg_cache, KM_SLEEP); + new->s_ops = NULL; + new->s_data = NULL; + new->s_szc = 0; + new->s_flags = 0; + if (seg_attach(as, segbase, segsize, new) < 0) { + kmem_cache_free(seg_cache, new); + return ((struct seg *)NULL); + } + /* caller must fill in ops, data */ + return (new); +} + +/* + * Attach a segment to the address space. Used by seg_alloc() + * and for kernel startup to attach to static segments. + */ +int +seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg) +{ + seg->s_as = as; + seg->s_base = base; + seg->s_size = size; + + /* + * as_addseg() will add the segment at the appropraite point + * in the list. It will return -1 if there is overlap with + * an already existing segment. + */ + return (as_addseg(as, seg)); +} + +/* + * Unmap a segment and free it from its associated address space. + * This should be called by anybody who's finished with a whole segment's + * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the + * responsibility of the segment driver to unlink the the segment + * from the address space, and to free public and private data structures + * associated with the segment. (This is typically done by a call to + * seg_free()). + */ +void +seg_unmap(struct seg *seg) +{ +#ifdef DEBUG + int ret; +#endif /* DEBUG */ + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); + + /* Shouldn't have called seg_unmap if mapping isn't yet established */ + ASSERT(seg->s_data != NULL); + + /* Unmap the whole mapping */ +#ifdef DEBUG + ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); + ASSERT(ret == 0); +#else + SEGOP_UNMAP(seg, seg->s_base, seg->s_size); +#endif /* DEBUG */ +} + +/* + * Free the segment from its associated as. This should only be called + * if a mapping to the segment has not yet been established (e.g., if + * an error occurs in the middle of doing an as_map when the segment + * has already been partially set up) or if it has already been deleted + * (e.g., from a segment driver unmap routine if the unmap applies to the + * entire segment). If the mapping is currently set up then seg_unmap() should + * be called instead. + */ +void +seg_free(struct seg *seg) +{ + register struct as *as = seg->s_as; + struct seg *tseg = as_removeseg(as, seg); + + ASSERT(tseg == seg); + + /* + * If the segment private data field is NULL, + * then segment driver is not attached yet. + */ + if (seg->s_data != NULL) + SEGOP_FREE(seg); + + kmem_cache_free(seg_cache, seg); +} + +/*ARGSUSED*/ +static void +seg_p_mem_config_post_add( + void *arg, + pgcnt_t delta_pages) +{ + /* Nothing to do. */ +} + +/* + * Attempt to purge seg_pcache. May need to return before this has + * completed to allow other pre_del callbacks to unlock pages. This is + * ok because: + * 1) The seg_pdisable flag has been set so at least we won't + * cache anymore locks and the locks we couldn't purge + * will not be held if they do get released by a subsequent + * pre-delete callback. + * + * 2) The rest of the memory delete thread processing does not + * depend on the changes made in this pre-delete callback. No + * panics will result, the worst that will happen is that the + * DR code will timeout and cancel the delete. + */ +/*ARGSUSED*/ +static int +seg_p_mem_config_pre_del( + void *arg, + pgcnt_t delta_pages) +{ + pgcnt_t old_plocked; + int stall_count = 0; + + mutex_enter(&seg_pcache); + seg_pdisable++; + ASSERT(seg_pdisable != 0); + mutex_exit(&seg_pcache); + + /* + * Attempt to empty the cache. Terminate if seg_plocked does not + * diminish with SEGP_STALL_THRESHOLD consecutive attempts. + */ + while (seg_plocked != 0) { + old_plocked = seg_plocked; + seg_ppurge_all(1); + if (seg_plocked == old_plocked) { + if (stall_count++ > SEGP_STALL_THRESHOLD) { + cmn_err(CE_NOTE, "!Pre-delete couldn't purge" + " pagelock cache - continuing"); + break; + } + } else + stall_count = 0; + if (seg_plocked != 0) + delay(hz/SEGP_PREDEL_DELAY_FACTOR); + } + return (0); +} + +/*ARGSUSED*/ +static void +seg_p_mem_config_post_del( + void *arg, + pgcnt_t delta_pages, + int cancelled) +{ + mutex_enter(&seg_pcache); + ASSERT(seg_pdisable != 0); + seg_pdisable--; + mutex_exit(&seg_pcache); +} + +static kphysm_setup_vector_t seg_p_mem_config_vec = { + KPHYSM_SETUP_VECTOR_VERSION, + seg_p_mem_config_post_add, + seg_p_mem_config_pre_del, + seg_p_mem_config_post_del, +}; + +static void +seg_pinit_mem_config(void) +{ + int ret; + + ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL); + /* + * Want to catch this in the debug kernel. At run time, if the + * callbacks don't get run all will be OK as the disable just makes + * it more likely that the pages can be collected. + */ + ASSERT(ret == 0); +} diff --git a/usr/src/uts/common/vm/vm_swap.c b/usr/src/uts/common/vm/vm_swap.c new file mode 100644 index 0000000000..d7028b6f29 --- /dev/null +++ b/usr/src/uts/common/vm/vm_swap.c @@ -0,0 +1,1590 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Each physical swap area has an associated bitmap representing + * its physical storage. The bitmap records which swap slots are + * currently allocated or freed. Allocation is done by searching + * through the bitmap for the first free slot. Thus, there's + * no linear relation between offset within the swap device and the + * address (within its segment(s)) of the page that the slot backs; + * instead, it's an arbitrary one-to-one mapping. + * + * Associated with each swap area is a swapinfo structure. These + * structures are linked into a linear list that determines the + * ordering of swap areas in the logical swap device. Each contains a + * pointer to the corresponding bitmap, the area's size, and its + * associated vnode. + */ + +#include <sys/types.h> +#include <sys/inttypes.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/cmn_err.h> +#include <sys/vtrace.h> +#include <sys/swap.h> +#include <sys/dumphdr.h> +#include <sys/debug.h> +#include <sys/fs/snode.h> +#include <sys/fs/swapnode.h> +#include <sys/policy.h> +#include <sys/zone.h> + +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/page.h> +#include <vm/seg_vn.h> +#include <vm/hat.h> +#include <vm/anon.h> +#include <vm/seg_map.h> + +/* + * To balance the load among multiple swap areas, we don't allow + * more than swap_maxcontig allocations to be satisfied from a + * single swap area before moving on to the next swap area. This + * effectively "interleaves" allocations among the many swap areas. + */ +int swap_maxcontig; /* set by anon_init() to 1 Mb */ + +#define MINIROOTSIZE 12000 /* ~6 Meg XXX */ + +/* + * XXX - this lock is a kludge. It serializes some aspects of swapadd() and + * swapdel() (namely VOP_OPEN, VOP_CLOSE, VN_RELE). It protects against + * somebody swapadd'ing and getting swap slots from a vnode, while someone + * else is in the process of closing or rele'ing it. + */ +static kmutex_t swap_lock; + +kmutex_t swapinfo_lock; + +/* + * protected by the swapinfo_lock + */ +struct swapinfo *swapinfo; + +static struct swapinfo *silast; +static int nswapfiles; + +static u_offset_t swap_getoff(struct swapinfo *); +static int swapadd(struct vnode *, ulong_t, ulong_t, char *); +static int swapdel(struct vnode *, ulong_t); +static int swapslot_free(struct vnode *, u_offset_t, struct swapinfo *); + +/* + * swap device bitmap allocation macros + */ +#define MAPSHIFT 5 +#define NBBW (NBPW * NBBY) /* number of bits per word */ +#define TESTBIT(map, i) (((map)[(i) >> MAPSHIFT] & (1 << (i) % NBBW))) +#define SETBIT(map, i) (((map)[(i) >> MAPSHIFT] |= (1 << (i) % NBBW))) +#define CLEARBIT(map, i) (((map)[(i) >> MAPSHIFT] &= ~(1 << (i) % NBBW))) + +int swap_debug = 0; /* set for debug printf's */ +int swap_verify = 0; /* set to verify slots when freeing and allocating */ + +uint_t swapalloc_maxcontig; + +/* + * Allocate a range of up to *lenp contiguous slots (page) from a physical + * swap device. Flags are one of: + * SA_NOT Must have a slot from a physical swap device other than the + * the one containing input (*vpp, *offp). + * Less slots than requested may be returned. *lenp allocated slots are + * returned starting at *offp on *vpp. + * Returns 1 for a successful allocation, 0 for couldn't allocate any slots. + */ +int +swap_phys_alloc( + struct vnode **vpp, + u_offset_t *offp, + size_t *lenp, + uint_t flags) +{ + struct swapinfo *sip; + offset_t soff, noff; + size_t len; + + mutex_enter(&swapinfo_lock); + sip = silast; + + /* Find a desirable physical device and allocate from it. */ + do { + if (sip == NULL) + break; + if (!(sip->si_flags & ST_INDEL) && + (spgcnt_t)sip->si_nfpgs > 0) { + /* Caller wants other than specified swap device */ + if (flags & SA_NOT) { + if (*vpp != sip->si_vp || + *offp < sip->si_soff || + *offp >= sip->si_eoff) + goto found; + /* Caller is loose, will take anything */ + } else + goto found; + } else if (sip->si_nfpgs == 0) + sip->si_allocs = 0; + if ((sip = sip->si_next) == NULL) + sip = swapinfo; + } while (sip != silast); + mutex_exit(&swapinfo_lock); + return (0); +found: + soff = swap_getoff(sip); + sip->si_nfpgs--; + if (soff == -1) + panic("swap_alloc: swap_getoff failed!"); + + for (len = PAGESIZE; len < *lenp; len += PAGESIZE) { + if (sip->si_nfpgs == 0) + break; + if (swapalloc_maxcontig && len >= swapalloc_maxcontig) + break; + noff = swap_getoff(sip); + if (noff == -1) { + break; + } else if (noff != soff + len) { + CLEARBIT(sip->si_swapslots, btop(noff - sip->si_soff)); + break; + } + sip->si_nfpgs--; + } + *vpp = sip->si_vp; + *offp = soff; + *lenp = len; + ASSERT((spgcnt_t)sip->si_nfpgs >= 0); + sip->si_allocs += btop(len); + if (sip->si_allocs >= swap_maxcontig) { + sip->si_allocs = 0; + if ((silast = sip->si_next) == NULL) + silast = swapinfo; + } + TRACE_2(TR_FAC_VM, TR_SWAP_ALLOC, + "swap_alloc:sip %p offset %lx", sip, soff); + mutex_exit(&swapinfo_lock); + return (1); +} + +int swap_backsearch = 0; + +/* + * Get a free offset on swap device sip. + * Return >=0 offset if succeeded, -1 for failure. + */ +static u_offset_t +swap_getoff(struct swapinfo *sip) +{ + uint_t *sp, *ep; + size_t aoff, boff, poff, slotnumber; + + ASSERT(MUTEX_HELD(&swapinfo_lock)); + + sip->si_alloccnt++; + for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT], + ep = &sip->si_swapslots[sip->si_mapsize / NBPW]; sp < ep; sp++) { + if (*sp != (uint_t)0xffffffff) + goto foundentry; + else + sip->si_checkcnt++; + } + SWAP_PRINT(SW_ALLOC, + "swap_getoff: couldn't find slot from hint %ld to end\n", + sip->si_hint, 0, 0, 0, 0); + /* + * Go backwards? Check for faster method XXX + */ + if (swap_backsearch) { + for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT], + ep = sip->si_swapslots; sp > ep; sp--) { + if (*sp != (uint_t)0xffffffff) + goto foundentry; + else + sip->si_checkcnt++; + } + } else { + for (sp = sip->si_swapslots, + ep = &sip->si_swapslots[sip->si_hint >> MAPSHIFT]; + sp < ep; sp++) { + if (*sp != (uint_t)0xffffffff) + goto foundentry; + else + sip->si_checkcnt++; + } + } + if (*sp == 0xffffffff) { + cmn_err(CE_WARN, "No free swap slots!"); + return ((u_offset_t)-1); + } + +foundentry: + /* + * aoff is the page number offset (in bytes) of the si_swapslots + * array element containing a free page + * + * boff is the page number offset of the free page + * (i.e. cleared bit) in si_swapslots[aoff]. + */ + aoff = ((char *)sp - (char *)sip->si_swapslots) * NBBY; + + for (boff = (sip->si_hint % NBBW); boff < NBBW; boff++) { + if (!TESTBIT(sip->si_swapslots, aoff + boff)) + goto foundslot; + else + sip->si_checkcnt++; + } + for (boff = 0; boff < (sip->si_hint % NBBW); boff++) { + if (!TESTBIT(sip->si_swapslots, aoff + boff)) + goto foundslot; + else + sip->si_checkcnt++; + } + panic("swap_getoff: didn't find slot in word hint %ld", sip->si_hint); + +foundslot: + /* + * Return the offset of the free page in swap device. + * Convert page number of byte offset and add starting + * offset of swap device. + */ + slotnumber = aoff + boff; + SWAP_PRINT(SW_ALLOC, "swap_getoff: allocating slot %ld\n", + slotnumber, 0, 0, 0, 0); + poff = ptob(slotnumber); + if (poff + sip->si_soff >= sip->si_eoff) + printf("ptob(aoff(%ld) + boff(%ld))(%ld) >= eoff(%ld)\n", + aoff, boff, ptob(slotnumber), (long)sip->si_eoff); + ASSERT(poff < sip->si_eoff); + /* + * We could verify here that the slot isn't already allocated + * by looking through all the anon slots. + */ + SETBIT(sip->si_swapslots, slotnumber); + sip->si_hint = slotnumber + 1; /* hint = next slot */ + return (poff + sip->si_soff); +} + +/* + * Free a swap page. + */ +void +swap_phys_free(struct vnode *vp, u_offset_t off, size_t len) +{ + struct swapinfo *sip; + ssize_t pagenumber, npage; + + mutex_enter(&swapinfo_lock); + sip = swapinfo; + + do { + if (sip->si_vp == vp && + sip->si_soff <= off && off < sip->si_eoff) { + for (pagenumber = btop(off - sip->si_soff), + npage = btop(len) + pagenumber; + pagenumber < npage; pagenumber++) { + SWAP_PRINT(SW_ALLOC, + "swap_phys_free: freeing slot %ld on " + "sip %p\n", + pagenumber, sip, 0, 0, 0); + if (!TESTBIT(sip->si_swapslots, pagenumber)) { + panic( + "swap_phys_free: freeing free slot " + "%p,%lx\n", (void *)vp, + ptob(pagenumber) + sip->si_soff); + } + CLEARBIT(sip->si_swapslots, pagenumber); + sip->si_nfpgs++; + } + ASSERT(sip->si_nfpgs <= sip->si_npgs); + mutex_exit(&swapinfo_lock); + return; + } + } while ((sip = sip->si_next) != NULL); + panic("swap_phys_free"); + /*NOTREACHED*/ +} + +/* + * Return the anon struct corresponding for the given + * <vnode, off> if it is part of the virtual swap device. + * Return the anon struct if found, otherwise NULL. + */ +struct anon * +swap_anon(struct vnode *vp, u_offset_t off) +{ + struct anon *ap; + + ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(vp, off)])); + + for (ap = anon_hash[ANON_HASH(vp, off)]; ap != NULL; ap = ap->an_hash) { + if (ap->an_vp == vp && ap->an_off == off) + return (ap); + } + return (NULL); +} + + +/* + * Determine if the vp offset range overlap a swap device. + */ +int +swap_in_range(struct vnode *vp, u_offset_t offset, size_t len) +{ + struct swapinfo *sip; + u_offset_t eoff; + + eoff = offset + len; + ASSERT(eoff > offset); + + mutex_enter(&swapinfo_lock); + sip = swapinfo; + if (vp && sip) { + do { + if (vp != sip->si_vp || eoff <= sip->si_soff || + offset >= sip->si_eoff) + continue; + mutex_exit(&swapinfo_lock); + return (1); + } while ((sip = sip->si_next) != NULL); + } + mutex_exit(&swapinfo_lock); + return (0); +} + +/* + * See if name is one of our swap files + * even though lookupname failed. + * This can be used by swapdel to delete + * swap resources on remote machines + * where the link has gone down. + */ +static struct vnode * +swapdel_byname( + char *name, /* pathname to delete */ + ulong_t lowblk) /* Low block number of area to delete */ +{ + struct swapinfo **sipp, *osip; + u_offset_t soff; + + /* + * Find the swap file entry for the file to + * be deleted. Skip any entries that are in + * transition. + */ + + soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */ + + mutex_enter(&swapinfo_lock); + for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) { + if ((strcmp(osip->si_pname, name) == 0) && + (osip->si_soff == soff) && (osip->si_flags == 0)) { + struct vnode *vp = osip->si_vp; + + VN_HOLD(vp); + mutex_exit(&swapinfo_lock); + return (vp); + } + } + mutex_exit(&swapinfo_lock); + return (NULL); +} + + +/* + * New system call to manipulate swap files. + */ +int +swapctl(int sc_cmd, void *sc_arg, int *rv) +{ + struct swapinfo *sip, *csip, *tsip; + int error = 0; + struct swapent st, *ust; + struct swapres sr; + struct vnode *vp; + int cnt = 0; + int tmp_nswapfiles; + int nswap; + int length, nlen; + int gplen = 0, plen; + char *swapname; + char *pname; + char *tpname; + struct anoninfo ai; + spgcnt_t avail; + int global = INGLOBALZONE(curproc); + + /* + * When running in a zone we want to hide the details of the swap + * devices: we report there only being one swap device named "swap" + * having a size equal to the sum of the sizes of all real swap devices + * on the system. + */ + switch (sc_cmd) { + case SC_GETNSWP: + if (global) + *rv = nswapfiles; + else + *rv = 1; + return (0); + + case SC_AINFO: + /* + * Return anoninfo information with these changes: + * ani_max = maximum amount of swap space + * (including potentially available physical memory) + * ani_free = amount of unallocated anonymous memory + * (some of which might be reserved and including + * potentially available physical memory) + * ani_resv = amount of claimed (reserved) anonymous memory + */ + avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); + ai.ani_max = (k_anoninfo.ani_max + + k_anoninfo.ani_mem_resv) +avail; + + ai.ani_free = k_anoninfo.ani_free + avail; + + ai.ani_resv = k_anoninfo.ani_phys_resv + + k_anoninfo.ani_mem_resv; + + if (copyout(&ai, sc_arg, sizeof (struct anoninfo)) != 0) + return (EFAULT); + return (0); + + case SC_LIST: + if (copyin(sc_arg, &length, sizeof (int)) != 0) + return (EFAULT); + if (!global) { + struct swapent st; + char *swappath = "swap"; + + if (length < 1) + return (ENOMEM); + ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent; + if (copyin(ust, &st, sizeof (swapent_t)) != 0) + return (EFAULT); + st.ste_start = PAGESIZE >> SCTRSHFT; + st.ste_length = (off_t)0; + st.ste_pages = 0; + st.ste_free = 0; + st.ste_flags = 0; + mutex_enter(&swapinfo_lock); + for (sip = swapinfo, nswap = 0; + sip != NULL && nswap < nswapfiles; + sip = sip->si_next, nswap++) { + st.ste_length += + (sip->si_eoff - sip->si_soff) >> SCTRSHFT; + st.ste_pages += sip->si_npgs; + st.ste_free += sip->si_nfpgs; + } + mutex_exit(&swapinfo_lock); + if (copyout(&st, ust, sizeof (swapent_t)) != 0 || + copyout(swappath, st.ste_path, + strlen(swappath) + 1) != 0) { + return (EFAULT); + } + *rv = 1; + return (0); + } +beginning: + tmp_nswapfiles = nswapfiles; + /* Return an error if not enough space for the whole table. */ + if (length < tmp_nswapfiles) + return (ENOMEM); + /* + * Get memory to hold the swap entries and their names. We'll + * copy the real entries into these and then copy these out. + * Allocating the pathname memory is only a guess so we may + * find that we need more and have to do it again. + * All this is because we have to hold the anon lock while + * traversing the swapinfo list, and we can't be doing copyouts + * and/or kmem_alloc()s during this. + */ + csip = kmem_zalloc(tmp_nswapfiles * sizeof (struct swapinfo), + KM_SLEEP); +retry: + nlen = tmp_nswapfiles * (gplen += 100); + pname = kmem_zalloc(nlen, KM_SLEEP); + + mutex_enter(&swapinfo_lock); + + if (tmp_nswapfiles != nswapfiles) { + mutex_exit(&swapinfo_lock); + kmem_free(pname, nlen); + kmem_free(csip, + tmp_nswapfiles * sizeof (struct swapinfo)); + gplen = 0; + goto beginning; + } + for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0; + sip && nswap < tmp_nswapfiles; + sip = sip->si_next, tsip++, tpname += plen, nswap++) { + plen = sip->si_pnamelen; + if (tpname + plen - pname > nlen) { + mutex_exit(&swapinfo_lock); + kmem_free(pname, nlen); + goto retry; + } + *tsip = *sip; + tsip->si_pname = tpname; + (void) strcpy(tsip->si_pname, sip->si_pname); + } + mutex_exit(&swapinfo_lock); + + if (sip) { + error = ENOMEM; + goto lout; + } + ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent; + for (tsip = csip, cnt = 0; cnt < nswap; tsip++, ust++, cnt++) { + if (copyin(ust, &st, sizeof (swapent_t)) != 0) { + error = EFAULT; + goto lout; + } + st.ste_flags = tsip->si_flags; + st.ste_length = + (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT; + st.ste_start = tsip->si_soff >> SCTRSHFT; + st.ste_pages = tsip->si_npgs; + st.ste_free = tsip->si_nfpgs; + if (copyout(&st, ust, sizeof (swapent_t)) != 0) { + error = EFAULT; + goto lout; + } + if (!tsip->si_pnamelen) + continue; + if (copyout(tsip->si_pname, st.ste_path, + tsip->si_pnamelen) != 0) { + error = EFAULT; + goto lout; + } + } + *rv = nswap; +lout: + kmem_free(csip, tmp_nswapfiles * sizeof (struct swapinfo)); + kmem_free(pname, nlen); + return (error); + + case SC_ADD: + case SC_REMOVE: + break; + default: + return (EINVAL); + } + if ((error = secpolicy_swapctl(CRED())) != 0) + return (error); + + if (copyin(sc_arg, &sr, sizeof (swapres_t))) + return (EFAULT); + + /* Allocate the space to read in pathname */ + if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL) + return (ENOMEM); + + error = copyinstr(sr.sr_name, swapname, MAXPATHLEN, 0); + if (error) + goto out; + + error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + if (error) { + if (sc_cmd == SC_ADD) + goto out; + /* see if we match by name */ + vp = swapdel_byname(swapname, (size_t)sr.sr_start); + if (vp == NULL) + goto out; + } + + if (vp->v_flag & (VNOMAP | VNOSWAP)) { + VN_RELE(vp); + error = ENOSYS; + goto out; + } + switch (vp->v_type) { + case VBLK: + break; + + case VREG: + if (vp->v_vfsp && vn_is_readonly(vp)) + error = EROFS; + else + error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED()); + break; + + case VDIR: + error = EISDIR; + break; + default: + error = ENOSYS; + break; + } + if (error == 0) { + if (sc_cmd == SC_REMOVE) + error = swapdel(vp, sr.sr_start); + else + error = swapadd(vp, sr.sr_start, + sr.sr_length, swapname); + } + VN_RELE(vp); +out: + kmem_free(swapname, MAXPATHLEN); + return (error); +} + +#if defined(_LP64) && defined(_SYSCALL32) + +int +swapctl32(int sc_cmd, void *sc_arg, int *rv) +{ + struct swapinfo *sip, *csip, *tsip; + int error = 0; + struct swapent32 st, *ust; + struct swapres32 sr; + struct vnode *vp; + int cnt = 0; + int tmp_nswapfiles; + int nswap; + int length, nlen; + int gplen = 0, plen; + char *swapname; + char *pname; + char *tpname; + struct anoninfo32 ai; + size_t s; + spgcnt_t avail; + + switch (sc_cmd) { + case SC_GETNSWP: + *rv = nswapfiles; + return (0); + + case SC_AINFO: + /* + * Return anoninfo information with these changes: + * ani_max = maximum amount of swap space + * (including potentially available physical memory) + * ani_free = amount of unallocated anonymous memory + * (some of which might be reserved and including + * potentially available physical memory) + * ani_resv = amount of claimed (reserved) anonymous memory + */ + avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); + s = (k_anoninfo.ani_max + k_anoninfo.ani_mem_resv) + avail; + if (s > UINT32_MAX) + return (EOVERFLOW); + ai.ani_max = s; + + s = k_anoninfo.ani_free + avail; + if (s > UINT32_MAX) + return (EOVERFLOW); + ai.ani_free = s; + + s = k_anoninfo.ani_phys_resv + k_anoninfo.ani_mem_resv; + if (s > UINT32_MAX) + return (EOVERFLOW); + ai.ani_resv = s; + + if (copyout(&ai, sc_arg, sizeof (ai)) != 0) + return (EFAULT); + return (0); + + case SC_LIST: + if (copyin(sc_arg, &length, sizeof (int32_t)) != 0) + return (EFAULT); +beginning: + tmp_nswapfiles = nswapfiles; + /* Return an error if not enough space for the whole table. */ + if (length < tmp_nswapfiles) + return (ENOMEM); + /* + * Get memory to hold the swap entries and their names. We'll + * copy the real entries into these and then copy these out. + * Allocating the pathname memory is only a guess so we may + * find that we need more and have to do it again. + * All this is because we have to hold the anon lock while + * traversing the swapinfo list, and we can't be doing copyouts + * and/or kmem_alloc()s during this. + */ + csip = kmem_zalloc(tmp_nswapfiles * sizeof (*csip), KM_SLEEP); +retry: + nlen = tmp_nswapfiles * (gplen += 100); + pname = kmem_zalloc(nlen, KM_SLEEP); + + mutex_enter(&swapinfo_lock); + + if (tmp_nswapfiles != nswapfiles) { + mutex_exit(&swapinfo_lock); + kmem_free(pname, nlen); + kmem_free(csip, tmp_nswapfiles * sizeof (*csip)); + gplen = 0; + goto beginning; + } + for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0; + (sip != NULL) && (nswap < tmp_nswapfiles); + sip = sip->si_next, tsip++, tpname += plen, nswap++) { + plen = sip->si_pnamelen; + if (tpname + plen - pname > nlen) { + mutex_exit(&swapinfo_lock); + kmem_free(pname, nlen); + goto retry; + } + *tsip = *sip; + tsip->si_pname = tpname; + (void) strcpy(tsip->si_pname, sip->si_pname); + } + mutex_exit(&swapinfo_lock); + + if (sip != NULL) { + error = ENOMEM; + goto lout; + } + ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent; + for (tsip = csip, cnt = 0; cnt < nswap; tsip++, ust++, cnt++) { + if (copyin(ust, &st, sizeof (*ust)) != 0) { + error = EFAULT; + goto lout; + } + st.ste_flags = tsip->si_flags; + st.ste_length = + (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT; + st.ste_start = tsip->si_soff >> SCTRSHFT; + st.ste_pages = tsip->si_npgs; + st.ste_free = tsip->si_nfpgs; + if (copyout(&st, ust, sizeof (st)) != 0) { + error = EFAULT; + goto lout; + } + if (!tsip->si_pnamelen) + continue; + if (copyout(tsip->si_pname, + (caddr_t)(uintptr_t)st.ste_path, + tsip->si_pnamelen) != 0) { + error = EFAULT; + goto lout; + } + } + *rv = nswap; +lout: + kmem_free(csip, tmp_nswapfiles * sizeof (*csip)); + kmem_free(pname, nlen); + return (error); + + case SC_ADD: + case SC_REMOVE: + break; + default: + return (EINVAL); + } + if ((error = secpolicy_swapctl(CRED())) != 0) + return (error); + + if (copyin(sc_arg, &sr, sizeof (sr))) + return (EFAULT); + + /* Allocate the space to read in pathname */ + if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL) + return (ENOMEM); + + error = copyinstr((caddr_t)(uintptr_t)sr.sr_name, + swapname, MAXPATHLEN, NULL); + if (error) + goto out; + + error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + if (error) { + if (sc_cmd == SC_ADD) + goto out; + /* see if we match by name */ + vp = swapdel_byname(swapname, (uint_t)sr.sr_start); + if (vp == NULL) + goto out; + } + + if (vp->v_flag & (VNOMAP | VNOSWAP)) { + VN_RELE(vp); + error = ENOSYS; + goto out; + } + switch (vp->v_type) { + case VBLK: + break; + + case VREG: + if (vp->v_vfsp && vn_is_readonly(vp)) + error = EROFS; + else + error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED()); + break; + + case VDIR: + error = EISDIR; + break; + default: + error = ENOSYS; + break; + } + if (error == 0) { + if (sc_cmd == SC_REMOVE) + error = swapdel(vp, sr.sr_start); + else + error = swapadd(vp, sr.sr_start, sr.sr_length, + swapname); + } + VN_RELE(vp); +out: + kmem_free(swapname, MAXPATHLEN); + return (error); +} + +#endif /* _LP64 && _SYSCALL32 */ + +/* + * Add a new swap file. + */ +int +swapadd(struct vnode *vp, ulong_t lowblk, ulong_t nblks, char *swapname) +{ + struct swapinfo **sipp, *nsip = NULL, *esip = NULL; + struct vnode *cvp; + struct vattr vattr; + pgcnt_t pages; + u_offset_t soff, eoff; + int error; + ssize_t i, start, end; + ushort_t wasswap; + ulong_t startblk; + size_t returned_mem; + + SWAP_PRINT(SW_CTL, "swapadd: vp %p lowblk %ld nblks %ld swapname %s\n", + vp, lowblk, nblks, swapname, 0); + /* + * Get the real vnode. (If vp is not a specnode it just returns vp, so + * it does the right thing, but having this code know about specnodes + * violates the spirit of having it be indepedent of vnode type.) + */ + cvp = common_specvp(vp); + + /* + * Or in VISSWAP so file system has chance to deny swap-ons during open. + */ + mutex_enter(&cvp->v_lock); + wasswap = cvp->v_flag & VISSWAP; + cvp->v_flag |= VISSWAP; + mutex_exit(&cvp->v_lock); + + mutex_enter(&swap_lock); + if (error = VOP_OPEN(&cvp, FREAD|FWRITE, CRED())) { + mutex_exit(&swap_lock); + /* restore state of v_flag */ + if (!wasswap) { + mutex_enter(&cvp->v_lock); + cvp->v_flag &= ~VISSWAP; + mutex_exit(&cvp->v_lock); + } + return (error); + } + mutex_exit(&swap_lock); + + /* + * Get partition size. Return error if empty partition, + * or if request does not fit within the partition. + * If this is the first swap device, we can reduce + * the size of the swap area to match what is + * available. This can happen if the system was built + * on a machine with a different size swap partition. + */ + vattr.va_mask = AT_SIZE; + if (error = VOP_GETATTR(cvp, &vattr, ATTR_COMM, CRED())) + goto out; + + /* + * Specfs returns a va_size of MAXOFFSET_T (UNKNOWN_SIZE) when the + * size of the device can't be determined. + */ + if ((vattr.va_size == 0) || (vattr.va_size == MAXOFFSET_T)) { + error = EINVAL; + goto out; + } + +#ifdef _ILP32 + /* + * No support for large swap in 32-bit OS, if the size of the swap is + * bigger than MAXOFF32_T then the size used by swapfs must be limited. + * This limitation is imposed by the swap subsystem itself, a D_64BIT + * driver as the target of swap operation should be able to field + * the IO. + */ + if (vattr.va_size > MAXOFF32_T) { + cmn_err(CE_NOTE, + "!swap device %s truncated from 0x%llx to 0x%x bytes", + swapname, vattr.va_size, MAXOFF32_T); + vattr.va_size = MAXOFF32_T; + } +#endif /* _ILP32 */ + + /* Fail if file not writeable (try to set size to current size) */ + vattr.va_mask = AT_SIZE; + if (error = VOP_SETATTR(cvp, &vattr, 0, CRED(), NULL)) + goto out; + + /* Fail if fs does not support VOP_PAGEIO */ + error = VOP_PAGEIO(cvp, (page_t *)NULL, (u_offset_t)0, 0, 0, CRED()); + + if (error == ENOSYS) + goto out; + else + error = 0; + /* + * If swapping on the root filesystem don't put swap blocks that + * correspond to the miniroot filesystem on the swap free list. + */ + if (cvp == rootdir) + startblk = roundup(MINIROOTSIZE<<SCTRSHFT, klustsize)>>SCTRSHFT; + else /* Skip 1st page (disk label) */ + startblk = (ulong_t)(lowblk ? lowblk : 1); + + soff = startblk << SCTRSHFT; + if (soff >= vattr.va_size) { + error = EINVAL; + goto out; + } + + /* + * If user specified 0 blks, use the size of the device + */ + eoff = nblks ? soff + (nblks - (startblk - lowblk) << SCTRSHFT) : + vattr.va_size; + + SWAP_PRINT(SW_CTL, "swapadd: va_size %ld soff %ld eoff %ld\n", + vattr.va_size, soff, eoff, 0, 0); + + if (eoff > vattr.va_size) { + error = EINVAL; + goto out; + } + + /* + * The starting and ending offsets must be page aligned. + * Round soff up to next page boundary, round eoff + * down to previous page boundary. + */ + soff = ptob(btopr(soff)); + eoff = ptob(btop(eoff)); + if (soff >= eoff) { + SWAP_PRINT(SW_CTL, "swapadd: soff %ld >= eoff %ld\n", + soff, eoff, 0, 0, 0); + error = EINVAL; + goto out; + } + + pages = btop(eoff - soff); + + /* Allocate and partially set up the new swapinfo */ + nsip = kmem_zalloc(sizeof (struct swapinfo), KM_SLEEP); + nsip->si_vp = cvp; + + nsip->si_soff = soff; + nsip->si_eoff = eoff; + nsip->si_hint = 0; + nsip->si_checkcnt = nsip->si_alloccnt = 0; + + nsip->si_pnamelen = (int)strlen(swapname) + 1; + nsip->si_pname = (char *)kmem_zalloc(nsip->si_pnamelen, KM_SLEEP); + bcopy(swapname, nsip->si_pname, nsip->si_pnamelen - 1); + SWAP_PRINT(SW_CTL, "swapadd: allocating swapinfo for %s, %ld pages\n", + swapname, pages, 0, 0, 0); + /* + * Size of swapslots map in bytes + */ + nsip->si_mapsize = P2ROUNDUP(pages, NBBW) / NBBY; + nsip->si_swapslots = kmem_zalloc(nsip->si_mapsize, KM_SLEEP); + + /* + * Permanently set the bits that can't ever be allocated, + * i.e. those from the ending offset to the round up slot for the + * swapslots bit map. + */ + start = pages; + end = P2ROUNDUP(pages, NBBW); + for (i = start; i < end; i++) { + SWAP_PRINT(SW_CTL, "swapadd: set bit for page %ld\n", i, + 0, 0, 0, 0); + SETBIT(nsip->si_swapslots, i); + } + nsip->si_npgs = nsip->si_nfpgs = pages; + /* + * Now check to see if we can add it. We wait til now to check because + * we need the swapinfo_lock and we don't want sleep with it (e.g., + * during kmem_alloc()) while we're setting up the swapinfo. + */ + mutex_enter(&swapinfo_lock); + for (sipp = &swapinfo; (esip = *sipp) != NULL; sipp = &esip->si_next) { + if (esip->si_vp == cvp) { + if (esip->si_soff == soff && esip->si_npgs == pages && + (esip->si_flags & ST_DOINGDEL)) { + /* + * We are adding a device that we are in the + * middle of deleting. Just clear the + * ST_DOINGDEL flag to signal this and + * the deletion routine will eventually notice + * it and add it back. + */ + esip->si_flags &= ~ST_DOINGDEL; + mutex_exit(&swapinfo_lock); + goto out; + } + /* disallow overlapping swap files */ + if ((soff < esip->si_eoff) && (eoff > esip->si_soff)) { + error = EEXIST; + mutex_exit(&swapinfo_lock); + goto out; + } + } + } + + nswapfiles++; + + /* + * add new swap device to list and shift allocations to it + * before updating the anoninfo counters + */ + *sipp = nsip; + silast = nsip; + + /* + * Update the total amount of reservable swap space + * accounting properly for swap space from physical memory + */ + /* New swap device soaks up currently reserved memory swap */ + mutex_enter(&anoninfo_lock); + + ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + + k_anoninfo.ani_max += pages; + ANI_ADD(pages); + if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { + returned_mem = MIN(k_anoninfo.ani_mem_resv - + k_anoninfo.ani_locked_swap, + k_anoninfo.ani_max - k_anoninfo.ani_phys_resv); + + ANI_ADD(-returned_mem); + k_anoninfo.ani_free -= returned_mem; + k_anoninfo.ani_mem_resv -= returned_mem; + k_anoninfo.ani_phys_resv += returned_mem; + + mutex_enter(&freemem_lock); + availrmem += returned_mem; + mutex_exit(&freemem_lock); + } + /* + * At boot time, to permit booting small memory machines using + * only physical memory as swap space, we allowed a dangerously + * large amount of memory to be used as swap space; now that + * more physical backing store is available bump down the amount + * we can get from memory to a safer size. + */ + if (swapfs_minfree < swapfs_desfree) { + mutex_enter(&freemem_lock); + if (availrmem > swapfs_desfree || !k_anoninfo.ani_mem_resv) + swapfs_minfree = swapfs_desfree; + mutex_exit(&freemem_lock); + } + + SWAP_PRINT(SW_CTL, "swapadd: ani_max %ld ani_free %ld\n", + k_anoninfo.ani_free, k_anoninfo.ani_free, 0, 0, 0); + + mutex_exit(&anoninfo_lock); + + mutex_exit(&swapinfo_lock); + + /* Initialize the dump device */ + mutex_enter(&dump_lock); + if (dumpvp == NULL) + (void) dumpinit(vp, swapname, 0); + mutex_exit(&dump_lock); + + VN_HOLD(cvp); +out: + if (error || esip) { + SWAP_PRINT(SW_CTL, "swapadd: error (%d)\n", error, 0, 0, 0, 0); + + if (!wasswap) { + mutex_enter(&cvp->v_lock); + cvp->v_flag &= ~VISSWAP; + mutex_exit(&cvp->v_lock); + } + if (nsip) { + kmem_free(nsip->si_swapslots, (size_t)nsip->si_mapsize); + kmem_free(nsip->si_pname, nsip->si_pnamelen); + kmem_free(nsip, sizeof (*nsip)); + } + mutex_enter(&swap_lock); + (void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED()); + mutex_exit(&swap_lock); + } + return (error); +} + +/* + * Delete a swap file. + */ +static int +swapdel( + struct vnode *vp, + ulong_t lowblk) /* Low block number of area to delete. */ +{ + struct swapinfo **sipp, *osip = NULL; + struct vnode *cvp; + u_offset_t soff; + int error = 0; + u_offset_t toff = 0; + struct vnode *tvp = NULL; + spgcnt_t pages; + struct anon **app, *ap; + kmutex_t *ahm; + pgcnt_t adjust_swap = 0; + + /* Find the swap file entry for the file to be deleted */ + cvp = common_specvp(vp); + + + lowblk = lowblk ? lowblk : 1; /* Skip first page (disk label) */ + soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */ + + mutex_enter(&swapinfo_lock); + for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) { + if ((osip->si_vp == cvp) && + (osip->si_soff == soff) && (osip->si_flags == 0)) + break; + } + + /* If the file was not found, error. */ + if (osip == NULL) { + error = EINVAL; + mutex_exit(&swapinfo_lock); + goto out; + } + + pages = osip->si_npgs; + + /* + * Do not delete if we will be low on swap pages. + */ + mutex_enter(&anoninfo_lock); + + ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + + mutex_enter(&freemem_lock); + if (((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) + + MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) < pages) { + mutex_exit(&freemem_lock); + mutex_exit(&anoninfo_lock); + error = ENOMEM; + cmn_err(CE_WARN, "swapdel - too few free pages"); + mutex_exit(&swapinfo_lock); + goto out; + } + mutex_exit(&freemem_lock); + + k_anoninfo.ani_max -= pages; + + /* If needed, reserve memory swap to replace old device */ + if (k_anoninfo.ani_phys_resv > k_anoninfo.ani_max) { + adjust_swap = k_anoninfo.ani_phys_resv - k_anoninfo.ani_max; + k_anoninfo.ani_phys_resv -= adjust_swap; + k_anoninfo.ani_mem_resv += adjust_swap; + mutex_enter(&freemem_lock); + availrmem -= adjust_swap; + mutex_exit(&freemem_lock); + ANI_ADD(adjust_swap); + } + ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + mutex_exit(&anoninfo_lock); + + ANI_ADD(-pages); + + /* + * Set the delete flag. This prevents anyone from allocating more + * pages from this file. Also set ST_DOINGDEL. Someone who wants to + * add the file back while we're deleting it will signify by clearing + * this flag. + */ + osip->si_flags |= ST_INDEL|ST_DOINGDEL; + mutex_exit(&swapinfo_lock); + + /* + * Free all the allocated physical slots for this file. We do this + * by walking through the entire anon hash array, because we need + * to update all the anon slots that have physical swap slots on + * this file, and this is the only way to find them all. We go back + * to the beginning of a bucket after each slot is freed because the + * anonhash_lock is not held during the free and thus the hash table + * may change under us. + */ + for (app = anon_hash; app < &anon_hash[ANON_HASH_SIZE]; app++) { + ahm = &anonhash_lock[(app-anon_hash) & (AH_LOCK_SIZE - 1)]; + mutex_enter(ahm); +top: + for (ap = *app; ap != NULL; ap = ap->an_hash) { + if (ap->an_pvp == cvp && + ap->an_poff >= osip->si_soff && + ap->an_poff < osip->si_eoff) { + ASSERT(TESTBIT(osip->si_swapslots, + btop((size_t)(ap->an_poff - + osip->si_soff)))); + tvp = ap->an_vp; + toff = ap->an_off; + VN_HOLD(tvp); + mutex_exit(ahm); + + error = swapslot_free(tvp, toff, osip); + + VN_RELE(tvp); + mutex_enter(ahm); + if (!error && (osip->si_flags & ST_DOINGDEL)) { + goto top; + } else { + if (error) { + cmn_err(CE_WARN, + "swapslot_free failed %d", + error); + } + + /* + * Add device back before making it + * visible. + */ + mutex_enter(&swapinfo_lock); + osip->si_flags &= + ~(ST_INDEL | ST_DOINGDEL); + mutex_exit(&swapinfo_lock); + + /* + * Update the anon space available + */ + mutex_enter(&anoninfo_lock); + + k_anoninfo.ani_phys_resv += adjust_swap; + k_anoninfo.ani_mem_resv -= adjust_swap; + k_anoninfo.ani_max += pages; + + mutex_enter(&freemem_lock); + availrmem += adjust_swap; + mutex_exit(&freemem_lock); + + mutex_exit(&anoninfo_lock); + + ANI_ADD(pages); + + mutex_exit(ahm); + goto out; + } + } + } + mutex_exit(ahm); + } + + /* All done, they'd better all be free! */ + mutex_enter(&swapinfo_lock); + ASSERT(osip->si_nfpgs == osip->si_npgs); + + /* Now remove it from the swapinfo list */ + for (sipp = &swapinfo; *sipp != NULL; sipp = &(*sipp)->si_next) { + if (*sipp == osip) + break; + } + ASSERT(*sipp); + *sipp = osip->si_next; + if (silast == osip) + if ((silast = osip->si_next) == NULL) + silast = swapinfo; + nswapfiles--; + mutex_exit(&swapinfo_lock); + + kmem_free(osip->si_swapslots, osip->si_mapsize); + kmem_free(osip->si_pname, osip->si_pnamelen); + kmem_free(osip, sizeof (*osip)); + + mutex_enter(&dump_lock); + if (cvp == dumpvp) + dumpfini(); + mutex_exit(&dump_lock); + + /* Release the vnode */ + + mutex_enter(&swap_lock); + (void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED()); + mutex_enter(&cvp->v_lock); + cvp->v_flag &= ~VISSWAP; + mutex_exit(&cvp->v_lock); + VN_RELE(cvp); + mutex_exit(&swap_lock); +out: + return (error); +} + +/* + * Free up a physical swap slot on swapinfo sip, currently in use by the + * anonymous page whose name is (vp, off). + */ +static int +swapslot_free( + struct vnode *vp, + u_offset_t off, + struct swapinfo *sip) +{ + struct page *pl[2], *pp; + struct anon *ap = NULL; + int error = 0; + kmutex_t *ahm; + + /* + * Get the page for the old swap slot and i/o lock it. + * Users of the physical slot will synchronize on the i/o lock. + */ + if (error = VOP_GETPAGE(vp, (offset_t)off, ptob(1), NULL, + pl, ptob(1), segkmap, NULL, S_READ, CRED())) { + /* + * Anon slot went away (EIDRM) or vp was truncated (EFAULT) + * while we got the page. Thus the physical slot must be + * free, so we have succeeded. + */ + if (error == EIDRM || error == EFAULT) + error = 0; + return (error); + } + pp = pl[0]; + page_io_lock(pp); + + ahm = &anonhash_lock[AH_LOCK(vp, off)]; + mutex_enter(ahm); + /* + * Get the anon slot; anon struct cannot vanish while we hold + * SE_SHARED lock on the physical page since anon_decref() blocks + * in page_lookup() before it can proceed further to remove + * anon struct from anon_hash table. + */ + if ((ap = swap_anon(vp, off)) == NULL) { + panic("swapslot_free(%p, %llx, %p), page: %p, null anon", + vp, off, sip, pp); + } + /* + * Free the physical slot. It may have been freed up and replaced with + * another one while we were getting the page so we have to re-verify + * that this is really one we want. If we do free the slot we have + * to mark the page modified, as its backing store is now gone. + */ + if (ap->an_pvp == sip->si_vp && ap->an_poff >= sip->si_soff && + ap->an_poff < sip->si_eoff) { + swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); + ap->an_pvp = NULL; + ap->an_poff = NULL; + mutex_exit(ahm); + hat_setmod(pp); + } else { + mutex_exit(ahm); + } +out: + /* Release the page locks */ + page_unlock(pp); + page_io_unlock(pp); + return (error); +} + +/* + * Get contig physical backing store for vp, in the range + * [*offp, *offp + *lenp), May back a subrange of this, but must + * always include the requested offset or fail. Returns the offsets + * backed as [*offp, *offp + *lenp) and the physical offsets used to + * back them from *pvpp in the range [*pstartp, *pstartp + *lenp). + * Returns 0 for success + * SE_NOANON -- no anon slot for requested paged + * SE_NOSWAP -- no physical swap space available + */ +int +swap_newphysname( + struct vnode *vp, + u_offset_t offset, + u_offset_t *offp, + size_t *lenp, + struct vnode **pvpp, + u_offset_t *poffp) +{ + struct anon *ap = NULL; /* anon slot for vp, off */ + int error = 0; + struct vnode *pvp; + u_offset_t poff, pstart, prem; + size_t plen; + u_offset_t off, start; + kmutex_t *ahm; + + ASSERT(*offp <= offset && offset < *offp + *lenp); + + /* Get new physical swap slots. */ + plen = *lenp; + if (!swap_phys_alloc(&pvp, &pstart, &plen, 0)) { + /* + * No swap available so return error unless requested + * offset is already backed in which case return that. + */ + ahm = &anonhash_lock[AH_LOCK(vp, offset)]; + mutex_enter(ahm); + if ((ap = swap_anon(vp, offset)) == NULL) { + error = SE_NOANON; + mutex_exit(ahm); + return (error); + } + error = (ap->an_pvp ? 0 : SE_NOSWAP); + *offp = offset; + *lenp = PAGESIZE; + *pvpp = ap->an_pvp; + *poffp = ap->an_poff; + mutex_exit(ahm); + return (error); + } + + /* + * We got plen (<= *lenp) contig slots. Use these to back a + * subrange of [*offp, *offp + *lenp) which includes offset. + * For now we just put offset at the end of the kluster. + * Clearly there are other possible choices - which is best? + */ + start = MAX(*offp, + (offset + PAGESIZE > plen) ? (offset + PAGESIZE - plen) : 0); + ASSERT(start + plen <= *offp + *lenp); + + for (off = start, poff = pstart; poff < pstart + plen; + off += PAGESIZE, poff += PAGESIZE) { + ahm = &anonhash_lock[AH_LOCK(vp, off)]; + mutex_enter(ahm); + if ((ap = swap_anon(vp, off)) != NULL) { + /* Free old slot if any, and assign new one */ + if (ap->an_pvp) + swap_phys_free(ap->an_pvp, ap->an_poff, + PAGESIZE); + ap->an_pvp = pvp; + ap->an_poff = poff; + } else { /* No anon slot for a klustered page, quit. */ + prem = (pstart + plen) - poff; + /* Already did requested page, do partial kluster */ + if (off > offset) { + plen = poff - pstart; + error = 0; + /* Fail on requested page, error */ + } else if (off == offset) { + error = SE_NOANON; + /* Fail on prior page, fail on requested page, error */ + } else if ((ap = swap_anon(vp, offset)) == NULL) { + error = SE_NOANON; + /* Fail on prior page, got requested page, do only it */ + } else { + /* Free old slot if any, and assign new one */ + if (ap->an_pvp) + swap_phys_free(ap->an_pvp, ap->an_poff, + PAGESIZE); + ap->an_pvp = pvp; + ap->an_poff = poff; + /* One page kluster */ + start = offset; + plen = PAGESIZE; + pstart = poff; + poff += PAGESIZE; + prem -= PAGESIZE; + } + /* Free unassigned slots */ + swap_phys_free(pvp, poff, prem); + mutex_exit(ahm); + break; + } + mutex_exit(ahm); + } + ASSERT(*offp <= start && start + plen <= *offp + *lenp); + ASSERT(start <= offset && offset < start + plen); + *offp = start; + *lenp = plen; + *pvpp = pvp; + *poffp = pstart; + return (error); +} + + +/* + * Get the physical swap backing store location for a given anonymous page + * named (vp, off). The backing store name is returned in (*pvpp, *poffp). + * Returns 0 success + * EIDRM -- no anon slot (page is not allocated) + */ +int +swap_getphysname( + struct vnode *vp, + u_offset_t off, + struct vnode **pvpp, + u_offset_t *poffp) +{ + struct anon *ap; + int error = 0; + kmutex_t *ahm; + + ahm = &anonhash_lock[AH_LOCK(vp, off)]; + mutex_enter(ahm); + + /* Get anon slot for vp, off */ + ap = swap_anon(vp, off); + if (ap == NULL) { + error = EIDRM; + goto out; + } + *pvpp = ap->an_pvp; + *poffp = ap->an_poff; +out: + mutex_exit(ahm); + return (error); +} diff --git a/usr/src/uts/common/vm/vpage.h b/usr/src/uts/common/vm/vpage.h new file mode 100644 index 0000000000..68dfb1adb0 --- /dev/null +++ b/usr/src/uts/common/vm/vpage.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1998 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _VM_VPAGE_H +#define _VM_VPAGE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * VM - Information per virtual page. + */ +struct vpage { + uchar_t nvp_prot; /* see <sys/mman.h> prot flags */ + uchar_t nvp_advice; /* pplock & <sys/mman.h> madvise flags */ +}; + +/* + * This was changed from a bitfield to flags/macros in order + * to conserve space (uchar_t bitfields are not ANSI). This could + * have been condensed to a uchar_t, but at the expense of complexity. + * We've stolen a bit from the top of nvp_advice to store pplock in. + * + * WARNING: VPP_SETADVICE(vpp, x) evaluates vpp twice, and VPP_PLOCK(vpp) + * returns a positive integer when the lock is held, not necessarily (1). + */ +#define VP_ADVICE_MASK (0x07) +#define VP_PPLOCK_MASK (0x80) /* physical page locked by me */ +#define VP_PPLOCK_SHIFT (0x07) /* offset of lock hiding inside nvp_advice */ + +#define VPP_PROT(vpp) ((vpp)->nvp_prot) +#define VPP_ADVICE(vpp) ((vpp)->nvp_advice & VP_ADVICE_MASK) +#define VPP_ISPPLOCK(vpp) \ + ((uchar_t)((vpp)->nvp_advice & VP_PPLOCK_MASK)) + +#define VPP_SETPROT(vpp, x) ((vpp)->nvp_prot = (x)) +#define VPP_SETADVICE(vpp, x) \ + ((vpp)->nvp_advice = ((vpp)->nvp_advice & ~VP_ADVICE_MASK) | \ + ((x) & VP_ADVICE_MASK)) +#define VPP_SETPPLOCK(vpp) ((vpp)->nvp_advice |= VP_PPLOCK_MASK) +#define VPP_CLRPPLOCK(vpp) ((vpp)->nvp_advice &= ~VP_PPLOCK_MASK) + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_VPAGE_H */ diff --git a/usr/src/uts/common/vm/xhat.c b/usr/src/uts/common/vm/xhat.c new file mode 100644 index 0000000000..255ca1eb67 --- /dev/null +++ b/usr/src/uts/common/vm/xhat.c @@ -0,0 +1,555 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/mman.h> +#include <sys/systm.h> +#include <vm/xhat.h> +#include <vm/page.h> +#include <vm/as.h> + +int xhat_debug = 0; + +krwlock_t xhat_provider_rwlock; +xhat_provider_t *xhat_provider = NULL; + +void +xhat_init() +{ + rw_init(&xhat_provider_rwlock, NULL, RW_DEFAULT, NULL); +} + + + +int +xhat_provider_register(xhat_provider_t *provider) +{ + /* strlen("_cache") = 7 */ + char cache_name[XHAT_CACHE_NAMELEN + 7]; + + + if (provider->xhat_provider_version != XHAT_PROVIDER_VERSION) { + cmn_err(CE_WARN, "XHAT provider version mismatch"); + return (-1); + } + + if ((XHAT_POPS(provider)->xhat_alloc == NULL) || + (XHAT_POPS(provider)->xhat_free == NULL)) { + cmn_err(CE_WARN, "Malformed XHAT provider"); + return (-1); + } + + /* Allocate kmem_cache which will manage xhat blocks */ + provider->xblkcache->free_blks = NULL; + (void) strncpy(cache_name, provider->xhat_provider_name, + XHAT_CACHE_NAMELEN); + (void) strcat(cache_name, "_cache"); + provider->xblkcache->cache = kmem_cache_create(cache_name, + provider->xhat_provider_blk_size, 0, NULL, NULL, + provider->xblkcache->reclaim, + (void *)provider, NULL, 0); + if (provider->xblkcache->cache == NULL) { + cmn_err(CE_WARN, "Failed to allocate cache for %s", + provider->xhat_provider_name); + return (-1); + } + + mutex_init(&provider->xblkcache->lock, NULL, MUTEX_DEFAULT, NULL); + + + /* Insert provider in the global list */ + rw_enter(&xhat_provider_rwlock, RW_WRITER); + provider->next = xhat_provider; + provider->prev = NULL; + if (xhat_provider) + xhat_provider->prev = provider; + xhat_provider = provider; + xhat_provider->xhat_provider_refcnt = 0; + rw_exit(&xhat_provider_rwlock); + return (0); +} + + + +int +xhat_provider_unregister(xhat_provider_t *provider) +{ + if (provider->xhat_provider_version != XHAT_PROVIDER_VERSION) + return (-1); + + rw_enter(&xhat_provider_rwlock, RW_WRITER); + + if (provider->xhat_provider_refcnt) { + rw_exit(&xhat_provider_rwlock); + return (-1); + } + + if (provider->next) + provider->next->prev = provider->prev; + if (provider->prev) + provider->prev->next = provider->next; + else + xhat_provider = provider->next; + provider->prev = NULL; + provider->next = NULL; + rw_exit(&xhat_provider_rwlock); + + /* Free all xblks that are sitting on free_blks list */ + provider->xblkcache->reclaim(provider); + + kmem_cache_destroy(provider->xblkcache->cache); + + return (0); +} + + + +/* Attaches an XHAT to the address space */ +int +xhat_attach_xhat(xhat_provider_t *provider, struct as *as, + struct xhat **xhatp, void *arg) +{ + struct xhat *xh; + + + + xh = XHAT_POPS(provider)->xhat_alloc(arg); + if (xh == NULL) { + *xhatp = NULL; + return (XH_PRVDR); + } + + mutex_init(&xh->xhat_lock, NULL, MUTEX_DEFAULT, NULL); + xh->xhat_provider = provider; + + rw_enter(&xhat_provider_rwlock, RW_WRITER); + provider->xhat_provider_refcnt++; + rw_exit(&xhat_provider_rwlock); + + mutex_enter(&as->a_contents); + + /* Is address space busy (being freed, dup'd or swapped)? */ + if (AS_ISBUSY(as)) { + mutex_exit(&as->a_contents); + XHAT_POPS(provider)->xhat_free(xh); + + rw_enter(&xhat_provider_rwlock, RW_WRITER); + provider->xhat_provider_refcnt--; + rw_exit(&xhat_provider_rwlock); + + *xhatp = NULL; + return (XH_ASBUSY); + } + + xh->xhat_as = as; + xh->xhat_refcnt = 0; + xh->holder = NULL; + xh->arg = arg; + xh->next = (struct xhat *)as->a_xhat; + if (xh->next) + xh->next->prev = xh; + as->a_xhat = xh; + mutex_exit(&as->a_contents); + *xhatp = xh; + return (0); +} + + +int +xhat_detach_xhat(xhat_provider_t *provider, struct as *as) +{ + struct xhat *xh; + + + mutex_enter(&as->a_contents); + + for (xh = (struct xhat *)as->a_xhat; xh != NULL; xh = xh->next) + if (xh->xhat_provider == provider) { + + + if (xh->holder != NULL) { + /* + * The address space is being freed, + * dup'd or swapped out. + * If we are the thread which doing one + * of those operations, we can go ahead + * and free up the XHAT. + * Otherwise, return. + */ + if (xh->holder != curthread) { + mutex_exit(&as->a_contents); + return (XH_ASBUSY); + } else + xhat_hat_rele(xh); + } + + if (xh->xhat_refcnt > 0) { + /* + * There are still "users" of the XHAT. + * This may be either because the caller + * forgot to free something up (which is a bug) + * or because xhat_op_all() is in progress. + * Since we are not allowing any of + * xhat_op_all's ops to call xhat_detach_xhat(), + * This can only be some other thread. It + * may want to wait a bit and retry. + */ + + + /* Restore the hold on the XHAT */ + if (xh->holder == curthread) + xhat_hat_hold(xh); + + mutex_exit(&as->a_contents); + return (XH_XHHELD); + } + + rw_enter(&xhat_provider_rwlock, RW_WRITER); + provider->xhat_provider_refcnt--; + rw_exit(&xhat_provider_rwlock); + + if (xh->next) + xh->next->prev = xh->prev; + if (xh->prev) + xh->prev->next = xh->next; + else + as->a_xhat = (void *) xh->next; + mutex_exit(&as->a_contents); + + XHAT_POPS(provider)->xhat_free(xh); + + return (0); + } + mutex_exit(&as->a_contents); + return (XH_NOTATTCHD); +} + +void +xhat_hat_hold(struct xhat *xhat) +{ + mutex_enter(&xhat->xhat_lock); + xhat->xhat_refcnt++; + mutex_exit(&xhat->xhat_lock); +} + +void +xhat_hat_rele(struct xhat *xhat) +{ + mutex_enter(&xhat->xhat_lock); + xhat->xhat_refcnt--; + ASSERT(xhat->xhat_refcnt >= 0); + mutex_exit(&xhat->xhat_lock); +} + + +int +xhat_hat_holders(struct xhat *xhat) +{ + return (xhat->xhat_refcnt); +} + + +/* + * Assumes that address space is already locked + * and that AS_FREE is set for as->a_flags. + */ +void +xhat_free_start_all(struct as *as) +{ + struct xhat *xh, *xh_nxt; + + + ASSERT(AS_ISBUSY(as)); + + mutex_enter(&as->a_contents); + xh = (struct xhat *)as->a_xhat; + + /* + * Simply calling xhat_hat_hold() won't work because we will + * not be able to succeed in xhat_detach_xhat(), which may + * get called from here. We need to know _who_ the holder is. + */ + if (xh != NULL) { + xhat_hat_hold(xh); + ASSERT(xh->holder == NULL); + xh->holder = curthread; + } + + while (xh != NULL) { + + xh_nxt = xh->next; + if (xh_nxt != NULL) { + ASSERT(xh_nxt->holder == NULL); + xhat_hat_hold(xh_nxt); + xh_nxt->holder = curthread; + } + + mutex_exit(&as->a_contents); + + XHAT_FREE_START(xh); + + mutex_enter(&as->a_contents); + + xh = xh_nxt; + } + + mutex_exit(&as->a_contents); +} + + + +/* + * Assumes that address space is already locked. + * Since xhat_free_start_all() must have been called + * earlier, for all XHATs holder is set to curthread. + * Also, since AS_BUSY is set for as->a_flags, no new + * XHATs could have been added. + */ +void +xhat_free_end_all(struct as *as) +{ + + struct xhat *xh, *xh_nxt; + + ASSERT(AS_ISBUSY(as)); + + mutex_enter(&as->a_contents); + xh = (struct xhat *)as->a_xhat; + + + while (xh != NULL) { + + ASSERT(xh->holder == curthread); + + xh_nxt = xh->next; + + mutex_exit(&as->a_contents); + + XHAT_FREE_END(xh); + + mutex_enter(&as->a_contents); + + xh = xh_nxt; + } + + mutex_exit(&as->a_contents); +} + + +/* Assumes that address space is already locked */ + +/* ARGSUSED */ +int +xhat_dup_all(struct as *as, struct as *newas, caddr_t addr, size_t len, + uint_t flag) +{ + /* This is not supported. Should we return some sort of error? */ + + ASSERT(AS_ISBUSY(as)); + + return (0); +} + + +/* Assumes that address space is already locked */ +void +xhat_swapout_all(struct as *as) +{ + struct xhat *xh, *xh_nxt; + + + ASSERT(AS_ISBUSY(as)); + + mutex_enter(&as->a_contents); + xh = (struct xhat *)as->a_xhat; + + if (xh != NULL) { + xhat_hat_hold(xh); + ASSERT(xh->holder == NULL); + xh->holder = curthread; + } + + + while (xh != NULL) { + + xh_nxt = xh->next; + if (xh_nxt != NULL) { + ASSERT(xh_nxt->holder == NULL); + xhat_hat_hold(xh_nxt); + xh_nxt->holder = curthread; + } + + mutex_exit(&as->a_contents); + + XHAT_SWAPOUT(xh); + + mutex_enter(&as->a_contents); + + /* + * If the xh is still there (i.e. swapout did not + * destroy it), clear the holder field. + * xh_nxt->prev couldn't have been changed in xhat_attach_xhat() + * because AS_BUSY is set. xhat_detach_xhat() also couldn't + * have modified it because (holder != NULL). + * If there is only one XHAT, just see if a_xhat still + * points to us. + */ + if (((xh_nxt != NULL) && (xh_nxt->prev == xh)) || + ((as->a_xhat != NULL) && (as->a_xhat == xh))) { + xhat_hat_rele(xh); + xh->holder = NULL; + } + + xh = xh_nxt; + } + + mutex_exit(&as->a_contents); +} + + + + +/* + * In the following routines, the appropriate xhat_op + * should never attempt to call xhat_detach_xhat(): it will + * never succeed since the XHAT is held. + */ + + +#define XHAT_UNLOAD_CALLBACK_OP (0) +#define XHAT_SETATTR_OP (1) +#define XHAT_CLRATTR_OP (2) +#define XHAT_CHGATTR_OP (3) +#define XHAT_CHGPROT_OP (4) +#define XHAT_UNSHARE_OP (5) + + +static void +xhat_op_all(int op, struct as *as, caddr_t addr, + size_t len, uint_t flags, void *ptr) +{ + struct xhat *xh, *xh_nxt; + + mutex_enter(&as->a_contents); + xh = (struct xhat *)as->a_xhat; + + while (xh != NULL) { + + xhat_hat_hold(xh); + + xh_nxt = xh->next; + if (xh_nxt != NULL) + xhat_hat_hold(xh_nxt); + + mutex_exit(&as->a_contents); + + switch (op) { + case XHAT_UNLOAD_CALLBACK_OP: + XHAT_UNLOAD_CALLBACK(xh, addr, + len, flags, (hat_callback_t *)ptr); + break; + case XHAT_SETATTR_OP: + XHAT_SETATTR(xh, addr, len, flags); + break; + case XHAT_CLRATTR_OP: + XHAT_CLRATTR(xh, addr, len, flags); + break; + case XHAT_CHGATTR_OP: + XHAT_CHGATTR(xh, addr, len, flags); + break; + case XHAT_CHGPROT_OP: + XHAT_CHGPROT(xh, addr, len, flags); + break; + case XHAT_UNSHARE_OP: + XHAT_UNSHARE(xh, addr, len); + break; + default: + panic("Unknown op %d in xhat_op_all", op); + } + + mutex_enter(&as->a_contents); + + /* + * Both pointers are still valid because both + * XHATs are held. + */ + xhat_hat_rele(xh); + if (xh_nxt != NULL) + xhat_hat_rele(xh_nxt); + xh = xh_nxt; + } + + mutex_exit(&as->a_contents); +} + + + +void +xhat_unload_callback_all(struct as *as, caddr_t addr, size_t len, uint_t flags, + hat_callback_t *callback) +{ + xhat_op_all(XHAT_UNLOAD_CALLBACK_OP, as, addr, len, flags, callback); +} + + +void +xhat_setattr_all(struct as *as, caddr_t addr, size_t len, uint_t attr) +{ + xhat_op_all(XHAT_SETATTR_OP, as, addr, len, attr, NULL); +} + + + +void +xhat_clrattr_all(struct as *as, caddr_t addr, size_t len, uint_t attr) +{ + xhat_op_all(XHAT_CLRATTR_OP, as, addr, len, attr, NULL); +} + + +void +xhat_chgattr_all(struct as *as, caddr_t addr, size_t len, uint_t attr) +{ + xhat_op_all(XHAT_CHGATTR_OP, as, addr, len, attr, NULL); +} + + +void +xhat_chgprot_all(struct as *as, caddr_t addr, size_t len, uint_t prot) +{ + xhat_op_all(XHAT_CHGPROT_OP, as, addr, len, prot, NULL); +} + + +void +xhat_unshare_all(struct as *as, caddr_t addr, size_t len) +{ + xhat_op_all(XHAT_UNSHARE_OP, as, addr, len, 0, NULL); +} diff --git a/usr/src/uts/common/vm/xhat.h b/usr/src/uts/common/vm/xhat.h new file mode 100644 index 0000000000..808262f2c9 --- /dev/null +++ b/usr/src/uts/common/vm/xhat.h @@ -0,0 +1,208 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VM_XHAT_H +#define _VM_XHAT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _ASM + +#include <sys/types.h> +#include <vm/page.h> +#include <sys/kmem.h> + +struct xhat; +struct xhat_hme_blk; + +struct xhat_ops { + struct xhat *(*xhat_alloc)(void *); + void (*xhat_free)(struct xhat *); + void (*xhat_free_start)(struct xhat *); + void (*xhat_free_end)(struct xhat *); + int (*xhat_dup)(struct xhat *, struct xhat *, caddr_t, + size_t, uint_t); + void (*xhat_swapin)(struct xhat *); + void (*xhat_swapout)(struct xhat *); + void (*xhat_memload)(struct xhat *, caddr_t, struct page *, + uint_t, uint_t); + void (*xhat_memload_array)(struct xhat *, caddr_t, size_t, + struct page **, uint_t, uint_t); + void (*xhat_devload)(struct xhat *, caddr_t, size_t, pfn_t, + uint_t, int); + void (*xhat_unload)(struct xhat *, caddr_t, size_t, uint_t); + void (*xhat_unload_callback)(struct xhat *, caddr_t, size_t, + uint_t, hat_callback_t *); + void (*xhat_setattr)(struct xhat *, caddr_t, size_t, uint_t); + void (*xhat_clrattr)(struct xhat *, caddr_t, size_t, uint_t); + void (*xhat_chgattr)(struct xhat *, caddr_t, size_t, uint_t); + void (*xhat_unshare)(struct xhat *, caddr_t, size_t); + void (*xhat_chgprot)(struct xhat *, caddr_t, size_t, uint_t); + int (*xhat_pageunload)(struct xhat *, struct page *, uint_t, + void *); +}; + + +#define XHAT_POPS(_p) (_p)->xhat_provider_ops +#define XHAT_PROPS(_h) XHAT_POPS(((struct xhat *)(_h))->xhat_provider) +#define XHAT_HOPS(hat, func, args) \ + { \ + if (XHAT_PROPS(hat)-> /* */ func) \ + XHAT_PROPS(hat)-> /* */ func /* */ args; \ + } + +#define XHAT_FREE_START(a) \ + XHAT_HOPS(a, xhat_free_start, ((struct xhat *)(a))) +#define XHAT_FREE_END(a) \ + XHAT_HOPS(a, xhat_free_end, ((struct xhat *)(a))) +#define XHAT_DUP(a, b, c, d, e) \ + ((XHAT_PROPS(a)->xhat_dup == NULL) ? (0) : \ + XHAT_PROPS(a)->xhat_dup((struct xhat *)(a), \ + (struct xhat *)(b), c, d, e)) +#define XHAT_SWAPIN(a) \ + XHAT_HOPS(a, xhat_swapin, ((struct xhat *)(a))) +#define XHAT_SWAPOUT(a) \ + XHAT_HOPS(a, xhat_swapout, ((struct xhat *)(a))) +#define XHAT_MEMLOAD(a, b, c, d, e) \ + XHAT_HOPS(a, xhat_memload, ((struct xhat *)(a), b, c, d, e)) +#define XHAT_MEMLOAD_ARRAY(a, b, c, d, e, f) \ + XHAT_HOPS(a, xhat_memload_array, ((struct xhat *)(a), b, c, d, e, f)) +#define XHAT_DEVLOAD(a, b, c, d, e, f) \ + XHAT_HOPS(a, xhat_devload, ((struct xhat *)(a), b, c, d, e, f)) +#define XHAT_UNLOAD(a, b, c, d) \ + XHAT_HOPS(a, xhat_unload, ((struct xhat *)(a), b, c, d)) +#define XHAT_UNLOAD_CALLBACK(a, b, c, d, e) \ + XHAT_HOPS(a, xhat_unload_callback, ((struct xhat *)(a), b, c, d, e)) +#define XHAT_SETATTR(a, b, c, d) \ + XHAT_HOPS(a, xhat_setattr, ((struct xhat *)(a), b, c, d)) +#define XHAT_CLRATTR(a, b, c, d) \ + XHAT_HOPS(a, xhat_clrattr, ((struct xhat *)(a), b, c, d)) +#define XHAT_CHGATTR(a, b, c, d) \ + XHAT_HOPS(a, xhat_chgattr, ((struct xhat *)(a), b, c, d)) +#define XHAT_UNSHARE(a, b, c) \ + XHAT_HOPS(a, xhat_unshare, ((struct xhat *)(a), b, c)) +#define XHAT_CHGPROT(a, b, c, d) \ + XHAT_HOPS(a, xhat_chgprot, ((struct xhat *)(a), b, c, d)) +#define XHAT_PAGEUNLOAD(a, b, c, d) \ + ((XHAT_PROPS(a)->xhat_pageunload == NULL) ? (0) : \ + XHAT_PROPS(a)->xhat_pageunload((struct xhat *)(a), b, c, d)) + + + +#define XHAT_PROVIDER_VERSION 1 + +/* + * Provider name will be appended with "_cache" + * when initializing kmem cache. + * The resulting sring must be less than + * KMEM_CACHE_NAMELEN + */ +#define XHAT_CACHE_NAMELEN 24 + +typedef struct xblk_cache { + kmutex_t lock; + kmem_cache_t *cache; + void *free_blks; + void (*reclaim)(void *); +} xblk_cache_t; + +typedef struct xhat_provider { + int xhat_provider_version; + int xhat_provider_refcnt; + struct xhat_provider *next; + struct xhat_provider *prev; + char xhat_provider_name[XHAT_CACHE_NAMELEN]; + xblk_cache_t *xblkcache; + struct xhat_ops *xhat_provider_ops; + int xhat_provider_blk_size; +} xhat_provider_t; + +/* + * The xhat structure is protected by xhat_lock. + * A particular xhat implementation is a extension of the + * xhat structure and may contain its own lock(s) to + * protect those additional fields. + * The xhat structure is never allocated directly. + * Instead its allocation is provided by the hat implementation. + * The xhat provider ops xhat_alloc/xhat_free are used to + * alloc/free a implementation dependant xhat structure. + */ +struct xhat { + xhat_provider_t *xhat_provider; + struct as *xhat_as; + void *arg; + struct xhat *prev; + struct xhat *next; + kmutex_t xhat_lock; + int xhat_refcnt; + kthread_t *holder; +}; + + +/* Error codes */ +#define XH_PRVDR (1) /* Provider-specific error */ +#define XH_ASBUSY (2) /* Address space is busy */ +#define XH_XHHELD (3) /* XHAT is being held */ +#define XH_NOTATTCHD (4) /* Provider is not attached to as */ + + +int xhat_provider_register(xhat_provider_t *); +int xhat_provider_unregister(xhat_provider_t *); +void xhat_init(void); +int xhat_attach_xhat(xhat_provider_t *, struct as *, struct xhat **, + void *); +int xhat_detach_xhat(xhat_provider_t *, struct as *); +pfn_t xhat_insert_xhatblk(page_t *, struct xhat *, void **); +int xhat_delete_xhatblk(void *, int); +void xhat_hat_hold(struct xhat *); +void xhat_hat_rele(struct xhat *); +int xhat_hat_holders(struct xhat *); + +void xhat_free_start_all(struct as *); +void xhat_free_end_all(struct as *); +int xhat_dup_all(struct as *, struct as *, caddr_t, size_t, uint_t); +void xhat_swapout_all(struct as *); +void xhat_unload_callback_all(struct as *, caddr_t, size_t, uint_t, + hat_callback_t *); +void xhat_setattr_all(struct as *, caddr_t, size_t, uint_t); +void xhat_clrattr_all(struct as *, caddr_t, size_t, uint_t); +void xhat_chgattr_all(struct as *, caddr_t, size_t, uint_t); +void xhat_chgprot_all(struct as *, caddr_t, size_t, uint_t); +void xhat_unshare_all(struct as *, caddr_t, size_t); + + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_XHAT_H */ |