OpenSolaris Launch

author: stevel@tonic-gate <none@none> 2005-06-14 00:00:00 -0700
committer: stevel@tonic-gate <none@none> 2005-06-14 00:00:00 -0700
commit: 7c478bd95313f5f23a4c958a745db2134aa03244 (patch)
tree: c871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/uts/common/vm
download: illumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz
39 files changed, 47175 insertions, 0 deletions
diff --git a/usr/src/uts/common/vm/Makefile b/usr/src/uts/common/vm/Makefile
new file mode 100644
index 0000000000..fcd6582985
--- /dev/null
+++ b/usr/src/uts/common/vm/Makefile
@@ -0,0 +1,55 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+# include global definitions
+include ../../../Makefile.master
+
+HDRS=	anon.h as.h faultcode.h hat.h kpm.h page.h pvn.h rm.h seg.h vpage.h \
+	seg_dev.h seg_enum.h seg_kmem.h seg_kp.h seg_kpm.h seg_map.h \
+	seg_vn.h seg_spt.h
+
+ROOTDIRS= $(ROOT)/usr/include/vm
+
+ROOTHDRS= $(HDRS:%=$(ROOTDIRS)/%)
+
+CHECKHDRS= $(HDRS:%.h=%.check)
+
+# install rule
+$(ROOTDIRS)/%: %
+	$(INS.file)
+
+.KEEP_STATE:
+
+.PARALLEL: $(CHECKHDRS)
+
+install_h: $(ROOTDIRS) $(ROOTHDRS)
+
+$(ROOTDIRS):
+	$(INS.dir)
+
+check:	$(CHECKHDRS)
diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h
new file mode 100644
index 0000000000..466b939a75
--- /dev/null
+++ b/usr/src/uts/common/vm/anon.h
@@ -0,0 +1,461 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/*	 All Rights Reserved   */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_ANON_H
+#define	_VM_ANON_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/cred.h>
+#include <vm/seg.h>
+#include <vm/vpage.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Anonymous pages.
+ */
+
+typedef	unsigned long anoff_t;		/* anon offsets */
+
+/*
+ *	Each anonymous page, either in memory or in swap, has an anon structure.
+ * The structure (slot) provides a level of indirection between anonymous pages
+ * and their backing store.
+ *
+ *	(an_vp, an_off) names the vnode of the anonymous page for this slot.
+ *
+ * 	(an_pvp, an_poff) names the location of the physical backing store
+ * 	for the page this slot represents. If the name is null there is no
+ * 	associated physical store. The physical backing store location can
+ *	change while the slot is in use.
+ *
+ *	an_hash is a hash list of anon slots. The list is hashed by
+ * 	(an_vp, an_off) of the associated anonymous page and provides a
+ *	method of going from the name of an anonymous page to its
+ * 	associated anon slot.
+ *
+ *	an_refcnt holds a reference count which is the number of separate
+ * 	copies that will need to be created in case of copy-on-write.
+ *	A refcnt > 0 protects the existence of the slot. The refcnt is
+ * 	initialized to 1 when the anon slot is created in anon_alloc().
+ *	If a client obtains an anon slot and allows multiple threads to
+ * 	share it, then it is the client's responsibility to insure that
+ *	it does not allow one thread to try to reference the slot at the
+ *	same time as another is trying to decrement the last count and
+ *	destroy the anon slot. E.g., the seg_vn segment type protects
+ *	against this with higher level locks.
+ */
+
+struct anon {
+	struct vnode *an_vp;	/* vnode of anon page */
+	struct vnode *an_pvp;	/* vnode of physical backing store */
+	anoff_t an_off;		/* offset of anon page */
+	anoff_t an_poff;	/* offset in vnode */
+	struct anon *an_hash;	/* hash table of anon slots */
+	int an_refcnt;		/* # of people sharing slot */
+};
+
+#ifdef _KERNEL
+/*
+ * The swapinfo_lock protects:
+ *		swapinfo list
+ *		individual swapinfo structures
+ *
+ * The anoninfo_lock protects:
+ *		anoninfo counters
+ *
+ * The anonhash_lock protects:
+ *		anon hash lists
+ *		anon slot fields
+ *
+ * Fields in the anon slot which are read-only for the life of the slot
+ * (an_vp, an_off) do not require the anonhash_lock be held to access them.
+ * If you access a field without the anonhash_lock held you must be holding
+ * the slot with an_refcnt to make sure it isn't destroyed.
+ * To write (an_pvp, an_poff) in a given slot you must also hold the
+ * p_iolock of the anonymous page for slot.
+ */
+extern kmutex_t anoninfo_lock;
+extern kmutex_t swapinfo_lock;
+extern kmutex_t anonhash_lock[];
+extern pad_mutex_t anon_array_lock[];
+extern kcondvar_t anon_array_cv[];
+
+/*
+ * Global hash table to provide a function from (vp, off) -> ap
+ */
+extern size_t anon_hash_size;
+extern struct anon **anon_hash;
+#define	ANON_HASH_SIZE	anon_hash_size
+#define	ANON_HASHAVELEN	4
+#define	ANON_HASH(VP, OFF)	\
+((((uintptr_t)(VP) >> 7)  ^ ((OFF) >> PAGESHIFT)) & (ANON_HASH_SIZE - 1))
+
+#define	AH_LOCK_SIZE	64
+#define	AH_LOCK(vp, off) (ANON_HASH((vp), (off)) & (AH_LOCK_SIZE -1))
+
+#endif	/* _KERNEL */
+
+/*
+ * Declaration for the Global counters to accurately
+ * track the kernel foot print in memory.
+ */
+extern  pgcnt_t segvn_pages_locked;
+extern  pgcnt_t pages_locked;
+extern  pgcnt_t pages_claimed;
+extern  pgcnt_t pages_useclaim;
+extern  pgcnt_t obp_pages;
+
+/*
+ * Anonymous backing store accounting structure for swapctl.
+ *
+ * ani_max = maximum amount of swap space
+ *	(including potentially available physical memory)
+ * ani_free = amount of unallocated anonymous memory
+ *	(some of which might be reserved and including
+ *	potentially available physical memory)
+ * ani_resv = amount of claimed (reserved) anonymous memory
+ *
+ * The swap data can be aquired more efficiently through the
+ * kstats interface.
+ * Total slots currently available for reservation =
+ *	MAX(ani_max - ani_resv, 0) + (availrmem - swapfs_minfree)
+ */
+struct anoninfo {
+	pgcnt_t	ani_max;
+	pgcnt_t	ani_free;
+	pgcnt_t	ani_resv;
+};
+
+#ifdef _SYSCALL32
+struct anoninfo32 {
+	size32_t ani_max;
+	size32_t ani_free;
+	size32_t ani_resv;
+};
+#endif /* _SYSCALL32 */
+
+/*
+ * Define the NCPU pool of the ani_free counters. Update the counter
+ * of the cpu on which the thread is running and in every clock intr
+ * sync anoninfo.ani_free with the current total off all the NCPU entries.
+ */
+
+typedef	struct	ani_free {
+	kmutex_t	ani_lock;
+	pgcnt_t		ani_count;
+	uchar_t		pad[64 - sizeof (kmutex_t) - sizeof (pgcnt_t)];
+			/* XXX 64 = cacheline size */
+} ani_free_t;
+
+#define	ANI_MAX_POOL	128
+extern	ani_free_t	ani_free_pool[];
+
+#define	ANI_ADD(inc)	{ \
+	ani_free_t	*anifp; \
+	int		index; \
+	index = (CPU->cpu_id & (ANI_MAX_POOL - 1)); \
+	anifp = &ani_free_pool[index]; \
+	mutex_enter(&anifp->ani_lock); \
+	anifp->ani_count += inc; \
+	mutex_exit(&anifp->ani_lock); \
+}
+
+/*
+ * Anon array pointers are allocated in chunks. Each chunk
+ * has PAGESIZE/sizeof(u_long *) of anon pointers.
+ * There are two levels of arrays for anon array pointers larger
+ * than a chunk. The first level points to anon array chunks.
+ * The second level consists of chunks of anon pointers.
+ *
+ * If anon array is smaller than a chunk then the whole anon array
+ * is created (memory is allocated for whole anon array).
+ * If anon array is larger than a chunk only first level array is
+ * allocated. Then other arrays (chunks) are allocated only when
+ * they are initialized with anon pointers.
+ */
+struct anon_hdr {
+	kmutex_t serial_lock;	/* serialize array chunk allocation */
+	pgcnt_t	size;		/* number of pointers to (anon) pages */
+	void	**array_chunk;	/* pointers to anon pointers or chunks of */
+				/* anon pointers */
+	int	flags;		/* ANON_ALLOC_FORCE force preallocation of */
+				/* whole anon array	*/
+};
+
+#ifdef	_LP64
+#define	ANON_PTRSHIFT	3
+#define	ANON_PTRMASK	~7
+#else
+#define	ANON_PTRSHIFT	2
+#define	ANON_PTRMASK	~3
+#endif
+
+#define	ANON_CHUNK_SIZE		(PAGESIZE >> ANON_PTRSHIFT)
+#define	ANON_CHUNK_SHIFT	(PAGESHIFT - ANON_PTRSHIFT)
+#define	ANON_CHUNK_OFF		(ANON_CHUNK_SIZE - 1)
+
+/*
+ * Anon flags.
+ */
+#define	ANON_SLEEP		0x0	/* ok to block */
+#define	ANON_NOSLEEP		0x1	/* non-blocking call */
+#define	ANON_ALLOC_FORCE	0x2	/* force single level anon array */
+#define	ANON_GROWDOWN		0x4	/* anon array should grow downward */
+
+/*
+ * The anon_map structure is used by various clients of the anon layer to
+ * manage anonymous memory.   When anonymous memory is shared,
+ * then the different clients sharing it will point to the
+ * same anon_map structure.  Also, if a segment is unmapped
+ * in the middle where an anon_map structure exists, the
+ * newly created segment will also share the anon_map structure,
+ * although the two segments will use different ranges of the
+ * anon array.  When mappings are private (or shared with
+ * a reference count of 1), an unmap operation will free up
+ * a range of anon slots in the array given by the anon_map
+ * structure.  Because of fragmentation due to this unmapping,
+ * we have to store the size of the anon array in the anon_map
+ * structure so that we can free everything when the referernce
+ * count goes to zero.
+ *
+ * A new rangelock scheme is introduced to make the anon layer scale.
+ * A reader/writer lock per anon_amp and an array of system-wide hash
+ * locks, anon_array_lock[] are introduced to replace serial_lock and
+ * anonmap lock.  The writer lock is held when we want to singlethreaD
+ * the reference to the anon array pointers or when references to
+ * anon_map's members, whereas reader lock and anon_array_lock are
+ * held to allows multiple threads to reference different part of
+ * anon array.  A global set of condition variables, anon_array_cv,
+ * are used with anon_array_lock[] to make the hold time of the locks
+ * short.
+ *
+ * szc is used to calculate the index of hash locks and cv's.  We
+ * could've just used seg->s_szc if not for the possible sharing of
+ * anon_amp between SYSV shared memory and ISM, so now we introduce
+ * szc in the anon_map structure.  For MAP_SHARED, the amp->szc is either
+ * 0 (base page size) or page_num_pagesizes() - 1, while MAP_PRIVATE
+ * the amp->szc could be anything in [0, page_num_pagesizes() - 1].
+ */
+struct anon_map {
+	krwlock_t a_rwlock;	/* protect anon_map and anon array */
+	size_t	size;		/* size in bytes mapped by the anon array */
+	struct	anon_hdr *ahp; 	/* anon array header pointer, containing */
+				/* anon pointer array(s) */
+	size_t	swresv;		/* swap space reserved for this anon_map */
+	uint_t	refcnt;		/* reference count on this structure */
+	ushort_t a_szc;		/* max szc among shared processes */
+	void	*locality;	/* lgroup locality info */
+};
+
+#ifdef _KERNEL
+
+#define	ANON_BUSY		0x1
+#define	ANON_ISBUSY(slot)	(*(slot) & ANON_BUSY)
+#define	ANON_SETBUSY(slot)	(*(slot) |= ANON_BUSY)
+#define	ANON_CLRBUSY(slot)	(*(slot) &= ~ANON_BUSY)
+
+#define	ANON_MAP_SHIFT		6	/* log2(sizeof (struct anon_map)) */
+#define	ANON_ARRAY_SHIFT	7	/* log2(ANON_LOCKSIZE) */
+#define	ANON_LOCKSIZE		128
+
+#define	ANON_LOCK_ENTER(lock, type)	rw_enter((lock), (type))
+#define	ANON_LOCK_EXIT(lock)		rw_exit((lock))
+
+#define	ANON_ARRAY_HASH(amp, idx)\
+	((((idx) + ((idx) >> ANON_ARRAY_SHIFT) +\
+	((idx) >> (ANON_ARRAY_SHIFT << 1)) +\
+	((idx) >> (ANON_ARRAY_SHIFT + (ANON_ARRAY_SHIFT << 1)))) ^\
+	((uintptr_t)(amp) >> ANON_MAP_SHIFT)) & (ANON_LOCKSIZE - 1))
+
+typedef struct anon_sync_obj {
+	kmutex_t	*sync_mutex;
+	kcondvar_t	*sync_cv;
+	ulong_t		*sync_data;
+} anon_sync_obj_t;
+
+/*
+ * Anonymous backing store accounting structure for kernel.
+ * ani_max = total reservable slots on physical (disk-backed) swap
+ * ani_phys_resv = total phys slots reserved for use by clients
+ * ani_mem_resv = total mem slots reserved for use by clients
+ * ani_free = # unallocated physical slots + # of reserved unallocated
+ * memory slots
+ */
+
+/*
+ * Initial total swap slots available for reservation
+ */
+#define	TOTAL_AVAILABLE_SWAP \
+	(k_anoninfo.ani_max + MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
+
+/*
+ * Swap slots currently available for reservation
+ */
+#define	CURRENT_TOTAL_AVAILABLE_SWAP \
+	((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +	\
+			MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
+
+struct k_anoninfo {
+	pgcnt_t	ani_max;	/* total reservable slots on phys */
+					/* (disk) swap */
+	pgcnt_t	ani_free;	/* # of unallocated phys and mem slots */
+	pgcnt_t	ani_phys_resv;	/* # of reserved phys (disk) slots */
+	pgcnt_t	ani_mem_resv;	/* # of reserved mem slots */
+	pgcnt_t	ani_locked_swap; /* # of swap slots locked in reserved */
+				/* mem swap */
+};
+
+extern	struct k_anoninfo k_anoninfo;
+
+extern void	anon_init(void);
+extern struct	anon *anon_alloc(struct vnode *, anoff_t);
+extern void	anon_dup(struct anon_hdr *, ulong_t,
+		    struct anon_hdr *, ulong_t, size_t);
+extern void	anon_dup_fill_holes(struct anon_hdr *, ulong_t,
+		    struct anon_hdr *, ulong_t, size_t, uint_t, int);
+extern int	anon_fill_cow_holes(struct seg *, caddr_t, struct anon_hdr *,
+		    ulong_t, struct vnode *, u_offset_t, size_t, uint_t,
+		    uint_t, struct vpage [], struct cred *);
+extern void	anon_free(struct anon_hdr *, ulong_t, size_t);
+extern void	anon_free_pages(struct anon_hdr *, ulong_t, size_t, uint_t);
+extern void	anon_disclaim(struct anon_map *, ulong_t, size_t, int);
+extern int	anon_getpage(struct anon **, uint_t *, struct page **,
+		    size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
+extern int	swap_getconpage(struct vnode *, u_offset_t, size_t,
+		    uint_t *, page_t *[], size_t, page_t *,
+		    spgcnt_t *, struct seg *, caddr_t,
+		    enum seg_rw, struct cred *);
+extern int	anon_map_getpages(struct anon_map *, ulong_t,
+		    uint_t, struct seg *, caddr_t, uint_t,
+		    uint_t *, page_t *[], uint_t *,
+		    struct vpage [], enum seg_rw, int, int, struct cred *);
+extern int	anon_map_privatepages(struct anon_map *, ulong_t,
+		    uint_t, struct seg *, caddr_t, uint_t,
+		    page_t *[], struct vpage [], int, struct cred *);
+extern struct	page *anon_private(struct anon **, struct seg *,
+		    caddr_t, uint_t, struct page *,
+		    int, struct cred *);
+extern struct	page *anon_zero(struct seg *, caddr_t,
+		    struct anon **, struct cred *);
+extern int	anon_map_createpages(struct anon_map *, ulong_t,
+		    size_t, struct page **,
+		    struct seg *, caddr_t,
+		    enum seg_rw, struct cred *);
+extern int	anon_map_demotepages(struct anon_map *, ulong_t,
+		    struct seg *, caddr_t, uint_t,
+		    struct vpage [], struct cred *);
+extern int	anon_resvmem(size_t, uint_t);
+extern void	anon_unresv(size_t);
+extern struct	anon_map *anonmap_alloc(size_t, size_t);
+extern void	anonmap_free(struct anon_map *);
+extern void	anon_decref(struct anon *);
+extern int	non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *);
+extern pgcnt_t	anon_pages(struct anon_hdr *, ulong_t, pgcnt_t);
+extern int	anon_swap_adjust(pgcnt_t);
+extern void	anon_swap_restore(pgcnt_t);
+extern struct	anon_hdr *anon_create(pgcnt_t, int);
+extern void	anon_release(struct anon_hdr *, pgcnt_t);
+extern struct	anon *anon_get_ptr(struct anon_hdr *, ulong_t);
+extern ulong_t	*anon_get_slot(struct anon_hdr *, ulong_t);
+extern struct	anon *anon_get_next_ptr(struct anon_hdr *, ulong_t *);
+extern int	anon_set_ptr(struct anon_hdr *, ulong_t, struct anon *, int);
+extern int 	anon_copy_ptr(struct anon_hdr *, ulong_t,
+		    struct anon_hdr *, ulong_t, pgcnt_t, int);
+extern pgcnt_t	anon_grow(struct anon_hdr *, ulong_t *, pgcnt_t, pgcnt_t, int);
+extern void	anon_array_enter(struct anon_map *, ulong_t,
+			anon_sync_obj_t *);
+extern void	anon_array_exit(anon_sync_obj_t *);
+
+/*
+ * anon_resv checks to see if there is enough swap space to fulfill a
+ * request and if so, reserves the appropriate anonymous memory resources.
+ * anon_checkspace just checks to see if there is space to fulfill the request,
+ * without taking any resources.  Both return 1 if successful and 0 if not.
+ */
+#define	anon_resv(size)		anon_resvmem((size), 1)
+#define	anon_checkspace(size)	anon_resvmem((size), 0)
+
+/*
+ * Flags to anon_private
+ */
+#define	STEAL_PAGE	0x1	/* page can be stolen */
+#define	LOCK_PAGE	0x2	/* page must be ``logically'' locked */
+
+/*
+ * Flags to anon_disclaim
+ */
+#define	ANON_PGLOOKUP_BLK	0x1	/* block on locked pages */
+
+/*
+ * SEGKP ANON pages that are locked are assumed to be LWP stack pages
+ * and thus count towards the user pages locked count.
+ * This value is protected by the same lock as availrmem.
+ */
+extern pgcnt_t anon_segkp_pages_locked;
+
+extern int anon_debug;
+
+#ifdef ANON_DEBUG
+
+#define	A_ANON	0x01
+#define	A_RESV	0x02
+#define	A_MRESV	0x04
+
+/* vararg-like debugging macro. */
+#define	ANON_PRINT(f, printf_args) \
+		if (anon_debug & f) \
+			printf printf_args
+
+#else	/* ANON_DEBUG */
+
+#define	ANON_PRINT(f, printf_args)
+
+#endif	/* ANON_DEBUG */
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_ANON_H */
diff --git a/usr/src/uts/common/vm/as.h b/usr/src/uts/common/vm/as.h
new file mode 100644
index 0000000000..c7afefc23c
--- /dev/null
+++ b/usr/src/uts/common/vm/as.h
@@ -0,0 +1,290 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/*	 All Rights Reserved   */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_AS_H
+#define	_VM_AS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/watchpoint.h>
+#include <vm/seg.h>
+#include <vm/faultcode.h>
+#include <vm/hat.h>
+#include <sys/avl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Address spaces.
+ */
+
+/*
+ * Each address space consists of a sorted list of segments
+ * and machine dependent address translation information.
+ *
+ * All the hard work is in the segment drivers and the
+ * hardware address translation code.
+ *
+ * The segment list is represented as an AVL tree.
+ *
+ * The address space lock (a_lock) is a long term lock which serializes
+ * access to certain operations (as_map, as_unmap) and protects the
+ * underlying generic segment data (seg.h) along with some fields in the
+ * address space structure as shown below:
+ *
+ *	address space structure 	segment structure
+ *
+ *	a_segtree			s_base
+ *	a_size				s_size
+ *	a_lastgap			s_link
+ *	a_seglast			s_ops
+ *					s_as
+ *					s_data
+ *
+ * The address space contents lock (a_contents) is a short term
+ * lock that protects most of the data in the address space structure.
+ * This lock is always acquired after the "a_lock" in all situations
+ * except while dealing with AS_CLAIMGAP to avoid deadlocks.
+ *
+ * The following fields are protected by this lock:
+ *
+ *	a_flags (AS_PAGLCK, AS_CLAIMGAP, etc.)
+ *	a_unmapwait
+ *	a_seglast
+ *
+ * The address space lock (a_lock) is always held prior to any segment
+ * operation.  Some segment drivers use the address space lock to protect
+ * some or all of their segment private data, provided the version of
+ * "a_lock" (read vs. write) is consistent with the use of the data.
+ *
+ * The following fields are protected by the hat layer lock:
+ *
+ *	a_vbits
+ *	a_hat
+ *	a_hrm
+ */
+
+struct as {
+	kmutex_t a_contents;	/* protect certain fields in the structure */
+	uchar_t  a_flags;	/* as attributes */
+	uchar_t	a_vbits;	/* used for collecting statistics */
+	kcondvar_t a_cv;	/* used by as_rangelock */
+	struct	hat *a_hat;	/* hat structure */
+	struct	hrmstat *a_hrm; /* ref and mod bits */
+	caddr_t	a_userlimit;	/* highest allowable address in this as */
+	struct seg *a_seglast;	/* last segment hit on the addr space */
+	krwlock_t a_lock;	/* protects segment related fields */
+	size_t	a_size;		/* size of address space */
+	struct seg *a_lastgap;	/* last seg found by as_gap() w/ AS_HI (mmap) */
+	struct seg *a_lastgaphl; /* last seg saved in as_gap() either for */
+				/* AS_HI or AS_LO used in as_addseg() */
+	avl_tree_t a_segtree;	/* segments in this address space. (AVL tree) */
+	avl_tree_t a_wpage;	/* watched pages (procfs) */
+	uchar_t	a_updatedir;	/* mappings changed, rebuild a_objectdir */
+	timespec_t a_updatetime;	/* time when mappings last changed */
+	vnode_t	**a_objectdir;	/* object directory (procfs) */
+	size_t	a_sizedir;	/* size of object directory */
+	struct as_callback *a_callbacks; /* callback list */
+	void *a_xhat;		/* list of xhat providers */
+};
+
+#define	AS_PAGLCK		0x80
+#define	AS_CLAIMGAP		0x40
+#define	AS_UNMAPWAIT		0x20
+#define	AS_NEEDSPURGE		0x10	/* mostly for seg_nf, see as_purge() */
+#define	AS_BUSY			0x01	/* needed by XHAT framework */
+
+#define	AS_ISPGLCK(as)		((as)->a_flags & AS_PAGLCK)
+#define	AS_ISCLAIMGAP(as)	((as)->a_flags & AS_CLAIMGAP)
+#define	AS_ISUNMAPWAIT(as)	((as)->a_flags & AS_UNMAPWAIT)
+#define	AS_ISBUSY(as)		((as)->a_flags & AS_BUSY)
+
+
+#define	AS_SETPGLCK(as)		((as)->a_flags |= AS_PAGLCK)
+#define	AS_SETCLAIMGAP(as)	((as)->a_flags |= AS_CLAIMGAP)
+#define	AS_SETUNMAPWAIT(as)	((as)->a_flags |= AS_UNMAPWAIT)
+#define	AS_SETBUSY(as)		((as)->a_flags |= AS_BUSY)
+
+#define	AS_CLRPGLCK(as)		((as)->a_flags &= ~AS_PAGLCK)
+#define	AS_CLRCLAIMGAP(as)	((as)->a_flags &= ~AS_CLAIMGAP)
+#define	AS_CLRUNMAPWAIT(as)	((as)->a_flags &= ~AS_UNMAPWAIT)
+#define	AS_CLRBUSY(as)		((as)->a_flags &= ~AS_BUSY)
+
+#define	AS_TYPE_64BIT(as)	\
+	    (((as)->a_userlimit > (caddr_t)UINT32_MAX) ? 1 : 0)
+
+/*
+ * The as_callback is the basic structure which supports the ability to
+ * inform clients of specific events pertaining to address space management.
+ * A user calls as_add_callback to register an address space callback
+ * for a range of pages, specifying the events that need to occur.
+ * When as_do_callbacks is called and finds a 'matching' entry, the
+ * callback is called once, and the callback function MUST call
+ * as_delete_callback when all callback activities are complete.
+ * The thread calling as_do_callbacks blocks until the as_delete_callback
+ * is called.  This allows for asynchorous events to subside before the
+ * as_do_callbacks thread continues.
+ *
+ * An example of the need for this is a driver which has done long-term
+ * locking of memory.  Address space management operations (events) such
+ * as as_free, as_umap, and as_setprot will block indefinitely until the
+ * pertinent memory is unlocked.  The callback mechanism provides the
+ * way to inform the driver of the event so that the driver may do the
+ * necessary unlocking.
+ *
+ * The contents of this structure is protected by a_contents lock
+ */
+typedef void (*callback_func_t)(struct as *, void *, uint_t);
+struct as_callback {
+	struct as_callback	*ascb_next;		/* list link */
+	uint_t			ascb_events;		/* event types */
+	callback_func_t		ascb_func;   		/* callback function */
+	void			*ascb_arg;		/* callback argument */
+	caddr_t			ascb_saddr;		/* start address */
+	size_t			ascb_len;		/* address range */
+};
+/*
+ * Callback events
+ */
+#define	AS_FREE_EVENT		0x1
+#define	AS_SETPROT_EVENT	0x2
+#define	AS_UNMAP_EVENT		0x4
+#define	AS_CALLBACK_CALLED	((uint_t)(1U << (8 * sizeof (uint_t) - 1U)))
+#define	AS_UNMAPWAIT_EVENT				\
+		(AS_FREE_EVENT | AS_SETPROT_EVENT | AS_UNMAP_EVENT)
+#define	AS_ALL_EVENT					\
+		(AS_FREE_EVENT | AS_SETPROT_EVENT | AS_UNMAP_EVENT)
+
+
+/* Return code values for as_callback_delete */
+enum as_cbdelete_rc {
+	AS_CALLBACK_DELETED,
+	AS_CALLBACK_NOTFOUND,
+	AS_CALLBACK_DELETE_DEFERRED
+};
+
+#ifdef _KERNEL
+
+/*
+ * Flags for as_gap.
+ */
+#define	AH_DIR		0x1	/* direction flag mask */
+#define	AH_LO		0x0	/* find lowest hole */
+#define	AH_HI		0x1	/* find highest hole */
+#define	AH_CONTAIN	0x2	/* hole must contain `addr' */
+
+extern struct as kas;		/* kernel's address space */
+
+/*
+ * Macros for address space locking.
+ */
+#define	AS_LOCK_ENTER(as, lock, type)		rw_enter((lock), (type))
+#define	AS_LOCK_EXIT(as, lock)			rw_exit((lock))
+#define	AS_LOCK_DESTROY(as, lock)		rw_destroy((lock))
+#define	AS_LOCK_TRYENTER(as, lock, type)	rw_tryenter((lock), (type))
+
+/*
+ * Macros to test lock states.
+ */
+#define	AS_LOCK_HELD(as, lock)		RW_LOCK_HELD((lock))
+#define	AS_READ_HELD(as, lock)		RW_READ_HELD((lock))
+#define	AS_WRITE_HELD(as, lock)		RW_WRITE_HELD((lock))
+
+/*
+ * macros to walk thru segment lists
+ */
+#define	AS_SEGFIRST(as)		avl_first(&(as)->a_segtree)
+#define	AS_SEGNEXT(as, seg)	AVL_NEXT(&(as)->a_segtree, (seg))
+#define	AS_SEGPREV(as, seg)	AVL_PREV(&(as)->a_segtree, (seg))
+
+void	as_init(void);
+void	as_avlinit(struct as *);
+struct	seg *as_segat(struct as *as, caddr_t addr);
+void	as_rangelock(struct as *as);
+void	as_rangeunlock(struct as *as);
+struct	as *as_alloc(void);
+void	as_free(struct as *as);
+int	as_dup(struct as *as, struct as **outas);
+struct	seg *as_findseg(struct as *as, caddr_t addr, int tail);
+int	as_addseg(struct as *as, struct seg *newseg);
+struct	seg *as_removeseg(struct as *as, struct seg *seg);
+faultcode_t as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
+		enum fault_type type, enum seg_rw rw);
+faultcode_t as_faulta(struct as *as, caddr_t addr, size_t size);
+int	as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot);
+int	as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot);
+int	as_unmap(struct as *as, caddr_t addr, size_t size);
+int	as_map(struct as *as, caddr_t addr, size_t size, int ((*crfp)()),
+		void *argsp);
+void	as_purge(struct as *as);
+int	as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
+		uint_t flags, caddr_t addr);
+int	as_memory(struct as *as, caddr_t *basep, size_t *lenp);
+size_t	as_swapout(struct as *as);
+int	as_incore(struct as *as, caddr_t addr, size_t size, char *vec,
+		size_t *sizep);
+int	as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
+		uintptr_t arg, ulong_t *lock_map, size_t pos);
+int	as_exec(struct as *oas, caddr_t ostka, size_t stksz,
+		struct as *nas, caddr_t nstka, uint_t hatflag);
+int	as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
+		size_t size, enum seg_rw rw);
+void	as_pageunlock(struct as *as, struct page **pp, caddr_t addr,
+		size_t size, enum seg_rw rw);
+void	as_pagereclaim(struct as *as, struct page **pp, caddr_t addr,
+		size_t size, enum seg_rw rw);
+int	as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
+		boolean_t wait);
+void	as_setwatch(struct as *as);
+void	as_clearwatch(struct as *as);
+int	as_getmemid(struct as *, caddr_t, memid_t *);
+
+int	as_add_callback(struct as *, void (*)(), void *, uint_t,
+			caddr_t, size_t, int);
+uint_t	as_delete_callback(struct as *, void *);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_AS_H */
diff --git a/usr/src/uts/common/vm/faultcode.h b/usr/src/uts/common/vm/faultcode.h
new file mode 100644
index 0000000000..82f886e00f
--- /dev/null
+++ b/usr/src/uts/common/vm/faultcode.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1992 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_FAULTCODE_H
+#define	_VM_FAULTCODE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * This file describes the data type returned by vm routines
+ * which handle faults.
+ *
+ * If FC_CODE(fc) == FC_OBJERR, then FC_ERRNO(fc) contains the errno value
+ * returned by the underlying object mapped at the fault address.
+ */
+#define	FC_HWERR	0x1	/* misc hardware error (e.g. bus timeout) */
+#define	FC_ALIGN	0x2	/* hardware alignment error */
+#define	FC_OBJERR	0x3	/* underlying object returned errno value */
+#define	FC_PROT		0x4	/* access exceeded current protections */
+#define	FC_NOMAP	0x5	/* no mapping at the fault address */
+#define	FC_NOSUPPORT	0x6	/* operation not supported by driver */
+
+#define	FC_MAKE_ERR(e)	(((e) << 8) | FC_OBJERR)
+
+#define	FC_CODE(fc)	((fc) & 0xff)
+#define	FC_ERRNO(fc)	((unsigned)(fc) >> 8)
+
+#ifndef	_ASM
+typedef	int	faultcode_t;	/* type returned by vm fault routines */
+#endif	/* _ASM */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_FAULTCODE_H */
diff --git a/usr/src/uts/common/vm/hat.c b/usr/src/uts/common/vm/hat.c
new file mode 100644
index 0000000000..24d6e50b1a
--- /dev/null
+++ b/usr/src/uts/common/vm/hat.c
@@ -0,0 +1,149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/systm.h>
+#include <sys/modctl.h>
+#include <sys/kobj.h>
+#include <vm/hat.h>
+
+/*
+ * PSARC 2004/405 made hat_getkpfnum(9F) obsolete. As part of the
+ * obsolecense, the original documented behavior will begin to be
+ * enforced in the future; namely, hat_getkpfnum(9F) may _only_
+ * be called with device-mapped memory virtual addresses. Since
+ * changing hat_getkpfnum(9F) to return PFN_INVALID on kernel memory
+ * would break a lot of modules without any warning, we've implemented
+ * the following mechanism as a stop-gap. In a future release, this
+ * can all be ripped out and hat_getkpfnum(9F) changed to return
+ * PFN_INVALID if it isn't called with a device-mapped memory address.
+ *
+ * We keep track of each module that has used hat_getkpfnum(9F)
+ * incorrectly. This allows us to avoid flooding the console/logs
+ * with too many warnings about a bad module that has already been
+ * flagged.
+ *
+ * On amd64 hat_getkpfnum() is never supported.
+ */
+
+#if !defined(__amd64)
+
+#define	HAT_STACK_MAXDEPTH	15
+
+struct badcall_node {
+	char	*bc_modname;
+	int	bc_stackdepth;
+	pc_t	bc_callstack[HAT_STACK_MAXDEPTH];
+	struct badcall_node *bc_linkage;
+};
+
+static struct badcall_node *bad_getkpfnum_callers;
+
+/*
+ * Common VM HAT routines.
+ */
+
+static void
+printwarn(struct badcall_node *bc)
+{
+	int sf;
+	char *ksym;
+	ulong_t off;
+
+	cmn_err(CE_WARN, "Module %s is using the obsolete hat_getkpfnum(9F)",
+	    bc->bc_modname);
+	cmn_err(CE_CONT, "interface in a way that will not be supported in\n");
+	cmn_err(CE_CONT, "a future release of Solaris. Please contact the\n");
+	cmn_err(CE_CONT, "vendor that supplied the module for assistance,\n");
+	cmn_err(CE_CONT, "or consult the Writing Device Drivers guide,\n");
+	cmn_err(CE_CONT, "available from http://www.sun.com for migration\n");
+	cmn_err(CE_CONT, "advice.\n");
+	cmn_err(CE_CONT, "---\n");
+	cmn_err(CE_CONT, "Callstack of bad caller:\n");
+
+	for (sf = 0; sf < bc->bc_stackdepth; sf++) {
+		ksym = kobj_getsymname(bc->bc_callstack[sf], &off);
+		cmn_err(CE_CONT, "\t%s+%lx\n", ksym? ksym : "?", off);
+	}
+}
+
+
+void
+hat_getkpfnum_badcall(void *caller)
+{
+	struct badcall_node bcs;
+	char *modname = mod_containing_pc((caddr_t)caller);
+	struct badcall_node *bc;
+
+#ifdef	__sparc
+	/*
+	 * This is a hack until the ifb and jfb framebuffer drivers
+	 * are fixed. Right now they use hat_getkpfnum() in a way that
+	 * is really safe but will be incorrectly flagged as being
+	 * buggy.
+	 */
+	if (strcmp(modname, "ifb") == 0 || strcmp(modname, "jfb") == 0)
+		return;
+#elif defined(__i386)
+	/*
+	 * This is a hack until these ethernet drivers can be fixed
+	 * or EOL'd.  hat_getkpfnum() will continue to work correctly
+	 * until this list can be removed.
+	 */
+	if (strcmp(modname, "dnet") == 0 || strcmp(modname, "pcn") == 0 ||
+	    strcmp(modname, "adp") == 0 || strcmp(modname, "chs") == 0)
+		return;
+#endif	/* __sparc / __i386 */
+
+	for (bc = bad_getkpfnum_callers; bc != NULL; bc = bc->bc_linkage)
+		if (strcmp(bc->bc_modname, modname) == 0)
+			return;
+
+	/*
+	 * We haven't seen this caller before, so create a log of
+	 * the callstack and module name, and emit a warning to the
+	 * user.
+	 */
+	bc = kmem_zalloc(sizeof (struct badcall_node), KM_NOSLEEP);
+	if (bc != NULL) {
+		bc->bc_linkage = bad_getkpfnum_callers;
+		bc->bc_modname = modname;
+		bad_getkpfnum_callers = bc;
+	} else {
+		bc = &bcs;
+		bc->bc_modname = modname;
+	}
+
+	bc->bc_stackdepth = getpcstack(bc->bc_callstack, HAT_STACK_MAXDEPTH);
+
+	printwarn(bc);
+}
+#endif /* __amd64 */
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
new file mode 100644
index 0000000000..b873f4e06e
--- /dev/null
+++ b/usr/src/uts/common/vm/hat.h
@@ -0,0 +1,598 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_HAT_H
+#define	_VM_HAT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <vm/faultcode.h>
+#include <sys/kstat.h>
+#include <sys/siginfo.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Hardware Address Translation management.
+ *
+ * This file describes the machine independent interfaces to
+ * the hardware address translation management routines.  Other
+ * machine specific interfaces and structures are defined
+ * in <vm/hat_xxx.h>.  The hat layer manages the address
+ * translation hardware as a cache driven by calls from the
+ * higher levels of the VM system.
+ */
+
+struct hat;
+struct kpme;
+struct memseg;
+
+#include <vm/page.h>
+
+/*
+ * a callback used with hat_unload_callback()
+ * start and end mark are set to a range of unloaded addresses
+ * and the function is invoked with a pointer to this data structure
+ */
+typedef struct hat_callback {
+	caddr_t	hcb_start_addr;
+	caddr_t hcb_end_addr;
+	void	(*hcb_function)(struct hat_callback *);
+	void	*hcb_data;
+} hat_callback_t;
+
+#ifdef	_KERNEL
+
+/*
+ * One time hat initialization
+ */
+void	hat_init(void);
+
+/*
+ * Notify hat of a system dump
+ */
+void	hat_dump(void);
+
+/*
+ * Operations on an address space:
+ *
+ * struct hat *hat_alloc(as)
+ *	allocated a hat structure for as.
+ *
+ * void hat_free_start(hat)
+ *	informs hat layer process has finished executing but as has not
+ *	been cleaned up yet.
+ *
+ * void hat_free_end(hat)
+ *	informs hat layer as is being destroyed.  hat layer cannot use as
+ *	pointer after this call.
+ *
+ * void hat_swapin(hat)
+ *	allocate any hat resources required for process being swapped in.
+ *
+ * void hat_swapout(hat)
+ *	deallocate hat resources for process being swapped out.
+ *
+ * size_t hat_get_mapped_size(hat)
+ *	returns number of bytes that have valid mappings in hat.
+ *
+ * void hat_stats_enable(hat)
+ * void hat_stats_disable(hat)
+ *	enables/disables collection of stats for hat.
+ *
+ * int hat_dup(parenthat, childhat, addr, len, flags)
+ *	Duplicate address translations of the parent to the child.  Supports
+ *	the entire address range or a range depending on flag,
+ *	zero returned on success, non-zero on error
+ *
+ * void hat_thread_exit(thread)
+ *	Notifies the HAT that a thread is exiting, called after it has been
+ *	reassigned to the kernel AS.
+ */
+
+struct hat *hat_alloc(struct as *);
+void	hat_free_start(struct hat *);
+void	hat_free_end(struct hat *);
+int	hat_dup(struct hat *, struct hat *, caddr_t, size_t, uint_t);
+void	hat_swapin(struct hat *);
+void	hat_swapout(struct hat *);
+size_t	hat_get_mapped_size(struct hat *);
+int	hat_stats_enable(struct hat *);
+void	hat_stats_disable(struct hat *);
+void	hat_thread_exit(kthread_t *);
+
+/*
+ * Operations on a named address within a segment:
+ *
+ * void hat_memload(hat, addr, pp, attr, flags)
+ *	load/lock the given page struct
+ *
+ * void hat_memload_array(hat, addr, len, ppa, attr, flags)
+ *	load/lock the given array of page structs
+ *
+ * void hat_devload(hat, addr, len, pf, attr, flags)
+ *	load/lock the given page frame number
+ *
+ * void hat_unlock(hat, addr, len)
+ *	unlock a given range of addresses
+ *
+ * void hat_unload(hat, addr, len, flags)
+ * void hat_unload_callback(hat, addr, len, flags, callback)
+ *	unload a given range of addresses (has optional callback)
+ *
+ * void hat_sync(hat, addr, len, flags)
+ *	synchronize mapping with software data structures
+ *
+ * void	hat_map(hat, addr, len, flags)
+ *
+ * void hat_setattr(hat, addr, len, attr)
+ * void hat_clrattr(hat, addr, len, attr)
+ * void hat_chgattr(hat, addr, len, attr)
+ *	modify attributes for a range of addresses. skips any invalid mappings
+ *
+ * uint_t hat_getattr(hat, addr, *attr)
+ *	returns attr for <hat,addr> in *attr.  returns 0 if there was a
+ *	mapping and *attr is valid, nonzero if there was no mapping and
+ *	*attr is not valid.
+ *
+ * size_t hat_getpagesize(hat, addr)
+ *	returns pagesize in bytes for <hat, addr>. returns -1 if there is
+ *	no mapping. This is an advisory call.
+ *
+ * pfn_t hat_getpfnum(hat, addr)
+ *	returns pfn for <hat, addr> or PFN_INVALID if mapping is invalid.
+ *
+ * pfn_t hat_getkpfnum(addr)
+ *	returns pfn for non-memory mapped addr in kernel address space
+ *	or PFN_INVALID if mapping is invalid or is kernel memory.
+ *
+ * int hat_probe(hat, addr)
+ *	return 0 if no valid mapping is present.  Faster version
+ *	of hat_getattr in certain architectures.
+ *
+ * int hat_share(dhat, daddr, shat, saddr, len, szc)
+ *
+ * void hat_unshare(hat, addr, len, szc)
+ *
+ * void hat_chgprot(hat, addr, len, vprot)
+ *	This is a deprecated call.  New segment drivers should store
+ *	all attributes and use hat_*attr calls.
+ *	Change the protections in the virtual address range
+ *	given to the specified virtual protection.  If vprot is ~PROT_WRITE,
+ *	then remove write permission, leaving the other permissions
+ *	unchanged.  If vprot is ~PROT_USER, remove user permissions.
+ */
+
+void	hat_memload(struct hat *, caddr_t, struct page *, uint_t, uint_t);
+void	hat_memload_array(struct hat *, caddr_t, size_t, struct page **,
+		uint_t, uint_t);
+
+void	hat_devload(struct hat *, caddr_t, size_t, pfn_t, uint_t, int);
+void	hat_unlock(struct hat *, caddr_t, size_t);
+void	hat_unload(struct hat *, caddr_t, size_t, uint_t);
+void	hat_unload_callback(struct hat *, caddr_t, size_t, uint_t,
+		hat_callback_t *);
+void	hat_sync(struct hat *, caddr_t, size_t, uint_t);
+void	hat_map(struct hat *, caddr_t, size_t, uint_t);
+void	hat_setattr(struct hat *, caddr_t, size_t, uint_t);
+void	hat_clrattr(struct hat *, caddr_t, size_t, uint_t);
+void	hat_chgattr(struct hat *, caddr_t, size_t, uint_t);
+uint_t	hat_getattr(struct hat *, caddr_t, uint_t *);
+ssize_t	hat_getpagesize(struct hat *, caddr_t);
+pfn_t	hat_getpfnum(struct hat *, caddr_t);
+int	hat_probe(struct hat *, caddr_t);
+int	hat_share(struct hat *, caddr_t, struct hat *, caddr_t, size_t, uint_t);
+void	hat_unshare(struct hat *, caddr_t, size_t, uint_t);
+void	hat_chgprot(struct hat *, caddr_t, size_t, uint_t);
+void	hat_reserve(struct as *, caddr_t, size_t);
+pfn_t	va_to_pfn(void *);
+uint64_t va_to_pa(void *);
+
+/*
+ * hat_getkpfnum() is never supported on amd64 and will be
+ * removed from other platforms in future release
+ */
+#if !defined(__amd64)
+pfn_t	hat_getkpfnum(caddr_t);
+#endif
+
+
+/*
+ * Kernel Physical Mapping (segkpm) hat interface routines.
+ */
+caddr_t	hat_kpm_mapin(struct page *, struct kpme *);
+void	hat_kpm_mapout(struct page *, struct kpme *, caddr_t);
+caddr_t	hat_kpm_page2va(struct page *, int);
+struct page *hat_kpm_vaddr2page(caddr_t);
+int	hat_kpm_fault(struct hat *, caddr_t);
+void	hat_kpm_mseghash_clear(int);
+void	hat_kpm_mseghash_update(pgcnt_t, struct memseg *);
+void	hat_kpm_addmem_mseg_update(struct memseg *, pgcnt_t, offset_t);
+void	hat_kpm_addmem_mseg_insert(struct memseg *);
+void	hat_kpm_addmem_memsegs_update(struct memseg *);
+caddr_t hat_kpm_mseg_reuse(struct memseg *);
+void	hat_kpm_delmem_mseg_update(struct memseg *, struct memseg **);
+void	hat_kpm_split_mseg_update(struct memseg *, struct memseg **,
+			struct memseg *, struct memseg *, struct memseg *);
+void	hat_kpm_walk(void (*)(void *, void *, size_t), void *);
+
+/*
+ * Operations on all translations for a given page(s)
+ *
+ * void hat_page_setattr(pp, flag)
+ * void hat_page_clrattr(pp, flag)
+ *	used to set/clr red/mod bits.
+ *
+ * uint hat_page_getattr(pp, flag)
+ *	If flag is specified, returns 0 if attribute is disabled
+ *	and non zero if enabled.  If flag specifes multiple attributs
+ *	then returns 0 if ALL atriibutes are disabled.  This is an advisory
+ *	call.
+ *
+ * int hat_pageunload(pp, forceflag)
+ *	unload all translations attached to pp.
+ *
+ * uint_t hat_pagesync(pp, flags)
+ *	get hw stats from hardware into page struct and reset hw stats
+ *	returns attributes of page
+ *
+ * ulong_t hat_page_getshare(pp)
+ *	returns approx number of mappings to this pp.  A return of 0 implies
+ *	there are no mappings to the page.
+ *
+ * faultcode_t hat_softlock(hat, addr, lenp, ppp, flags);
+ *	called to softlock pages for zero copy tcp
+ *
+ * void hat_page_demote(pp);
+ *	unload all large mappings to pp and decrease p_szc of all
+ *	constituent pages according to the remaining mappings.
+ */
+
+void	hat_page_setattr(struct page *, uint_t);
+void	hat_page_clrattr(struct page *, uint_t);
+uint_t	hat_page_getattr(struct page *, uint_t);
+int	hat_pageunload(struct page *, uint_t);
+uint_t	hat_pagesync(struct page *, uint_t);
+ulong_t	hat_page_getshare(struct page *);
+faultcode_t hat_softlock(struct hat *, caddr_t, size_t *,
+			struct page **, uint_t);
+void	hat_page_demote(struct page *);
+
+/*
+ * Rountine to expose supported HAT features to PIM.
+ */
+enum hat_features {
+	HAT_SHARED_PT,		/* Shared page tables */
+	HAT_DYNAMIC_ISM_UNMAP,	/* hat_pageunload() handles ISM pages */
+	HAT_VMODSORT		/* support for VMODSORT flag of vnode */
+};
+
+int hat_supported(enum hat_features, void *);
+
+/*
+ * Services provided to the hat:
+ *
+ * void as_signal_proc(as, siginfo)
+ *	deliver signal to all processes that have this as.
+ *
+ * int hat_setstat(as, addr, len, rmbits)
+ *	informs hatstat layer that ref/mod bits need to be updated for
+ *	address range. Returns 0 on success, 1 for failure.
+ */
+void	as_signal_proc(struct as *, k_siginfo_t *siginfo);
+void	hat_setstat(struct as *, caddr_t, size_t, uint_t);
+
+/*
+ * Flags to pass to hat routines.
+ *
+ * Certain flags only apply to some interfaces:
+ *
+ * 	HAT_LOAD	Default flags to load a translation to the page.
+ * 	HAT_LOAD_LOCK	Lock down mapping resources; hat_map(), hat_memload(),
+ *			and hat_devload().
+ *	HAT_LOAD_ADV	Advisory load - Load translation if and only if
+ *			sufficient MMU resources exist (i.e., do not steal).
+ *	HAT_LOAD_SHARE	A flag to hat_memload() to indicate h/w page tables
+ *			that map some user pages (not kas) is shared by more
+ *			than one process (eg. ISM).
+ *	HAT_LOAD_CONTIG	Pages are contigous
+ *	HAT_LOAD_NOCONSIST Do not add mapping to mapping list.
+ *	HAT_LOAD_REMAP	Reload a valid pte with a different page frame.
+ *	HAT_RELOAD_SHARE Reload a shared page table entry. Some platforms
+ *			 may require different actions than on the first
+ *			 load of a shared mapping.
+ *	HAT_NO_KALLOC	Do not kmem_alloc while creating the mapping; at this
+ *			point, it's setting up mapping to allocate internal
+ *			hat layer data structures.  This flag forces hat layer
+ *			to tap its reserves in order to prevent infinite
+ *			recursion.
+ *	HAT_LOAD_AUTOLPG Get MMU specific disable_auto_large_pages
+ */
+
+/*
+ * Flags for hat_memload/hat_devload
+ */
+#define	HAT_FLAGS_RESV		0xFF000000	/* resv for hat impl */
+#define	HAT_LOAD		0x00
+#define	HAT_LOAD_LOCK		0x01
+#define	HAT_LOAD_ADV		0x04
+#define	HAT_LOAD_CONTIG		0x10
+#define	HAT_LOAD_NOCONSIST	0x20
+#define	HAT_LOAD_SHARE		0x40
+#define	HAT_LOAD_REMAP		0x80
+#define	HAT_RELOAD_SHARE	0x100
+#define	HAT_NO_KALLOC		0x200
+#define	HAT_LOAD_TEXT		0x400
+#define	HAT_LOAD_AUTOLPG	0x800
+
+/*
+ * Attributes for hat_memload/hat_devload/hat_*attr
+ * are a superset of prot flags defined in mman.h.
+ */
+#define	HAT_PLAT_ATTR_MASK	0xF00000
+#define	HAT_PROT_MASK		0x0F
+
+#define	HAT_NOFAULT		0x10
+#define	HAT_NOSYNC		0x20
+
+/*
+ * Advisory ordering attributes. Apply only to device mappings.
+ *
+ * HAT_STRICTORDER: the CPU must issue the references in order, as the
+ *	programmer specified.  This is the default.
+ * HAT_UNORDERED_OK: the CPU may reorder the references (this is all kinds
+ *	of reordering; store or load with store or load).
+ * HAT_MERGING_OK: merging and batching: the CPU may merge individual stores
+ *	to consecutive locations (for example, turn two consecutive byte
+ *	stores into one halfword store), and it may batch individual loads
+ *	(for example, turn two consecutive byte loads into one halfword load).
+ *	This also implies re-ordering.
+ * HAT_LOADCACHING_OK: the CPU may cache the data it fetches and reuse it
+ *	until another store occurs.  The default is to fetch new data
+ *	on every load.  This also implies merging.
+ * HAT_STORECACHING_OK: the CPU may keep the data in the cache and push it to
+ *	the device (perhaps with other data) at a later time.  The default is
+ *	to push the data right away.  This also implies load caching.
+ */
+#define	HAT_STRICTORDER		0x0000
+#define	HAT_UNORDERED_OK	0x0100
+#define	HAT_MERGING_OK		0x0200
+#define	HAT_LOADCACHING_OK	0x0300
+#define	HAT_STORECACHING_OK	0x0400
+#define	HAT_ORDER_MASK		0x0700
+
+/* endian attributes */
+#define	HAT_NEVERSWAP		0x0000
+#define	HAT_STRUCTURE_BE	0x1000
+#define	HAT_STRUCTURE_LE	0x2000
+#define	HAT_ENDIAN_MASK		0x3000
+
+/* flags for hat_softlock */
+#define	HAT_COW			0x0001
+
+/*
+ * Flags for hat_unload
+ */
+#define	HAT_UNLOAD		0x00
+#define	HAT_UNLOAD_NOSYNC	0x02
+#define	HAT_UNLOAD_UNLOCK	0x04
+#define	HAT_UNLOAD_OTHER	0x08
+#define	HAT_UNLOAD_UNMAP	0x10
+
+/*
+ * Flags for hat_pagesync, hat_getstat, hat_sync
+ */
+#define	HAT_SYNC_DONTZERO	0x00
+#define	HAT_SYNC_ZERORM		0x01
+/* Additional flags for hat_pagesync */
+#define	HAT_SYNC_STOPON_REF	0x02
+#define	HAT_SYNC_STOPON_MOD	0x04
+#define	HAT_SYNC_STOPON_RM	(HAT_SYNC_STOPON_REF | HAT_SYNC_STOPON_MOD)
+#define	HAT_SYNC_STOPON_SHARED	0x08
+
+/*
+ * Flags for hat_dup
+ *
+ * HAT_DUP_ALL dup entire address space
+ * HAT_DUP_COW dup plus hat_clrattr(..PROT_WRITE) on newas
+ */
+#define	HAT_DUP_ALL		1
+#define	HAT_DUP_COW		2
+
+
+/*
+ * Flags for hat_map
+ */
+#define	HAT_MAP			0x00
+
+/*
+ * Flag for hat_pageunload
+ */
+#define	HAT_ADV_PGUNLOAD	0x00
+#define	HAT_FORCE_PGUNLOAD	0x01
+
+/*
+ * Attributes for hat_page_*attr, hat_setstats and
+ * returned by hat_pagesync.
+ */
+#define	P_MOD	0x1		/* the modified bit */
+#define	P_REF	0x2		/* the referenced bit */
+#define	P_RO	0x4		/* Read only page */
+
+#define	hat_ismod(pp)		(hat_page_getattr(pp, P_MOD))
+#define	hat_isref(pp)		(hat_page_getattr(pp, P_REF))
+#define	hat_isro(pp)		(hat_page_getattr(pp, P_RO))
+
+#define	hat_setmod(pp)		(hat_page_setattr(pp, P_MOD))
+#define	hat_setref(pp)		(hat_page_setattr(pp, P_REF))
+#define	hat_setrefmod(pp)	(hat_page_setattr(pp, P_REF|P_MOD))
+
+#define	hat_clrmod(pp)		(hat_page_clrattr(pp, P_MOD))
+#define	hat_clrref(pp)		(hat_page_clrattr(pp, P_REF))
+#define	hat_clrrefmod(pp)	(hat_page_clrattr(pp, P_REF|P_MOD))
+
+#define	hat_page_is_mapped(pp)	(hat_page_getshare(pp))
+
+/*
+ * hat_setup is being used in sparc/os/sundep.c
+ */
+void	hat_setup(struct hat *, int);
+
+/*
+ * Flags for hat_setup
+ */
+#define	HAT_DONTALLOC		0
+#define	HAT_ALLOC		1
+#define	HAT_INIT		2
+
+/*
+ * Other routines, for statistics
+ */
+int	hat_startstat(struct as *);
+void	hat_getstat(struct as *, caddr_t, size_t, uint_t, char *, int);
+void	hat_freestat(struct as *, int);
+void	hat_resvstat(size_t, struct as *, caddr_t);
+
+/*
+ * Transitionary routine while we still allow hat_getkpfnum(caddr_t)
+ * to return a pfn for kernel memory, but want to warn the user that
+ * it isn't supported.
+ */
+void	hat_getkpfnum_badcall(void *caller);
+
+/*
+ * Relocation callback routines. Currently only sfmmu HAT supports
+ * these.
+ */
+extern int	hat_add_callback(id_t, caddr_t, uint_t, uint_t, void *,
+	pfn_t *);
+extern id_t	hat_register_callback(
+	int (*prehandler)(caddr_t, uint_t, uint_t, void *),
+	int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t),
+	int (*errhandler)(caddr_t, uint_t, uint_t, void *), int);
+extern void	hat_delete_callback(caddr_t, uint_t, void *, uint_t);
+
+/*
+ * hat_add_callback()/hat_delete_callback() flags.
+ */
+#define	HAC_NOSLEEP	0x0
+#define	HAC_SLEEP	0x1
+#define	HAC_PAGELOCK	0x2
+
+/*
+ * Suspend/unsuspend handler callback arguments.
+ */
+#define	HAT_SUSPEND		0x0010
+#define	HAT_UNSUSPEND		0x0010
+#define	HAT_PRESUSPEND		0x0020
+#define	HAT_POSTUNSUSPEND	0x0020
+
+/*
+ * Error handler callback arguments. See the block comments
+ * before the implementation of hat_add_callback() for an
+ * explanation of what these mean.
+ */
+#define	HAT_CB_ERR_LEAKED	0x1
+
+#endif /* _KERNEL */
+
+/*
+ * The size of the bit array for ref and mod bit storage must be a power of 2.
+ * 2 bits are collected for each page.  Below the power used is 4,
+ * which is 16 8-bit characters = 128 bits, ref and mod bit information
+ * for 64 pages.
+ */
+#define	HRM_SHIFT		4
+#define	HRM_BYTES		(1 << HRM_SHIFT)
+#define	HRM_PAGES		((HRM_BYTES * NBBY) / 2)
+#define	HRM_PGPERBYTE		(NBBY/2)
+#define	HRM_PGBYTEMASK		(HRM_PGPERBYTE-1)
+
+#define	HRM_PGOFFMASK		((HRM_PGPERBYTE-1) << MMU_PAGESHIFT)
+#define	HRM_BASEOFFSET		(((MMU_PAGESIZE * HRM_PAGES) - 1))
+#define	HRM_BASEMASK		(~(HRM_BASEOFFSET))
+
+#define	HRM_BASESHIFT		(MMU_PAGESHIFT + (HRM_SHIFT + 2))
+#define	HRM_PAGEMASK		(MMU_PAGEMASK ^ HRM_BASEMASK)
+
+#define	HRM_HASHSIZE		0x200
+#define	HRM_HASHMASK		(HRM_HASHSIZE - 1)
+
+#define	HRM_BLIST_INCR		0x200
+
+/*
+ * The structure for maintaining referenced and modified information
+ */
+struct hrmstat {
+	struct as	*hrm_as;	/* stat block belongs to this as */
+	uintptr_t	hrm_base;	/* base of block */
+	ushort_t	hrm_id;		/* opaque identifier, one of a_vbits */
+	struct hrmstat	*hrm_anext;	/* as statistics block list */
+	struct hrmstat	*hrm_hnext;	/* list for hashed blocks */
+	uchar_t		hrm_bits[HRM_BYTES]; /* the ref and mod bits */
+};
+
+/*
+ * For global monitoring of the reference and modified bits
+ * of all address spaces we reserve one id bit.
+ */
+#define	HRM_SWSMONID	1
+
+
+#ifdef _KERNEL
+
+/*
+ * Hat locking functions
+ * XXX - these two functions are currently being used by hatstats
+ * 	they can be removed by using a per-as mutex for hatstats.
+ */
+void	hat_enter(struct hat *);
+void	hat_exit(struct hat *);
+
+#endif /* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_HAT_H */
diff --git a/usr/src/uts/common/vm/hat_refmod.c b/usr/src/uts/common/vm/hat_refmod.c
new file mode 100644
index 0000000000..1a812bd94f
--- /dev/null
+++ b/usr/src/uts/common/vm/hat_refmod.c
@@ -0,0 +1,544 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * The following routines implement the hat layer's
+ * recording of the referenced and modified bits.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+
+/*
+ * Note, usage of cmn_err requires you not hold any hat layer locks.
+ */
+#include <sys/cmn_err.h>
+
+#include <vm/as.h>
+#include <vm/hat.h>
+
+kmutex_t hat_statlock;		/* protects all hat statistics data */
+struct hrmstat *hrm_memlist;	/* tracks memory alloced for hrm_blist blocks */
+struct hrmstat **hrm_hashtab;	/* hash table for finding blocks quickly */
+struct hrmstat *hrm_blist;
+int hrm_blist_incr = HRM_BLIST_INCR;
+int hrm_blist_lowater = HRM_BLIST_INCR/2;
+int hrm_blist_num = 0;
+int hrm_blist_total = 0;
+int hrm_mlockinited = 0;
+int hrm_allocfailmsg = 0;	/* print a message when allocations fail */
+int hrm_allocfail = 0;
+
+static struct hrmstat	*hrm_balloc(void);
+static int	hrm_init(void);
+static void	hrm_link(struct hrmstat *);
+static void	hrm_setbits(struct hrmstat *, caddr_t, uint_t);
+static void	hrm_hashout(struct hrmstat *);
+static void	hrm_getblk(int);
+
+#define	hrm_hash(as, addr) \
+	(HRM_HASHMASK & \
+	(((uintptr_t)(addr) >> HRM_BASESHIFT) ^ ((uintptr_t)(as) >> 2)))
+
+#define	hrm_match(hrm, as, addr) \
+	(((hrm)->hrm_as == (as) && \
+	((hrm)->hrm_base == ((uintptr_t)(addr) & HRM_BASEMASK))) ? 1 : 0)
+
+/*
+ * reserve enough statistic blocks for
+ * chunk of bytes (pages) in a given as.
+ */
+/* ARGSUSED */
+void
+hat_resvstat(size_t chunk, struct as *as, caddr_t addr)
+{
+	int nhrm = btop(chunk)/HRM_PAGES;
+
+	if (nhrm < HRM_BLIST_INCR)
+		nhrm = 0;	/* preallocate at least HRM_BLIST_INCR */
+	hrm_getblk(nhrm);
+}
+
+/*
+ * Start the statistics gathering for an address space.
+ * Return -1 if we can't do it, otherwise return an opaque
+ * identifier to be used when querying for the gathered statistics.
+ * The identifier is an unused bit in a_vbits.
+ * Bit 0 is reserved for swsmon.
+ */
+int
+hat_startstat(struct as *as)
+{
+	uint_t nbits;		/* number of bits */
+	uint_t bn;		/* bit number */
+	uint_t id;		/* new vbit, identifier */
+	uint_t vbits;		/* used vbits of address space */
+	size_t chunk;		/* mapped size for stats */
+	/*
+	 * Initialize global data, if needed.
+	 */
+	if (hrm_init() == -1)
+		return (-1);
+
+	/*
+	 * If the refmod saving memory allocator runs out, print
+	 * a warning message about how to fix it, see comment at
+	 * the beginning of hat_setstat.
+	 */
+	if (hrm_allocfailmsg) {
+		cmn_err(CE_WARN,
+		    "hrm_balloc failures occured, increase hrm_blist_incr");
+		hrm_allocfailmsg = 0;
+	}
+
+	/*
+	 * Verify that a buffer of statistics blocks exists
+	 * and allocate more, if needed.
+	 */
+
+	chunk = hat_get_mapped_size(as->a_hat);
+	chunk = (btop(chunk)/HRM_PAGES);
+	if (chunk < HRM_BLIST_INCR)
+		chunk = 0;
+
+	hrm_getblk((int)chunk);
+
+	/*
+	 * Find a unused id in the given address space.
+	 */
+	hat_enter(as->a_hat);
+	vbits = as->a_vbits;
+	nbits = sizeof (as->a_vbits) * NBBY;
+	for (bn = 1, id = 2; bn < (nbits - 1); bn++, id <<= 1)
+		if ((id & vbits) == 0)
+			break;
+	if (bn >= (nbits - 1)) {
+		hat_exit(as->a_hat);
+		return (-1);
+	}
+	as->a_vbits |= id;
+	hat_exit(as->a_hat);
+	(void) hat_stats_enable(as->a_hat);
+	return (id);
+}
+
+/*
+ * Record referenced and modified information for an address space.
+ * Rmbits is a word containing the referenced bit in bit position 1
+ * and the modified bit in bit position 0.
+ *
+ * For current informational uses, one can rerun any program using
+ * this facility after modifying the hrm_blist_incr to be a larger
+ * amount so that a larger buffer of blocks will be maintained.
+ */
+void
+hat_setstat(struct as *as, caddr_t addr, size_t len, uint_t rmbits)
+{
+	struct hrmstat	*hrm;
+	uint_t		vbits, newbits, nb;
+	int		h;
+
+	ASSERT(len == PAGESIZE);
+	ASSERT((rmbits & ~(P_MOD|P_REF)) == 0);
+
+	if (rmbits == 0)
+		return;
+
+	/*
+	 * Initialize global data, if needed.
+	 */
+	if (hrm_init() == -1)
+		return;
+
+	mutex_enter(&hat_statlock);
+
+	/*
+	 * Search the hash list for the as and addr we are looking for
+	 * and set the ref and mod bits in every block that matches.
+	 */
+	vbits = 0;
+	h = hrm_hash(as, addr);
+	for (hrm = hrm_hashtab[h]; hrm; hrm = hrm->hrm_hnext) {
+		if (hrm_match(hrm, as, addr)) {
+			hrm_setbits(hrm, addr, rmbits);
+			vbits |= hrm->hrm_id;
+		}
+	}
+
+	/*
+	 * If we didn't find a block for all of the enabled
+	 * vpages bits, then allocate and initialize a block
+	 * for each bit that was not found.
+	 */
+	if (vbits != as->a_vbits) {
+		newbits = vbits ^ as->a_vbits;
+		while (newbits) {
+			if (ffs(newbits))
+				nb = 1 << (ffs(newbits)-1);
+			hrm = (struct hrmstat *)hrm_balloc();
+			if (hrm == NULL) {
+				hrm_allocfailmsg = 1;
+				hrm_allocfail++;
+				mutex_exit(&hat_statlock);
+				return;
+			}
+			hrm->hrm_as = as;
+			hrm->hrm_base = (uintptr_t)addr & HRM_BASEMASK;
+			hrm->hrm_id = nb;
+			hrm_link(hrm);
+			hrm_setbits(hrm, addr, rmbits);
+			newbits &= ~nb;
+		}
+	}
+	mutex_exit(&hat_statlock);
+}
+
+/*
+ * Free the resources used to maintain the referenced and modified
+ * statistics for the virtual page view of an address space
+ * identified by id.
+ */
+void
+hat_freestat(struct as *as, int id)
+{
+	struct hrmstat *hrm, *prev_ahrm;
+
+	hat_stats_disable(as->a_hat);	/* tell the hat layer to stop */
+	hat_enter(as->a_hat);
+	if (id == 0)
+		as->a_vbits = 0;
+	else
+		as->a_vbits &= ~id;
+
+	if ((hrm = as->a_hrm) == NULL) {
+		hat_exit(as->a_hat);
+		return;
+	}
+	hat_exit(as->a_hat);
+
+	mutex_enter(&hat_statlock);
+	if (hrm_hashtab == NULL) {
+		/* can't happen? */
+		mutex_exit(&hat_statlock);
+		return;
+	}
+	for (prev_ahrm = NULL; hrm; hrm = hrm->hrm_anext) {
+		if ((id == hrm->hrm_id) || (id == NULL)) {
+
+			hrm_hashout(hrm);
+			hrm->hrm_hnext = hrm_blist;
+			hrm_blist = hrm;
+			hrm_blist_num++;
+
+			if (prev_ahrm == NULL)
+				as->a_hrm = hrm->hrm_anext;
+			else
+				prev_ahrm->hrm_anext = hrm->hrm_anext;
+
+		} else
+			prev_ahrm = hrm;
+	}
+
+	/*
+	 * If all statistics blocks are free,
+	 * return the memory to the system.
+	 */
+	if (hrm_blist_num == hrm_blist_total) {
+		/* zero the block list since we are giving back its memory */
+		hrm_blist = NULL;
+		hrm_blist_num = 0;
+		hrm_blist_total = 0;
+		while (hrm_memlist) {
+			hrm = hrm_memlist;
+			hrm_memlist = hrm->hrm_hnext;
+			kmem_free(hrm, hrm->hrm_base);
+		}
+		ASSERT(hrm_memlist == NULL);
+		kmem_free(hrm_hashtab, HRM_HASHSIZE * sizeof (char *));
+		hrm_hashtab = NULL;
+	}
+	mutex_exit(&hat_statlock);
+}
+
+/*
+ * Initialize any global state for the statistics handling.
+ * Hrm_lock protects the globally allocted memory:
+ *	hrm_memlist and hrm_hashtab.
+ */
+static int
+hrm_init(void)
+{
+	/*
+	 * Alloacte the hashtable if it doesn't exist yet.
+	 */
+	mutex_enter(&hat_statlock);
+	if (hrm_hashtab == NULL)
+		hrm_hashtab =
+			kmem_zalloc(HRM_HASHSIZE * sizeof (char *), KM_SLEEP);
+	mutex_exit(&hat_statlock);
+	return (0);
+}
+
+/*
+ * Grab memory for statistics gathering of the hat layer.
+ */
+static void
+hrm_getblk(int chunk)
+{
+	struct hrmstat *hrm, *l;
+	int i;
+	int hrm_incr;
+
+	mutex_enter(&hat_statlock);
+	if ((hrm_blist == NULL) ||
+	    (hrm_blist_num <= hrm_blist_lowater) ||
+	    chunk) {
+
+		mutex_exit(&hat_statlock);
+
+		hrm_incr = chunk? chunk : hrm_blist_incr;
+		hrm = kmem_zalloc(sizeof (struct hrmstat) * hrm_incr, KM_SLEEP);
+		hrm->hrm_base = sizeof (struct hrmstat) * hrm_incr;
+
+		/*
+		 * thread the allocated blocks onto a freelist
+		 * using the first block to hold information for
+		 * freeing them all later
+		 */
+		mutex_enter(&hat_statlock);
+		hrm->hrm_hnext = hrm_memlist;
+		hrm_memlist = hrm;
+
+		hrm_blist_total += (hrm_incr - 1);
+		for (i = 1; i < hrm_incr; i++) {
+			l = &hrm[i];
+			l->hrm_hnext = hrm_blist;
+			hrm_blist = l;
+			hrm_blist_num++;
+		}
+	}
+	mutex_exit(&hat_statlock);
+}
+
+static void
+hrm_hashin(struct hrmstat *hrm)
+{
+	int 		h;
+
+	ASSERT(MUTEX_HELD(&hat_statlock));
+	h = hrm_hash(hrm->hrm_as, hrm->hrm_base);
+
+	hrm->hrm_hnext = hrm_hashtab[h];
+	hrm_hashtab[h] = hrm;
+}
+
+static void
+hrm_hashout(struct hrmstat *hrm)
+{
+	struct hrmstat	*list, **prev_hrm;
+	int		h;
+
+	ASSERT(MUTEX_HELD(&hat_statlock));
+	h = hrm_hash(hrm->hrm_as, hrm->hrm_base);
+	list = hrm_hashtab[h];
+	prev_hrm = &hrm_hashtab[h];
+
+	while (list) {
+		if (list == hrm) {
+			*prev_hrm = list->hrm_hnext;
+			return;
+		}
+		prev_hrm = &list->hrm_hnext;
+		list = list->hrm_hnext;
+	}
+}
+
+
+/*
+ * Link a statistic block into an address space and also put it
+ * on the hash list for future references.
+ */
+static void
+hrm_link(struct hrmstat *hrm)
+{
+	struct as *as = hrm->hrm_as;
+
+	ASSERT(MUTEX_HELD(&hat_statlock));
+	hrm->hrm_anext = as->a_hrm;
+	as->a_hrm = hrm;
+	hrm_hashin(hrm);
+}
+
+/*
+ * Allocate a block for statistics keeping.
+ * Returns NULL if blocks are unavailable.
+ */
+static struct hrmstat *
+hrm_balloc(void)
+{
+	struct hrmstat *hrm;
+
+	ASSERT(MUTEX_HELD(&hat_statlock));
+
+	hrm = hrm_blist;
+	if (hrm != NULL) {
+		hrm_blist = hrm->hrm_hnext;
+		hrm_blist_num--;
+		hrm->hrm_hnext = NULL;
+	}
+	return (hrm);
+}
+
+/*
+ * Set the ref and mod bits for addr within statistics block hrm.
+ */
+static void
+hrm_setbits(struct hrmstat *hrm, caddr_t addr, uint_t bits)
+{
+	uint_t po, bo, spb;
+	uint_t nbits;
+
+	po = ((uintptr_t)addr & HRM_BASEOFFSET) >> MMU_PAGESHIFT; /* pg off */
+	bo = po / (NBBY / 2);			/* which byte in bit array */
+	spb = (3 - (po & 3)) * 2;		/* shift position within byte */
+	nbits = bits << spb;			/* bit mask */
+	hrm->hrm_bits[bo] |= nbits;
+}
+
+/*
+ * Return collected statistics about an address space.
+ * If clearflag is set, atomically read and zero the bits.
+ *
+ * Fill in the data array supplied with the referenced and
+ * modified bits collected for address range [addr ... addr + len]
+ * in address space, as, uniquely identified by id.
+ * The destination is a byte array.  We fill in three bits per byte:
+ * referenced, modified, and hwmapped bits.
+ * Kernel only interface, can't fault on destination data array.
+ *
+ */
+void
+hat_getstat(struct as *as, caddr_t addr, size_t len, uint_t id,
+    caddr_t datap, int clearflag)
+{
+	size_t	np;		/* number of pages */
+	caddr_t	a;
+	char 	*dp;
+
+	np = btop(len);
+	bzero(datap, np);
+
+	hat_sync(as->a_hat, addr, len, clearflag);
+
+	/* allocate more statistics blocks if needed */
+	hrm_getblk(0);
+
+	mutex_enter(&hat_statlock);
+	if (hrm_hashtab == NULL) {
+		/* can happen when victim process exits */
+		mutex_exit(&hat_statlock);
+		return;
+	}
+	dp = datap;
+	a = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	while (a < addr + len) {
+		struct hrmstat	*hrm;
+		size_t	n;		/* number of pages, temp */
+		int	h;		/* hash index */
+		uint_t	po;
+
+		h = hrm_hash(as, a);
+		n = (HRM_PAGES -
+			(((uintptr_t)a & HRM_PAGEMASK) >> MMU_PAGESHIFT));
+		if (n > np)
+			n = np;
+		po = ((uintptr_t)a & HRM_BASEOFFSET) >> MMU_PAGESHIFT;
+
+		for (hrm = hrm_hashtab[h]; hrm; hrm = hrm->hrm_hnext) {
+			if (hrm->hrm_as == as &&
+			    hrm->hrm_base == ((uintptr_t)a & HRM_BASEMASK) &&
+			    id == hrm->hrm_id) {
+				int i, nr;
+				uint_t bo, spb;
+
+				/*
+				 * Extract leading unaligned bits.
+				 */
+				i = 0;
+				while (i < n && (po & 3)) {
+					bo = po / (NBBY / 2);
+					spb = (3 - (po & 3)) * 2;
+					*dp++ |= (hrm->hrm_bits[bo] >> spb) & 3;
+					if (clearflag)
+						hrm->hrm_bits[bo] &= ~(3<<spb);
+					po++;
+					i++;
+				}
+				/*
+				 * Extract aligned bits.
+				 */
+				nr = n/4*4;
+				bo = po / (NBBY / 2);
+				while (i < nr) {
+					int bits = hrm->hrm_bits[bo];
+					*dp++ |= (bits >> 6) & 3;
+					*dp++ |= (bits >> 4) & 3;
+					*dp++ |= (bits >> 2) & 3;
+					*dp++ |= (bits >> 0) & 3;
+					if (clearflag)
+						hrm->hrm_bits[bo] = 0;
+					bo++;
+					po += 4;
+					i += 4;
+				}
+				/*
+				 * Extract trailing unaligned bits.
+				 */
+				while (i < n) {
+					bo = po / (NBBY / 2);
+					spb = (3 - (po & 3)) * 2;
+					*dp++ |= (hrm->hrm_bits[bo] >> spb) & 3;
+					if (clearflag)
+						hrm->hrm_bits[bo] &= ~(3<<spb);
+					po++;
+					i++;
+				}
+
+				break;
+			}
+		}
+		if (hrm == NULL)
+			dp += n;
+		np -= n;
+		a += n * MMU_PAGESIZE;
+	}
+	mutex_exit(&hat_statlock);
+}
diff --git a/usr/src/uts/common/vm/kpm.h b/usr/src/uts/common/vm/kpm.h
new file mode 100644
index 0000000000..edc213b8f8
--- /dev/null
+++ b/usr/src/uts/common/vm/kpm.h
@@ -0,0 +1,57 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_VM_KPM_H
+#define	_VM_KPM_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	_LP64
+#define	SEGKPM_SUPPORT
+#endif
+
+#ifndef	_ASM
+
+/*
+ * Machine independent per instance kpm mapping structure
+ */
+struct kpme {
+	struct kpme	*kpe_next;
+	struct kpme	*kpe_prev;
+	struct page	*kpe_page;	/* back pointer to (start) page */
+};
+
+#endif	/* _ASM */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_KPM_H */
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
new file mode 100644
index 0000000000..9cd32e0ae3
--- /dev/null
+++ b/usr/src/uts/common/vm/page.h
@@ -0,0 +1,1006 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_PAGE_H
+#define	_VM_PAGE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <vm/seg.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#if defined(_KERNEL) || defined(_KMEMUSER)
+
+/*
+ * Shared/Exclusive lock.
+ */
+
+/*
+ * Types of page locking supported by page_lock & friends.
+ */
+typedef enum {
+	SE_SHARED,
+	SE_EXCL			/* exclusive lock (value == -1) */
+} se_t;
+
+/*
+ * For requesting that page_lock reclaim the page from the free list.
+ */
+typedef enum {
+	P_RECLAIM,		/* reclaim page from free list */
+	P_NO_RECLAIM		/* DON`T reclaim the page	*/
+} reclaim_t;
+
+/*
+ * Callers of page_try_reclaim_lock and page_lock_es can use this flag
+ * to get SE_EXCL access before reader/writers are given access.
+ */
+#define	SE_EXCL_WANTED	0x02
+
+#endif	/* _KERNEL | _KMEMUSER */
+
+typedef int	selock_t;
+
+/*
+ * Define VM_STATS to turn on all sorts of statistic gathering about
+ * the VM layer.  By default, it is only turned on when DEBUG is
+ * also defined.
+ */
+#ifdef DEBUG
+#define	VM_STATS
+#endif	/* DEBUG */
+
+#ifdef VM_STATS
+#define	VM_STAT_ADD(stat)			(stat)++
+#define	VM_STAT_COND_ADD(cond, stat)		((void) (!(cond) || (stat)++))
+#else
+#define	VM_STAT_ADD(stat)
+#define	VM_STAT_COND_ADD(cond, stat)
+#endif	/* VM_STATS */
+
+#ifdef _KERNEL
+
+/*
+ * Macros to acquire and release the page logical lock.
+ */
+#define	page_struct_lock(pp)	mutex_enter(&page_llock)
+#define	page_struct_unlock(pp)	mutex_exit(&page_llock)
+
+#endif	/* _KERNEL */
+
+#include <sys/t_lock.h>
+
+struct as;
+
+/*
+ * Each physical page has a page structure, which is used to maintain
+ * these pages as a cache.  A page can be found via a hashed lookup
+ * based on the [vp, offset].  If a page has an [vp, offset] identity,
+ * then it is entered on a doubly linked circular list off the
+ * vnode using the vpnext/vpprev pointers.   If the p_free bit
+ * is on, then the page is also on a doubly linked circular free
+ * list using next/prev pointers.  If the "p_selock" and "p_iolock"
+ * are held, then the page is currently being read in (exclusive p_selock)
+ * or written back (shared p_selock).  In this case, the next/prev pointers
+ * are used to link the pages together for a consecutive i/o request.  If
+ * the page is being brought in from its backing store, then other processes
+ * will wait for the i/o to complete before attaching to the page since it
+ * will have an "exclusive" lock.
+ *
+ * Each page structure has the locks described below along with
+ * the fields they protect:
+ *
+ *	p_selock	This is a per-page shared/exclusive lock that is
+ *			used to implement the logical shared/exclusive
+ *			lock for each page.  The "shared" lock is normally
+ *			used in most cases while the "exclusive" lock is
+ *			required to destroy or retain exclusive access to
+ *			a page (e.g., while reading in pages).  The appropriate
+ *			lock is always held whenever there is any reference
+ *			to a page structure (e.g., during i/o).
+ *			(Note that with the addition of the "writer-lock-wanted"
+ *			semantics (via SE_EWANTED), threads must not acquire
+ *			multiple reader locks or else a deadly embrace will
+ *			occur in the following situation: thread 1 obtains a
+ *			reader lock; next thread 2 fails to get a writer lock
+ *			but specified SE_EWANTED so it will wait by either
+ *			blocking (when using page_lock_es) or spinning while
+ *			retrying (when using page_try_reclaim_lock) until the
+ *			reader lock is released; then thread 1 attempts to
+ *			get another reader lock but is denied due to
+ *			SE_EWANTED being set, and now both threads are in a
+ *			deadly embrace.)
+ *
+ *				p_hash
+ *				p_vnode
+ *				p_offset
+ *
+ *				p_free
+ *				p_age
+ *
+ *	p_iolock	This is a binary semaphore lock that provides
+ *			exclusive access to the i/o list links in each
+ *			page structure.  It is always held while the page
+ *			is on an i/o list (i.e., involved in i/o).  That is,
+ *			even though a page may be only `shared' locked
+ *			while it is doing a write, the following fields may
+ *			change anyway.  Normally, the page must be
+ *			`exclusively' locked to change anything in it.
+ *
+ *				p_next
+ *				p_prev
+ *
+ * The following fields are protected by the global page_llock:
+ *
+ *				p_lckcnt
+ *				p_cowcnt
+ *
+ * The following lists are protected by the global page_freelock:
+ *
+ *				page_cachelist
+ *				page_freelist
+ *
+ * The following, for our purposes, are protected by
+ * the global freemem_lock:
+ *
+ *				freemem
+ *				freemem_wait
+ *				freemem_cv
+ *
+ * The following fields are protected by hat layer lock(s).  When a page
+ * structure is not mapped and is not associated with a vnode (after a call
+ * to page_hashout() for example) the p_nrm field may be modified with out
+ * holding the hat layer lock:
+ *
+ *				p_nrm
+ *				p_mapping
+ *				p_share
+ *
+ * The following field is file system dependent.  How it is used and
+ * the locking strategies applied are up to the individual file system
+ * implementation.
+ *
+ *				p_fsdata
+ *
+ * The page structure is used to represent and control the system's
+ * physical pages.  There is one instance of the structure for each
+ * page that is not permenately allocated.  For example, the pages that
+ * hold the page structures are permanently held by the kernel
+ * and hence do not need page structures to track them.  The array
+ * of page structures is allocated early on in the kernel's life and
+ * is based on the amount of available physical memory.
+ *
+ * Each page structure may simultaneously appear on several linked lists.
+ * The lists are:  hash list, free or in i/o list, and a vnode's page list.
+ * Each type of list is protected by a different group of mutexes as described
+ * below:
+ *
+ * The hash list is used to quickly find a page when the page's vnode and
+ * offset within the vnode are known.  Each page that is hashed is
+ * connected via the `p_hash' field.  The anchor for each hash is in the
+ * array `page_hash'.  An array of mutexes, `ph_mutex', protects the
+ * lists anchored by page_hash[].  To either search or modify a given hash
+ * list, the appropriate mutex in the ph_mutex array must be held.
+ *
+ * The free list contains pages that are `free to be given away'.  For
+ * efficiency reasons, pages on this list are placed in two catagories:
+ * pages that are still associated with a vnode, and pages that are not
+ * associated with a vnode.  Free pages always have their `p_free' bit set,
+ * free pages that are still associated with a vnode also have their
+ * `p_age' bit set.  Pages on the free list are connected via their
+ * `p_next' and `p_prev' fields.  When a page is involved in some sort
+ * of i/o, it is not free and these fields may be used to link associated
+ * pages together.  At the moment, the free list is protected by a
+ * single mutex `page_freelock'.  The list of free pages still associated
+ * with a vnode is anchored by `page_cachelist' while other free pages
+ * are anchored in architecture dependent ways (to handle page coloring etc.).
+ *
+ * Pages associated with a given vnode appear on a list anchored in the
+ * vnode by the `v_pages' field.  They are linked together with
+ * `p_vpnext' and `p_vpprev'.  The field `p_offset' contains a page's
+ * offset within the vnode.  The pages on this list are not kept in
+ * offset order.  These lists, in a manner similar to the hash lists,
+ * are protected by an array of mutexes called `vph_hash'.  Before
+ * searching or modifying this chain the appropriate mutex in the
+ * vph_hash[] array must be held.
+ *
+ * Again, each of the lists that a page can appear on is protected by a
+ * mutex.  Before reading or writing any of the fields comprising the
+ * list, the appropriate lock must be held.  These list locks should only
+ * be held for very short intervals.
+ *
+ * In addition to the list locks, each page structure contains a
+ * shared/exclusive lock that protects various fields within it.
+ * To modify one of these fields, the `p_selock' must be exclusively held.
+ * To read a field with a degree of certainty, the lock must be at least
+ * held shared.
+ *
+ * Removing a page structure from one of the lists requires holding
+ * the appropriate list lock and the page's p_selock.  A page may be
+ * prevented from changing identity, being freed, or otherwise modified
+ * by acquiring p_selock shared.
+ *
+ * To avoid deadlocks, a strict locking protocol must be followed.  Basically
+ * there are two cases:  In the first case, the page structure in question
+ * is known ahead of time (e.g., when the page is to be added or removed
+ * from a list).  In the second case, the page structure is not known and
+ * must be found by searching one of the lists.
+ *
+ * When adding or removing a known page to one of the lists, first the
+ * page must be exclusively locked (since at least one of its fields
+ * will be modified), second the lock protecting the list must be acquired,
+ * third the page inserted or deleted, and finally the list lock dropped.
+ *
+ * The more interesting case occures when the particular page structure
+ * is not known ahead of time.  For example, when a call is made to
+ * page_lookup(), it is not known if a page with the desired (vnode and
+ * offset pair) identity exists.  So the appropriate mutex in ph_mutex is
+ * acquired, the hash list searched, and if the desired page is found
+ * an attempt is made to lock it.  The attempt to acquire p_selock must
+ * not block while the hash list lock is held.  A deadlock could occure
+ * if some other process was trying to remove the page from the list.
+ * The removing process (following the above protocol) would have exclusively
+ * locked the page, and be spinning waiting to acquire the lock protecting
+ * the hash list.  Since the searching process holds the hash list lock
+ * and is waiting to acquire the page lock, a deadlock occurs.
+ *
+ * The proper scheme to follow is: first, lock the appropriate list,
+ * search the list, and if the desired page is found either use
+ * page_trylock() (which will not block) or pass the address of the
+ * list lock to page_lock().  If page_lock() can not acquire the page's
+ * lock, it will drop the list lock before going to sleep.  page_lock()
+ * returns a value to indicate if the list lock was dropped allowing the
+ * calling program to react appropriately (i.e., retry the operation).
+ *
+ * If the list lock was dropped before the attempt at locking the page
+ * was made, checks would have to be made to ensure that the page had
+ * not changed identity before its lock was obtained.  This is because
+ * the interval between dropping the list lock and acquiring the page
+ * lock is indeterminate.
+ *
+ * In addition, when both a hash list lock (ph_mutex[]) and a vnode list
+ * lock (vph_mutex[]) are needed, the hash list lock must be acquired first.
+ * The routine page_hashin() is a good example of this sequence.
+ * This sequence is ASSERTed by checking that the vph_mutex[] is not held
+ * just before each acquisition of one of the mutexs in ph_mutex[].
+ *
+ * So, as a quick summary:
+ *
+ * 	pse_mutex[]'s protect the p_selock and p_cv fields.
+ *
+ * 	p_selock protects the p_free, p_age, p_vnode, p_offset and p_hash,
+ *
+ * 	ph_mutex[]'s protect the page_hash[] array and its chains.
+ *
+ * 	vph_mutex[]'s protect the v_pages field and the vp page chains.
+ *
+ *	First lock the page, then the hash chain, then the vnode chain.  When
+ *	this is not possible `trylocks' must be used.  Sleeping while holding
+ *	any of these mutexes (p_selock is not a mutex) is not allowed.
+ *
+ *
+ *	field		reading		writing		    ordering
+ *	======================================================================
+ *	p_vnode		p_selock(E,S)	p_selock(E)
+ *	p_offset
+ *	p_free
+ *	p_age
+ *	=====================================================================
+ *	p_hash		p_selock(E,S)	p_selock(E) &&	    p_selock, ph_mutex
+ *					ph_mutex[]
+ *	=====================================================================
+ *	p_vpnext	p_selock(E,S)	p_selock(E) &&	    p_selock, vph_mutex
+ *	p_vpprev			vph_mutex[]
+ *	=====================================================================
+ *	When the p_free bit is set:
+ *
+ *	p_next		p_selock(E,S)	p_selock(E) &&	    p_selock,
+ *	p_prev				page_freelock	    page_freelock
+ *
+ *	When the p_free bit is not set:
+ *
+ *	p_next		p_selock(E,S)	p_selock(E) &&	    p_selock, p_iolock
+ *	p_prev				p_iolock
+ *	=====================================================================
+ *	p_selock	pse_mutex[]	pse_mutex[]	    can`t acquire any
+ *	p_cv						    other mutexes or
+ *							    sleep while holding
+ *							    this lock.
+ *	=====================================================================
+ *	p_lckcnt	p_selock(E,S)	p_selock(E) &&
+ *	p_cowcnt			page_llock
+ *	=====================================================================
+ *	p_nrm		hat layer lock	hat layer lock
+ *	p_mapping
+ *	p_pagenum
+ *	=====================================================================
+ *
+ *	where:
+ *		E----> exclusive version of p_selock.
+ *		S----> shared version of p_selock.
+ *
+ *
+ *	Global data structures and variable:
+ *
+ *	field		reading		writing		    ordering
+ *	=====================================================================
+ *	page_hash[]	ph_mutex[]	ph_mutex[]	    can hold this lock
+ *							    before acquiring
+ *							    a vph_mutex or
+ *							    pse_mutex.
+ *	=====================================================================
+ *	vp->v_pages	vph_mutex[]	vph_mutex[]	    can only acquire
+ *							    a pse_mutex while
+ *							    holding this lock.
+ *	=====================================================================
+ *	page_cachelist	page_freelock	page_freelock	    can't acquire any
+ *	page_freelist	page_freelock	page_freelock
+ *	=====================================================================
+ *	freemem		freemem_lock	freemem_lock	    can't acquire any
+ *	freemem_wait					    other mutexes while
+ *	freemem_cv					    holding this mutex.
+ *	=====================================================================
+ *
+ * Page relocation, PG_NORELOC and P_NORELOC.
+ *
+ * Pages may be relocated using the page_relocate() interface. Relocation
+ * involves moving the contents and identity of a page to another, free page.
+ * To relocate a page, the SE_EXCL lock must be obtained. The way to prevent
+ * a page from being relocated is to hold the SE_SHARED lock (the SE_EXCL
+ * lock must not be held indefinitely). If the page is going to be held
+ * SE_SHARED indefinitely, then the PG_NORELOC hint should be passed
+ * to page_create_va so that pages that are prevented from being relocated
+ * can be managed differently by the platform specific layer.
+ *
+ * Pages locked in memory using page_pp_lock (p_lckcnt/p_cowcnt != 0)
+ * are guaranteed to be held in memory, but can still be relocated
+ * providing the SE_EXCL lock can be obtained.
+ *
+ * The P_NORELOC bit in the page_t.p_state field is provided for use by
+ * the platform specific code in managing pages when the PG_NORELOC
+ * hint is used.
+ *
+ * Memory delete and page locking.
+ *
+ * The set of all usable pages is managed using the global page list as
+ * implemented by the memseg structure defined below. When memory is added
+ * or deleted this list changes. Additions to this list guarantee that the
+ * list is never corrupt.  In order to avoid the necessity of an additional
+ * lock to protect against failed accesses to the memseg being deleted and,
+ * more importantly, the page_ts, the memseg structure is never freed and the
+ * page_t virtual address space is remapped to a page (or pages) of
+ * zeros.  If a page_t is manipulated while it is p_selock'd, or if it is
+ * locked indirectly via a hash or freelist lock, it is not possible for
+ * memory delete to collect the page and so that part of the page list is
+ * prevented from being deleted. If the page is referenced outside of one
+ * of these locks, it is possible for the page_t being referenced to be
+ * deleted.  Examples of this are page_t pointers returned by
+ * page_numtopp_nolock, page_first and page_next.  Providing the page_t
+ * is re-checked after taking the p_selock (for p_vnode != NULL), the
+ * remapping to the zero pages will be detected.
+ *
+ *
+ * Page size (p_szc field) and page locking.
+ *
+ * p_szc field of free pages is changed by free list manager under freelist
+ * locks and is of no concern to the rest of VM subsystem.
+ *
+ * p_szc changes of allocated anonymous (swapfs) can only be done only after
+ * exclusively locking all constituent pages and calling hat_pageunload() on
+ * each of them. To prevent p_szc changes of non free anonymous (swapfs) large
+ * pages it's enough to either lock SHARED any of constituent pages or prevent
+ * hat_pageunload() by holding hat level lock that protects mapping lists (this
+ * method is for hat code only)
+ *
+ * To increase (promote) p_szc of allocated non anonymous file system pages
+ * one has to first lock exclusively all involved constituent pages and call
+ * hat_pageunload() on each of them. To prevent p_szc promote it's enough to
+ * either lock SHARED any of constituent pages that will be needed to make a
+ * large page or prevent hat_pageunload() by holding hat level lock that
+ * protects mapping lists (this method is for hat code only).
+ *
+ * To decrease (demote) p_szc of an allocated non anonymous file system large
+ * page one can either use the same method as used for changeing p_szc of
+ * anonymous large pages or if it's not possible to lock all constituent pages
+ * exclusively a different method can be used. In the second method one only
+ * has to exclusively lock one of constituent pages but then one has to
+ * acquire further locks by calling page_szc_lock() and
+ * hat_page_demote(). hat_page_demote() acquires hat level locks and then
+ * demotes the page. This mechanism relies on the fact that any code that
+ * needs to prevent p_szc of a file system large page from changeing either
+ * locks all constituent large pages at least SHARED or locks some pages at
+ * least SHARED and calls page_szc_lock() or uses hat level page locks.
+ * Demotion using this method is implemented by page_demote_vp_pages().
+ * Please see comments in front of page_demote_vp_pages(), hat_page_demote()
+ * and page_szc_lock() for more details.
+ *
+ * Lock order: p_selock, page_szc_lock, ph_mutex/vph_mutex/freelist,
+ * hat level locks.
+ */
+
+typedef struct page {
+	u_offset_t	p_offset;	/* offset into vnode for this page */
+	struct vnode	*p_vnode;	/* vnode that this page is named by */
+	selock_t	p_selock;	/* shared/exclusive lock on the page */
+#if defined(_LP64)
+	int		p_selockpad;	/* pad for growing selock */
+#endif
+	struct page	*p_hash;	/* hash by [vnode, offset] */
+	struct page	*p_vpnext;	/* next page in vnode list */
+	struct page	*p_vpprev;	/* prev page in vnode list */
+	struct page	*p_next;	/* next page in free/intrans lists */
+	struct page	*p_prev;	/* prev page in free/intrans lists */
+	ushort_t	p_lckcnt;	/* number of locks on page data */
+	ushort_t	p_cowcnt;	/* number of copy on write lock */
+	kcondvar_t	p_cv;		/* page struct's condition var */
+	kcondvar_t	p_io_cv;	/* for iolock */
+	uchar_t		p_iolock_state;	/* replaces p_iolock */
+	volatile uchar_t p_szc;		/* page size code */
+	uchar_t		p_fsdata;	/* file system dependent byte */
+	uchar_t		p_state;	/* p_free, p_noreloc */
+	uchar_t		p_nrm;		/* non-cache, ref, mod readonly bits */
+#if defined(__sparc)
+	uchar_t		p_vcolor;	/* virtual color */
+#else
+	uchar_t		p_embed;	/* x86 - changes p_mapping & p_index */
+#endif
+	uchar_t		p_index;	/* MPSS mapping info. Not used on x86 */
+	uchar_t		p_toxic;	/* page has an unrecoverable error */
+	void		*p_mapping;	/* hat specific translation info */
+	pfn_t		p_pagenum;	/* physical page number */
+
+	uint_t		p_share;	/* number of translations */
+#if defined(_LP64)
+	uint_t		p_sharepad;	/* pad for growing p_share */
+#endif
+	uint_t		p_msresv_1;	/* reserved for future use */
+#if defined(__sparc)
+	uint_t		p_kpmref;	/* number of kpm mapping sharers */
+	struct kpme	*p_kpmelist;	/* kpm specific mapping info */
+#else
+	/* index of entry in p_map when p_embed is set */
+	uint_t		p_mlentry;
+#endif
+	uint64_t	p_msresv_2;	/* page allocation debugging */
+} page_t;
+
+
+typedef	page_t	devpage_t;
+#define	devpage	page
+
+
+/*
+ * Page hash table is a power-of-two in size, externally chained
+ * through the hash field.  PAGE_HASHAVELEN is the average length
+ * desired for this chain, from which the size of the page_hash
+ * table is derived at boot time and stored in the kernel variable
+ * page_hashsz.  In the hash function it is given by PAGE_HASHSZ.
+ *
+ * PAGE_HASH_FUNC returns an index into the page_hash[] array.  This
+ * index is also used to derive the mutex that protects the chain.
+ *
+ * In constructing the hash function, first we dispose of unimportant bits
+ * (page offset from "off" and the low 3 bits of "vp" which are zero for
+ * struct alignment). Then shift and sum the remaining bits a couple times
+ * in order to get as many source bits from the two source values into the
+ * resulting hashed value.  Note that this will perform quickly, since the
+ * shifting/summing are fast register to register operations with no additional
+ * memory references).
+ */
+#if NCPU < 4
+#define	PH_TABLE_SIZE	16
+#define	VP_SHIFT	7
+#else
+#define	PH_TABLE_SIZE	128
+#define	VP_SHIFT	9
+#endif
+
+/*
+ * The amount to use for the successive shifts in the hash function below.
+ * The actual value is LOG2(PH_TABLE_SIZE), so that as many bits as
+ * possible will filter thru PAGE_HASH_FUNC() and PAGE_HASH_MUTEX().
+ */
+#define	PH_SHIFT_SIZE   (7)
+
+#define	PAGE_HASHSZ	page_hashsz
+#define	PAGE_HASHAVELEN		4
+#define	PAGE_HASH_FUNC(vp, off) \
+	((((uintptr_t)(off) >> PAGESHIFT) + \
+		((uintptr_t)(off) >> (PAGESHIFT + PH_SHIFT_SIZE)) + \
+		((uintptr_t)(vp) >> 3) + \
+		((uintptr_t)(vp) >> (3 + PH_SHIFT_SIZE)) + \
+		((uintptr_t)(vp) >> (3 + 2 * PH_SHIFT_SIZE))) & \
+		(PAGE_HASHSZ - 1))
+#ifdef _KERNEL
+
+/*
+ * The page hash value is re-hashed to an index for the ph_mutex array.
+ *
+ * For 64 bit kernels, the mutex array is padded out to prevent false
+ * sharing of cache sub-blocks (64 bytes) of adjacent mutexes.
+ *
+ * For 32 bit kernels, we don't want to waste kernel address space with
+ * padding, so instead we rely on the hash function to introduce skew of
+ * adjacent vnode/offset indexes (the left shift part of the hash function).
+ * Since sizeof (kmutex_t) is 8, we shift an additional 3 to skew to a different
+ * 64 byte sub-block.
+ */
+typedef struct pad_mutex {
+	kmutex_t	pad_mutex;
+#ifdef _LP64
+	char		pad_pad[64 - sizeof (kmutex_t)];
+#endif
+} pad_mutex_t;
+extern pad_mutex_t ph_mutex[];
+
+#define	PAGE_HASH_MUTEX(x) \
+	&(ph_mutex[((x) + ((x) >> VP_SHIFT) + ((x) << 3)) & \
+		(PH_TABLE_SIZE - 1)].pad_mutex)
+
+/*
+ * Flags used while creating pages.
+ */
+#define	PG_EXCL		0x0001
+#define	PG_WAIT		0x0002
+#define	PG_PHYSCONTIG	0x0004		/* NOT SUPPORTED */
+#define	PG_MATCH_COLOR	0x0008		/* SUPPORTED by free list routines */
+#define	PG_NORELOC	0x0010		/* Non-relocatable alloc hint. */
+					/* Page must be PP_ISNORELOC */
+#define	PG_PANIC	0x0020		/* system will panic if alloc fails */
+#define	PG_PUSHPAGE	0x0040		/* alloc may use reserve */
+
+/*
+ * When p_selock has the SE_EWANTED bit set, threads waiting for SE_EXCL
+ * access are given priority over all other waiting threads.
+ */
+#define	SE_EWANTED	0x40000000
+#define	PAGE_LOCKED(pp)		(((pp)->p_selock & ~SE_EWANTED) != 0)
+#define	PAGE_SHARED(pp)		(((pp)->p_selock & ~SE_EWANTED) > 0)
+#define	PAGE_EXCL(pp)		((pp)->p_selock < 0)
+#define	PAGE_LOCKED_SE(pp, se)	\
+	((se) == SE_EXCL ? PAGE_EXCL(pp) : PAGE_SHARED(pp))
+
+extern	long page_hashsz;
+extern	page_t **page_hash;
+
+extern	kmutex_t page_llock;		/* page logical lock mutex */
+extern	kmutex_t freemem_lock;		/* freemem lock */
+
+extern	pgcnt_t	total_pages;		/* total pages in the system */
+
+/*
+ * Variables controlling locking of physical memory.
+ */
+extern	pgcnt_t	pages_pp_maximum;	/* tuning: lock + claim <= max */
+extern	void init_pages_pp_maximum(void);
+
+struct lgrp;
+
+/* page_list_{add,sub} flags */
+
+/* which list */
+#define	PG_FREE_LIST	0x0001
+#define	PG_CACHE_LIST	0x0002
+
+/* where on list */
+#define	PG_LIST_TAIL	0x0010
+#define	PG_LIST_HEAD	0x0020
+
+/* called from */
+#define	PG_LIST_ISINIT	0x1000
+#define	PG_LIST_ISCAGE	0x2000
+
+/*
+ * Flags for setting the p_toxic flag when a page has errors
+ * These flags may be OR'ed into the p_toxic page flag to
+ * indicate that error(s) have occurred on a page,
+ * (see page_settoxic()). If both PAGE_IS_TOXIC and
+ * PAGE_IS_FAILING are set, PAGE_IS_FAILING takes precedence.
+ *
+ * When an error happens on a page, the trap handler sets
+ * PAGE_IS_FAULTY on the page to indicate that an error has been
+ * seen on the page. The error could be really a memory error or
+ * something else (like a datapath error). When it is determined
+ * that it is a memory error, the page is marked as PAGE_IS_TOXIC
+ * or PAGE_IS_FAILING depending on the type of error and then
+ * retired.
+ *
+ * We use the page's 'toxic' flag to determine whether the page
+ * has just got a single error - PAGE_IS_TOXIC - or is being
+ * retired due to multiple soft errors - PAGE_IS_FAILING. In
+ * page_free(), a page that has been marked PAGE_IS_FAILING will
+ * not be cleaned, it will always be retired. A page marked
+ * PAGE_IS_TOXIC is cleaned and is retired only if this attempt at
+ * cleaning fails.
+ *
+ * When a page has been successfully retired, we set PAGE_IS_RETIRED.
+ */
+#define	PAGE_IS_OK		0x0
+#define	PAGE_IS_TOXIC		0x1
+#define	PAGE_IS_FAILING		0x2
+#define	PAGE_IS_RETIRED		0x4
+#define	PAGE_IS_FAULTY		0x8
+
+/*
+ * Page frame operations.
+ */
+page_t	*page_lookup(struct vnode *, u_offset_t, se_t);
+page_t	*page_lookup_create(struct vnode *, u_offset_t, se_t, page_t *,
+	spgcnt_t *, int);
+page_t	*page_lookup_nowait(struct vnode *, u_offset_t, se_t);
+page_t	*page_find(struct vnode *, u_offset_t);
+page_t	*page_exists(struct vnode *, u_offset_t);
+int	page_exists_physcontig(vnode_t *, u_offset_t, uint_t, page_t *[]);
+int	page_exists_forreal(struct vnode *, u_offset_t, uint_t *);
+void	page_needfree(spgcnt_t);
+page_t	*page_create(struct vnode *, u_offset_t, size_t, uint_t);
+int	page_alloc_pages(struct seg *, caddr_t, page_t **, page_t **,
+		uint_t, int);
+page_t  *page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes,
+	uint_t flags, struct seg *seg, caddr_t vaddr, void *arg);
+page_t	*page_create_va(struct vnode *, u_offset_t, size_t, uint_t,
+	struct seg *, caddr_t);
+int	page_create_wait(size_t npages, uint_t flags);
+void    page_create_putback(ssize_t npages);
+void	page_free(page_t *, int);
+void	page_free_at_startup(page_t *);
+void	page_free_pages(page_t *);
+void	free_vp_pages(struct vnode *, u_offset_t, size_t);
+int	page_reclaim(page_t *, kmutex_t *);
+void	page_destroy(page_t *, int);
+void	page_destroy_pages(page_t *);
+void	page_destroy_free(page_t *);
+void	page_rename(page_t *, struct vnode *, u_offset_t);
+int	page_hashin(page_t *, struct vnode *, u_offset_t, kmutex_t *);
+void	page_hashout(page_t *, kmutex_t *);
+int	page_num_hashin(pfn_t, struct vnode *, u_offset_t);
+void	page_add(page_t **, page_t *);
+void	page_add_common(page_t **, page_t *);
+void	page_sub(page_t **, page_t *);
+void	page_sub_common(page_t **, page_t *);
+page_t	*page_get_freelist(struct vnode *, u_offset_t, struct seg *,
+		caddr_t, size_t, uint_t, struct lgrp *);
+
+page_t	*page_get_cachelist(struct vnode *, u_offset_t, struct seg *,
+		caddr_t, uint_t, struct lgrp *);
+void	page_list_add(page_t *, int);
+void	page_boot_demote(page_t *);
+void	page_promote_size(page_t *, uint_t);
+void	page_list_add_pages(page_t *, int);
+void	page_list_sub(page_t *, int);
+void	page_list_break(page_t **, page_t **, size_t);
+void	page_list_concat(page_t **, page_t **);
+void	page_vpadd(page_t **, page_t *);
+void	page_vpsub(page_t **, page_t *);
+int	page_lock(page_t *, se_t, kmutex_t *, reclaim_t);
+int	page_lock_es(page_t *, se_t, kmutex_t *, reclaim_t, int);
+void page_lock_clr_exclwanted(page_t *);
+int	page_trylock(page_t *, se_t);
+int	page_try_reclaim_lock(page_t *, se_t, int);
+int	page_tryupgrade(page_t *);
+void	page_downgrade(page_t *);
+void	page_unlock(page_t *);
+void	page_lock_delete(page_t *);
+int	page_pp_lock(page_t *, int, int);
+void	page_pp_unlock(page_t *, int, int);
+int	page_resv(pgcnt_t, uint_t);
+void	page_unresv(pgcnt_t);
+void	page_pp_useclaim(page_t *, page_t *, uint_t);
+int	page_addclaim(page_t *);
+int	page_subclaim(page_t *);
+int	page_addclaim_pages(page_t **);
+int	page_subclaim_pages(page_t **);
+pfn_t	page_pptonum(page_t *);
+page_t	*page_numtopp(pfn_t, se_t);
+page_t	*page_numtopp_noreclaim(pfn_t, se_t);
+page_t	*page_numtopp_nolock(pfn_t);
+page_t	*page_numtopp_nowait(pfn_t, se_t);
+page_t  *page_first();
+page_t  *page_next(page_t *);
+page_t  *page_nextn_raw(page_t *, ulong_t);	/* pp += n */
+#define	page_next_raw(PP)	page_nextn_raw((PP), 1)
+page_t  *page_list_next(page_t *);
+page_t	*page_nextn(page_t *, ulong_t);
+page_t	*page_next_scan_init(void **);
+page_t	*page_next_scan_large(page_t *, ulong_t *, void **);
+void    prefetch_page_r(void *);
+void	ppcopy(page_t *, page_t *);
+void	page_relocate_hash(page_t *, page_t *);
+void	pagezero(page_t *, uint_t, uint_t);
+void	pagescrub(page_t *, uint_t, uint_t);
+void	page_io_lock(page_t *);
+void	page_io_unlock(page_t *);
+int	page_io_trylock(page_t *);
+int	page_iolock_assert(page_t *);
+void	page_iolock_init(page_t *);
+pgcnt_t	page_busy(int);
+void	page_lock_init(void);
+ulong_t	page_share_cnt(page_t *);
+int	page_isshared(page_t *);
+int	page_isfree(page_t *);
+int	page_isref(page_t *);
+int	page_ismod(page_t *);
+int	page_release(page_t *, int);
+int	page_retire(page_t *, uchar_t);
+int	page_istoxic(page_t *);
+int	page_isfailing(page_t *);
+int	page_isretired(page_t *);
+int	page_deteriorating(page_t *);
+void	page_settoxic(page_t *, uchar_t);
+void	page_clrtoxic(page_t *);
+void	page_clrtoxic_flag(page_t *, uchar_t);
+int	page_isfaulty(page_t *);
+int	page_mem_avail(pgcnt_t);
+
+void page_set_props(page_t *, uint_t);
+void page_clr_all_props(page_t *);
+
+kmutex_t	*page_vnode_mutex(struct vnode *);
+kmutex_t	*page_se_mutex(struct page *);
+kmutex_t	*page_szc_lock(struct page *);
+int		page_szc_lock_assert(struct page *pp);
+
+/*
+ * Page relocation interfaces. page_relocate() is generic.
+ * page_get_replacement_page() is provided by the PSM.
+ * page_free_replacement_page() is generic.
+ */
+int group_page_trylock(page_t *, se_t);
+void group_page_unlock(page_t *);
+int page_relocate(page_t **, page_t **, int, int, spgcnt_t *, struct lgrp *);
+int do_page_relocate(page_t **, page_t **, int, spgcnt_t *, struct lgrp *);
+page_t *page_get_replacement_page(page_t *, struct lgrp *, uint_t);
+void page_free_replacement_page(page_t *);
+int page_relocate_cage(page_t **, page_t **);
+
+int page_try_demote_pages(page_t *);
+void page_demote_free_pages(page_t *);
+
+struct anon_map;
+
+void page_mark_migrate(struct seg *, caddr_t, size_t, struct anon_map *,
+    ulong_t, vnode_t *, u_offset_t, int);
+void page_migrate(struct seg *, caddr_t, page_t **, pgcnt_t);
+
+/*
+ * Tell the PIM we are adding physical memory
+ */
+void add_physmem(page_t *, size_t, pfn_t);
+void add_physmem_cb(page_t *, pfn_t);	/* callback for page_t part */
+
+/*
+ * hw_page_array[] is configured with hardware supported page sizes by
+ * platform specific code.
+ */
+typedef struct {
+	size_t	hp_size;
+	uint_t	hp_shift;
+	pgcnt_t	hp_pgcnt;	/* base pagesize cnt */
+} hw_pagesize_t;
+
+extern hw_pagesize_t	hw_page_array[];
+extern uint_t		page_colors, page_colors_mask;
+extern uint_t		page_coloring_shift;
+extern int		cpu_page_colors;
+
+uint_t	page_num_pagesizes(void);
+uint_t	page_num_user_pagesizes(void);
+size_t	page_get_pagesize(uint_t);
+size_t	page_get_user_pagesize(uint_t n);
+pgcnt_t	page_get_pagecnt(uint_t);
+uint_t	page_get_shift(uint_t);
+int	page_szc(size_t);
+int	page_user_szc(size_t);
+
+
+/* page_get_replacement page flags */
+#define	PGR_SAMESZC	0x1	/* only look for page size same as orig */
+#define	PGR_NORELOC	0x2	/* allocate a P_NORELOC page */
+
+#endif	/* _KERNEL */
+
+/*
+ * Constants used for the p_iolock_state
+ */
+#define	PAGE_IO_INUSE	0x1
+#define	PAGE_IO_WANTED	0x2
+
+/*
+ * Constants used for page_release status
+ */
+#define	PGREL_NOTREL    0x1
+#define	PGREL_CLEAN	0x2
+#define	PGREL_MOD	0x3
+
+/*
+ * The p_state field holds what used to be the p_age and p_free
+ * bits.  These fields are protected by p_selock (see above).
+ */
+#define	P_FREE		0x80		/* Page on free list */
+#define	P_NORELOC	0x40		/* Page is non-relocatable */
+#define	P_MIGRATE	0x20		/* Migrate page on next touch */
+#define	P_SWAP		0x10		/* belongs to vnode that is V_ISSWAP */
+
+#define	PP_ISFREE(pp)		((pp)->p_state & P_FREE)
+#define	PP_ISAGED(pp)		(((pp)->p_state & P_FREE) && \
+					((pp)->p_vnode == NULL))
+#define	PP_ISNORELOC(pp)	((pp)->p_state & P_NORELOC)
+#define	PP_ISMIGRATE(pp)	((pp)->p_state & P_MIGRATE)
+#define	PP_ISSWAP(pp)		((pp)->p_state & P_SWAP)
+
+#define	PP_SETFREE(pp)		((pp)->p_state = ((pp)->p_state & ~P_MIGRATE) \
+				| P_FREE)
+#define	PP_SETAGED(pp)		ASSERT(PP_ISAGED(pp))
+#define	PP_SETNORELOC(pp)	((pp)->p_state |= P_NORELOC)
+#define	PP_SETMIGRATE(pp)	((pp)->p_state |= P_MIGRATE)
+#define	PP_SETSWAP(pp)		((pp)->p_state |= P_SWAP)
+
+#define	PP_CLRFREE(pp)		((pp)->p_state &= ~P_FREE)
+#define	PP_CLRAGED(pp)		ASSERT(!PP_ISAGED(pp))
+#define	PP_CLRNORELOC(pp)	((pp)->p_state &= ~P_NORELOC)
+#define	PP_CLRMIGRATE(pp)	((pp)->p_state &= ~P_MIGRATE)
+#define	PP_CLRSWAP(pp)		((pp)->p_state &= ~P_SWAP)
+
+
+
+/*
+ * kpm large page description.
+ * The virtual address range of segkpm is divided into chunks of
+ * kpm_pgsz. Each chunk is controlled by a kpm_page_t. The ushort
+ * is sufficient for 2^^15 * PAGESIZE, so e.g. the maximum kpm_pgsz
+ * for 8K is 256M and 2G for 64K pages. It it kept as small as
+ * possible to save physical memory space.
+ *
+ * There are 2 segkpm mapping windows within in the virtual address
+ * space when we have to prevent VAC alias conflicts. The so called
+ * Alias window (mappings are always by PAGESIZE) is controlled by
+ * kp_refcnta. The regular window is controlled by kp_refcnt for the
+ * normal operation, which is to use the largest available pagesize.
+ * When VAC alias conflicts are present within a chunk in the regular
+ * window the large page mapping is broken up into smaller PAGESIZE
+ * mappings. kp_refcntc is used to control the pages that are invoked
+ * in the conflict and kp_refcnts holds the active mappings done
+ * with the small page size. In non vac conflict mode kp_refcntc is
+ * also used as "go" indication (-1) for the trap level tsbmiss
+ * handler.
+ */
+typedef struct kpm_page {
+	short kp_refcnt;	/* pages mapped large */
+	short kp_refcnta;	/* pages mapped in Alias window */
+	short kp_refcntc;	/* TL-tsbmiss flag; #vac alias conflict pages */
+	short kp_refcnts;	/* vac alias: pages mapped small */
+} kpm_page_t;
+
+/*
+ * Note: khl_lock offset changes must be reflected in sfmmu_asm.s
+ */
+typedef struct kpm_hlk {
+	kmutex_t khl_mutex;	/* kpm_page mutex */
+	uint_t   khl_lock;	/* trap level tsbmiss handling */
+} kpm_hlk_t;
+
+/*
+ * kpm small page description.
+ * When kpm_pgsz is equal to PAGESIZE a smaller representation is used
+ * to save memory space. Alias range mappings and regular segkpm
+ * mappings are done in units of PAGESIZE and can share the mapping
+ * information and the mappings are always distinguishable by their
+ * virtual address. Other information neeeded for VAC conflict prevention
+ * is already available on a per page basis. There are basically 3 states
+ * a kpm_spage can have: not mapped (0), mapped in Alias range or virtually
+ * uncached (1) and mapped in the regular segkpm window (-1). The -1 value
+ * is also used as "go" indication for the segkpm trap level tsbmiss
+ * handler for small pages (value is kept the same as it is used for large
+ * mappings).
+ */
+typedef struct kpm_spage {
+	char	kp_mapped;	/* page mapped small */
+} kpm_spage_t;
+
+/*
+ * Note: kshl_lock offset changes must be reflected in sfmmu_asm.s
+ */
+typedef struct kpm_shlk {
+	uint_t   kshl_lock;	/* trap level tsbmiss handling */
+} kpm_shlk_t;
+
+/*
+ * Each segment of physical memory is described by a memseg struct.
+ * Within a segment, memory is considered contiguous. The members
+ * can be categorized as follows:
+ * . Platform independent:
+ *         pages, epages, pages_base, pages_end, next, lnext.
+ * . 64bit only but platform independent:
+ *         kpm_pbase, kpm_nkpmpgs, kpm_pages, kpm_spages.
+ * . Really platform or mmu specific:
+ *         pagespa, epagespa, nextpa, kpm_pagespa.
+ * . Mixed:
+ *         msegflags.
+ */
+struct memseg {
+	page_t *pages, *epages;		/* [from, to] in page array */
+	pfn_t pages_base, pages_end;	/* [from, to] in page numbers */
+	struct memseg *next;		/* next segment in list */
+#if defined(__sparc)
+	struct memseg *lnext;		/* next segment in deleted list */
+	uint64_t pagespa, epagespa;	/* [from, to] page array physical */
+	uint64_t nextpa;		/* physical next pointer */
+	pfn_t	kpm_pbase;		/* start of kpm range */
+	pgcnt_t kpm_nkpmpgs;		/* # of kpm_pgsz pages */
+	union _mseg_un {
+		kpm_page_t  *kpm_lpgs;	/* ptr to kpm_page array */
+		kpm_spage_t *kpm_spgs;	/* ptr to kpm_spage array */
+	} mseg_un;
+	uint64_t kpm_pagespa;		/* physical ptr to kpm (s)pages array */
+	uint_t msegflags;		/* memseg flags */
+#endif /* __sparc */
+};
+
+/* memseg union aliases */
+#define	kpm_pages	mseg_un.kpm_lpgs
+#define	kpm_spages	mseg_un.kpm_spgs
+
+/* msegflags */
+#define	MEMSEG_DYNAMIC		0x1	/* DR: memory was added dynamically */
+
+/* memseg support macros */
+#define	MSEG_NPAGES(SEG)	((SEG)->pages_end - (SEG)->pages_base)
+
+/* memseg hash */
+#define	MEM_HASH_SHIFT		0x9
+#define	N_MEM_SLOTS		0x200		/* must be a power of 2 */
+#define	MEMSEG_PFN_HASH(pfn)	(((pfn)/mhash_per_slot) & (N_MEM_SLOTS - 1))
+
+/* memseg  externals */
+extern struct memseg *memsegs;		/* list of memory segments */
+extern ulong_t mhash_per_slot;
+extern uint64_t memsegspa;		/* memsegs as physical address */
+
+void build_pfn_hash();
+extern struct memseg *page_numtomemseg_nolock(pfn_t pfnum);
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_PAGE_H */
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
new file mode 100644
index 0000000000..9a2d12dd8e
--- /dev/null
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -0,0 +1,861 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VM - page locking primitives
+ */
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/vtrace.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/bitmap.h>
+#include <sys/lockstat.h>
+#include <sys/condvar_impl.h>
+#include <vm/page.h>
+#include <vm/seg_enum.h>
+#include <vm/vm_dep.h>
+
+/*
+ * This global mutex is for logical page locking.
+ * The following fields in the page structure are protected
+ * by this lock:
+ *
+ *	p_lckcnt
+ *	p_cowcnt
+ */
+kmutex_t page_llock;
+
+/*
+ * This is a global lock for the logical page free list.  The
+ * logical free list, in this implementation, is maintained as two
+ * separate physical lists - the cache list and the free list.
+ */
+kmutex_t  page_freelock;
+
+/*
+ * The hash table, page_hash[], the p_selock fields, and the
+ * list of pages associated with vnodes are protected by arrays of mutexes.
+ *
+ * Unless the hashes are changed radically, the table sizes must be
+ * a power of two.  Also, we typically need more mutexes for the
+ * vnodes since these locks are occasionally held for long periods.
+ * And since there seem to be two special vnodes (kvp and swapvp),
+ * we make room for private mutexes for them.
+ *
+ * The pse_mutex[] array holds the mutexes to protect the p_selock
+ * fields of all page_t structures.
+ *
+ * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
+ * when given a pointer to a page_t.
+ *
+ * PSE_TABLE_SIZE must be a power of two.  One could argue that we
+ * should go to the trouble of setting it up at run time and base it
+ * on memory size rather than the number of compile time CPUs.
+ *
+ * XX64	We should be using physmem size to calculate PSE_TABLE_SIZE,
+ *	PSE_SHIFT, PIO_SHIFT.
+ *
+ *	These might break in 64 bit world.
+ */
+#define	PSE_SHIFT	7		/* log2(PSE_TABLE_SIZE) */
+
+#define	PSE_TABLE_SIZE	128		/* number of mutexes to have */
+
+#define	PIO_SHIFT	PSE_SHIFT	/* next power of 2 bigger than page_t */
+#define	PIO_TABLE_SIZE	PSE_TABLE_SIZE	/* number of io mutexes to have */
+
+pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
+pad_mutex_t	pse_mutex[PSE_TABLE_SIZE];
+kmutex_t	pio_mutex[PIO_TABLE_SIZE];
+
+#define	PAGE_SE_MUTEX(pp) \
+	    &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
+		((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
+		(PSE_TABLE_SIZE - 1))].pad_mutex
+
+#define	PAGE_IO_MUTEX(pp) \
+	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
+
+#define	PSZC_MTX_TABLE_SIZE	128
+#define	PSZC_MTX_TABLE_SHIFT	7
+
+static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
+
+#define	PAGE_SZC_MUTEX(_pp) \
+	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
+		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
+		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
+		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
+
+/*
+ * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
+ * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
+ * and p_vpnext).
+ *
+ * The page_vnode_mutex(vp) function returns the address of the appropriate
+ * mutex from this array given a pointer to a vnode.  It is complicated
+ * by the fact that the kernel's vnode and the swapfs vnode are referenced
+ * frequently enough to warrent their own mutexes.
+ *
+ * The VP_HASH_FUNC returns the index into the vph_mutex array given
+ * an address of a vnode.
+ */
+
+/*
+ * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
+ *	Need to review again.
+ */
+#define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
+
+#define	VP_HASH_FUNC(vp) \
+	((((uintptr_t)(vp) >> 6) + \
+	    ((uintptr_t)(vp) >> 8) + \
+	    ((uintptr_t)(vp) >> 10) + \
+	    ((uintptr_t)(vp) >> 12)) \
+	    & (VPH_TABLE_SIZE - 1))
+
+extern	struct vnode	kvp;
+
+kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
+
+/*
+ * Initialize the locks used by the Virtual Memory Management system.
+ */
+void
+page_lock_init()
+{
+}
+
+/*
+ * At present we only use page ownership to aid debugging, so it's
+ * OK if the owner field isn't exact.  In the 32-bit world two thread ids
+ * can map to the same owner because we just 'or' in 0x80000000 and
+ * then clear the second highest bit, so that (for example) 0x2faced00
+ * and 0xafaced00 both map to 0xafaced00.
+ * In the 64-bit world, p_selock may not be large enough to hold a full
+ * thread pointer.  If we ever need precise ownership (e.g. if we implement
+ * priority inheritance for page locks) then p_selock should become a
+ * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
+ */
+#define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
+#define	SE_READER	1
+
+/*
+ * A page that is deleted must be marked as such using the
+ * page_lock_delete() function. The page must be exclusively locked.
+ * The SE_DELETED marker is put in p_selock when this function is called.
+ * SE_DELETED must be distinct from any SE_WRITER value.
+ */
+#define	SE_DELETED	(1 | INT_MIN)
+
+#ifdef VM_STATS
+uint_t	vph_kvp_count;
+uint_t	vph_swapfsvp_count;
+uint_t	vph_other;
+#endif /* VM_STATS */
+
+#ifdef VM_STATS
+uint_t	page_lock_count;
+uint_t	page_lock_miss;
+uint_t	page_lock_miss_lock;
+uint_t	page_lock_reclaim;
+uint_t	page_lock_bad_reclaim;
+uint_t	page_lock_same_page;
+uint_t	page_lock_upgrade;
+uint_t	page_lock_upgrade_failed;
+uint_t	page_lock_deleted;
+
+uint_t	page_trylock_locked;
+uint_t	page_trylock_missed;
+
+uint_t	page_try_reclaim_upgrade;
+#endif /* VM_STATS */
+
+
+/*
+ * Acquire the "shared/exclusive" lock on a page.
+ *
+ * Returns 1 on success and locks the page appropriately.
+ *	   0 on failure and does not lock the page.
+ *
+ * If `lock' is non-NULL, it will be dropped and reacquired in the
+ * failure case.  This routine can block, and if it does
+ * it will always return a failure since the page identity [vp, off]
+ * or state may have changed.
+ */
+
+int
+page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
+{
+	return (page_lock_es(pp, se, lock, reclaim, 0));
+}
+
+/*
+ * With the addition of reader-writer lock semantics to page_lock_es,
+ * callers wanting an exclusive (writer) lock may prevent shared-lock
+ * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
+ * In this case, when an exclusive lock cannot be acquired, p_selock's
+ * SE_EWANTED bit is set.
+ * This bit, along with the se and es parameters, are used to decide
+ * if the requested lock should be granted:
+ *
+ * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED  Action
+ * ----------  -------------- -------------------  ---------
+ * SE_EXCL        no           dont-care/1         deny lock
+ * SE_EXCL     any(see note)   unlocked/any        grant lock, clear SE_EWANTED
+ * SE_EXCL        yes          any lock/any        deny, set SE_EWANTED
+ * SE_EXCL        no           any lock/any        deny
+ * SE_SHARED   not applicable    shared/0          grant
+ * SE_SHARED   not applicable  unlocked/0          grant
+ * SE_SHARED   not applicable    shared/1          deny
+ * SE_SHARED   not applicable  unlocked/1          deny
+ * SE_SHARED   not applicable      excl/any        deny
+ *
+ * Note: the code grants an exclusive lock to the caller and clears
+ * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
+ * bit's value.  This was deemed acceptable as we are not concerned about
+ * exclusive-lock starvation. If this ever becomes an issue, a priority or
+ * fifo mechanism should also be implemented.
+ */
+int
+page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
+{
+	int		retval;
+	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
+	int		upgraded;
+	int		reclaim_it;
+
+	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
+
+	VM_STAT_ADD(page_lock_count);
+
+	upgraded = 0;
+	reclaim_it = 0;
+
+	mutex_enter(pse);
+
+	/*
+	 * Current uses of 'es':
+	 * es == 1 page_lookup_create will attempt page relocation
+	 * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete
+	 * memory thread); this prevents reader-starvation of waiting
+	 * writer thread(s).
+	 */
+
+
+	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
+	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
+		se = SE_EXCL;
+	}
+
+	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
+
+		reclaim_it = 1;
+		if (se == SE_SHARED) {
+			/*
+			 * This is an interesting situation.
+			 *
+			 * Remember that p_free can only change if
+			 * p_selock < 0.
+			 * p_free does not depend on our holding `pse'.
+			 * And, since we hold `pse', p_selock can not change.
+			 * So, if p_free changes on us, the page is already
+			 * exclusively held, and we would fail to get p_selock
+			 * regardless.
+			 *
+			 * We want to avoid getting the share
+			 * lock on a free page that needs to be reclaimed.
+			 * It is possible that some other thread has the share
+			 * lock and has left the free page on the cache list.
+			 * pvn_vplist_dirty() does this for brief periods.
+			 * If the se_share is currently SE_EXCL, we will fail
+			 * to acquire p_selock anyway.  Blocking is the
+			 * right thing to do.
+			 * If we need to reclaim this page, we must get
+			 * exclusive access to it, force the upgrade now.
+			 * Again, we will fail to acquire p_selock if the
+			 * page is not free and block.
+			 */
+			upgraded = 1;
+			se = SE_EXCL;
+			VM_STAT_ADD(page_lock_upgrade);
+		}
+	}
+
+	if (se == SE_EXCL) {
+		if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
+			/*
+			 * if the caller wants a writer lock (but did not
+			 * specify exclusive access), and there is a pending
+			 * writer that wants exclusive access, return failure
+			 */
+			retval = 0;
+		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
+			/* no reader/writer lock held */
+			THREAD_KPRI_REQUEST();
+			/* this clears our setting of the SE_EWANTED bit */
+			pp->p_selock = SE_WRITER;
+			retval = 1;
+		} else {
+			/* page is locked */
+			if (es == SE_EXCL_WANTED) {
+				/* set the SE_EWANTED bit */
+				pp->p_selock |= SE_EWANTED;
+			}
+			retval = 0;
+		}
+	} else {
+		retval = 0;
+		if (pp->p_selock >= 0) {
+			/* readers are not allowed when excl wanted */
+			if (!(pp->p_selock & SE_EWANTED)) {
+				pp->p_selock += SE_READER;
+				retval = 1;
+			}
+		}
+	}
+
+	if (retval == 0) {
+		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
+			VM_STAT_ADD(page_lock_deleted);
+			mutex_exit(pse);
+			return (retval);
+		}
+
+#ifdef VM_STATS
+		VM_STAT_ADD(page_lock_miss);
+		if (upgraded) {
+			VM_STAT_ADD(page_lock_upgrade_failed);
+		}
+#endif
+		if (lock) {
+			VM_STAT_ADD(page_lock_miss_lock);
+			mutex_exit(lock);
+		}
+
+		/*
+		 * Now, wait for the page to be unlocked and
+		 * release the lock protecting p_cv and p_selock.
+		 */
+		cv_wait(&pp->p_cv, pse);
+		mutex_exit(pse);
+
+		/*
+		 * The page identity may have changed while we were
+		 * blocked.  If we are willing to depend on "pp"
+		 * still pointing to a valid page structure (i.e.,
+		 * assuming page structures are not dynamically allocated
+		 * or freed), we could try to lock the page if its
+		 * identity hasn't changed.
+		 *
+		 * This needs to be measured, since we come back from
+		 * cv_wait holding pse (the expensive part of this
+		 * operation) we might as well try the cheap part.
+		 * Though we would also have to confirm that dropping
+		 * `lock' did not cause any grief to the callers.
+		 */
+		if (lock) {
+			mutex_enter(lock);
+		}
+	} else {
+		/*
+		 * We have the page lock.
+		 * If we needed to reclaim the page, and the page
+		 * needed reclaiming (ie, it was free), then we
+		 * have the page exclusively locked.  We may need
+		 * to downgrade the page.
+		 */
+		ASSERT((upgraded) ?
+		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
+		mutex_exit(pse);
+
+		/*
+		 * We now hold this page's lock, either shared or
+		 * exclusive.  This will prevent its identity from changing.
+		 * The page, however, may or may not be free.  If the caller
+		 * requested, and it is free, go reclaim it from the
+		 * free list.  If the page can't be reclaimed, return failure
+		 * so that the caller can start all over again.
+		 *
+		 * NOTE:page_reclaim() releases the page lock (p_selock)
+		 *	if it can't be reclaimed.
+		 */
+		if (reclaim_it) {
+			if (!page_reclaim(pp, lock)) {
+				VM_STAT_ADD(page_lock_bad_reclaim);
+				retval = 0;
+			} else {
+				VM_STAT_ADD(page_lock_reclaim);
+				if (upgraded) {
+					page_downgrade(pp);
+				}
+			}
+		}
+	}
+	return (retval);
+}
+
+/*
+ * Clear the SE_EWANTED bit from p_selock.  This function allows
+ * callers of page_lock_es and page_try_reclaim_lock to clear
+ * their setting of this bit if they decide they no longer wish
+ * to gain exclusive access to the page.  Currently only
+ * delete_memory_thread uses this when the delete memory
+ * operation is cancelled.
+ */
+void
+page_lock_clr_exclwanted(page_t *pp)
+{
+	kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+	mutex_enter(pse);
+	pp->p_selock &= ~SE_EWANTED;
+	if (CV_HAS_WAITERS(&pp->p_cv))
+		cv_broadcast(&pp->p_cv);
+	mutex_exit(pse);
+}
+
+/*
+ * Read the comments inside of page_lock_es() carefully.
+ *
+ * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
+ * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
+ * This is used by threads subject to reader-starvation (eg. memory delete).
+ *
+ * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
+ * it is expected that it will retry at a later time.  Threads that will
+ * not retry the lock *must* call page_lock_clr_exclwanted to clear the
+ * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
+ * the bit is cleared.)
+ */
+int
+page_try_reclaim_lock(page_t *pp, se_t se, int es)
+{
+	kmutex_t *pse = PAGE_SE_MUTEX(pp);
+	selock_t old;
+
+	mutex_enter(pse);
+
+	old = pp->p_selock;
+
+	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
+	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+	if (se == SE_SHARED && es == 1 && old == 0) {
+		se = SE_EXCL;
+	}
+
+	if (se == SE_SHARED) {
+		if (!PP_ISFREE(pp)) {
+			if (old >= 0) {
+				/* readers are not allowed when excl wanted */
+				if (!(old & SE_EWANTED)) {
+					pp->p_selock = old + SE_READER;
+					mutex_exit(pse);
+					return (1);
+				}
+			}
+			mutex_exit(pse);
+			return (0);
+		}
+		/*
+		 * The page is free, so we really want SE_EXCL (below)
+		 */
+		VM_STAT_ADD(page_try_reclaim_upgrade);
+	}
+
+	/*
+	 * The caller wants a writer lock.  We try for it only if
+	 * SE_EWANTED is not set, or if the caller specified
+	 * SE_EXCL_WANTED.
+	 */
+	if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) {
+		if ((old & ~SE_EWANTED) == 0) {
+			/* no reader/writer lock held */
+			THREAD_KPRI_REQUEST();
+			/* this clears out our setting of the SE_EWANTED bit */
+			pp->p_selock = SE_WRITER;
+			mutex_exit(pse);
+			return (1);
+		}
+	}
+	if (es == SE_EXCL_WANTED) {
+		/* page is locked, set the SE_EWANTED bit */
+		pp->p_selock |= SE_EWANTED;
+	}
+	mutex_exit(pse);
+	return (0);
+}
+
+/*
+ * Acquire a page's "shared/exclusive" lock, but never block.
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_trylock(page_t *pp, se_t se)
+{
+	kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+	mutex_enter(pse);
+	if (pp->p_selock & SE_EWANTED) {
+		/* fail if a thread wants exclusive access */
+		mutex_exit(pse);
+		return (0);
+	}
+
+	if (se == SE_EXCL) {
+		if (pp->p_selock == 0) {
+			THREAD_KPRI_REQUEST();
+			pp->p_selock = SE_WRITER;
+			mutex_exit(pse);
+			return (1);
+		}
+	} else {
+		if (pp->p_selock >= 0) {
+			pp->p_selock += SE_READER;
+			mutex_exit(pse);
+			return (1);
+		}
+	}
+	mutex_exit(pse);
+	return (0);
+}
+
+/*
+ * Release the page's "shared/exclusive" lock and wake up anyone
+ * who might be waiting for it.
+ */
+void
+page_unlock(page_t *pp)
+{
+	kmutex_t *pse = PAGE_SE_MUTEX(pp);
+	selock_t old;
+
+	mutex_enter(pse);
+	old = pp->p_selock;
+	if ((old & ~SE_EWANTED) == SE_READER) {
+		pp->p_selock = old & ~SE_READER;
+		if (CV_HAS_WAITERS(&pp->p_cv))
+			cv_broadcast(&pp->p_cv);
+	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
+		panic("page_unlock: page %p is deleted", pp);
+	} else if (old < 0) {
+		THREAD_KPRI_RELEASE();
+		pp->p_selock &= SE_EWANTED;
+		if (CV_HAS_WAITERS(&pp->p_cv))
+			cv_broadcast(&pp->p_cv);
+	} else if ((old & ~SE_EWANTED) > SE_READER) {
+		pp->p_selock = old - SE_READER;
+	} else {
+		panic("page_unlock: page %p is not locked", pp);
+	}
+	mutex_exit(pse);
+}
+
+/*
+ * Try to upgrade the lock on the page from a "shared" to an
+ * "exclusive" lock.  Since this upgrade operation is done while
+ * holding the mutex protecting this page, no one else can acquire this page's
+ * lock and change the page. Thus, it is safe to drop the "shared"
+ * lock and attempt to acquire the "exclusive" lock.
+ *
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_tryupgrade(page_t *pp)
+{
+	kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+	mutex_enter(pse);
+	if (!(pp->p_selock & SE_EWANTED)) {
+		/* no threads want exclusive access, try upgrade */
+		if (pp->p_selock == SE_READER) {
+			THREAD_KPRI_REQUEST();
+			/* convert to exclusive lock */
+			pp->p_selock = SE_WRITER;
+			mutex_exit(pse);
+			return (1);
+		}
+	}
+	mutex_exit(pse);
+	return (0);
+}
+
+/*
+ * Downgrade the "exclusive" lock on the page to a "shared" lock
+ * while holding the mutex protecting this page's p_selock field.
+ */
+void
+page_downgrade(page_t *pp)
+{
+	kmutex_t *pse = PAGE_SE_MUTEX(pp);
+	int excl_waiting;
+
+	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
+	ASSERT(PAGE_EXCL(pp));
+
+	mutex_enter(pse);
+	excl_waiting =  pp->p_selock & SE_EWANTED;
+	THREAD_KPRI_RELEASE();
+	pp->p_selock = SE_READER | excl_waiting;
+	if (CV_HAS_WAITERS(&pp->p_cv))
+		cv_broadcast(&pp->p_cv);
+	mutex_exit(pse);
+}
+
+void
+page_lock_delete(page_t *pp)
+{
+	kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(pp->p_vnode == NULL);
+	ASSERT(pp->p_offset == (u_offset_t)-1);
+	ASSERT(!PP_ISFREE(pp));
+
+	mutex_enter(pse);
+	THREAD_KPRI_RELEASE();
+	pp->p_selock = SE_DELETED;
+	if (CV_HAS_WAITERS(&pp->p_cv))
+		cv_broadcast(&pp->p_cv);
+	mutex_exit(pse);
+}
+
+/*
+ * Implement the io lock for pages
+ */
+void
+page_iolock_init(page_t *pp)
+{
+	pp->p_iolock_state = 0;
+	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
+}
+
+/*
+ * Acquire the i/o lock on a page.
+ */
+void
+page_io_lock(page_t *pp)
+{
+	kmutex_t *pio;
+
+	pio = PAGE_IO_MUTEX(pp);
+	mutex_enter(pio);
+	while (pp->p_iolock_state & PAGE_IO_INUSE) {
+		cv_wait(&(pp->p_io_cv), pio);
+	}
+	pp->p_iolock_state |= PAGE_IO_INUSE;
+	mutex_exit(pio);
+}
+
+/*
+ * Release the i/o lock on a page.
+ */
+void
+page_io_unlock(page_t *pp)
+{
+	kmutex_t *pio;
+
+	pio = PAGE_IO_MUTEX(pp);
+	mutex_enter(pio);
+	cv_signal(&pp->p_io_cv);
+	pp->p_iolock_state &= ~PAGE_IO_INUSE;
+	mutex_exit(pio);
+}
+
+/*
+ * Try to acquire the i/o lock on a page without blocking.
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_io_trylock(page_t *pp)
+{
+	kmutex_t *pio;
+
+	if (pp->p_iolock_state & PAGE_IO_INUSE)
+		return (0);
+
+	pio = PAGE_IO_MUTEX(pp);
+	mutex_enter(pio);
+
+	if (pp->p_iolock_state & PAGE_IO_INUSE) {
+		mutex_exit(pio);
+		return (0);
+	}
+	pp->p_iolock_state |= PAGE_IO_INUSE;
+	mutex_exit(pio);
+
+	return (1);
+}
+
+/*
+ * Assert that the i/o lock on a page is held.
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_iolock_assert(page_t *pp)
+{
+	return (pp->p_iolock_state & PAGE_IO_INUSE);
+}
+
+/*
+ * Wrapper exported to kernel routines that are built
+ * platform-independent (the macro is platform-dependent;
+ * the size of vph_mutex[] is based on NCPU).
+ *
+ * Note that you can do stress testing on this by setting the
+ * variable page_vnode_mutex_stress to something other than
+ * zero in a DEBUG kernel in a debugger after loading the kernel.
+ * Setting it after the kernel is running may not work correctly.
+ */
+#ifdef DEBUG
+static int page_vnode_mutex_stress = 0;
+#endif
+
+kmutex_t *
+page_vnode_mutex(vnode_t *vp)
+{
+	if (vp == &kvp)
+		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
+#ifdef DEBUG
+	if (page_vnode_mutex_stress != 0)
+		return (&vph_mutex[0]);
+#endif
+
+	return (&vph_mutex[VP_HASH_FUNC(vp)]);
+}
+
+kmutex_t *
+page_se_mutex(page_t *pp)
+{
+	return (PAGE_SE_MUTEX(pp));
+}
+
+#ifdef VM_STATS
+uint_t pszclck_stat[4];
+#endif
+/*
+ * Find, take and return a mutex held by hat_page_demote().
+ * Called by page_demote_vp_pages() before hat_page_demote() call and by
+ * routines that want to block hat_page_demote() but can't do it
+ * via locking all constituent pages.
+ *
+ * Return NULL if p_szc is 0.
+ *
+ * It should only be used for pages that can be demoted by hat_page_demote()
+ * i.e. non swapfs file system pages.  The logic here is lifted from
+ * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
+ * since the page is locked and not free.
+ *
+ * Hash of the root page is used to find the lock.
+ * To find the root in the presense of hat_page_demote() chageing the location
+ * of the root this routine relies on the fact that hat_page_demote() changes
+ * root last.
+ *
+ * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
+ * returned pp's p_szc may be any value.
+ */
+kmutex_t *
+page_szc_lock(page_t *pp)
+{
+	kmutex_t	*mtx;
+	page_t		*rootpp;
+	uint_t		szc;
+	uint_t		rszc;
+	uint_t		pszc = pp->p_szc;
+
+	ASSERT(pp != NULL);
+	ASSERT(PAGE_LOCKED(pp));
+	ASSERT(!PP_ISFREE(pp));
+	ASSERT(pp->p_vnode != NULL);
+	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
+	ASSERT(pp->p_vnode != &kvp);
+
+again:
+	if (pszc == 0) {
+		VM_STAT_ADD(pszclck_stat[0]);
+		return (NULL);
+	}
+
+	/* The lock lives in the root page */
+
+	rootpp = PP_GROUPLEADER(pp, pszc);
+	mtx = PAGE_SZC_MUTEX(rootpp);
+	mutex_enter(mtx);
+
+	/*
+	 * since p_szc can only decrease if pp == rootpp
+	 * rootpp will be always the same i.e we have the right root
+	 * regardless of rootpp->p_szc.
+	 * If location of pp's root didn't change after we took
+	 * the lock we have the right root. return mutex hashed off it.
+	 */
+	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
+		VM_STAT_ADD(pszclck_stat[1]);
+		return (mtx);
+	}
+
+	/*
+	 * root location changed because page got demoted.
+	 * locate the new root.
+	 */
+	if (rszc < pszc) {
+		szc = pp->p_szc;
+		ASSERT(szc < pszc);
+		mutex_exit(mtx);
+		pszc = szc;
+		VM_STAT_ADD(pszclck_stat[2]);
+		goto again;
+	}
+
+	VM_STAT_ADD(pszclck_stat[3]);
+	/*
+	 * current hat_page_demote not done yet.
+	 * wait for it to finish.
+	 */
+	mutex_exit(mtx);
+	rootpp = PP_GROUPLEADER(rootpp, rszc);
+	mtx = PAGE_SZC_MUTEX(rootpp);
+	mutex_enter(mtx);
+	mutex_exit(mtx);
+	ASSERT(rootpp->p_szc < rszc);
+	goto again;
+}
+
+int
+page_szc_lock_assert(page_t *pp)
+{
+	page_t *rootpp = PP_PAGEROOT(pp);
+	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
+
+	return (MUTEX_HELD(mtx));
+}
diff --git a/usr/src/uts/common/vm/pvn.h b/usr/src/uts/common/vm/pvn.h
new file mode 100644
index 0000000000..0467589ae6
--- /dev/null
+++ b/usr/src/uts/common/vm/pvn.h
@@ -0,0 +1,117 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2002 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_PVN_H
+#define	_VM_PVN_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/buf.h>
+#include <vm/seg.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	_KERNEL
+
+/*
+ * VM - paged vnode.
+ *
+ * The VM system manages memory as a cache of paged vnodes.
+ * This file desribes the interfaces to common subroutines
+ * used to help implement the VM/file system routines.
+ */
+
+struct page	*pvn_read_kluster(struct vnode *vp, u_offset_t off,
+			struct seg *seg, caddr_t addr, u_offset_t *offp,
+			size_t *lenp, u_offset_t vp_off, size_t vp_len,
+			int isra);
+struct page	*pvn_write_kluster(struct vnode *vp, struct page *pp,
+			u_offset_t *offp, size_t *lenp, u_offset_t vp_off,
+			size_t vp_len, int flags);
+void		pvn_read_done(struct page *plist, int flags);
+void		pvn_write_done(struct page *plist, int flags);
+void		pvn_io_done(struct page *plist);
+int		pvn_vplist_dirty(struct vnode *vp, u_offset_t off,
+			int (*putapage)(vnode_t *, struct page *, u_offset_t *,
+				size_t *, int, cred_t *),
+			int flags, struct cred *cred);
+int		pvn_getdirty(struct page *pp, int flags);
+void		pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes);
+int		pvn_getpages(
+			int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *,
+				struct page *[], size_t, struct seg *,
+				caddr_t, enum seg_rw, cred_t *),
+			struct vnode *vp, u_offset_t off, size_t len,
+			uint_t *protp, struct page **pl, size_t plsz,
+			struct seg *seg, caddr_t addr, enum seg_rw rw,
+			struct cred *cred);
+void		pvn_plist_init(struct page *pp, struct page **pl, size_t plsz,
+			u_offset_t off, size_t io_len, enum seg_rw rw);
+void		pvn_init(void);
+
+/*
+ * When requesting pages from the getpage routines, pvn_getpages will
+ * allocate space to return PVN_GETPAGE_NUM pages which map PVN_GETPAGE_SZ
+ * worth of bytes.  These numbers are chosen to be the minimum of the max's
+ * given in terms of bytes and pages.
+ */
+#define	PVN_MAX_GETPAGE_SZ	0x10000		/* getpage size limit */
+#define	PVN_MAX_GETPAGE_NUM	0x8		/* getpage page limit */
+
+#if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE
+
+#define	PVN_GETPAGE_SZ	ptob(PVN_MAX_GETPAGE_NUM)
+#define	PVN_GETPAGE_NUM	PVN_MAX_GETPAGE_NUM
+
+#else
+
+#define	PVN_GETPAGE_SZ	PVN_MAX_GETPAGE_SZ
+#define	PVN_GETPAGE_NUM	btop(PVN_MAX_GETPAGE_SZ)
+
+#endif
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_PVN_H */
diff --git a/usr/src/uts/common/vm/rm.h b/usr/src/uts/common/vm/rm.h
new file mode 100644
index 0000000000..9789283993
--- /dev/null
+++ b/usr/src/uts/common/vm/rm.h
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2001 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1988 AT&T	*/
+/*	All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_RM_H
+#define	_VM_RM_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	_KERNEL
+
+extern size_t rm_asrss(struct as *);
+extern size_t rm_assize(struct as *);
+extern ushort_t rm_pctmemory(struct as *);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_RM_H */
diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h
new file mode 100644
index 0000000000..2ada345960
--- /dev/null
+++ b/usr/src/uts/common/vm/seg.h
@@ -0,0 +1,252 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_SEG_H
+#define	_VM_SEG_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/vnode.h>
+#include <sys/avl.h>
+#include <vm/seg_enum.h>
+#include <vm/faultcode.h>
+#include <vm/hat.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Segments.
+ */
+
+/*
+ * kstat statistics for segment advise
+ */
+typedef struct {
+	kstat_named_t MADV_FREE_hit;
+	kstat_named_t MADV_FREE_miss;
+} segadvstat_t;
+
+/*
+ * memory object ids
+ */
+typedef struct memid { u_longlong_t val[2]; } memid_t;
+
+/*
+ * An address space contains a set of segments, managed by drivers.
+ * Drivers support mapped devices, sharing, copy-on-write, etc.
+ *
+ * The seg structure contains a lock to prevent races, the base virtual
+ * address and size of the segment, a back pointer to the containing
+ * address space, pointers to maintain an AVL tree of segments in the
+ * same address space, and procedure and data hooks for the driver.
+ * The AVL tree of segments for the address space is sorted by
+ * ascending base addresses and overlapping segments are not allowed.
+ *
+ * After a segment is created, faults may occur on pages of the segment.
+ * When a fault occurs, the fault handling code must get the desired
+ * object and set up the hardware translation to the object.  For some
+ * objects, the fault handling code also implements copy-on-write.
+ *
+ * When the hat wants to unload a translation, it can call the unload
+ * routine which is responsible for processing reference and modify bits.
+ *
+ * Each segment is protected by it's containing address space lock.  To
+ * access any field in the segment structure, the "as" must be locked.
+ * If a segment field is to be modified, the address space lock must be
+ * write locked.
+ */
+
+struct seg {
+	caddr_t	s_base;			/* base virtual address */
+	size_t	s_size;			/* size in bytes */
+	uint_t	s_szc;			/* max page size code */
+	uint_t	s_flags;		/* flags for segment, see below */
+	struct	as *s_as;		/* containing address space */
+	avl_node_t s_tree;		/* AVL tree links to segs in this as */
+	struct	seg_ops *s_ops;		/* ops vector: see below */
+	void *s_data;			/* private data for instance */
+};
+
+#define	S_PURGE		(0x01)		/* seg should be purged in as_gap() */
+
+struct	seg_ops {
+	int	(*dup)(struct seg *, struct seg *);
+	int	(*unmap)(struct seg *, caddr_t, size_t);
+	void	(*free)(struct seg *);
+	faultcode_t (*fault)(struct hat *, struct seg *, caddr_t, size_t,
+	    enum fault_type, enum seg_rw);
+	faultcode_t (*faulta)(struct seg *, caddr_t);
+	int	(*setprot)(struct seg *, caddr_t, size_t, uint_t);
+	int	(*checkprot)(struct seg *, caddr_t, size_t, uint_t);
+	int	(*kluster)(struct seg *, caddr_t, ssize_t);
+	size_t	(*swapout)(struct seg *);
+	int	(*sync)(struct seg *, caddr_t, size_t, int, uint_t);
+	size_t	(*incore)(struct seg *, caddr_t, size_t, char *);
+	int	(*lockop)(struct seg *, caddr_t, size_t, int, int, ulong_t *,
+			size_t);
+	int	(*getprot)(struct seg *, caddr_t, size_t, uint_t *);
+	u_offset_t	(*getoffset)(struct seg *, caddr_t);
+	int	(*gettype)(struct seg *, caddr_t);
+	int	(*getvp)(struct seg *, caddr_t, struct vnode **);
+	int	(*advise)(struct seg *, caddr_t, size_t, uint_t);
+	void	(*dump)(struct seg *);
+	int	(*pagelock)(struct seg *, caddr_t, size_t, struct page ***,
+			enum lock_type, enum seg_rw);
+	int	(*setpagesize)(struct seg *, caddr_t, size_t, uint_t);
+	int	(*getmemid)(struct seg *, caddr_t, memid_t *);
+	struct lgrp_mem_policy_info	*(*getpolicy)(struct seg *, caddr_t);
+};
+
+#ifdef _KERNEL
+/*
+ * Generic segment operations
+ */
+extern	void	seg_init(void);
+extern	struct	seg *seg_alloc(struct as *as, caddr_t base, size_t size);
+extern	int	seg_attach(struct as *as, caddr_t base, size_t size,
+			struct seg *seg);
+extern	void	seg_unmap(struct seg *seg);
+extern	void	seg_free(struct seg *seg);
+
+/*
+ * functions for pagelock cache support
+ */
+extern	void	seg_ppurge(struct seg *seg);
+extern	void	seg_ppurge_seg(int (*callback)());
+extern	void	seg_pinactive(struct seg *seg, caddr_t addr, size_t len,
+			struct page **pp, enum seg_rw rw, int (*callback)());
+extern	int	seg_pinsert_check(struct seg *seg, size_t len, uint_t flags);
+extern	int	seg_pinsert(struct seg *seg, caddr_t addr, size_t len,
+			struct page **pp, enum seg_rw rw, uint_t flags,
+			int (*callback)());
+extern	struct	page **seg_plookup(struct seg *seg, caddr_t addr,
+			size_t len, enum seg_rw rw);
+extern	void	seg_pasync_thread(void);
+extern	void	seg_preap(void);
+
+extern	int	seg_preapahead;
+extern	segadvstat_t  segadvstat;
+/*
+ * Flags for pagelock cache support
+ */
+#define	SEGP_ASYNC_FLUSH	0x1	/* flushed by async thread */
+#define	SEGP_FORCE_WIRED	0x2	/* skip check against seg_pwindow */
+
+/*
+ * Return values for seg_pinsert and seg_pinsert_check functions.
+ */
+#define	SEGP_SUCCESS		0	/* seg_pinsert() succeeded */
+#define	SEGP_FAIL		1	/* seg_pinsert() failed */
+
+/* Page status bits for segop_incore */
+#define	SEG_PAGE_INCORE		0x01	/* VA has a page backing it */
+#define	SEG_PAGE_LOCKED		0x02	/* VA has a page that is locked */
+#define	SEG_PAGE_HASCOW		0x04	/* VA has a page with a copy-on-write */
+#define	SEG_PAGE_SOFTLOCK	0x08	/* VA has a page with softlock held */
+#define	SEG_PAGE_VNODEBACKED	0x10	/* Segment is backed by a vnode */
+#define	SEG_PAGE_ANON		0x20	/* VA has an anonymous page */
+#define	SEG_PAGE_VNODE		0x40	/* VA has a vnode page backing it */
+
+#define	SEGOP_DUP(s, n)		    (*(s)->s_ops->dup)((s), (n))
+#define	SEGOP_UNMAP(s, a, l)	    (*(s)->s_ops->unmap)((s), (a), (l))
+#define	SEGOP_FREE(s)		    (*(s)->s_ops->free)((s))
+#define	SEGOP_FAULT(h, s, a, l, t, rw) \
+		(*(s)->s_ops->fault)((h), (s), (a), (l), (t), (rw))
+#define	SEGOP_FAULTA(s, a)	    (*(s)->s_ops->faulta)((s), (a))
+#define	SEGOP_SETPROT(s, a, l, p)   (*(s)->s_ops->setprot)((s), (a), (l), (p))
+#define	SEGOP_CHECKPROT(s, a, l, p) (*(s)->s_ops->checkprot)((s), (a), (l), (p))
+#define	SEGOP_KLUSTER(s, a, d)	    (*(s)->s_ops->kluster)((s), (a), (d))
+#define	SEGOP_SWAPOUT(s)	    (*(s)->s_ops->swapout)((s))
+#define	SEGOP_SYNC(s, a, l, atr, f) \
+		(*(s)->s_ops->sync)((s), (a), (l), (atr), (f))
+#define	SEGOP_INCORE(s, a, l, v)    (*(s)->s_ops->incore)((s), (a), (l), (v))
+#define	SEGOP_LOCKOP(s, a, l, atr, op, b, p) \
+		(*(s)->s_ops->lockop)((s), (a), (l), (atr), (op), (b), (p))
+#define	SEGOP_GETPROT(s, a, l, p)   (*(s)->s_ops->getprot)((s), (a), (l), (p))
+#define	SEGOP_GETOFFSET(s, a)	    (*(s)->s_ops->getoffset)((s), (a))
+#define	SEGOP_GETTYPE(s, a)	    (*(s)->s_ops->gettype)((s), (a))
+#define	SEGOP_GETVP(s, a, vpp)	    (*(s)->s_ops->getvp)((s), (a), (vpp))
+#define	SEGOP_ADVISE(s, a, l, b)    (*(s)->s_ops->advise)((s), (a), (l), (b))
+#define	SEGOP_DUMP(s)		    (*(s)->s_ops->dump)((s))
+#define	SEGOP_PAGELOCK(s, a, l, p, t, rw) \
+		(*(s)->s_ops->pagelock)((s), (a), (l), (p), (t), (rw))
+#define	SEGOP_SETPAGESIZE(s, a, l, szc) \
+		(*(s)->s_ops->setpagesize)((s), (a), (l), (szc))
+#define	SEGOP_GETMEMID(s, a, mp)    (*(s)->s_ops->getmemid)((s), (a), (mp))
+#define	SEGOP_GETPOLICY(s, a)	    (*(s)->s_ops->getpolicy)((s), (a))
+
+#define	seg_page(seg, addr) \
+	(((uintptr_t)((addr) - (seg)->s_base)) >> PAGESHIFT)
+
+#define	seg_pages(seg) \
+	(((uintptr_t)((seg)->s_size + PAGEOFFSET)) >> PAGESHIFT)
+
+#define	IE_NOMEM	-1	/* internal to seg layer */
+#define	IE_RETRY	-2	/* internal to seg layer */
+#define	IE_REATTACH	-3	/* internal to seg layer */
+
+/* Delay/retry factors for seg_p_mem_config_pre_del */
+#define	SEGP_PREDEL_DELAY_FACTOR	4
+/*
+ * As a workaround to being unable to purge the pagelock
+ * cache during a DR delete memory operation, we use
+ * a stall threshold that is twice the maximum seen
+ * during testing.  This workaround will be removed
+ * when a suitable fix is found.
+ */
+#define	SEGP_STALL_SECONDS	25
+#define	SEGP_STALL_THRESHOLD \
+	(SEGP_STALL_SECONDS * SEGP_PREDEL_DELAY_FACTOR)
+
+#ifdef VMDEBUG
+
+uint_t	seg_page(struct seg *, caddr_t);
+uint_t	seg_pages(struct seg *);
+
+#endif	/* VMDEBUG */
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_H */
diff --git a/usr/src/uts/common/vm/seg_dev.c b/usr/src/uts/common/vm/seg_dev.c
new file mode 100644
index 0000000000..9b3733871f
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_dev.c
@@ -0,0 +1,4073 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VM - segment of a mapped device.
+ *
+ * This segment driver is used when mapping character special devices.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/vtrace.h>
+#include <sys/systm.h>
+#include <sys/vmsystm.h>
+#include <sys/mman.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/conf.h>
+#include <sys/debug.h>
+#include <sys/ddidevmap.h>
+#include <sys/lgrp.h>
+
+#include <vm/page.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_dev.h>
+#include <vm/seg_kp.h>
+#include <vm/seg_kmem.h>
+#include <vm/vpage.h>
+
+#include <sys/sunddi.h>
+#include <sys/esunddi.h>
+#include <sys/fs/snode.h>
+
+#if DEBUG
+int segdev_debug;
+#define	DEBUGF(level, args) { if (segdev_debug >= (level)) cmn_err args; }
+#else
+#define	DEBUGF(level, args)
+#endif
+
+/* Default timeout for devmap context management */
+#define	CTX_TIMEOUT_VALUE 0
+
+#define	HOLD_DHP_LOCK(dhp)  if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
+			{ mutex_enter(&dhp->dh_lock); }
+
+#define	RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
+			{ mutex_exit(&dhp->dh_lock); }
+
+#define	round_down_p2(a, s)	((a) & ~((s) - 1))
+#define	round_up_p2(a, s)	(((a) + (s) - 1) & ~((s) - 1))
+
+/*
+ * VA_PA_ALIGNED checks to see if both VA and PA are on pgsize boundary
+ * VA_PA_PGSIZE_ALIGNED check to see if VA is aligned with PA w.r.t. pgsize
+ */
+#define	VA_PA_ALIGNED(uvaddr, paddr, pgsize)		\
+	(((uvaddr | paddr) & (pgsize - 1)) == 0)
+#define	VA_PA_PGSIZE_ALIGNED(uvaddr, paddr, pgsize)	\
+	(((uvaddr ^ paddr) & (pgsize - 1)) == 0)
+
+#define	vpgtob(n)	((n) * sizeof (struct vpage))	/* For brevity */
+
+#define	VTOCVP(vp)	(VTOS(vp)->s_commonvp)	/* we "know" it's an snode */
+
+static struct devmap_ctx *devmapctx_list = NULL;
+static struct devmap_softlock *devmap_slist = NULL;
+
+/*
+ * mutex, vnode and page for the page of zeros we use for the trash mappings.
+ * One trash page is allocated on the first ddi_umem_setup call that uses it
+ * XXX Eventually, we may want to combine this with what segnf does when all
+ * hat layers implement HAT_NOFAULT.
+ *
+ * The trash page is used when the backing store for a userland mapping is
+ * removed but the application semantics do not take kindly to a SIGBUS.
+ * In that scenario, the applications pages are mapped to some dummy page
+ * which returns garbage on read and writes go into a common place.
+ * (Perfect for NO_FAULT semantics)
+ * The device driver is responsible to communicating to the app with some
+ * other mechanism that such remapping has happened and the app should take
+ * corrective action.
+ * We can also use an anonymous memory page as there is no requirement to
+ * keep the page locked, however this complicates the fault code. RFE.
+ */
+static struct vnode trashvp;
+static struct page *trashpp;
+
+/* Non-pageable kernel memory is allocated from the umem_np_arena. */
+static vmem_t *umem_np_arena;
+
+/* Set the cookie to a value we know will never be a valid umem_cookie */
+#define	DEVMAP_DEVMEM_COOKIE	((ddi_umem_cookie_t)0x1)
+
+/*
+ * Macros to check if type of devmap handle
+ */
+#define	cookie_is_devmem(c)	\
+	((c) == (struct ddi_umem_cookie *)DEVMAP_DEVMEM_COOKIE)
+
+#define	cookie_is_pmem(c)	\
+	((c) == (struct ddi_umem_cookie *)DEVMAP_PMEM_COOKIE)
+
+#define	cookie_is_kpmem(c)	(!cookie_is_devmem(c) && !cookie_is_pmem(c) &&\
+	((c)->type == KMEM_PAGEABLE))
+
+#define	dhp_is_devmem(dhp)	\
+	(cookie_is_devmem((struct ddi_umem_cookie *)((dhp)->dh_cookie)))
+
+#define	dhp_is_pmem(dhp)	\
+	(cookie_is_pmem((struct ddi_umem_cookie *)((dhp)->dh_cookie)))
+
+#define	dhp_is_kpmem(dhp)	\
+	(cookie_is_kpmem((struct ddi_umem_cookie *)((dhp)->dh_cookie)))
+
+/*
+ * Private seg op routines.
+ */
+static int	segdev_dup(struct seg *, struct seg *);
+static int	segdev_unmap(struct seg *, caddr_t, size_t);
+static void	segdev_free(struct seg *);
+static faultcode_t segdev_fault(struct hat *, struct seg *, caddr_t, size_t,
+		    enum fault_type, enum seg_rw);
+static faultcode_t segdev_faulta(struct seg *, caddr_t);
+static int	segdev_setprot(struct seg *, caddr_t, size_t, uint_t);
+static int	segdev_checkprot(struct seg *, caddr_t, size_t, uint_t);
+static void	segdev_badop(void);
+static int	segdev_sync(struct seg *, caddr_t, size_t, int, uint_t);
+static size_t	segdev_incore(struct seg *, caddr_t, size_t, char *);
+static int	segdev_lockop(struct seg *, caddr_t, size_t, int, int,
+		    ulong_t *, size_t);
+static int	segdev_getprot(struct seg *, caddr_t, size_t, uint_t *);
+static u_offset_t	segdev_getoffset(struct seg *, caddr_t);
+static int	segdev_gettype(struct seg *, caddr_t);
+static int	segdev_getvp(struct seg *, caddr_t, struct vnode **);
+static int	segdev_advise(struct seg *, caddr_t, size_t, uint_t);
+static void	segdev_dump(struct seg *);
+static int	segdev_pagelock(struct seg *, caddr_t, size_t,
+		    struct page ***, enum lock_type, enum seg_rw);
+static int	segdev_setpagesize(struct seg *, caddr_t, size_t, uint_t);
+static int	segdev_getmemid(struct seg *, caddr_t, memid_t *);
+static lgrp_mem_policy_info_t	*segdev_getpolicy(struct seg *, caddr_t);
+
+/*
+ * XXX	this struct is used by rootnex_map_fault to identify
+ *	the segment it has been passed. So if you make it
+ *	"static" you'll need to fix rootnex_map_fault.
+ */
+struct seg_ops segdev_ops = {
+	segdev_dup,
+	segdev_unmap,
+	segdev_free,
+	segdev_fault,
+	segdev_faulta,
+	segdev_setprot,
+	segdev_checkprot,
+	(int (*)())segdev_badop,	/* kluster */
+	(size_t (*)(struct seg *))NULL,	/* swapout */
+	segdev_sync,			/* sync */
+	segdev_incore,
+	segdev_lockop,			/* lockop */
+	segdev_getprot,
+	segdev_getoffset,
+	segdev_gettype,
+	segdev_getvp,
+	segdev_advise,
+	segdev_dump,
+	segdev_pagelock,
+	segdev_setpagesize,
+	segdev_getmemid,
+	segdev_getpolicy,
+};
+
+/*
+ * Private segdev support routines
+ */
+static struct segdev_data *sdp_alloc(void);
+
+static void segdev_softunlock(struct hat *, struct seg *, caddr_t,
+    size_t, enum seg_rw);
+
+static faultcode_t segdev_faultpage(struct hat *, struct seg *, caddr_t,
+    struct vpage *, enum fault_type, enum seg_rw, devmap_handle_t *);
+
+static faultcode_t segdev_faultpages(struct hat *, struct seg *, caddr_t,
+    size_t, enum fault_type, enum seg_rw, devmap_handle_t *);
+
+static struct devmap_ctx *devmap_ctxinit(dev_t, ulong_t);
+static struct devmap_softlock *devmap_softlock_init(dev_t, ulong_t);
+static void devmap_softlock_rele(devmap_handle_t *);
+static void devmap_ctx_rele(devmap_handle_t *);
+
+static void devmap_ctxto(void *);
+
+static devmap_handle_t *devmap_find_handle(devmap_handle_t *dhp_head,
+    caddr_t addr);
+
+static ulong_t devmap_roundup(devmap_handle_t *dhp, ulong_t offset, size_t len,
+    ulong_t *opfn, ulong_t *pagesize);
+
+static void free_devmap_handle(devmap_handle_t *dhp);
+
+static int devmap_handle_dup(devmap_handle_t *dhp, devmap_handle_t **new_dhp,
+    struct seg *newseg);
+
+static devmap_handle_t *devmap_handle_unmap(devmap_handle_t *dhp);
+
+static void devmap_handle_unmap_head(devmap_handle_t *dhp, size_t len);
+
+static void devmap_handle_unmap_tail(devmap_handle_t *dhp, caddr_t addr);
+
+static int devmap_device(devmap_handle_t *dhp, struct as *as, caddr_t *addr,
+    offset_t off, size_t len, uint_t flags);
+
+static void devmap_get_large_pgsize(devmap_handle_t *dhp, size_t len,
+    caddr_t addr, size_t *llen, caddr_t *laddr);
+
+static void devmap_handle_reduce_len(devmap_handle_t *dhp, size_t len);
+
+static void *devmap_alloc_pages(vmem_t *vmp, size_t size, int vmflag);
+static void devmap_free_pages(vmem_t *vmp, void *inaddr, size_t size);
+
+static void *devmap_umem_alloc_np(size_t size, size_t flags);
+static void devmap_umem_free_np(void *addr, size_t size);
+
+/*
+ * routines to lock and unlock underlying segkp segment for
+ * KMEM_PAGEABLE type cookies.
+ */
+static faultcode_t  acquire_kpmem_lock(struct ddi_umem_cookie *, size_t);
+static void release_kpmem_lock(struct ddi_umem_cookie *, size_t);
+
+/*
+ * Routines to synchronize F_SOFTLOCK and F_INVAL faults for
+ * drivers with devmap_access callbacks
+ */
+static int devmap_softlock_enter(struct devmap_softlock *, size_t,
+	enum fault_type);
+static void devmap_softlock_exit(struct devmap_softlock *, size_t,
+	enum fault_type);
+
+static kmutex_t devmapctx_lock;
+
+static kmutex_t devmap_slock;
+
+/*
+ * Initialize the thread callbacks and thread private data.
+ */
+static struct devmap_ctx *
+devmap_ctxinit(dev_t dev, ulong_t id)
+{
+	struct devmap_ctx	*devctx;
+	struct devmap_ctx	*tmp;
+	dev_info_t		*dip;
+
+	tmp =  kmem_zalloc(sizeof (struct devmap_ctx), KM_SLEEP);
+
+	mutex_enter(&devmapctx_lock);
+
+	dip = e_ddi_hold_devi_by_dev(dev, 0);
+	ASSERT(dip != NULL);
+	ddi_release_devi(dip);
+
+	for (devctx = devmapctx_list; devctx != NULL; devctx = devctx->next)
+		if ((devctx->dip == dip) && (devctx->id == id))
+			break;
+
+	if (devctx == NULL) {
+		devctx = tmp;
+		devctx->dip = dip;
+		devctx->id = id;
+		mutex_init(&devctx->lock, NULL, MUTEX_DEFAULT, NULL);
+		cv_init(&devctx->cv, NULL, CV_DEFAULT, NULL);
+		devctx->next = devmapctx_list;
+		devmapctx_list = devctx;
+	} else
+		kmem_free(tmp, sizeof (struct devmap_ctx));
+
+	mutex_enter(&devctx->lock);
+	devctx->refcnt++;
+	mutex_exit(&devctx->lock);
+	mutex_exit(&devmapctx_lock);
+
+	return (devctx);
+}
+
+/*
+ * Timeout callback called if a CPU has not given up the device context
+ * within dhp->dh_timeout_length ticks
+ */
+static void
+devmap_ctxto(void *data)
+{
+	struct devmap_ctx *devctx = data;
+
+	TRACE_1(TR_FAC_DEVMAP, TR_DEVMAP_CTXTO,
+	    "devmap_ctxto:timeout expired, devctx=%p", (void *)devctx);
+	mutex_enter(&devctx->lock);
+	/*
+	 * Set oncpu = 0 so the next mapping trying to get the device context
+	 * can.
+	 */
+	devctx->oncpu = 0;
+	devctx->timeout = 0;
+	cv_signal(&devctx->cv);
+	mutex_exit(&devctx->lock);
+}
+
+/*
+ * Create a device segment.
+ */
+int
+segdev_create(struct seg *seg, void *argsp)
+{
+	struct segdev_data *sdp;
+	struct segdev_crargs *a = (struct segdev_crargs *)argsp;
+	devmap_handle_t *dhp = (devmap_handle_t *)a->devmap_data;
+	int error;
+
+	/*
+	 * Since the address space is "write" locked, we
+	 * don't need the segment lock to protect "segdev" data.
+	 */
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
+
+	sdp = sdp_alloc();
+
+	sdp->mapfunc = a->mapfunc;
+	sdp->offset = a->offset;
+	sdp->prot = a->prot;
+	sdp->maxprot = a->maxprot;
+	sdp->type = a->type;
+	sdp->pageprot = 0;
+	sdp->softlockcnt = 0;
+	sdp->vpage = NULL;
+
+	if (sdp->mapfunc == NULL)
+		sdp->devmap_data = dhp;
+	else
+		sdp->devmap_data = dhp = NULL;
+
+	sdp->hat_flags = a->hat_flags;
+	sdp->hat_attr = a->hat_attr;
+
+	/*
+	 * Currently, hat_flags supports only HAT_LOAD_NOCONSIST
+	 */
+	ASSERT(!(sdp->hat_flags & ~HAT_LOAD_NOCONSIST));
+
+	/*
+	 * Hold shadow vnode -- segdev only deals with
+	 * character (VCHR) devices. We use the common
+	 * vp to hang pages on.
+	 */
+	sdp->vp = specfind(a->dev, VCHR);
+	ASSERT(sdp->vp != NULL);
+
+	seg->s_ops = &segdev_ops;
+	seg->s_data = sdp;
+
+	while (dhp != NULL) {
+		dhp->dh_seg = seg;
+		dhp = dhp->dh_next;
+	}
+
+	/*
+	 * Inform the vnode of the new mapping.
+	 */
+	/*
+	 * It is ok to use pass sdp->maxprot to ADDMAP rather than to use
+	 * dhp specific maxprot because spec_addmap does not use maxprot.
+	 */
+	error = VOP_ADDMAP(VTOCVP(sdp->vp), sdp->offset,
+	    seg->s_as, seg->s_base, seg->s_size,
+	    sdp->prot, sdp->maxprot, sdp->type, CRED());
+
+	if (error != 0) {
+		sdp->devmap_data = NULL;
+		hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
+		    HAT_UNLOAD_UNMAP);
+	}
+
+	return (error);
+}
+
+static struct segdev_data *
+sdp_alloc(void)
+{
+	struct segdev_data *sdp;
+
+	sdp = kmem_zalloc(sizeof (struct segdev_data), KM_SLEEP);
+	mutex_init(&sdp->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	return (sdp);
+}
+
+/*
+ * Duplicate seg and return new segment in newseg.
+ */
+static int
+segdev_dup(struct seg *seg, struct seg *newseg)
+{
+	struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	struct segdev_data *newsdp;
+	devmap_handle_t *dhp = (devmap_handle_t *)sdp->devmap_data;
+	size_t npages;
+	int ret;
+
+	TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_DUP,
+	    "segdev_dup:start dhp=%p, seg=%p", (void *)dhp, (void *)seg);
+
+	DEBUGF(3, (CE_CONT, "segdev_dup: dhp %p seg %p\n",
+	    (void *)dhp, (void *)seg));
+
+	/*
+	 * Since the address space is "write" locked, we
+	 * don't need the segment lock to protect "segdev" data.
+	 */
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	newsdp = sdp_alloc();
+
+	newseg->s_ops = seg->s_ops;
+	newseg->s_data = (void *)newsdp;
+
+	VN_HOLD(sdp->vp);
+	newsdp->vp 	= sdp->vp;
+	newsdp->mapfunc = sdp->mapfunc;
+	newsdp->offset	= sdp->offset;
+	newsdp->pageprot = sdp->pageprot;
+	newsdp->prot	= sdp->prot;
+	newsdp->maxprot = sdp->maxprot;
+	newsdp->type = sdp->type;
+	newsdp->hat_attr = sdp->hat_attr;
+	newsdp->hat_flags = sdp->hat_flags;
+	newsdp->softlockcnt = 0;
+
+	/*
+	 * Initialize per page data if the segment we are
+	 * dup'ing has per page information.
+	 */
+	npages = seg_pages(newseg);
+
+	if (sdp->vpage != NULL) {
+		size_t nbytes = vpgtob(npages);
+
+		newsdp->vpage = kmem_zalloc(nbytes, KM_SLEEP);
+		bcopy(sdp->vpage, newsdp->vpage, nbytes);
+	} else
+		newsdp->vpage = NULL;
+
+	/*
+	 * duplicate devmap handles
+	 */
+	if (dhp != NULL) {
+		ret = devmap_handle_dup(dhp,
+			(devmap_handle_t **)&newsdp->devmap_data, newseg);
+		if (ret != 0) {
+			TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_DUP_CK1,
+			    "segdev_dup:ret1 ret=%x, dhp=%p seg=%p",
+			    ret, (void *)dhp, (void *)seg);
+			DEBUGF(1, (CE_CONT,
+			    "segdev_dup: ret %x dhp %p seg %p\n",
+			    ret, (void *)dhp, (void *)seg));
+			return (ret);
+		}
+	}
+
+	/*
+	 * Inform the common vnode of the new mapping.
+	 */
+	return (VOP_ADDMAP(VTOCVP(newsdp->vp),
+		newsdp->offset, newseg->s_as,
+		newseg->s_base, newseg->s_size, newsdp->prot,
+		newsdp->maxprot, sdp->type, CRED()));
+}
+
+/*
+ * duplicate devmap handles
+ */
+static int
+devmap_handle_dup(devmap_handle_t *dhp, devmap_handle_t **new_dhp,
+    struct seg *newseg)
+{
+	devmap_handle_t *newdhp_save = NULL;
+	devmap_handle_t *newdhp = NULL;
+	struct devmap_callback_ctl *callbackops;
+
+	while (dhp != NULL) {
+		newdhp = kmem_alloc(sizeof (devmap_handle_t), KM_SLEEP);
+
+		/* Need to lock the original dhp while copying if REMAP */
+		HOLD_DHP_LOCK(dhp);
+		bcopy(dhp, newdhp, sizeof (devmap_handle_t));
+		RELE_DHP_LOCK(dhp);
+		newdhp->dh_seg = newseg;
+		newdhp->dh_next = NULL;
+		if (newdhp_save != NULL)
+			newdhp_save->dh_next = newdhp;
+		else
+			*new_dhp = newdhp;
+		newdhp_save = newdhp;
+
+		callbackops = &newdhp->dh_callbackops;
+
+		if (dhp->dh_softlock != NULL)
+			newdhp->dh_softlock = devmap_softlock_init(
+			    newdhp->dh_dev,
+			    (ulong_t)callbackops->devmap_access);
+		if (dhp->dh_ctx != NULL)
+			newdhp->dh_ctx = devmap_ctxinit(newdhp->dh_dev,
+			    (ulong_t)callbackops->devmap_access);
+
+		/*
+		 * Initialize dh_lock if we want to do remap.
+		 */
+		if (newdhp->dh_flags & DEVMAP_ALLOW_REMAP) {
+			mutex_init(&newdhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
+			newdhp->dh_flags |= DEVMAP_LOCK_INITED;
+		}
+
+		if (callbackops->devmap_dup != NULL) {
+			int ret;
+
+			/*
+			 * Call the dup callback so that the driver can
+			 * duplicate its private data.
+			 */
+			ret = (*callbackops->devmap_dup)(dhp, dhp->dh_pvtp,
+				(devmap_cookie_t *)newdhp, &newdhp->dh_pvtp);
+
+			if (ret != 0) {
+				/*
+				 * We want to free up this segment as the driver
+				 * has indicated that we can't dup it.  But we
+				 * don't want to call the drivers, devmap_unmap,
+				 * callback function as the driver does not
+				 * think this segment exists. The caller of
+				 * devmap_dup will call seg_free on newseg
+				 * as it was the caller that allocated the
+				 * segment.
+				 */
+				DEBUGF(1, (CE_CONT, "devmap_handle_dup ERROR: "
+				    "newdhp %p dhp %p\n", (void *)newdhp,
+				    (void *)dhp));
+				callbackops->devmap_unmap = NULL;
+				return (ret);
+			}
+		}
+
+		dhp = dhp->dh_next;
+	}
+
+	return (0);
+}
+
+/*
+ * Split a segment at addr for length len.
+ */
+/*ARGSUSED*/
+static int
+segdev_unmap(struct seg *seg, caddr_t addr, size_t len)
+{
+	register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	register struct segdev_data *nsdp;
+	register struct seg *nseg;
+	register size_t	opages;		/* old segment size in pages */
+	register size_t	npages;		/* new segment size in pages */
+	register size_t	dpages;		/* pages being deleted (unmapped) */
+	register size_t	nbytes;
+	devmap_handle_t *dhp = (devmap_handle_t *)sdp->devmap_data;
+	devmap_handle_t *dhpp;
+	devmap_handle_t *newdhp;
+	struct devmap_callback_ctl *callbackops;
+	caddr_t nbase;
+	offset_t off;
+	ulong_t nsize;
+	size_t mlen, sz;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP,
+	    "segdev_unmap:start dhp=%p, seg=%p addr=%p len=%lx",
+	    (void *)dhp, (void *)seg, (void *)addr, len);
+
+	DEBUGF(3, (CE_CONT, "segdev_unmap: dhp %p seg %p addr %p len %lx\n",
+	    (void *)dhp, (void *)seg, (void *)addr, len));
+
+	/*
+	 * Since the address space is "write" locked, we
+	 * don't need the segment lock to protect "segdev" data.
+	 */
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if ((sz = sdp->softlockcnt) > 0) {
+		/*
+		 * Fail the unmap if pages are SOFTLOCKed through this mapping.
+		 * softlockcnt is protected from change by the as write lock.
+		 */
+		TRACE_1(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP_CK1,
+		    "segdev_unmap:error softlockcnt = %ld", sz);
+		DEBUGF(1, (CE_CONT, "segdev_unmap: softlockcnt %ld\n", sz));
+		return (EAGAIN);
+	}
+
+	/*
+	 * Check for bad sizes
+	 */
+	if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
+	    (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET))
+		panic("segdev_unmap");
+
+	if (dhp != NULL) {
+		devmap_handle_t *tdhp;
+		/*
+		 * If large page size was used in hat_devload(),
+		 * the same page size must be used in hat_unload().
+		 */
+		dhpp = tdhp = devmap_find_handle(dhp, addr);
+		while (tdhp != NULL) {
+			if (tdhp->dh_flags & DEVMAP_FLAG_LARGE) {
+				break;
+			}
+			tdhp = tdhp->dh_next;
+		}
+		if (tdhp != NULL) {	/* found a dhp using large pages */
+			size_t slen = len;
+			size_t mlen;
+			size_t soff;
+
+			soff = (ulong_t)(addr - dhpp->dh_uvaddr);
+			while (slen != 0) {
+				mlen = MIN(slen, (dhpp->dh_len - soff));
+				hat_unload(seg->s_as->a_hat, dhpp->dh_uvaddr,
+					dhpp->dh_len, HAT_UNLOAD_UNMAP);
+				dhpp = dhpp->dh_next;
+				ASSERT(slen >= mlen);
+				slen -= mlen;
+				soff = 0;
+			}
+		} else
+			hat_unload(seg->s_as->a_hat, addr, len,
+				HAT_UNLOAD_UNMAP);
+	} else {
+		/*
+		 * Unload any hardware translations in the range
+		 * to be taken out.
+		 */
+		hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP);
+	}
+
+	/*
+	 * get the user offset which will used in the driver callbacks
+	 */
+	off = sdp->offset + (offset_t)(addr - seg->s_base);
+
+	/*
+	 * Inform the vnode of the unmapping.
+	 */
+	ASSERT(sdp->vp != NULL);
+	(void) VOP_DELMAP(VTOCVP(sdp->vp), off, seg->s_as, addr, len,
+		sdp->prot, sdp->maxprot, sdp->type, CRED());
+
+	/*
+	 * Check for entire segment
+	 */
+	if (addr == seg->s_base && len == seg->s_size) {
+		seg_free(seg);
+		return (0);
+	}
+
+	opages = seg_pages(seg);
+	dpages = btop(len);
+	npages = opages - dpages;
+
+	/*
+	 * Check for beginning of segment
+	 */
+	if (addr == seg->s_base) {
+		if (sdp->vpage != NULL) {
+			register struct vpage *ovpage;
+
+			ovpage = sdp->vpage;	/* keep pointer to vpage */
+
+			nbytes = vpgtob(npages);
+			sdp->vpage = kmem_alloc(nbytes, KM_SLEEP);
+			bcopy(&ovpage[dpages], sdp->vpage, nbytes);
+
+			/* free up old vpage */
+			kmem_free(ovpage, vpgtob(opages));
+		}
+
+		/*
+		 * free devmap handles from the beginning of the mapping.
+		 */
+		if (dhp != NULL)
+			devmap_handle_unmap_head(dhp, len);
+
+		sdp->offset += (offset_t)len;
+
+		seg->s_base += len;
+		seg->s_size -= len;
+
+		return (0);
+	}
+
+	/*
+	 * Check for end of segment
+	 */
+	if (addr + len == seg->s_base + seg->s_size) {
+		if (sdp->vpage != NULL) {
+			register struct vpage *ovpage;
+
+			ovpage = sdp->vpage;	/* keep pointer to vpage */
+
+			nbytes = vpgtob(npages);
+			sdp->vpage = kmem_alloc(nbytes, KM_SLEEP);
+			bcopy(ovpage, sdp->vpage, nbytes);
+
+			/* free up old vpage */
+			kmem_free(ovpage, vpgtob(opages));
+		}
+		seg->s_size -= len;
+
+		/*
+		 * free devmap handles from addr to the end of the mapping.
+		 */
+		if (dhp != NULL)
+			devmap_handle_unmap_tail(dhp, addr);
+
+		return (0);
+	}
+
+	/*
+	 * The section to go is in the middle of the segment,
+	 * have to make it into two segments.  nseg is made for
+	 * the high end while seg is cut down at the low end.
+	 */
+	nbase = addr + len;				/* new seg base */
+	nsize = (seg->s_base + seg->s_size) - nbase;	/* new seg size */
+	seg->s_size = addr - seg->s_base;		/* shrink old seg */
+	nseg = seg_alloc(seg->s_as, nbase, nsize);
+	if (nseg == NULL)
+		panic("segdev_unmap seg_alloc");
+
+	TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP_CK2,
+	    "segdev_unmap: seg=%p nseg=%p", (void *)seg, (void *)nseg);
+	DEBUGF(3, (CE_CONT, "segdev_unmap: segdev_dup seg %p nseg %p\n",
+	    (void *)seg, (void *)nseg));
+	nsdp = sdp_alloc();
+
+	nseg->s_ops = seg->s_ops;
+	nseg->s_data = (void *)nsdp;
+
+	VN_HOLD(sdp->vp);
+	nsdp->mapfunc = sdp->mapfunc;
+	nsdp->offset = sdp->offset + (offset_t)(nseg->s_base - seg->s_base);
+	nsdp->vp 	= sdp->vp;
+	nsdp->pageprot = sdp->pageprot;
+	nsdp->prot	= sdp->prot;
+	nsdp->maxprot = sdp->maxprot;
+	nsdp->type = sdp->type;
+	nsdp->hat_attr = sdp->hat_attr;
+	nsdp->hat_flags = sdp->hat_flags;
+	nsdp->softlockcnt = 0;
+
+	/*
+	 * Initialize per page data if the segment we are
+	 * dup'ing has per page information.
+	 */
+	if (sdp->vpage != NULL) {
+		/* need to split vpage into two arrays */
+		register size_t nnbytes;
+		register size_t nnpages;
+		register struct vpage *ovpage;
+
+		ovpage = sdp->vpage;		/* keep pointer to vpage */
+
+		npages = seg_pages(seg);	/* seg has shrunk */
+		nbytes = vpgtob(npages);
+		nnpages = seg_pages(nseg);
+		nnbytes = vpgtob(nnpages);
+
+		sdp->vpage = kmem_alloc(nbytes, KM_SLEEP);
+		bcopy(ovpage, sdp->vpage, nbytes);
+
+		nsdp->vpage = kmem_alloc(nnbytes, KM_SLEEP);
+		bcopy(&ovpage[npages + dpages], nsdp->vpage, nnbytes);
+
+		/* free up old vpage */
+		kmem_free(ovpage, vpgtob(opages));
+	} else
+		nsdp->vpage = NULL;
+
+	/*
+	 * unmap dhps.
+	 */
+	if (dhp == NULL) {
+		nsdp->devmap_data = NULL;
+		return (0);
+	}
+	while (dhp != NULL) {
+		callbackops = &dhp->dh_callbackops;
+		TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP_CK3,
+		    "segdev_unmap: dhp=%p addr=%p", dhp, addr);
+		DEBUGF(3, (CE_CONT, "unmap: dhp %p addr %p uvaddr %p len %lx\n",
+		    (void *)dhp, (void *)addr,
+		    (void *)dhp->dh_uvaddr, dhp->dh_len));
+
+		if (addr == (dhp->dh_uvaddr + dhp->dh_len)) {
+			dhpp = dhp->dh_next;
+			dhp->dh_next = NULL;
+			dhp = dhpp;
+		} else if (addr > (dhp->dh_uvaddr + dhp->dh_len)) {
+			dhp = dhp->dh_next;
+		} else if (addr > dhp->dh_uvaddr &&
+			(addr + len) < (dhp->dh_uvaddr + dhp->dh_len)) {
+			/*
+			 * <addr, addr+len> is enclosed by dhp.
+			 * create a newdhp that begins at addr+len and
+			 * ends at dhp->dh_uvaddr+dhp->dh_len.
+			 */
+			newdhp = kmem_alloc(sizeof (devmap_handle_t), KM_SLEEP);
+			HOLD_DHP_LOCK(dhp);
+			bcopy(dhp, newdhp, sizeof (devmap_handle_t));
+			RELE_DHP_LOCK(dhp);
+			newdhp->dh_seg = nseg;
+			newdhp->dh_next = dhp->dh_next;
+			if (dhp->dh_softlock != NULL)
+				newdhp->dh_softlock = devmap_softlock_init(
+					newdhp->dh_dev,
+					(ulong_t)callbackops->devmap_access);
+			if (dhp->dh_ctx != NULL)
+				newdhp->dh_ctx = devmap_ctxinit(newdhp->dh_dev,
+					(ulong_t)callbackops->devmap_access);
+			if (newdhp->dh_flags & DEVMAP_LOCK_INITED) {
+				mutex_init(&newdhp->dh_lock,
+				    NULL, MUTEX_DEFAULT, NULL);
+			}
+			if (callbackops->devmap_unmap != NULL)
+				(*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp,
+					off, len, dhp, &dhp->dh_pvtp,
+					newdhp, &newdhp->dh_pvtp);
+			mlen = len + (addr - dhp->dh_uvaddr);
+			devmap_handle_reduce_len(newdhp, mlen);
+			nsdp->devmap_data = newdhp;
+			/* XX Changing len should recalculate LARGE flag */
+			dhp->dh_len = addr - dhp->dh_uvaddr;
+			dhpp = dhp->dh_next;
+			dhp->dh_next = NULL;
+			dhp = dhpp;
+		} else if ((addr > dhp->dh_uvaddr) &&
+			    ((addr + len) >= (dhp->dh_uvaddr + dhp->dh_len))) {
+			mlen = dhp->dh_len + dhp->dh_uvaddr - addr;
+			/*
+			 * <addr, addr+len> spans over dhps.
+			 */
+			if (callbackops->devmap_unmap != NULL)
+				(*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp,
+					off, mlen, (devmap_cookie_t *)dhp,
+					&dhp->dh_pvtp, NULL, NULL);
+			/* XX Changing len should recalculate LARGE flag */
+			dhp->dh_len = addr - dhp->dh_uvaddr;
+			dhpp = dhp->dh_next;
+			dhp->dh_next = NULL;
+			dhp = dhpp;
+			nsdp->devmap_data = dhp;
+		} else if ((addr + len) >= (dhp->dh_uvaddr + dhp->dh_len)) {
+			/*
+			 * dhp is enclosed by <addr, addr+len>.
+			 */
+			dhp->dh_seg = nseg;
+			nsdp->devmap_data = dhp;
+			dhp = devmap_handle_unmap(dhp);
+			nsdp->devmap_data = dhp; /* XX redundant? */
+		} else if (((addr + len) > dhp->dh_uvaddr) &&
+			    ((addr + len) < (dhp->dh_uvaddr + dhp->dh_len))) {
+			mlen = addr + len - dhp->dh_uvaddr;
+			if (callbackops->devmap_unmap != NULL)
+				(*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp,
+					dhp->dh_uoff, mlen, NULL,
+					NULL, dhp, &dhp->dh_pvtp);
+			devmap_handle_reduce_len(dhp, mlen);
+			nsdp->devmap_data = dhp;
+			dhp->dh_seg = nseg;
+			dhp = dhp->dh_next;
+		} else {
+			dhp->dh_seg = nseg;
+			dhp = dhp->dh_next;
+		}
+	}
+	return (0);
+}
+
+/*
+ * Utility function handles reducing the length of a devmap handle during unmap
+ * Note that is only used for unmapping the front portion of the handler,
+ * i.e., we are bumping up the offset/pfn etc up by len
+ * Do not use if reducing length at the tail.
+ */
+static void
+devmap_handle_reduce_len(devmap_handle_t *dhp, size_t len)
+{
+	struct ddi_umem_cookie *cp;
+	struct devmap_pmem_cookie *pcp;
+	/*
+	 * adjust devmap handle fields
+	 */
+	ASSERT(len < dhp->dh_len);
+
+	/* Make sure only page-aligned changes are done */
+	ASSERT((len & PAGEOFFSET) == 0);
+
+	dhp->dh_len -= len;
+	dhp->dh_uoff += (offset_t)len;
+	dhp->dh_roff += (offset_t)len;
+	dhp->dh_uvaddr += len;
+	/* Need to grab dhp lock if REMAP */
+	HOLD_DHP_LOCK(dhp);
+	cp = dhp->dh_cookie;
+	if (!(dhp->dh_flags & DEVMAP_MAPPING_INVALID)) {
+		if (cookie_is_devmem(cp)) {
+			dhp->dh_pfn += btop(len);
+		} else if (cookie_is_pmem(cp)) {
+			pcp = (struct devmap_pmem_cookie *)dhp->dh_pcookie;
+			ASSERT((dhp->dh_roff & PAGEOFFSET) == 0 &&
+				dhp->dh_roff < ptob(pcp->dp_npages));
+		} else {
+			ASSERT(dhp->dh_roff < cp->size);
+			ASSERT(dhp->dh_cvaddr >= cp->cvaddr &&
+				dhp->dh_cvaddr < (cp->cvaddr + cp->size));
+			ASSERT((dhp->dh_cvaddr + len) <=
+				(cp->cvaddr + cp->size));
+
+			dhp->dh_cvaddr += len;
+		}
+	}
+	/* XXX - Should recalculate the DEVMAP_FLAG_LARGE after changes */
+	RELE_DHP_LOCK(dhp);
+}
+
+/*
+ * Free devmap handle, dhp.
+ * Return the next devmap handle on the linked list.
+ */
+static devmap_handle_t *
+devmap_handle_unmap(devmap_handle_t *dhp)
+{
+	struct devmap_callback_ctl *callbackops = &dhp->dh_callbackops;
+	struct segdev_data *sdp = (struct segdev_data *)dhp->dh_seg->s_data;
+	devmap_handle_t *dhpp = (devmap_handle_t *)sdp->devmap_data;
+
+	ASSERT(dhp != NULL);
+
+	/*
+	 * before we free up dhp, call the driver's devmap_unmap entry point
+	 * to free resources allocated for this dhp.
+	 */
+	if (callbackops->devmap_unmap != NULL) {
+		(*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp, dhp->dh_uoff,
+			dhp->dh_len, NULL, NULL, NULL, NULL);
+	}
+
+	if (dhpp == dhp) {	/* releasing first dhp, change sdp data */
+		sdp->devmap_data = dhp->dh_next;
+	} else {
+		while (dhpp->dh_next != dhp) {
+			dhpp = dhpp->dh_next;
+		}
+		dhpp->dh_next = dhp->dh_next;
+	}
+	dhpp = dhp->dh_next;	/* return value is next dhp in chain */
+
+	if (dhp->dh_softlock != NULL)
+		devmap_softlock_rele(dhp);
+
+	if (dhp->dh_ctx != NULL)
+		devmap_ctx_rele(dhp);
+
+	if (dhp->dh_flags & DEVMAP_LOCK_INITED) {
+		mutex_destroy(&dhp->dh_lock);
+	}
+	kmem_free(dhp, sizeof (devmap_handle_t));
+
+	return (dhpp);
+}
+
+/*
+ * Free complete devmap handles from dhp for len bytes
+ * dhp can be either the first handle or a subsequent handle
+ */
+static void
+devmap_handle_unmap_head(devmap_handle_t *dhp, size_t len)
+{
+	struct devmap_callback_ctl *callbackops;
+
+	/*
+	 * free the devmap handles covered by len.
+	 */
+	while (len >= dhp->dh_len) {
+		len -= dhp->dh_len;
+		dhp = devmap_handle_unmap(dhp);
+	}
+	if (len != 0) {	/* partial unmap at head of first remaining dhp */
+		callbackops = &dhp->dh_callbackops;
+
+		/*
+		 * Call the unmap callback so the drivers can make
+		 * adjustment on its private data.
+		 */
+		if (callbackops->devmap_unmap != NULL)
+			(*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp,
+			    dhp->dh_uoff, len, NULL, NULL, dhp, &dhp->dh_pvtp);
+		devmap_handle_reduce_len(dhp, len);
+	}
+}
+
+/*
+ * Free devmap handles to truncate  the mapping after addr
+ * RFE: Simpler to pass in dhp pointing at correct dhp (avoid find again)
+ *	Also could then use the routine in middle unmap case too
+ */
+static void
+devmap_handle_unmap_tail(devmap_handle_t *dhp, caddr_t addr)
+{
+	register struct seg *seg = dhp->dh_seg;
+	register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	register devmap_handle_t *dhph = (devmap_handle_t *)sdp->devmap_data;
+	struct devmap_callback_ctl *callbackops;
+	register devmap_handle_t *dhpp;
+	size_t maplen;
+	ulong_t off;
+	size_t len;
+
+	maplen = (size_t)(addr - dhp->dh_uvaddr);
+	dhph = devmap_find_handle(dhph, addr);
+
+	while (dhph != NULL) {
+		if (maplen == 0) {
+			dhph =  devmap_handle_unmap(dhph);
+		} else {
+			callbackops = &dhph->dh_callbackops;
+			len = dhph->dh_len - maplen;
+			off = (ulong_t)sdp->offset + (addr - seg->s_base);
+			/*
+			 * Call the unmap callback so the driver
+			 * can make adjustments on its private data.
+			 */
+			if (callbackops->devmap_unmap != NULL)
+				(*callbackops->devmap_unmap)(dhph,
+					dhph->dh_pvtp, off, len,
+					(devmap_cookie_t *)dhph,
+					&dhph->dh_pvtp, NULL, NULL);
+			/* XXX Reducing len needs to recalculate LARGE flag */
+			dhph->dh_len = maplen;
+			maplen = 0;
+			dhpp = dhph->dh_next;
+			dhph->dh_next = NULL;
+			dhph = dhpp;
+		}
+	} /* end while */
+}
+
+/*
+ * Free a segment.
+ */
+static void
+segdev_free(struct seg *seg)
+{
+	register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	devmap_handle_t *dhp = (devmap_handle_t *)sdp->devmap_data;
+
+	TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_FREE,
+	    "segdev_free: dhp=%p seg=%p", (void *)dhp, (void *)seg);
+	DEBUGF(3, (CE_CONT, "segdev_free: dhp %p seg %p\n",
+	    (void *)dhp, (void *)seg));
+
+	/*
+	 * Since the address space is "write" locked, we
+	 * don't need the segment lock to protect "segdev" data.
+	 */
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	while (dhp != NULL)
+		dhp = devmap_handle_unmap(dhp);
+
+	VN_RELE(sdp->vp);
+	if (sdp->vpage != NULL)
+		kmem_free(sdp->vpage, vpgtob(seg_pages(seg)));
+
+	mutex_destroy(&sdp->lock);
+	kmem_free(sdp, sizeof (*sdp));
+}
+
+static void
+free_devmap_handle(devmap_handle_t *dhp)
+{
+	register devmap_handle_t *dhpp;
+
+	/*
+	 * free up devmap handle
+	 */
+	while (dhp != NULL) {
+		dhpp = dhp->dh_next;
+		if (dhp->dh_flags & DEVMAP_LOCK_INITED) {
+			mutex_destroy(&dhp->dh_lock);
+		}
+
+		if (dhp->dh_softlock != NULL)
+			devmap_softlock_rele(dhp);
+
+		if (dhp->dh_ctx != NULL)
+			devmap_ctx_rele(dhp);
+
+		kmem_free(dhp, sizeof (devmap_handle_t));
+		dhp = dhpp;
+	}
+}
+
+/*
+ * routines to lock and unlock underlying segkp segment for
+ * KMEM_PAGEABLE type cookies.
+ * segkp only allows a single pending F_SOFTLOCK
+ * we keep track of number of locks in the cookie so we can
+ * have multiple pending faults and manage the calls to segkp.
+ * RFE: if segkp supports either pagelock or can support multiple
+ * calls to F_SOFTLOCK, then these routines can go away.
+ *	If pagelock, segdev_faultpage can fault on a page by page basis
+ *		and simplifies the code quite a bit.
+ *	if multiple calls allowed but not partial ranges, then need for
+ *	cookie->lock and locked count goes away, code can call as_fault directly
+ */
+static faultcode_t
+acquire_kpmem_lock(struct ddi_umem_cookie *cookie, size_t npages)
+{
+	int err = 0;
+	ASSERT(cookie_is_kpmem(cookie));
+	/*
+	 * Fault in pages in segkp with F_SOFTLOCK.
+	 * We want to hold the lock until all pages have been loaded.
+	 * segkp only allows single caller to hold SOFTLOCK, so cookie
+	 * holds a count so we dont call into segkp multiple times
+	 */
+	mutex_enter(&cookie->lock);
+
+	/*
+	 * Check for overflow in locked field
+	 */
+	if ((UINT32_MAX - cookie->locked) < npages) {
+		err = FC_MAKE_ERR(ENOMEM);
+	} else if (cookie->locked == 0) {
+		/* First time locking */
+		err = as_fault(kas.a_hat, &kas, cookie->cvaddr,
+		    cookie->size, F_SOFTLOCK, PROT_READ|PROT_WRITE);
+	}
+	if (!err) {
+		cookie->locked += npages;
+	}
+	mutex_exit(&cookie->lock);
+	return (err);
+}
+
+static void
+release_kpmem_lock(struct ddi_umem_cookie *cookie, size_t npages)
+{
+	mutex_enter(&cookie->lock);
+	ASSERT(cookie_is_kpmem(cookie));
+	ASSERT(cookie->locked >= npages);
+	cookie->locked -= (uint_t)npages;
+	if (cookie->locked == 0) {
+		/* Last unlock */
+		if (as_fault(kas.a_hat, &kas, cookie->cvaddr,
+		    cookie->size, F_SOFTUNLOCK, PROT_READ|PROT_WRITE))
+			panic("segdev releasing kpmem lock %p", (void *)cookie);
+	}
+	mutex_exit(&cookie->lock);
+}
+
+/*
+ * Routines to synchronize F_SOFTLOCK and F_INVAL faults for
+ * drivers with devmap_access callbacks
+ * slock->softlocked basically works like a rw lock
+ *	-ve counts => F_SOFTLOCK in progress
+ *	+ve counts => F_INVAL/F_PROT in progress
+ * We allow only one F_SOFTLOCK at a time
+ * but can have multiple pending F_INVAL/F_PROT calls
+ *
+ * This routine waits using cv_wait_sig so killing processes is more graceful
+ * Returns EINTR if coming out of this routine due to a signal, 0 otherwise
+ */
+static int devmap_softlock_enter(
+	struct devmap_softlock *slock,
+	size_t npages,
+	enum fault_type type)
+{
+	if (npages == 0)
+		return (0);
+	mutex_enter(&(slock->lock));
+	switch (type) {
+	case F_SOFTLOCK :
+		while (slock->softlocked) {
+			if (cv_wait_sig(&(slock)->cv, &(slock)->lock) == 0) {
+				/* signalled */
+				mutex_exit(&(slock->lock));
+				return (EINTR);
+			}
+		}
+		slock->softlocked -= npages; /* -ve count => locked */
+		break;
+	case F_INVAL :
+	case F_PROT :
+		while (slock->softlocked < 0)
+			if (cv_wait_sig(&(slock)->cv, &(slock)->lock) == 0) {
+				/* signalled */
+				mutex_exit(&(slock->lock));
+				return (EINTR);
+			}
+		slock->softlocked += npages; /* +ve count => f_invals */
+		break;
+	default:
+		ASSERT(0);
+	}
+	mutex_exit(&(slock->lock));
+	return (0);
+}
+
+static void devmap_softlock_exit(
+	struct devmap_softlock *slock,
+	size_t npages,
+	enum fault_type type)
+{
+	if (slock == NULL)
+		return;
+	mutex_enter(&(slock->lock));
+	switch (type) {
+	case F_SOFTLOCK :
+		ASSERT(-slock->softlocked >= npages);
+		slock->softlocked += npages;	/* -ve count is softlocked */
+		if (slock->softlocked == 0)
+			cv_signal(&slock->cv);
+		break;
+	case F_INVAL :
+	case F_PROT:
+		ASSERT(slock->softlocked >= npages);
+		slock->softlocked -= npages;
+		if (slock->softlocked == 0)
+			cv_signal(&slock->cv);
+		break;
+	default:
+		ASSERT(0);
+	}
+	mutex_exit(&(slock->lock));
+}
+
+/*
+ * Do a F_SOFTUNLOCK call over the range requested.
+ * The range must have already been F_SOFTLOCK'ed.
+ * The segment lock should be held, (but not the segment private lock?)
+ *  The softunlock code below does not adjust for large page sizes
+ *	assumes the caller already did any addr/len adjustments for
+ *	pagesize mappings before calling.
+ */
+/*ARGSUSED*/
+static void
+segdev_softunlock(
+	struct hat *hat,		/* the hat */
+	struct seg *seg,		/* seg_dev of interest */
+	caddr_t addr,			/* base address of range */
+	size_t len,			/* number of bytes */
+	enum seg_rw rw)			/* type of access at fault */
+{
+	struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	devmap_handle_t *dhp_head = (devmap_handle_t *)sdp->devmap_data;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_SOFTUNLOCK,
+	    "segdev_softunlock:dhp_head=%p sdp=%p addr=%p len=%lx",
+	    dhp_head, sdp, addr, len);
+	DEBUGF(3, (CE_CONT, "segdev_softunlock: dhp %p lockcnt %lx "
+	    "addr %p len %lx\n",
+	    (void *)dhp_head, sdp->softlockcnt, (void *)addr, len));
+
+	hat_unlock(hat, addr, len);
+
+	if (dhp_head != NULL) {
+		devmap_handle_t *dhp;
+		size_t mlen;
+		ulong_t off;
+
+		dhp = devmap_find_handle(dhp_head, addr);
+		ASSERT(dhp != NULL);
+
+		off = (ulong_t)(addr - dhp->dh_uvaddr);
+		while (len != 0) {
+			mlen = MIN(len, (dhp->dh_len - off));
+
+			/*
+			 * unlock segkp memory, locked during F_SOFTLOCK
+			 */
+			if (dhp_is_kpmem(dhp)) {
+				release_kpmem_lock(
+				    (struct ddi_umem_cookie *)dhp->dh_cookie,
+				    btopr(mlen));
+			}
+
+			/*
+			 * Do the softlock accounting for devmap_access
+			 */
+			if (dhp->dh_callbackops.devmap_access != NULL) {
+				devmap_softlock_exit(dhp->dh_softlock,
+					btopr(mlen), F_SOFTLOCK);
+			}
+
+			len -= mlen;
+			dhp = dhp->dh_next;
+			off = 0;
+		}
+	}
+
+	mutex_enter(&freemem_lock);
+	ASSERT(sdp->softlockcnt >= btopr(len));
+	sdp->softlockcnt -= btopr(len);
+	mutex_exit(&freemem_lock);
+	if (sdp->softlockcnt == 0) {
+		/*
+		 * All SOFTLOCKS are gone. Wakeup any waiting
+		 * unmappers so they can try again to unmap.
+		 * Check for waiters first without the mutex
+		 * held so we don't always grab the mutex on
+		 * softunlocks.
+		 */
+		if (AS_ISUNMAPWAIT(seg->s_as)) {
+			mutex_enter(&seg->s_as->a_contents);
+			if (AS_ISUNMAPWAIT(seg->s_as)) {
+				AS_CLRUNMAPWAIT(seg->s_as);
+				cv_broadcast(&seg->s_as->a_cv);
+			}
+			mutex_exit(&seg->s_as->a_contents);
+		}
+	}
+
+}
+
+/*
+ * Handle fault for a single page.
+ * Done in a separate routine so we can handle errors more easily.
+ * This routine is called only from segdev_faultpages()
+ * when looping over the range of addresses requested. The segment lock is held.
+ */
+static faultcode_t
+segdev_faultpage(
+	struct hat *hat,		/* the hat */
+	struct seg *seg,		/* seg_dev of interest */
+	caddr_t addr,			/* address in as */
+	struct vpage *vpage,		/* pointer to vpage for seg, addr */
+	enum fault_type type,		/* type of fault */
+	enum seg_rw rw,			/* type of access at fault */
+	devmap_handle_t *dhp)		/* devmap handle if any for this page */
+{
+	struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	uint_t prot;
+	pfn_t pfnum = PFN_INVALID;
+	u_offset_t offset;
+	uint_t hat_flags;
+	dev_info_t *dip;
+
+	TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_FAULTPAGE,
+	    "segdev_faultpage: dhp=%p seg=%p addr=%p", dhp, seg, addr);
+	DEBUGF(8, (CE_CONT, "segdev_faultpage: dhp %p seg %p addr %p \n",
+	    (void *)dhp, (void *)seg, (void *)addr));
+
+	/*
+	 * Initialize protection value for this page.
+	 * If we have per page protection values check it now.
+	 */
+	if (sdp->pageprot) {
+		uint_t protchk;
+
+		switch (rw) {
+		case S_READ:
+			protchk = PROT_READ;
+			break;
+		case S_WRITE:
+			protchk = PROT_WRITE;
+			break;
+		case S_EXEC:
+			protchk = PROT_EXEC;
+			break;
+		case S_OTHER:
+		default:
+			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+			break;
+		}
+
+		prot = VPP_PROT(vpage);
+		if ((prot & protchk) == 0)
+			return (FC_PROT);	/* illegal access type */
+	} else {
+		prot = sdp->prot;
+		/* caller has already done segment level protection check */
+	}
+
+	if (type == F_SOFTLOCK) {
+		mutex_enter(&freemem_lock);
+		sdp->softlockcnt++;
+		mutex_exit(&freemem_lock);
+	}
+
+	hat_flags = ((type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD);
+	offset = sdp->offset + (u_offset_t)(addr - seg->s_base);
+	/*
+	 * In the devmap framework, sdp->mapfunc is set to NULL.  we can get
+	 * pfnum from dhp->dh_pfn (at beginning of segment) and offset from
+	 * seg->s_base.
+	 */
+	if (dhp == NULL) {
+		/* If segment has devmap_data, then dhp should be non-NULL */
+		ASSERT(sdp->devmap_data == NULL);
+		pfnum = (pfn_t)cdev_mmap(sdp->mapfunc, sdp->vp->v_rdev,
+			(off_t)offset, prot);
+		prot |= sdp->hat_attr;
+	} else {
+		ulong_t off;
+		struct ddi_umem_cookie *cp;
+		struct devmap_pmem_cookie *pcp;
+
+		/* ensure the dhp passed in contains addr. */
+		ASSERT(dhp == devmap_find_handle(
+			(devmap_handle_t *)sdp->devmap_data, addr));
+
+		off = addr - dhp->dh_uvaddr;
+
+		/*
+		 * This routine assumes that the caller makes sure that the
+		 * fields in dhp used below are unchanged due to remap during
+		 * this call. Caller does HOLD_DHP_LOCK if neeed
+		 */
+		cp = dhp->dh_cookie;
+		if (dhp->dh_flags & DEVMAP_MAPPING_INVALID) {
+			pfnum = PFN_INVALID;
+		} else if (cookie_is_devmem(cp)) {
+			pfnum = dhp->dh_pfn + btop(off);
+		} else if (cookie_is_pmem(cp)) {
+			pcp = (struct devmap_pmem_cookie *)dhp->dh_pcookie;
+			ASSERT((dhp->dh_roff & PAGEOFFSET) == 0 &&
+				dhp->dh_roff < ptob(pcp->dp_npages));
+			pfnum = page_pptonum(
+			    pcp->dp_pparray[btop(off + dhp->dh_roff)]);
+		} else {
+			ASSERT(dhp->dh_roff < cp->size);
+			ASSERT(dhp->dh_cvaddr >= cp->cvaddr &&
+				dhp->dh_cvaddr < (cp->cvaddr + cp->size));
+			ASSERT((dhp->dh_cvaddr + off) <=
+				(cp->cvaddr + cp->size));
+			ASSERT((dhp->dh_cvaddr + off + PAGESIZE) <=
+				(cp->cvaddr + cp->size));
+
+			switch (cp->type) {
+			case UMEM_LOCKED :
+			    if (cp->pparray != NULL) {
+				ASSERT((dhp->dh_roff & PAGEOFFSET) == 0);
+				pfnum = page_pptonum(
+				    cp->pparray[btop(off + dhp->dh_roff)]);
+			    } else {
+				pfnum = hat_getpfnum(
+				    ((proc_t *)cp->procp)->p_as->a_hat,
+				    cp->cvaddr + off);
+			    }
+			    break;
+			case UMEM_TRASH :
+			    pfnum = page_pptonum(trashpp);
+			    /* We should set hat_flags to HAT_NOFAULT also */
+			    /* However, not all hat layers implement this */
+			    break;
+			case KMEM_PAGEABLE:
+			case KMEM_NON_PAGEABLE:
+			    pfnum = hat_getpfnum(kas.a_hat,
+				dhp->dh_cvaddr + off);
+			    break;
+			default :
+			    pfnum = PFN_INVALID;
+			    break;
+			}
+		}
+		prot |= dhp->dh_hat_attr;
+	}
+	if (pfnum == PFN_INVALID) {
+		return (FC_MAKE_ERR(EFAULT));
+	}
+	/* prot should already be OR'ed in with hat_attributes if needed */
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_FAULTPAGE_CK1,
+	    "segdev_faultpage: pfnum=%lx memory=%x prot=%x flags=%x",
+	    pfnum, pf_is_memory(pfnum), prot, hat_flags);
+	DEBUGF(9, (CE_CONT, "segdev_faultpage: pfnum %lx memory %x "
+	    "prot %x flags %x\n", pfnum, pf_is_memory(pfnum), prot, hat_flags));
+
+	if (pf_is_memory(pfnum) || (dhp != NULL)) {
+		/*
+		 * It's not _really_ required here to pass sdp->hat_flags
+		 * to hat_devload even though we do it.
+		 * This is because hat figures it out DEVMEM mappings
+		 * are non-consistent, anyway.
+		 */
+		hat_devload(hat, addr, PAGESIZE, pfnum,
+				prot, hat_flags | sdp->hat_flags);
+		return (0);
+	}
+
+	/*
+	 * Fall through to the case where devmap is not used and need to call
+	 * up the device tree to set up the mapping
+	 */
+
+	dip = VTOS(VTOCVP(sdp->vp))->s_dip;
+	ASSERT(dip);
+
+	/*
+	 * When calling ddi_map_fault, we do not OR in sdp->hat_attr
+	 * This is because this calls drivers which may not expect
+	 * prot to have any other values than PROT_ALL
+	 * The root nexus driver has a hack to peek into the segment
+	 * structure and then OR in sdp->hat_attr.
+	 * XX In case the bus_ops interfaces are ever revisited
+	 * we need to fix this. prot should include other hat attributes
+	 */
+	if (ddi_map_fault(dip, hat, seg, addr, NULL, pfnum, prot & PROT_ALL,
+	    (uint_t)(type == F_SOFTLOCK)) != DDI_SUCCESS) {
+		return (FC_MAKE_ERR(EFAULT));
+	}
+	return (0);
+}
+
+static faultcode_t
+segdev_fault(
+	struct hat *hat,		/* the hat */
+	struct seg *seg,		/* the seg_dev of interest */
+	caddr_t addr,			/* the address of the fault */
+	size_t len,			/* the length of the range */
+	enum fault_type type,		/* type of fault */
+	enum seg_rw rw)			/* type of access at fault */
+{
+	struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	devmap_handle_t *dhp_head = (devmap_handle_t *)sdp->devmap_data;
+	devmap_handle_t *dhp;
+	struct devmap_softlock *slock = NULL;
+	ulong_t slpage = 0;
+	ulong_t off;
+	caddr_t maddr = addr;
+	int err;
+	int err_is_faultcode = 0;
+
+	TRACE_5(TR_FAC_DEVMAP, TR_DEVMAP_FAULT,
+	    "segdev_fault: dhp_head=%p seg=%p addr=%p len=%lx type=%x",
+	    (void *)dhp_head, (void *)seg, (void *)addr, len, type);
+	DEBUGF(7, (CE_CONT, "segdev_fault: dhp_head %p seg %p "
+	    "addr %p len %lx type %x\n",
+	    (void *)dhp_head, (void *)seg, (void *)addr, len, type));
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/* Handle non-devmap case */
+	if (dhp_head == NULL)
+		return (segdev_faultpages(hat, seg, addr, len, type, rw, NULL));
+
+	/* Find devmap handle */
+	if ((dhp = devmap_find_handle(dhp_head, addr)) == NULL)
+		return (FC_NOMAP);
+
+	/*
+	 * The seg_dev driver does not implement copy-on-write,
+	 * and always loads translations with maximal allowed permissions
+	 * but we got an fault trying to access the device.
+	 * Servicing the fault is not going to result in any better result
+	 * RFE: If we want devmap_access callbacks to be involved in F_PROT
+	 *	faults, then the code below is written for that
+	 *	Pending resolution of the following:
+	 *	- determine if the F_INVAL/F_SOFTLOCK syncing
+	 *	is needed for F_PROT also or not. The code below assumes it does
+	 *	- If driver sees F_PROT and calls devmap_load with same type,
+	 *	then segdev_faultpages will fail with FC_PROT anyway, need to
+	 *	change that so calls from devmap_load to segdev_faultpages for
+	 *	F_PROT type are retagged to F_INVAL.
+	 * RFE: Today we dont have drivers that use devmap and want to handle
+	 *	F_PROT calls. The code in segdev_fault* is written to allow
+	 *	this case but is not tested. A driver that needs this capability
+	 *	should be able to remove the short-circuit case; resolve the
+	 *	above issues and "should" work.
+	 */
+	if (type == F_PROT) {
+		return (FC_PROT);
+	}
+
+	/*
+	 * Loop through dhp list calling devmap_access or segdev_faultpages for
+	 * each devmap handle.
+	 * drivers which implement devmap_access can interpose on faults and do
+	 * device-appropriate special actions before calling devmap_load.
+	 */
+
+	/*
+	 * Unfortunately, this simple loop has turned out to expose a variety
+	 * of complex problems which results in the following convoluted code.
+	 *
+	 * First, a desire to handle a serialization of F_SOFTLOCK calls
+	 * to the driver within the framework.
+	 *	This results in a dh_softlock structure that is on a per device
+	 *	(or device instance) basis and serializes devmap_access calls.
+	 *	Ideally we would need to do this for underlying
+	 *	memory/device regions that are being faulted on
+	 *	but that is hard to identify and with REMAP, harder
+	 * Second, a desire to serialize F_INVAL(and F_PROT) calls w.r.t.
+	 * 	to F_SOFTLOCK calls to the driver.
+	 * These serializations are to simplify the driver programmer model.
+	 * To support these two features, the code first goes through the
+	 *	devmap handles and counts the pages (slpage) that are covered
+	 *	by devmap_access callbacks.
+	 * This part ends with a devmap_softlock_enter call
+	 *	which allows only one F_SOFTLOCK active on a device instance,
+	 *	but multiple F_INVAL/F_PROTs can be active except when a
+	 *	F_SOFTLOCK is active
+	 *
+	 * Next, we dont short-circuit the fault code upfront to call
+	 *	segdev_softunlock for F_SOFTUNLOCK, because we must use
+	 *	the same length when we softlock and softunlock.
+	 *
+	 *	-Hat layers may not support softunlocking lengths less than the
+	 *	original length when there is large page support.
+	 *	-kpmem locking is dependent on keeping the lengths same.
+	 *	-if drivers handled F_SOFTLOCK, they probably also expect to
+	 *		see an F_SOFTUNLOCK of the same length
+	 *	Hence, if extending lengths during softlock,
+	 *	softunlock has to make the same adjustments and goes through
+	 *	the same loop calling segdev_faultpages/segdev_softunlock
+	 *	But some of the synchronization and error handling is different
+	 */
+
+	if (type != F_SOFTUNLOCK) {
+		devmap_handle_t *dhpp = dhp;
+		size_t slen = len;
+
+		/*
+		 * Calculate count of pages that are :
+		 * a) within the (potentially extended) fault region
+		 * b) AND covered by devmap handle with devmap_access
+		 */
+		off = (ulong_t)(addr - dhpp->dh_uvaddr);
+		while (slen != 0) {
+			size_t mlen;
+
+			/*
+			 * Softlocking on a region that allows remap is
+			 * unsupported due to unresolved locking issues
+			 * XXX: unclear what these are?
+			 *	One potential is that if there is a pending
+			 *	softlock, then a remap should not be allowed
+			 *	until the unlock is done. This is easily
+			 *	fixed by returning error in devmap*remap on
+			 *	checking the dh->dh_softlock->softlocked value
+			 */
+			if ((type == F_SOFTLOCK) &&
+			    (dhpp->dh_flags & DEVMAP_ALLOW_REMAP)) {
+				return (FC_NOSUPPORT);
+			}
+
+			mlen = MIN(slen, (dhpp->dh_len - off));
+			if (dhpp->dh_callbackops.devmap_access) {
+				size_t llen;
+				caddr_t laddr;
+				/*
+				 * use extended length for large page mappings
+				 */
+				HOLD_DHP_LOCK(dhpp);
+				if ((sdp->pageprot == 0) &&
+				    (dhpp->dh_flags & DEVMAP_FLAG_LARGE)) {
+					devmap_get_large_pgsize(dhpp,
+					    mlen, maddr, &llen, &laddr);
+				} else {
+					llen = mlen;
+				}
+				RELE_DHP_LOCK(dhpp);
+
+				slpage += btopr(llen);
+				slock = dhpp->dh_softlock;
+			}
+			maddr += mlen;
+			ASSERT(slen >= mlen);
+			slen -= mlen;
+			dhpp = dhpp->dh_next;
+			off = 0;
+		}
+		/*
+		 * synchonize with other faulting threads and wait till safe
+		 * devmap_softlock_enter might return due to signal in cv_wait
+		 *
+		 * devmap_softlock_enter has to be called outside of while loop
+		 * to prevent a deadlock if len spans over multiple dhps.
+		 * dh_softlock is based on device instance and if multiple dhps
+		 * use the same device instance, the second dhp's LOCK call
+		 * will hang waiting on the first to complete.
+		 * devmap_setup verifies that slocks in a dhp_chain are same.
+		 * RFE: this deadlock only hold true for F_SOFTLOCK. For
+		 * 	F_INVAL/F_PROT, since we now allow multiple in parallel,
+		 *	we could have done the softlock_enter inside the loop
+		 *	and supported multi-dhp mappings with dissimilar devices
+		 */
+		if (err = devmap_softlock_enter(slock, slpage, type))
+			return (FC_MAKE_ERR(err));
+	}
+
+	/* reset 'maddr' to the start addr of the range of fault. */
+	maddr = addr;
+
+	/* calculate the offset corresponds to 'addr' in the first dhp. */
+	off = (ulong_t)(addr - dhp->dh_uvaddr);
+
+	/*
+	 * The fault length may span over multiple dhps.
+	 * Loop until the total length is satisfied.
+	 */
+	while (len != 0) {
+		size_t llen;
+		size_t mlen;
+		caddr_t laddr;
+
+		/*
+		 * mlen is the smaller of 'len' and the length
+		 * from addr to the end of mapping defined by dhp.
+		 */
+		mlen = MIN(len, (dhp->dh_len - off));
+
+		HOLD_DHP_LOCK(dhp);
+		/*
+		 * Pass the extended length and address to devmap_access
+		 * if large pagesize is used for loading address translations.
+		 */
+		if ((sdp->pageprot == 0) &&
+		    (dhp->dh_flags & DEVMAP_FLAG_LARGE)) {
+			devmap_get_large_pgsize(dhp, mlen, maddr,
+				&llen, &laddr);
+			ASSERT(maddr == addr || laddr == maddr);
+		} else {
+			llen = mlen;
+			laddr = maddr;
+		}
+
+		if (dhp->dh_callbackops.devmap_access != NULL) {
+			offset_t aoff;
+
+			aoff = sdp->offset + (offset_t)(laddr - seg->s_base);
+
+			/*
+			 * call driver's devmap_access entry point which will
+			 * call devmap_load/contextmgmt to load the translations
+			 *
+			 * We drop the dhp_lock before calling access so
+			 * drivers can call devmap_*_remap within access
+			 */
+			RELE_DHP_LOCK(dhp);
+
+			err = (*dhp->dh_callbackops.devmap_access)(
+			    dhp, (void *)dhp->dh_pvtp, aoff, llen, type, rw);
+		} else {
+			/*
+			 * If no devmap_access entry point, then load mappings
+			 * hold dhp_lock across faultpages if REMAP
+			 */
+			err = segdev_faultpages(hat, seg, laddr, llen,
+			    type, rw, dhp);
+			err_is_faultcode = 1;
+			RELE_DHP_LOCK(dhp);
+		}
+
+		if (err) {
+			if ((type == F_SOFTLOCK) && (maddr > addr)) {
+				/*
+				 * If not first dhp, use
+				 * segdev_fault(F_SOFTUNLOCK) for prior dhps
+				 * While this is recursion, it is incorrect to
+				 * call just segdev_softunlock
+				 * if we are using either large pages
+				 * or devmap_access. It will be more right
+				 * to go through the same loop as above
+				 * rather than call segdev_softunlock directly
+				 * It will use the right lenghths as well as
+				 * call into the driver devmap_access routines.
+				 */
+				size_t done = (size_t)(maddr - addr);
+				(void) segdev_fault(hat, seg, addr, done,
+					F_SOFTUNLOCK, S_OTHER);
+				/*
+				 * reduce slpage by number of pages
+				 * released by segdev_softunlock
+				 */
+				ASSERT(slpage >= btopr(done));
+				devmap_softlock_exit(slock,
+					slpage - btopr(done), type);
+			} else {
+				devmap_softlock_exit(slock, slpage, type);
+			}
+
+
+			/*
+			 * Segdev_faultpages() already returns a faultcode,
+			 * hence, result from segdev_faultpages() should be
+			 * returned directly.
+			 */
+			if (err_is_faultcode)
+				return (err);
+			return (FC_MAKE_ERR(err));
+		}
+
+		maddr += mlen;
+		ASSERT(len >= mlen);
+		len -= mlen;
+		dhp = dhp->dh_next;
+		off = 0;
+
+		ASSERT(!dhp || len == 0 || maddr == dhp->dh_uvaddr);
+	}
+	/*
+	 * release the softlock count at end of fault
+	 * For F_SOFTLOCk this is done in the later F_SOFTUNLOCK
+	 */
+	if ((type == F_INVAL) || (type == F_PROT))
+		devmap_softlock_exit(slock, slpage, type);
+	return (0);
+}
+
+/*
+ * segdev_faultpages
+ *
+ * Used to fault in seg_dev segment pages. Called by segdev_fault or devmap_load
+ * This routine assumes that the callers makes sure that the fields
+ * in dhp used below are not changed due to remap during this call.
+ * Caller does HOLD_DHP_LOCK if neeed
+ * This routine returns a faultcode_t as a return value for segdev_fault.
+ */
+static faultcode_t
+segdev_faultpages(
+	struct hat *hat,		/* the hat */
+	struct seg *seg,		/* the seg_dev of interest */
+	caddr_t addr,			/* the address of the fault */
+	size_t len,			/* the length of the range */
+	enum fault_type type,		/* type of fault */
+	enum seg_rw rw,			/* type of access at fault */
+	devmap_handle_t *dhp)		/* devmap handle */
+{
+	register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	register caddr_t a;
+	struct vpage *vpage;
+	struct ddi_umem_cookie *kpmem_cookie = NULL;
+	int err;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_FAULTPAGES,
+	    "segdev_faultpages: dhp=%p seg=%p addr=%p len=%lx",
+	    (void *)dhp, (void *)seg, (void *)addr, len);
+	DEBUGF(5, (CE_CONT, "segdev_faultpages: "
+	    "dhp %p seg %p addr %p len %lx\n",
+	    (void *)dhp, (void *)seg, (void *)addr, len));
+
+	/*
+	 * The seg_dev driver does not implement copy-on-write,
+	 * and always loads translations with maximal allowed permissions
+	 * but we got an fault trying to access the device.
+	 * Servicing the fault is not going to result in any better result
+	 * XXX: If we want to allow devmap_access to handle F_PROT calls,
+	 * This code should be removed and let the normal fault handling
+	 * take care of finding the error
+	 */
+	if (type == F_PROT) {
+		return (FC_PROT);
+	}
+
+	if (type == F_SOFTUNLOCK) {
+		segdev_softunlock(hat, seg, addr, len, rw);
+		return (0);
+	}
+
+	/*
+	 * For kernel pageable memory, fault/lock segkp pages
+	 * We hold this until the completion of this
+	 * fault (INVAL/PROT) or till unlock (SOFTLOCK).
+	 */
+	if ((dhp != NULL) && dhp_is_kpmem(dhp)) {
+		kpmem_cookie = (struct ddi_umem_cookie *)dhp->dh_cookie;
+		if (err = acquire_kpmem_lock(kpmem_cookie, btopr(len)))
+			return (err);
+	}
+
+	/*
+	 * If we have the same protections for the entire segment,
+	 * insure that the access being attempted is legitimate.
+	 */
+	mutex_enter(&sdp->lock);
+	if (sdp->pageprot == 0) {
+		uint_t protchk;
+
+		switch (rw) {
+		case S_READ:
+			protchk = PROT_READ;
+			break;
+		case S_WRITE:
+			protchk = PROT_WRITE;
+			break;
+		case S_EXEC:
+			protchk = PROT_EXEC;
+			break;
+		case S_OTHER:
+		default:
+			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+			break;
+		}
+
+		if ((sdp->prot & protchk) == 0) {
+			mutex_exit(&sdp->lock);
+			/* undo kpmem locking */
+			if (kpmem_cookie != NULL) {
+				release_kpmem_lock(kpmem_cookie, btopr(len));
+			}
+			return (FC_PROT);	/* illegal access type */
+		}
+	}
+
+	/*
+	 * we do a single hat_devload for the range if
+	 *   - devmap framework (dhp is not NULL),
+	 *   - pageprot == 0, i.e., no per-page protection set and
+	 *   - is device pages, irrespective of whether we are using large pages
+	 */
+	if ((sdp->pageprot == 0) && (dhp != NULL) && dhp_is_devmem(dhp)) {
+		pfn_t pfnum;
+		uint_t hat_flags;
+
+		if (dhp->dh_flags & DEVMAP_MAPPING_INVALID) {
+			mutex_exit(&sdp->lock);
+			return (FC_NOMAP);
+		}
+
+		if (type == F_SOFTLOCK) {
+			mutex_enter(&freemem_lock);
+			sdp->softlockcnt += btopr(len);
+			mutex_exit(&freemem_lock);
+		}
+
+		hat_flags = ((type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD);
+		pfnum = dhp->dh_pfn + btop((uintptr_t)(addr - dhp->dh_uvaddr));
+		ASSERT(!pf_is_memory(pfnum));
+
+		hat_devload(hat, addr, len, pfnum, sdp->prot | dhp->dh_hat_attr,
+			hat_flags | sdp->hat_flags);
+		mutex_exit(&sdp->lock);
+		return (0);
+	}
+
+	/* Handle cases where we have to loop through fault handling per-page */
+
+	if (sdp->vpage == NULL)
+		vpage = NULL;
+	else
+		vpage = &sdp->vpage[seg_page(seg, addr)];
+
+	/* loop over the address range handling each fault */
+	for (a = addr; a < addr + len; a += PAGESIZE) {
+		if (err = segdev_faultpage(hat, seg, a, vpage, type, rw, dhp)) {
+			break;
+		}
+		if (vpage != NULL)
+			vpage++;
+	}
+	mutex_exit(&sdp->lock);
+	if (err && (type == F_SOFTLOCK)) { /* error handling for F_SOFTLOCK */
+		size_t done = (size_t)(a - addr); /* pages fault successfully */
+		if (done > 0) {
+			/* use softunlock for those pages */
+			segdev_softunlock(hat, seg, addr, done, S_OTHER);
+		}
+		if (kpmem_cookie != NULL) {
+			/* release kpmem lock for rest of pages */
+			ASSERT(len >= done);
+			release_kpmem_lock(kpmem_cookie, btopr(len - done));
+		}
+	} else if ((kpmem_cookie != NULL) && (type != F_SOFTLOCK)) {
+		/* for non-SOFTLOCK cases, release kpmem */
+		release_kpmem_lock(kpmem_cookie, btopr(len));
+	}
+	return (err);
+}
+
+/*
+ * Asynchronous page fault.  We simply do nothing since this
+ * entry point is not supposed to load up the translation.
+ */
+/*ARGSUSED*/
+static faultcode_t
+segdev_faulta(struct seg *seg, caddr_t addr)
+{
+	TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_FAULTA,
+	    "segdev_faulta: seg=%p addr=%p", (void *)seg, (void *)addr);
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	return (0);
+}
+
+static int
+segdev_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	register devmap_handle_t *dhp;
+	register struct vpage *vp, *evp;
+	devmap_handle_t *dhp_head = (devmap_handle_t *)sdp->devmap_data;
+	ulong_t off;
+	size_t mlen, sz;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_SETPROT,
+	    "segdev_setprot:start seg=%p addr=%p len=%lx prot=%x",
+	    (void *)seg, (void *)addr, len, prot);
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if ((sz = sdp->softlockcnt) > 0 && dhp_head != NULL) {
+		/*
+		 * Fail the setprot if pages are SOFTLOCKed through this
+		 * mapping.
+		 * Softlockcnt is protected from change by the as read lock.
+		 */
+		TRACE_1(TR_FAC_DEVMAP, TR_DEVMAP_SETPROT_CK1,
+		    "segdev_setprot:error softlockcnt=%lx", sz);
+		DEBUGF(1, (CE_CONT, "segdev_setprot: softlockcnt %ld\n", sz));
+		return (EAGAIN);
+	}
+
+	if (dhp_head != NULL) {
+		if ((dhp = devmap_find_handle(dhp_head, addr)) == NULL)
+			return (EINVAL);
+
+		/*
+		 * check if violate maxprot.
+		 */
+		off = (ulong_t)(addr - dhp->dh_uvaddr);
+		mlen  = len;
+		while (dhp) {
+			if ((dhp->dh_maxprot & prot) != prot)
+				return (EACCES);	/* violated maxprot */
+
+			if (mlen > (dhp->dh_len - off)) {
+				mlen -= dhp->dh_len - off;
+				dhp = dhp->dh_next;
+				off = 0;
+			} else
+				break;
+		}
+	} else {
+		if ((sdp->maxprot & prot) != prot)
+			return (EACCES);
+	}
+
+	mutex_enter(&sdp->lock);
+	if (addr == seg->s_base && len == seg->s_size && sdp->pageprot == 0) {
+		if (sdp->prot == prot) {
+			mutex_exit(&sdp->lock);
+			return (0);			/* all done */
+		}
+		sdp->prot = (uchar_t)prot;
+	} else {
+		sdp->pageprot = 1;
+		if (sdp->vpage == NULL) {
+			/*
+			 * First time through setting per page permissions,
+			 * initialize all the vpage structures to prot
+			 */
+			sdp->vpage = kmem_zalloc(vpgtob(seg_pages(seg)),
+			    KM_SLEEP);
+			evp = &sdp->vpage[seg_pages(seg)];
+			for (vp = sdp->vpage; vp < evp; vp++)
+				VPP_SETPROT(vp, sdp->prot);
+		}
+		/*
+		 * Now go change the needed vpages protections.
+		 */
+		evp = &sdp->vpage[seg_page(seg, addr + len)];
+		for (vp = &sdp->vpage[seg_page(seg, addr)]; vp < evp; vp++)
+			VPP_SETPROT(vp, prot);
+	}
+	mutex_exit(&sdp->lock);
+
+	if (dhp_head != NULL) {
+		devmap_handle_t *tdhp;
+		/*
+		 * If large page size was used in hat_devload(),
+		 * the same page size must be used in hat_unload().
+		 */
+		dhp = tdhp = devmap_find_handle(dhp_head, addr);
+		while (tdhp != NULL) {
+			if (tdhp->dh_flags & DEVMAP_FLAG_LARGE) {
+				break;
+			}
+			tdhp = tdhp->dh_next;
+		}
+		if (tdhp) {
+			size_t slen = len;
+			size_t mlen;
+			size_t soff;
+
+			soff = (ulong_t)(addr - dhp->dh_uvaddr);
+			while (slen != 0) {
+				mlen = MIN(slen, (dhp->dh_len - soff));
+				hat_unload(seg->s_as->a_hat, dhp->dh_uvaddr,
+					dhp->dh_len, HAT_UNLOAD);
+				dhp = dhp->dh_next;
+				ASSERT(slen >= mlen);
+				slen -= mlen;
+				soff = 0;
+			}
+			return (0);
+		}
+	}
+
+	if ((prot & ~PROT_USER) == PROT_NONE) {
+		hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
+	} else {
+		/*
+		 * RFE: the segment should keep track of all attributes
+		 * allowing us to remove the deprecated hat_chgprot
+		 * and use hat_chgattr.
+		 */
+		hat_chgprot(seg->s_as->a_hat, addr, len, prot);
+	}
+
+	return (0);
+}
+
+static int
+segdev_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	struct vpage *vp, *evp;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_CHECKPROT,
+	    "segdev_checkprot:start seg=%p addr=%p len=%lx prot=%x",
+	    (void *)seg, (void *)addr, len, prot);
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * If segment protection can be used, simply check against them
+	 */
+	mutex_enter(&sdp->lock);
+	if (sdp->pageprot == 0) {
+		register int err;
+
+		err = ((sdp->prot & prot) != prot) ? EACCES : 0;
+		mutex_exit(&sdp->lock);
+		return (err);
+	}
+
+	/*
+	 * Have to check down to the vpage level
+	 */
+	evp = &sdp->vpage[seg_page(seg, addr + len)];
+	for (vp = &sdp->vpage[seg_page(seg, addr)]; vp < evp; vp++) {
+		if ((VPP_PROT(vp) & prot) != prot) {
+			mutex_exit(&sdp->lock);
+			return (EACCES);
+		}
+	}
+	mutex_exit(&sdp->lock);
+	return (0);
+}
+
+static int
+segdev_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+	struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	size_t pgno;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_GETPROT,
+	    "segdev_getprot:start seg=%p addr=%p len=%lx protv=%p",
+	    (void *)seg, (void *)addr, len, (void *)protv);
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
+	if (pgno != 0) {
+		mutex_enter(&sdp->lock);
+		if (sdp->pageprot == 0) {
+			do
+				protv[--pgno] = sdp->prot;
+			while (pgno != 0);
+		} else {
+			size_t pgoff = seg_page(seg, addr);
+
+			do {
+				pgno--;
+				protv[pgno] =
+					VPP_PROT(&sdp->vpage[pgno + pgoff]);
+			} while (pgno != 0);
+		}
+		mutex_exit(&sdp->lock);
+	}
+	return (0);
+}
+
+static u_offset_t
+segdev_getoffset(register struct seg *seg, caddr_t addr)
+{
+	register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+
+	TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_GETOFFSET,
+	    "segdev_getoffset:start seg=%p addr=%p", (void *)seg, (void *)addr);
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	return ((u_offset_t)sdp->offset + (addr - seg->s_base));
+}
+
+/*ARGSUSED*/
+static int
+segdev_gettype(register struct seg *seg, caddr_t addr)
+{
+	register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+
+	TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_GETTYPE,
+	    "segdev_gettype:start seg=%p addr=%p", (void *)seg, (void *)addr);
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	return (sdp->type);
+}
+
+
+/*ARGSUSED*/
+static int
+segdev_getvp(register struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+	register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+
+	TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_GETVP,
+	    "segdev_getvp:start seg=%p addr=%p", (void *)seg, (void *)addr);
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * Note that this vp is the common_vp of the device, where the
+	 * pages are hung ..
+	 */
+	*vpp = VTOCVP(sdp->vp);
+
+	return (0);
+}
+
+static void
+segdev_badop(void)
+{
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SEGDEV_BADOP,
+		"segdev_badop:start");
+	panic("segdev_badop");
+	/*NOTREACHED*/
+}
+
+/*
+ * segdev pages are not in the cache, and thus can't really be controlled.
+ * Hence, syncs are simply always successful.
+ */
+/*ARGSUSED*/
+static int
+segdev_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
+{
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SYNC, "segdev_sync:start");
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	return (0);
+}
+
+/*
+ * segdev pages are always "in core".
+ */
+/*ARGSUSED*/
+static size_t
+segdev_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
+{
+	size_t v = 0;
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_INCORE, "segdev_incore:start");
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	for (len = (len + PAGEOFFSET) & PAGEMASK; len; len -= PAGESIZE,
+	    v += PAGESIZE)
+		*vec++ = 1;
+	return (v);
+}
+
+/*
+ * segdev pages are not in the cache, and thus can't really be controlled.
+ * Hence, locks are simply always successful.
+ */
+/*ARGSUSED*/
+static int
+segdev_lockop(struct seg *seg, caddr_t addr,
+    size_t len, int attr, int op, ulong_t *lockmap, size_t pos)
+{
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_LOCKOP, "segdev_lockop:start");
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	return (0);
+}
+
+/*
+ * segdev pages are not in the cache, and thus can't really be controlled.
+ * Hence, advise is simply always successful.
+ */
+/*ARGSUSED*/
+static int
+segdev_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
+{
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_ADVISE, "segdev_advise:start");
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	return (0);
+}
+
+/*
+ * segdev pages are not dumped, so we just return
+ */
+/*ARGSUSED*/
+static void
+segdev_dump(struct seg *seg)
+{}
+
+/*
+ * ddi_segmap_setup:	Used by drivers who wish specify mapping attributes
+ *			for a segment.	Called from a drivers segmap(9E)
+ *			routine.
+ */
+/*ARGSUSED*/
+int
+ddi_segmap_setup(dev_t dev, off_t offset, struct as *as, caddr_t *addrp,
+    off_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cred,
+    ddi_device_acc_attr_t *accattrp, uint_t rnumber)
+{
+	struct segdev_crargs dev_a;
+	int (*mapfunc)(dev_t dev, off_t off, int prot);
+	uint_t hat_attr;
+	pfn_t pfn;
+	int	error, i;
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SEGMAP_SETUP,
+	    "ddi_segmap_setup:start");
+
+	if ((mapfunc = devopsp[getmajor(dev)]->devo_cb_ops->cb_mmap) == nodev)
+		return (ENODEV);
+
+	/*
+	 * Character devices that support the d_mmap
+	 * interface can only be mmap'ed shared.
+	 */
+	if ((flags & MAP_TYPE) != MAP_SHARED)
+		return (EINVAL);
+
+	/*
+	 * Check that this region is indeed mappable on this platform.
+	 * Use the mapping function.
+	 */
+	if (ddi_device_mapping_check(dev, accattrp, rnumber, &hat_attr) == -1)
+		return (ENXIO);
+
+	/*
+	 * Check to ensure that the entire range is
+	 * legal and we are not trying to map in
+	 * more than the device will let us.
+	 */
+	for (i = 0; i < len; i += PAGESIZE) {
+		if (i == 0) {
+			/*
+			 * Save the pfn at offset here. This pfn will be
+			 * used later to get user address.
+			 */
+			if ((pfn = (pfn_t)cdev_mmap(mapfunc, dev, offset,
+				maxprot)) == PFN_INVALID)
+				return (ENXIO);
+		} else {
+			if (cdev_mmap(mapfunc, dev, offset + i, maxprot) ==
+				PFN_INVALID)
+				return (ENXIO);
+		}
+	}
+
+	as_rangelock(as);
+	if ((flags & MAP_FIXED) == 0) {
+		/*
+		 * Pick an address w/o worrying about
+		 * any vac alignment constraints.
+		 */
+		map_addr(addrp, len, ptob(pfn), 0, flags);
+		if (*addrp == NULL) {
+			as_rangeunlock(as);
+			return (ENOMEM);
+		}
+	} else {
+		/*
+		 * User-specified address; blow away any previous mappings.
+		 */
+		(void) as_unmap(as, *addrp, len);
+	}
+
+	dev_a.mapfunc = mapfunc;
+	dev_a.dev = dev;
+	dev_a.offset = (offset_t)offset;
+	dev_a.type = flags & MAP_TYPE;
+	dev_a.prot = (uchar_t)prot;
+	dev_a.maxprot = (uchar_t)maxprot;
+	dev_a.hat_attr = hat_attr;
+	dev_a.hat_flags = 0;
+	dev_a.devmap_data = NULL;
+
+	error = as_map(as, *addrp, len, segdev_create, &dev_a);
+	as_rangeunlock(as);
+	return (error);
+
+}
+
+/*ARGSUSED*/
+static int
+segdev_pagelock(struct seg *seg, caddr_t addr, size_t len,
+    struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_PAGELOCK,
+	    "segdev_pagelock:start");
+	return (ENOTSUP);
+}
+
+/*ARGSUSED*/
+static int
+segdev_setpagesize(struct seg *seg, caddr_t addr, size_t len,
+    uint_t szc)
+{
+	return (ENOTSUP);
+}
+
+/*
+ * devmap_device: Used by devmap framework to establish mapping
+ *                called by devmap_seup(9F) during map setup time.
+ */
+/*ARGSUSED*/
+static int
+devmap_device(devmap_handle_t *dhp, struct as *as, caddr_t *addr,
+    offset_t off, size_t len, uint_t flags)
+{
+	devmap_handle_t *rdhp, *maxdhp;
+	struct segdev_crargs dev_a;
+	int	err;
+	uint_t maxprot = PROT_ALL;
+	offset_t offset = 0;
+	pfn_t pfn;
+	struct devmap_pmem_cookie *pcp;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_DEVICE,
+	    "devmap_device:start dhp=%p addr=%p off=%llx, len=%lx",
+	    (void *)dhp, (void *)addr, off, len);
+
+	DEBUGF(2, (CE_CONT, "devmap_device: dhp %p addr %p off %llx len %lx\n",
+	    (void *)dhp, (void *)addr, off, len));
+
+	as_rangelock(as);
+	if ((flags & MAP_FIXED) == 0) {
+		offset_t aligned_off;
+
+		rdhp = maxdhp = dhp;
+		while (rdhp != NULL) {
+			maxdhp = (maxdhp->dh_len > rdhp->dh_len) ?
+				maxdhp : rdhp;
+			rdhp = rdhp->dh_next;
+			maxprot |= dhp->dh_maxprot;
+		}
+		offset = maxdhp->dh_uoff - dhp->dh_uoff;
+
+		/*
+		 * Use the dhp that has the
+		 * largest len to get user address.
+		 */
+		/*
+		 * If MAPPING_INVALID, cannot use dh_pfn/dh_cvaddr,
+		 * use 0 which is as good as any other.
+		 */
+		if (maxdhp->dh_flags & DEVMAP_MAPPING_INVALID) {
+			aligned_off = (offset_t)0;
+		} else if (dhp_is_devmem(maxdhp)) {
+			aligned_off = (offset_t)ptob(maxdhp->dh_pfn) - offset;
+		} else if (dhp_is_pmem(maxdhp)) {
+			pcp = (struct devmap_pmem_cookie *)maxdhp->dh_pcookie;
+			pfn = page_pptonum(
+			    pcp->dp_pparray[btop(maxdhp->dh_roff)]);
+			aligned_off = (offset_t)ptob(pfn) - offset;
+		} else {
+			aligned_off = (offset_t)(uintptr_t)maxdhp->dh_cvaddr -
+			    offset;
+		}
+
+		/*
+		 * Pick an address aligned to dh_cookie.
+		 * for kernel memory/user memory, cookie is cvaddr.
+		 * for device memory, cookie is physical address.
+		 */
+		map_addr(addr, len, aligned_off, 1, flags);
+		if (*addr == NULL) {
+			as_rangeunlock(as);
+			return (ENOMEM);
+		}
+	} else {
+		/*
+		 * User-specified address; blow away any previous mappings.
+		 */
+		(void) as_unmap(as, *addr, len);
+	}
+
+	dev_a.mapfunc = NULL;
+	dev_a.dev = dhp->dh_dev;
+	dev_a.type = flags & MAP_TYPE;
+	dev_a.offset = off;
+	/*
+	 * sdp->maxprot has the least restrict protection of all dhps.
+	 */
+	dev_a.maxprot = maxprot;
+	dev_a.prot = dhp->dh_prot;
+	/*
+	 * devmap uses dhp->dh_hat_attr for hat.
+	 */
+	dev_a.hat_flags = 0;
+	dev_a.hat_attr = 0;
+	dev_a.devmap_data = (void *)dhp;
+
+	err = as_map(as, *addr, len, segdev_create, &dev_a);
+	as_rangeunlock(as);
+	return (err);
+}
+
+int
+devmap_do_ctxmgt(devmap_cookie_t dhc, void *pvtp, offset_t off, size_t len,
+    uint_t type, uint_t rw, int (*ctxmgt)(devmap_cookie_t, void *, offset_t,
+    size_t, uint_t, uint_t))
+{
+	register devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+	struct devmap_ctx *devctx;
+	int do_timeout = 0;
+	int ret;
+
+#ifdef lint
+	pvtp = pvtp;
+#endif
+
+	TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT,
+	    "devmap_do_ctxmgt:start dhp=%p off=%llx, len=%lx",
+	    (void *)dhp, off, len);
+	DEBUGF(7, (CE_CONT, "devmap_do_ctxmgt: dhp %p off %llx len %lx\n",
+	    (void *)dhp, off, len));
+
+	if (ctxmgt == NULL)
+		return (FC_HWERR);
+
+	devctx = dhp->dh_ctx;
+
+	/*
+	 * If we are on an MP system with more than one cpu running
+	 * and if a thread on some CPU already has the context, wait
+	 * for it to finish if there is a hysteresis timeout.
+	 *
+	 * We call cv_wait() instead of cv_wait_sig() because
+	 * it does not matter much if it returned due to a signal
+	 * or due to a cv_signal() or cv_broadcast().  In either event
+	 * we need to complete the mapping otherwise the processes
+	 * will die with a SEGV.
+	 */
+	if ((dhp->dh_timeout_length > 0) && (ncpus > 1)) {
+		TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK1,
+		    "devmap_do_ctxmgt:doing hysteresis, devctl %p dhp %p",
+		    devctx, dhp);
+		do_timeout = 1;
+		mutex_enter(&devctx->lock);
+		while (devctx->oncpu)
+			cv_wait(&devctx->cv, &devctx->lock);
+		devctx->oncpu = 1;
+		mutex_exit(&devctx->lock);
+	}
+
+	/*
+	 * Call the contextmgt callback so that the driver can handle
+	 * the fault.
+	 */
+	ret = (*ctxmgt)(dhp, dhp->dh_pvtp, off, len, type, rw);
+
+	/*
+	 * If devmap_access() returned -1, then there was a hardware
+	 * error so we need to convert the return value to something
+	 * that trap() will understand.  Otherwise, the return value
+	 * is already a fault code generated by devmap_unload()
+	 * or devmap_load().
+	 */
+	if (ret) {
+		TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK2,
+		    "devmap_do_ctxmgt: ret=%x dhp=%p devctx=%p",
+		    ret, dhp, devctx);
+		DEBUGF(1, (CE_CONT, "devmap_do_ctxmgt: ret %x dhp %p\n",
+		    ret, (void *)dhp));
+		if (devctx->oncpu) {
+			mutex_enter(&devctx->lock);
+			devctx->oncpu = 0;
+			cv_signal(&devctx->cv);
+			mutex_exit(&devctx->lock);
+		}
+		return (FC_HWERR);
+	}
+
+	/*
+	 * Setup the timeout if we need to
+	 */
+	if (do_timeout) {
+		mutex_enter(&devctx->lock);
+		if (dhp->dh_timeout_length > 0) {
+			TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK3,
+			    "devmap_do_ctxmgt:timeout set");
+			devctx->timeout = timeout(devmap_ctxto,
+			    devctx, dhp->dh_timeout_length);
+		} else {
+			/*
+			 * We don't want to wait so set oncpu to
+			 * 0 and wake up anyone waiting.
+			 */
+			TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK4,
+			    "devmap_do_ctxmgt:timeout not set");
+			devctx->oncpu = 0;
+			cv_signal(&devctx->cv);
+		}
+		mutex_exit(&devctx->lock);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ *                                       end of mapping
+ *                    poff   fault_offset         |
+ *            base     |        |                 |
+ *              |      |        |                 |
+ *              V      V        V                 V
+ *  +-----------+---------------+-------+---------+-------+
+ *              ^               ^       ^         ^
+ *              |<--- offset--->|<-len->|         |
+ *              |<--- dh_len(size of mapping) --->|
+ *                     |<--  pg -->|
+ *                              -->|rlen|<--
+ */
+static ulong_t
+devmap_roundup(devmap_handle_t *dhp, ulong_t offset, size_t len,
+    ulong_t *opfn, ulong_t *pagesize)
+{
+	register int level;
+	ulong_t pg;
+	ulong_t poff;
+	ulong_t base;
+	caddr_t uvaddr;
+	long rlen;
+
+	TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_ROUNDUP,
+	    "devmap_roundup:start dhp=%p off=%lx len=%lx",
+	    (void *)dhp, offset, len);
+	DEBUGF(2, (CE_CONT, "devmap_roundup: dhp %p off %lx len %lx\n",
+	    (void *)dhp, offset, len));
+
+	/*
+	 * get the max. pagesize that is aligned within the range
+	 * <dh_pfn, dh_pfn+offset>.
+	 *
+	 * The calculations below use physical address to ddetermine
+	 * the page size to use. The same calculations can use the
+	 * virtual address to determine the page size.
+	 */
+	base = (ulong_t)ptob(dhp->dh_pfn);
+	for (level = dhp->dh_mmulevel; level >= 0; level--) {
+		pg = page_get_pagesize(level);
+		poff = ((base + offset) & ~(pg - 1));
+		uvaddr = dhp->dh_uvaddr + (poff - base);
+		if ((poff >= base) &&
+		    ((poff + pg) <= (base + dhp->dh_len)) &&
+		    VA_PA_ALIGNED((uintptr_t)uvaddr, poff, pg))
+			break;
+	}
+
+	TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_ROUNDUP_CK1,
+	    "devmap_roundup: base=%lx poff=%lx dhp=%p",
+	    base, poff, dhp);
+	DEBUGF(2, (CE_CONT, "devmap_roundup: base %lx poff %lx pfn %lx\n",
+	    base, poff, dhp->dh_pfn));
+
+	ASSERT(VA_PA_ALIGNED((uintptr_t)uvaddr, poff, pg));
+	ASSERT(level >= 0);
+
+	*pagesize = pg;
+	*opfn = dhp->dh_pfn + btop(poff - base);
+
+	rlen = len + offset - (poff - base + pg);
+
+	ASSERT(rlen < (long)len);
+
+	TRACE_5(TR_FAC_DEVMAP, TR_DEVMAP_ROUNDUP_CK2,
+	    "devmap_roundup:ret dhp=%p level=%x rlen=%lx psiz=%p opfn=%p",
+	    (void *)dhp, level, rlen, pagesize, opfn);
+	DEBUGF(1, (CE_CONT, "devmap_roundup: dhp %p "
+	    "level %x rlen %lx psize %lx opfn %lx\n",
+	    (void *)dhp, level, rlen, *pagesize, *opfn));
+
+	return ((ulong_t)((rlen > 0) ? rlen : 0));
+}
+
+/*
+ * find the dhp that contains addr.
+ */
+static devmap_handle_t *
+devmap_find_handle(devmap_handle_t *dhp_head, caddr_t addr)
+{
+	devmap_handle_t *dhp;
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_FIND_HANDLE,
+	    "devmap_find_handle:start");
+
+	dhp = dhp_head;
+	while (dhp) {
+		if (addr >= dhp->dh_uvaddr &&
+		    addr < (dhp->dh_uvaddr + dhp->dh_len))
+			return (dhp);
+		dhp = dhp->dh_next;
+	}
+
+	return ((devmap_handle_t *)NULL);
+}
+
+/*
+ * devmap_unload:
+ *			Marks a segdev segment or pages if offset->offset+len
+ *			is not the entire segment as intercept and unloads the
+ *			pages in the range offset -> offset+len.
+ */
+int
+devmap_unload(devmap_cookie_t dhc, offset_t offset, size_t len)
+{
+	register devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+	caddr_t	addr;
+	ulong_t	size;
+	ssize_t	soff;
+
+	TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_UNLOAD,
+	    "devmap_unload:start dhp=%p offset=%llx len=%lx",
+	    (void *)dhp, offset, len);
+	DEBUGF(7, (CE_CONT, "devmap_unload: dhp %p offset %llx len %lx\n",
+	    (void *)dhp, offset, len));
+
+	soff = (ssize_t)(offset - dhp->dh_uoff);
+	soff = round_down_p2(soff, PAGESIZE);
+	if (soff < 0 || soff >= dhp->dh_len)
+		return (FC_MAKE_ERR(EINVAL));
+
+	/*
+	 * Address and size must be page aligned.  Len is set to the
+	 * number of bytes in the number of pages that are required to
+	 * support len.  Offset is set to the byte offset of the first byte
+	 * of the page that contains offset.
+	 */
+	len = round_up_p2(len, PAGESIZE);
+
+	/*
+	 * If len is == 0, then calculate the size by getting
+	 * the number of bytes from offset to the end of the segment.
+	 */
+	if (len == 0)
+		size = dhp->dh_len - soff;
+	else {
+		size = len;
+		if ((soff + size) > dhp->dh_len)
+			return (FC_MAKE_ERR(EINVAL));
+	}
+
+	/*
+	 * The address is offset bytes from the base address of
+	 * the dhp.
+	 */
+	addr = (caddr_t)(soff + dhp->dh_uvaddr);
+
+	/*
+	 * If large page size was used in hat_devload(),
+	 * the same page size must be used in hat_unload().
+	 */
+	if (dhp->dh_flags & DEVMAP_FLAG_LARGE) {
+		hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
+			dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
+	} else {
+		hat_unload(dhp->dh_seg->s_as->a_hat,  addr, size,
+			HAT_UNLOAD|HAT_UNLOAD_OTHER);
+	}
+
+	return (0);
+}
+
+/*
+ * calculates the optimal page size that will be used for hat_devload().
+ */
+static void
+devmap_get_large_pgsize(devmap_handle_t *dhp, size_t len, caddr_t addr,
+    size_t *llen, caddr_t *laddr)
+{
+	ulong_t off;
+	ulong_t pfn;
+	ulong_t pgsize;
+	uint_t first = 1;
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_GET_LARGE_PGSIZE,
+	    "devmap_get_large_pgsize:start");
+
+	/*
+	 * RFE - Code only supports large page mappings for devmem
+	 * This code could be changed in future if we want to support
+	 * large page mappings for kernel exported memory.
+	 */
+	ASSERT(dhp_is_devmem(dhp));
+	ASSERT(!(dhp->dh_flags & DEVMAP_MAPPING_INVALID));
+
+	*llen = 0;
+	off = (ulong_t)(addr - dhp->dh_uvaddr);
+	while ((long)len > 0) {
+		/*
+		 * get the optimal pfn to minimize address translations.
+		 * devmap_roundup() returns residue bytes for next round
+		 * calculations.
+		 */
+		len = devmap_roundup(dhp, off, len, &pfn, &pgsize);
+
+		if (first) {
+			*laddr = dhp->dh_uvaddr + ptob(pfn - dhp->dh_pfn);
+			first = 0;
+		}
+
+		*llen += pgsize;
+		off = ptob(pfn - dhp->dh_pfn) + pgsize;
+	}
+	/* Large page mapping len/addr cover more range than orginal fault */
+	ASSERT(*llen >= len && *laddr <= addr);
+	ASSERT((*laddr + *llen) >= (addr + len));
+}
+
+/*
+ * Initialize the devmap_softlock structure.
+ */
+static struct devmap_softlock *
+devmap_softlock_init(dev_t dev, ulong_t id)
+{
+	struct devmap_softlock *slock;
+	struct devmap_softlock *tmp;
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SOFTLOCK_INIT,
+	    "devmap_softlock_init:start");
+
+	tmp = kmem_zalloc(sizeof (struct devmap_softlock), KM_SLEEP);
+	mutex_enter(&devmap_slock);
+
+	for (slock = devmap_slist; slock != NULL; slock = slock->next)
+		if ((slock->dev == dev) && (slock->id == id))
+			break;
+
+	if (slock == NULL) {
+		slock = tmp;
+		slock->dev = dev;
+		slock->id = id;
+		mutex_init(&slock->lock, NULL, MUTEX_DEFAULT, NULL);
+		cv_init(&slock->cv, NULL, CV_DEFAULT, NULL);
+		slock->next = devmap_slist;
+		devmap_slist = slock;
+	} else
+		kmem_free(tmp, sizeof (struct devmap_softlock));
+
+	mutex_enter(&slock->lock);
+	slock->refcnt++;
+	mutex_exit(&slock->lock);
+	mutex_exit(&devmap_slock);
+
+	return (slock);
+}
+
+/*
+ * Wake up processes that sleep on softlocked.
+ * Free dh_softlock if refcnt is 0.
+ */
+static void
+devmap_softlock_rele(devmap_handle_t *dhp)
+{
+	struct devmap_softlock *slock = dhp->dh_softlock;
+	struct devmap_softlock *tmp;
+	struct devmap_softlock *parent;
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SOFTLOCK_RELE,
+	    "devmap_softlock_rele:start");
+
+	mutex_enter(&devmap_slock);
+	mutex_enter(&slock->lock);
+
+	ASSERT(slock->refcnt > 0);
+
+	slock->refcnt--;
+
+	/*
+	 * If no one is using the device, free up the slock data.
+	 */
+	if (slock->refcnt == 0) {
+		slock->softlocked = 0;
+		cv_signal(&slock->cv);
+
+		if (devmap_slist == slock)
+			devmap_slist = slock->next;
+		else {
+			parent = devmap_slist;
+			for (tmp = devmap_slist->next; tmp != NULL;
+				tmp = tmp->next) {
+				if (tmp == slock) {
+					parent->next = tmp->next;
+					break;
+				}
+				parent = tmp;
+			}
+		}
+		mutex_exit(&slock->lock);
+		mutex_destroy(&slock->lock);
+		cv_destroy(&slock->cv);
+		kmem_free(slock, sizeof (struct devmap_softlock));
+	} else
+		mutex_exit(&slock->lock);
+
+	mutex_exit(&devmap_slock);
+}
+
+/*
+ * Wake up processes that sleep on dh_ctx->locked.
+ * Free dh_ctx if refcnt is 0.
+ */
+static void
+devmap_ctx_rele(devmap_handle_t *dhp)
+{
+	struct devmap_ctx *devctx = dhp->dh_ctx;
+	struct devmap_ctx *tmp;
+	struct devmap_ctx *parent;
+	timeout_id_t tid;
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_CTX_RELE,
+	    "devmap_ctx_rele:start");
+
+	mutex_enter(&devmapctx_lock);
+	mutex_enter(&devctx->lock);
+
+	ASSERT(devctx->refcnt > 0);
+
+	devctx->refcnt--;
+
+	/*
+	 * If no one is using the device, free up the devctx data.
+	 */
+	if (devctx->refcnt == 0) {
+		/*
+		 * Untimeout any threads using this mapping as they are about
+		 * to go away.
+		 */
+		if (devctx->timeout != 0) {
+			TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_CTX_RELE_CK1,
+			    "devmap_ctx_rele:untimeout ctx->timeout");
+
+			tid = devctx->timeout;
+			mutex_exit(&devctx->lock);
+			(void) untimeout(tid);
+			mutex_enter(&devctx->lock);
+		}
+
+		devctx->oncpu = 0;
+		cv_signal(&devctx->cv);
+
+		if (devmapctx_list == devctx)
+			devmapctx_list = devctx->next;
+		else {
+			parent = devmapctx_list;
+			for (tmp = devmapctx_list->next; tmp != NULL;
+				tmp = tmp->next) {
+				if (tmp == devctx) {
+					parent->next = tmp->next;
+					break;
+				}
+				parent = tmp;
+			}
+		}
+		mutex_exit(&devctx->lock);
+		mutex_destroy(&devctx->lock);
+		cv_destroy(&devctx->cv);
+		kmem_free(devctx, sizeof (struct devmap_ctx));
+	} else
+		mutex_exit(&devctx->lock);
+
+	mutex_exit(&devmapctx_lock);
+}
+
+/*
+ * devmap_load:
+ *			Marks a segdev segment or pages if offset->offset+len
+ *			is not the entire segment as nointercept and faults in
+ *			the pages in the range offset -> offset+len.
+ */
+int
+devmap_load(devmap_cookie_t dhc, offset_t offset, size_t len, uint_t type,
+    uint_t rw)
+{
+	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+	struct as *asp = dhp->dh_seg->s_as;
+	caddr_t	addr;
+	ulong_t	size;
+	ssize_t	soff;	/* offset from the beginning of the segment */
+	int rc;
+
+	TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_LOAD,
+	    "devmap_load:start dhp=%p offset=%llx len=%lx",
+		(void *)dhp, offset, len);
+
+	DEBUGF(7, (CE_CONT, "devmap_load: dhp %p offset %llx len %lx\n",
+	    (void *)dhp, offset, len));
+
+	/*
+	 *	Hat layer only supports devload to process' context for which
+	 *	the as lock is held. Verify here and return error if drivers
+	 *	inadvertently call devmap_load on a wrong devmap handle.
+	 */
+	if ((asp != &kas) && !AS_LOCK_HELD(asp, &asp->a_lock))
+		return (FC_MAKE_ERR(EINVAL));
+
+	soff = (ssize_t)(offset - dhp->dh_uoff);
+	soff = round_down_p2(soff, PAGESIZE);
+	if (soff < 0 || soff >= dhp->dh_len)
+		return (FC_MAKE_ERR(EINVAL));
+
+	/*
+	 * Address and size must be page aligned.  Len is set to the
+	 * number of bytes in the number of pages that are required to
+	 * support len.  Offset is set to the byte offset of the first byte
+	 * of the page that contains offset.
+	 */
+	len = round_up_p2(len, PAGESIZE);
+
+	/*
+	 * If len == 0, then calculate the size by getting
+	 * the number of bytes from offset to the end of the segment.
+	 */
+	if (len == 0)
+		size = dhp->dh_len - soff;
+	else {
+		size = len;
+		if ((soff + size) > dhp->dh_len)
+			return (FC_MAKE_ERR(EINVAL));
+	}
+
+	/*
+	 * The address is offset bytes from the base address of
+	 * the segment.
+	 */
+	addr = (caddr_t)(soff + dhp->dh_uvaddr);
+
+	HOLD_DHP_LOCK(dhp);
+	rc = segdev_faultpages(asp->a_hat,
+			dhp->dh_seg, addr, size, type, rw, dhp);
+	RELE_DHP_LOCK(dhp);
+	return (rc);
+}
+
+int
+devmap_setup(dev_t dev, offset_t off, struct as *as, caddr_t *addrp,
+    size_t len, uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
+{
+	register devmap_handle_t *dhp;
+	int (*devmap)(dev_t, devmap_cookie_t, offset_t, size_t,
+		size_t *, uint_t);
+	int (*mmap)(dev_t, off_t, int);
+	struct devmap_callback_ctl *callbackops;
+	devmap_handle_t *dhp_head = NULL;
+	devmap_handle_t *dhp_prev = NULL;
+	devmap_handle_t *dhp_curr;
+	caddr_t addr;
+	int map_flag;
+	int ret;
+	ulong_t total_len;
+	size_t map_len;
+	size_t resid_len = len;
+	offset_t map_off = off;
+	struct devmap_softlock *slock = NULL;
+
+#ifdef lint
+	cred = cred;
+#endif
+
+	TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_SETUP,
+	    "devmap_setup:start off=%llx len=%lx", off, len);
+	DEBUGF(3, (CE_CONT, "devmap_setup: off %llx len %lx\n",
+	    off, len));
+
+	devmap = devopsp[getmajor(dev)]->devo_cb_ops->cb_devmap;
+	mmap = devopsp[getmajor(dev)]->devo_cb_ops->cb_mmap;
+
+	/*
+	 * driver must provide devmap(9E) entry point in cb_ops to use the
+	 * devmap framework.
+	 */
+	if (devmap == NULL || devmap == nulldev || devmap == nodev)
+		return (EINVAL);
+
+	/*
+	 * To protect from an inadvertent entry because the devmap entry point
+	 * is not NULL, return error if D_DEVMAP bit is not set in cb_flag and
+	 * mmap is NULL.
+	 */
+	map_flag = devopsp[getmajor(dev)]->devo_cb_ops->cb_flag;
+	if ((map_flag & D_DEVMAP) == 0 && (mmap == NULL || mmap == nulldev))
+		return (EINVAL);
+
+	/*
+	 * devmap allows mmap(2) to map multiple registers.
+	 * one devmap_handle is created for each register mapped.
+	 */
+	for (total_len = 0; total_len < len; total_len += map_len) {
+		dhp = kmem_zalloc(sizeof (devmap_handle_t), KM_SLEEP);
+
+		if (dhp_prev != NULL)
+			dhp_prev->dh_next = dhp;
+		else
+			dhp_head = dhp;
+		dhp_prev = dhp;
+
+		dhp->dh_prot = prot;
+		dhp->dh_orig_maxprot = dhp->dh_maxprot = maxprot;
+		dhp->dh_dev = dev;
+		dhp->dh_timeout_length = CTX_TIMEOUT_VALUE;
+		dhp->dh_uoff = map_off;
+
+		/*
+		 * Get mapping specific info from
+		 * the driver, such as rnumber, roff, len, callbackops,
+		 * accattrp and, if the mapping is for kernel memory,
+		 * ddi_umem_cookie.
+		 */
+		if ((ret = cdev_devmap(dev, dhp, map_off,
+		    resid_len, &map_len, get_udatamodel())) != 0) {
+			free_devmap_handle(dhp_head);
+			return (ENXIO);
+		}
+
+		if (map_len & PAGEOFFSET) {
+			free_devmap_handle(dhp_head);
+			return (EINVAL);
+		}
+
+		callbackops = &dhp->dh_callbackops;
+
+		if ((callbackops->devmap_access == NULL) ||
+			(callbackops->devmap_access == nulldev) ||
+			(callbackops->devmap_access == nodev)) {
+			/*
+			 * Normally devmap does not support MAP_PRIVATE unless
+			 * the drivers provide a valid devmap_access routine.
+			 */
+			if ((flags & MAP_PRIVATE) != 0) {
+				free_devmap_handle(dhp_head);
+				return (EINVAL);
+			}
+		} else {
+			/*
+			 * Initialize dhp_softlock and dh_ctx if the drivers
+			 * provide devmap_access.
+			 */
+			dhp->dh_softlock = devmap_softlock_init(dev,
+				(ulong_t)callbackops->devmap_access);
+			dhp->dh_ctx = devmap_ctxinit(dev,
+				(ulong_t)callbackops->devmap_access);
+
+			/*
+			 * segdev_fault can only work when all
+			 * dh_softlock in a multi-dhp mapping
+			 * are same. see comments in segdev_fault
+			 * This code keeps track of the first
+			 * dh_softlock allocated in slock and
+			 * compares all later allocations and if
+			 * not similar, returns an error.
+			 */
+			if (slock == NULL)
+				slock = dhp->dh_softlock;
+			if (slock != dhp->dh_softlock) {
+				free_devmap_handle(dhp_head);
+				return (ENOTSUP);
+			}
+		}
+
+		map_off += map_len;
+		resid_len -= map_len;
+	}
+
+	/*
+	 * get the user virtual address and establish the mapping between
+	 * uvaddr and device physical address.
+	 */
+	if ((ret = devmap_device(dhp_head, as, addrp, off, len, flags))
+			!= 0) {
+		/*
+		 * free devmap handles if error during the mapping.
+		 */
+		free_devmap_handle(dhp_head);
+
+		return (ret);
+	}
+
+	/*
+	 * call the driver's devmap_map callback to do more after the mapping,
+	 * such as to allocate driver private data for context management.
+	 */
+	dhp = dhp_head;
+	map_off = off;
+	addr = *addrp;
+	while (dhp != NULL) {
+		callbackops = &dhp->dh_callbackops;
+		dhp->dh_uvaddr = addr;
+		dhp_curr = dhp;
+		if (callbackops->devmap_map != NULL) {
+			ret = (*callbackops->devmap_map)((devmap_cookie_t)dhp,
+					dev, flags, map_off,
+					dhp->dh_len, &dhp->dh_pvtp);
+			if (ret != 0) {
+				struct segdev_data *sdp;
+
+				/*
+				 * call driver's devmap_unmap entry point
+				 * to free driver resources.
+				 */
+				dhp = dhp_head;
+				map_off = off;
+				while (dhp != dhp_curr) {
+					callbackops = &dhp->dh_callbackops;
+					if (callbackops->devmap_unmap != NULL) {
+						(*callbackops->devmap_unmap)(
+							dhp, dhp->dh_pvtp,
+							map_off, dhp->dh_len,
+							NULL, NULL, NULL, NULL);
+					}
+					map_off += dhp->dh_len;
+					dhp = dhp->dh_next;
+				}
+				sdp = dhp_head->dh_seg->s_data;
+				sdp->devmap_data = NULL;
+				free_devmap_handle(dhp_head);
+				return (ENXIO);
+			}
+		}
+		map_off += dhp->dh_len;
+		addr += dhp->dh_len;
+		dhp = dhp->dh_next;
+	}
+
+	return (0);
+}
+
+int
+ddi_devmap_segmap(dev_t dev, off_t off, ddi_as_handle_t as, caddr_t *addrp,
+    off_t len, uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
+{
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SEGMAP,
+	    "devmap_segmap:start");
+	return (devmap_setup(dev, (offset_t)off, (struct as *)as, addrp,
+	    (size_t)len, prot, maxprot, flags, cred));
+}
+
+/*
+ * Called from devmap_devmem_setup/remap to see if can use large pages for
+ * this device mapping.
+ * Also calculate the max. page size for this mapping.
+ * this page size will be used in fault routine for
+ * optimal page size calculations.
+ */
+static void
+devmap_devmem_large_page_setup(devmap_handle_t *dhp)
+{
+	ASSERT(dhp_is_devmem(dhp));
+	dhp->dh_mmulevel = 0;
+
+	/*
+	 * use large page size only if:
+	 *  1. device memory.
+	 *  2. mmu supports multiple page sizes,
+	 *  3. Driver did not disallow it
+	 *  4. dhp length is at least as big as the large pagesize
+	 *  5. the uvaddr and pfn are large pagesize aligned
+	 */
+	if (page_num_pagesizes() > 1 &&
+	    !(dhp->dh_flags & (DEVMAP_USE_PAGESIZE | DEVMAP_MAPPING_INVALID))) {
+		ulong_t base;
+		int level;
+
+		base = (ulong_t)ptob(dhp->dh_pfn);
+		for (level = 1; level < page_num_pagesizes(); level++) {
+			size_t pgsize = page_get_pagesize(level);
+			if ((dhp->dh_len < pgsize) ||
+			    (!VA_PA_PGSIZE_ALIGNED((uintptr_t)dhp->dh_uvaddr,
+					base, pgsize))) {
+				break;
+			}
+		}
+		dhp->dh_mmulevel = level - 1;
+	}
+	if (dhp->dh_mmulevel > 0) {
+		dhp->dh_flags |= DEVMAP_FLAG_LARGE;
+	} else {
+		dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
+	}
+}
+
+/*
+ * Called by driver devmap routine to pass device specific info to
+ * the framework.    used for device memory mapping only.
+ */
+int
+devmap_devmem_setup(devmap_cookie_t dhc, dev_info_t *dip,
+    struct devmap_callback_ctl *callbackops, uint_t rnumber, offset_t roff,
+    size_t len, uint_t maxprot, uint_t flags, ddi_device_acc_attr_t *accattrp)
+{
+	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+	ddi_acc_handle_t handle;
+	ddi_map_req_t mr;
+	ddi_acc_hdl_t *hp;
+	int err;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_DEVMEM_SETUP,
+	    "devmap_devmem_setup:start dhp=%p offset=%llx rnum=%d len=%lx",
+	    (void *)dhp, roff, rnumber, (uint_t)len);
+	DEBUGF(2, (CE_CONT, "devmap_devmem_setup: dhp %p offset %llx "
+	    "rnum %d len %lx\n", (void *)dhp, roff, rnumber, len));
+
+	/*
+	 * First to check if this function has been called for this dhp.
+	 */
+	if (dhp->dh_flags & DEVMAP_SETUP_DONE)
+		return (DDI_FAILURE);
+
+	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
+		return (DDI_FAILURE);
+
+	if (flags & DEVMAP_MAPPING_INVALID) {
+		/*
+		 * Don't go up the tree to get pfn if the driver specifies
+		 * DEVMAP_MAPPING_INVALID in flags.
+		 *
+		 * If DEVMAP_MAPPING_INVALID is specified, we have to grant
+		 * remap permission.
+		 */
+		if (!(flags & DEVMAP_ALLOW_REMAP)) {
+			return (DDI_FAILURE);
+		}
+		dhp->dh_pfn = PFN_INVALID;
+	} else {
+		handle = impl_acc_hdl_alloc(KM_SLEEP, NULL);
+		if (handle == NULL)
+			return (DDI_FAILURE);
+
+		hp = impl_acc_hdl_get(handle);
+		hp->ah_vers = VERS_ACCHDL;
+		hp->ah_dip = dip;
+		hp->ah_rnumber = rnumber;
+		hp->ah_offset = roff;
+		hp->ah_len = len;
+		if (accattrp != NULL)
+			hp->ah_acc = *accattrp;
+
+		mr.map_op = DDI_MO_MAP_LOCKED;
+		mr.map_type = DDI_MT_RNUMBER;
+		mr.map_obj.rnumber = rnumber;
+		mr.map_prot = maxprot & dhp->dh_orig_maxprot;
+		mr.map_flags = DDI_MF_DEVICE_MAPPING;
+		mr.map_handlep = hp;
+		mr.map_vers = DDI_MAP_VERSION;
+
+		/*
+		 * up the device tree to get pfn.
+		 * The rootnex_map_regspec() routine in nexus drivers has been
+		 * modified to return pfn if map_flags is DDI_MF_DEVICE_MAPPING.
+		 */
+		err = ddi_map(dip, &mr, roff, len, (caddr_t *)&dhp->dh_pfn);
+		dhp->dh_hat_attr = hp->ah_hat_flags;
+		impl_acc_hdl_free(handle);
+
+		if (err)
+			return (DDI_FAILURE);
+	}
+	/* Should not be using devmem setup for memory pages */
+	ASSERT(!pf_is_memory(dhp->dh_pfn));
+
+	/* Only some of the flags bits are settable by the driver */
+	dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
+	dhp->dh_len = ptob(btopr(len));
+
+	dhp->dh_cookie = DEVMAP_DEVMEM_COOKIE;
+	dhp->dh_roff = ptob(btop(roff));
+
+	/* setup the dh_mmulevel and DEVMAP_FLAG_LARGE */
+	devmap_devmem_large_page_setup(dhp);
+	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
+	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
+
+
+	if (callbackops != NULL) {
+		bcopy(callbackops, &dhp->dh_callbackops,
+		    sizeof (struct devmap_callback_ctl));
+	}
+
+	/*
+	 * Initialize dh_lock if we want to do remap.
+	 */
+	if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
+		mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
+		dhp->dh_flags |= DEVMAP_LOCK_INITED;
+	}
+
+	dhp->dh_flags |= DEVMAP_SETUP_DONE;
+
+	return (DDI_SUCCESS);
+}
+
+int
+devmap_devmem_remap(devmap_cookie_t dhc, dev_info_t *dip,
+    uint_t rnumber, offset_t roff, size_t len, uint_t maxprot,
+    uint_t flags, ddi_device_acc_attr_t *accattrp)
+{
+	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+	ddi_acc_handle_t handle;
+	ddi_map_req_t mr;
+	ddi_acc_hdl_t *hp;
+	pfn_t	pfn;
+	uint_t	hat_flags;
+	int	err;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_DEVMEM_REMAP,
+	    "devmap_devmem_setup:start dhp=%p offset=%llx rnum=%d len=%lx",
+	    (void *)dhp, roff, rnumber, (uint_t)len);
+	DEBUGF(2, (CE_CONT, "devmap_devmem_remap: dhp %p offset %llx "
+	    "rnum %d len %lx\n", (void *)dhp, roff, rnumber, len));
+
+	/*
+	 * Return failure if setup has not been done or no remap permission
+	 * has been granted during the setup.
+	 */
+	if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
+	    (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
+		return (DDI_FAILURE);
+
+	/* Only DEVMAP_MAPPING_INVALID flag supported for remap */
+	if ((flags != 0) && (flags != DEVMAP_MAPPING_INVALID))
+		return (DDI_FAILURE);
+
+	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
+		return (DDI_FAILURE);
+
+	if (!(flags & DEVMAP_MAPPING_INVALID)) {
+		handle = impl_acc_hdl_alloc(KM_SLEEP, NULL);
+		if (handle == NULL)
+			return (DDI_FAILURE);
+	}
+
+	HOLD_DHP_LOCK(dhp);
+
+	/*
+	 * Unload the old mapping, so next fault will setup the new mappings
+	 * Do this while holding the dhp lock so other faults dont reestablish
+	 * the mappings
+	 */
+	hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
+		dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
+
+	if (flags & DEVMAP_MAPPING_INVALID) {
+		dhp->dh_flags |= DEVMAP_MAPPING_INVALID;
+		dhp->dh_pfn = PFN_INVALID;
+	} else {
+		/* clear any prior DEVMAP_MAPPING_INVALID flag */
+		dhp->dh_flags &= ~DEVMAP_MAPPING_INVALID;
+		hp = impl_acc_hdl_get(handle);
+		hp->ah_vers = VERS_ACCHDL;
+		hp->ah_dip = dip;
+		hp->ah_rnumber = rnumber;
+		hp->ah_offset = roff;
+		hp->ah_len = len;
+		if (accattrp != NULL)
+			hp->ah_acc = *accattrp;
+
+		mr.map_op = DDI_MO_MAP_LOCKED;
+		mr.map_type = DDI_MT_RNUMBER;
+		mr.map_obj.rnumber = rnumber;
+		mr.map_prot = maxprot & dhp->dh_orig_maxprot;
+		mr.map_flags = DDI_MF_DEVICE_MAPPING;
+		mr.map_handlep = hp;
+		mr.map_vers = DDI_MAP_VERSION;
+
+		/*
+		 * up the device tree to get pfn.
+		 * The rootnex_map_regspec() routine in nexus drivers has been
+		 * modified to return pfn if map_flags is DDI_MF_DEVICE_MAPPING.
+		 */
+		err = ddi_map(dip, &mr, roff, len, (caddr_t *)&pfn);
+		hat_flags = hp->ah_hat_flags;
+		impl_acc_hdl_free(handle);
+		if (err) {
+			RELE_DHP_LOCK(dhp);
+			return (DDI_FAILURE);
+		}
+		/*
+		 * Store result of ddi_map first in local variables, as we do
+		 * not want to overwrite the existing dhp with wrong data.
+		 */
+		dhp->dh_pfn = pfn;
+		dhp->dh_hat_attr = hat_flags;
+	}
+
+	/* clear the large page size flag */
+	dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
+
+	dhp->dh_cookie = DEVMAP_DEVMEM_COOKIE;
+	dhp->dh_roff = ptob(btop(roff));
+
+	/* setup the dh_mmulevel and DEVMAP_FLAG_LARGE */
+	devmap_devmem_large_page_setup(dhp);
+	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
+	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
+
+	RELE_DHP_LOCK(dhp);
+	return (DDI_SUCCESS);
+}
+
+/*
+ * called by driver devmap routine to pass kernel virtual address  mapping
+ * info to the framework.    used only for kernel memory
+ * allocated from ddi_umem_alloc().
+ */
+int
+devmap_umem_setup(devmap_cookie_t dhc, dev_info_t *dip,
+    struct devmap_callback_ctl *callbackops, ddi_umem_cookie_t cookie,
+    offset_t off, size_t len, uint_t maxprot, uint_t flags,
+    ddi_device_acc_attr_t *accattrp)
+{
+	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+	struct ddi_umem_cookie *cp = (struct ddi_umem_cookie *)cookie;
+
+#ifdef lint
+	dip = dip;
+	accattrp = accattrp;
+#endif
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_SETUP,
+	    "devmap_umem_setup:start dhp=%p offset=%llx cookie=%p len=%lx",
+	    (void *)dhp, off, cookie, len);
+	DEBUGF(2, (CE_CONT, "devmap_umem_setup: dhp %p offset %llx "
+	    "cookie %p len %lx\n", (void *)dhp, off, (void *)cookie, len));
+
+	if (cookie == NULL)
+		return (DDI_FAILURE);
+
+	/* For UMEM_TRASH, this restriction is not needed */
+	if ((off + len) > cp->size)
+		return (DDI_FAILURE);
+
+	/*
+	 * First to check if this function has been called for this dhp.
+	 */
+	if (dhp->dh_flags & DEVMAP_SETUP_DONE)
+		return (DDI_FAILURE);
+
+	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
+		return (DDI_FAILURE);
+
+	if (flags & DEVMAP_MAPPING_INVALID) {
+		/*
+		 * If DEVMAP_MAPPING_INVALID is specified, we have to grant
+		 * remap permission.
+		 */
+		if (!(flags & DEVMAP_ALLOW_REMAP)) {
+			return (DDI_FAILURE);
+		}
+	} else {
+		dhp->dh_cookie = cookie;
+		dhp->dh_roff = ptob(btop(off));
+		dhp->dh_cvaddr = cp->cvaddr + dhp->dh_roff;
+	}
+
+	/*
+	 * The default is _not_ to pass HAT_LOAD_NOCONSIST to hat_devload();
+	 * we pass HAT_LOAD_NOCONSIST _only_ in cases where hat tries to
+	 * create consistent mappings but our intention was to create
+	 * non-consistent mappings.
+	 *
+	 * DEVMEM: hat figures it out it's DEVMEM and creates non-consistent
+	 * mappings.
+	 *
+	 * kernel exported memory: hat figures it out it's memory and always
+	 * creates consistent mappings.
+	 *
+	 * /dev/mem: non-consistent mappings. See comments in common/io/mem.c
+	 *
+	 * /dev/kmem: consistent mappings are created unless they are
+	 * MAP_FIXED. We _explicitly_ tell hat to create non-consistent
+	 * mappings by passing HAT_LOAD_NOCONSIST in case of MAP_FIXED
+	 * mappings of /dev/kmem. See common/io/mem.c
+	 */
+
+	/* Only some of the flags bits are settable by the driver */
+	dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
+
+	dhp->dh_len = ptob(btopr(len));
+	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
+	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
+
+	if (callbackops != NULL) {
+		bcopy(callbackops, &dhp->dh_callbackops,
+		    sizeof (struct devmap_callback_ctl));
+	}
+	/*
+	 * Initialize dh_lock if we want to do remap.
+	 */
+	if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
+		mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
+		dhp->dh_flags |= DEVMAP_LOCK_INITED;
+	}
+
+	dhp->dh_flags |= DEVMAP_SETUP_DONE;
+
+	return (DDI_SUCCESS);
+}
+
+int
+devmap_umem_remap(devmap_cookie_t dhc, dev_info_t *dip,
+    ddi_umem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot,
+    uint_t flags, ddi_device_acc_attr_t *accattrp)
+{
+	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+	struct ddi_umem_cookie *cp = (struct ddi_umem_cookie *)cookie;
+
+	TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_REMAP,
+	    "devmap_umem_remap:start dhp=%p offset=%llx cookie=%p len=%lx",
+	    (void *)dhp, off, cookie, len);
+	DEBUGF(2, (CE_CONT, "devmap_umem_remap: dhp %p offset %llx "
+	    "cookie %p len %lx\n", (void *)dhp, off, (void *)cookie, len));
+
+#ifdef lint
+	dip = dip;
+	accattrp = accattrp;
+#endif
+	/*
+	 * Reture failure if setup has not been done or no remap permission
+	 * has been granted during the setup.
+	 */
+	if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
+		(dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
+		return (DDI_FAILURE);
+
+	/* No flags supported for remap yet */
+	if (flags != 0)
+		return (DDI_FAILURE);
+
+	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
+		return (DDI_FAILURE);
+
+	/* For UMEM_TRASH, this restriction is not needed */
+	if ((off + len) > cp->size)
+		return (DDI_FAILURE);
+
+	HOLD_DHP_LOCK(dhp);
+	/*
+	 * Unload the old mapping, so next fault will setup the new mappings
+	 * Do this while holding the dhp lock so other faults dont reestablish
+	 * the mappings
+	 */
+	hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
+		dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
+
+	dhp->dh_cookie = cookie;
+	dhp->dh_roff = ptob(btop(off));
+	dhp->dh_cvaddr = cp->cvaddr + dhp->dh_roff;
+
+	/* clear the large page size flag */
+	dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
+
+	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
+	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
+	RELE_DHP_LOCK(dhp);
+	return (DDI_SUCCESS);
+}
+
+/*
+ * to set timeout value for the driver's context management callback, e.g.
+ * devmap_access().
+ */
+void
+devmap_set_ctx_timeout(devmap_cookie_t dhc, clock_t ticks)
+{
+	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+
+	TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_SET_CTX_TIMEOUT,
+	    "devmap_set_ctx_timeout:start dhp=%p ticks=%x",
+	    (void *)dhp, ticks);
+	dhp->dh_timeout_length = ticks;
+}
+
+int
+devmap_default_access(devmap_cookie_t dhp, void *pvtp, offset_t off,
+    size_t len, uint_t type, uint_t rw)
+{
+#ifdef lint
+	pvtp = pvtp;
+#endif
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_DEFAULT_ACCESS,
+	    "devmap_default_access:start");
+	return (devmap_load(dhp, off, len, type, rw));
+}
+
+/*
+ * segkmem_alloc() wrapper to allocate memory which is both
+ * non-relocatable (for DR) and sharelocked, since the rest
+ * of this segment driver requires it.
+ */
+static void *
+devmap_alloc_pages(vmem_t *vmp, size_t size, int vmflag)
+{
+	ASSERT(vmp != NULL);
+	ASSERT(kvseg.s_base != NULL);
+	vmflag |= (VM_NORELOC | SEGKMEM_SHARELOCKED);
+	return (segkmem_alloc(vmp, size, vmflag));
+}
+
+/*
+ * This is where things are a bit incestrous with seg_kmem: unlike
+ * seg_kp, seg_kmem does not keep its pages long-term sharelocked, so
+ * we need to do a bit of a dance around that to prevent duplication of
+ * code until we decide to bite the bullet and implement a new kernel
+ * segment for driver-allocated memory that is exported to user space.
+ */
+static void
+devmap_free_pages(vmem_t *vmp, void *inaddr, size_t size)
+{
+	page_t *pp;
+	caddr_t addr = inaddr;
+	caddr_t eaddr;
+	pgcnt_t npages = btopr(size);
+
+	ASSERT(vmp != NULL);
+	ASSERT(kvseg.s_base != NULL);
+	ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
+
+	hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+
+	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
+		/*
+		 * Use page_find() instead of page_lookup() to find the page
+		 * since we know that it is hashed and has a shared lock.
+		 */
+		pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr);
+
+		if (pp == NULL)
+			panic("devmap_free_pages: page not found");
+		if (!page_tryupgrade(pp)) {
+			page_unlock(pp);
+			pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr,
+			    SE_EXCL);
+			if (pp == NULL)
+				panic("devmap_free_pages: page already freed");
+		}
+		/* Clear p_lckcnt so page_destroy() doesn't update availrmem */
+		pp->p_lckcnt = 0;
+		page_destroy(pp, 0);
+	}
+	page_unresv(npages);
+
+	if (vmp != NULL)
+		vmem_free(vmp, inaddr, size);
+}
+
+/*
+ * devmap_umem_alloc_np() replaces kmem_zalloc() as the method for
+ * allocating non-pageable kmem in response to a ddi_umem_alloc()
+ * default request. For now we allocate our own pages and we keep
+ * them long-term sharelocked, since: A) the fault routines expect the
+ * memory to already be locked; B) pageable umem is already long-term
+ * locked; C) it's a lot of work to make it otherwise, particuarly
+ * since the nexus layer expects the pages to never fault. An RFE is to
+ * not keep the pages long-term locked, but instead to be able to
+ * take faults on them and simply look them up in kvp in case we
+ * fault on them. Even then, we must take care not to let pageout
+ * steal them from us since the data must remain resident; if we
+ * do this we must come up with some way to pin the pages to prevent
+ * faults while a driver is doing DMA to/from them.
+ */
+static void *
+devmap_umem_alloc_np(size_t size, size_t flags)
+{
+	void *buf;
+	int vmflags = (flags & DDI_UMEM_NOSLEEP)? VM_NOSLEEP : VM_SLEEP;
+
+	buf = vmem_alloc(umem_np_arena, size, vmflags);
+	if (buf != NULL)
+		bzero(buf, size);
+	return (buf);
+}
+
+static void
+devmap_umem_free_np(void *addr, size_t size)
+{
+	vmem_free(umem_np_arena, addr, size);
+}
+
+/*
+ * allocate page aligned kernel memory for exporting to user land.
+ * The devmap framework will use the cookie allocated by ddi_umem_alloc()
+ * to find a user virtual address that is in same color as the address
+ * allocated here.
+ */
+void *
+ddi_umem_alloc(size_t size, int flags, ddi_umem_cookie_t *cookie)
+{
+	register size_t len = ptob(btopr(size));
+	void *buf = NULL;
+	struct ddi_umem_cookie *cp;
+	int iflags = 0;
+
+	*cookie = NULL;
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_ALLOC,
+	    "devmap_umem_alloc:start");
+	if (len == 0)
+		return ((void *)NULL);
+
+	/*
+	 * allocate cookie
+	 */
+	if ((cp = kmem_zalloc(sizeof (struct ddi_umem_cookie),
+		flags & DDI_UMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP)) == NULL) {
+		ASSERT(flags & DDI_UMEM_NOSLEEP);
+		return ((void *)NULL);
+	}
+
+	if (flags & DDI_UMEM_PAGEABLE) {
+		/* Only one of the flags is allowed */
+		ASSERT(!(flags & DDI_UMEM_TRASH));
+		/* initialize resource with 0 */
+		iflags = KPD_ZERO;
+
+		/*
+		 * to allocate unlocked pageable memory, use segkp_get() to
+		 * create a segkp segment.  Since segkp can only service kas,
+		 * other segment drivers such as segdev have to do
+		 * as_fault(segkp, SOFTLOCK) in its fault routine,
+		 */
+		if (flags & DDI_UMEM_NOSLEEP)
+			iflags |= KPD_NOWAIT;
+
+		if ((buf = segkp_get(segkp, len, iflags)) == NULL) {
+			kmem_free(cp, sizeof (struct ddi_umem_cookie));
+			return ((void *)NULL);
+		}
+		cp->type = KMEM_PAGEABLE;
+		mutex_init(&cp->lock, NULL, MUTEX_DEFAULT, NULL);
+		cp->locked = 0;
+	} else if (flags & DDI_UMEM_TRASH) {
+		/* Only one of the flags is allowed */
+		ASSERT(!(flags & DDI_UMEM_PAGEABLE));
+		cp->type = UMEM_TRASH;
+		buf = NULL;
+	} else {
+		if ((buf = devmap_umem_alloc_np(len, flags)) == NULL) {
+			kmem_free(cp, sizeof (struct ddi_umem_cookie));
+			return ((void *)NULL);
+		}
+
+		cp->type = KMEM_NON_PAGEABLE;
+	}
+
+	/*
+	 * need to save size here.  size will be used when
+	 * we do kmem_free.
+	 */
+	cp->size = len;
+	cp->cvaddr = (caddr_t)buf;
+
+	*cookie =  (void *)cp;
+	return (buf);
+}
+
+void
+ddi_umem_free(ddi_umem_cookie_t cookie)
+{
+	struct ddi_umem_cookie *cp;
+
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_FREE,
+	    "devmap_umem_free:start");
+
+	/*
+	 * if cookie is NULL, no effects on the system
+	 */
+	if (cookie == NULL)
+		return;
+
+	cp = (struct ddi_umem_cookie *)cookie;
+
+	switch (cp->type) {
+	case KMEM_PAGEABLE :
+		ASSERT(cp->cvaddr != NULL && cp->size != 0);
+		/*
+		 * Check if there are still any pending faults on the cookie
+		 * while the driver is deleting it,
+		 * XXX - could change to an ASSERT but wont catch errant drivers
+		 */
+		mutex_enter(&cp->lock);
+		if (cp->locked) {
+			mutex_exit(&cp->lock);
+			panic("ddi_umem_free for cookie with pending faults %p",
+			    (void *)cp);
+			return;
+		}
+
+		segkp_release(segkp, cp->cvaddr);
+
+		/*
+		 * release mutex associated with this cookie.
+		 */
+		mutex_destroy(&cp->lock);
+		break;
+	case KMEM_NON_PAGEABLE :
+		ASSERT(cp->cvaddr != NULL && cp->size != 0);
+		devmap_umem_free_np(cp->cvaddr, cp->size);
+		break;
+	case UMEM_TRASH :
+		break;
+	case UMEM_LOCKED :
+		/* Callers should use ddi_umem_unlock for this type */
+		ddi_umem_unlock(cookie);
+		/* Frees the cookie too */
+		return;
+	default:
+		/* panic so we can diagnose the underlying cause */
+		panic("ddi_umem_free: illegal cookie type 0x%x\n",
+		    cp->type);
+	}
+
+	kmem_free(cookie, sizeof (struct ddi_umem_cookie));
+}
+
+
+static int
+segdev_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+	struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+
+	/*
+	 * It looks as if it is always mapped shared
+	 */
+	TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_GETMEMID,
+	    "segdev_getmemid:start");
+	memidp->val[0] = (uintptr_t)VTOCVP(sdp->vp);
+	memidp->val[1] = sdp->offset + (uintptr_t)(addr - seg->s_base);
+	return (0);
+}
+
+/*ARGSUSED*/
+static lgrp_mem_policy_info_t *
+segdev_getpolicy(struct seg *seg, caddr_t addr)
+{
+	return (NULL);
+}
+
+/*
+ * ddi_umem_alloc() non-pageable quantum cache max size.
+ * This is just a SWAG.
+ */
+#define	DEVMAP_UMEM_QUANTUM	(8*PAGESIZE)
+
+/*
+ * Initialize seg_dev from boot. This routine sets up the trash page
+ * and creates the umem_np_arena used to back non-pageable memory
+ * requests.
+ */
+void
+segdev_init(void)
+{
+	struct seg kseg;
+
+	umem_np_arena = vmem_create("umem_np", NULL, 0, PAGESIZE,
+	    devmap_alloc_pages, devmap_free_pages, heap_arena,
+	    DEVMAP_UMEM_QUANTUM, VM_SLEEP);
+
+	kseg.s_as = &kas;
+	trashpp = page_create_va(&trashvp, 0, PAGESIZE,
+	    PG_NORELOC | PG_EXCL | PG_WAIT, &kseg, NULL);
+	if (trashpp == NULL)
+		panic("segdev_init: failed to create trash page");
+	pagezero(trashpp, 0, PAGESIZE);
+	page_downgrade(trashpp);
+}
+
+/*
+ * Invoke platform-dependent support routines so that /proc can have
+ * the platform code deal with curious hardware.
+ */
+int
+segdev_copyfrom(struct seg *seg,
+    caddr_t uaddr, const void *devaddr, void *kaddr, size_t len)
+{
+	struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	struct snode *sp = VTOS(VTOCVP(sdp->vp));
+
+	return (e_ddi_copyfromdev(sp->s_dip,
+	    (off_t)(uaddr - seg->s_base), devaddr, kaddr, len));
+}
+
+int
+segdev_copyto(struct seg *seg,
+    caddr_t uaddr, const void *kaddr, void *devaddr, size_t len)
+{
+	struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+	struct snode *sp = VTOS(VTOCVP(sdp->vp));
+
+	return (e_ddi_copytodev(sp->s_dip,
+	    (off_t)(uaddr - seg->s_base), kaddr, devaddr, len));
+}
diff --git a/usr/src/uts/common/vm/seg_dev.h b/usr/src/uts/common/vm/seg_dev.h
new file mode 100644
index 0000000000..c498c06ecf
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_dev.h
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_SEG_DEV_H
+#define	_VM_SEG_DEV_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/project.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Structure whose pointer is passed to the segdev_create routine
+ */
+struct segdev_crargs {
+	offset_t	offset;		/* starting offset */
+	int	(*mapfunc)(dev_t dev, off_t off, int prot); /* map function */
+	dev_t	dev;		/* device number */
+	uchar_t	type;		/* type of sharing done */
+	uchar_t	prot;		/* protection */
+	uchar_t	maxprot;	/* maximum protection */
+	uint_t	hat_attr;	/* hat attr */
+	uint_t	hat_flags;	/* currently, hat_flags is used ONLY for */
+				/* HAT_LOAD_NOCONSIST; in future, it can be */
+				/* expanded to include any flags that are */
+				/* not already part of hat_attr */
+	void    *devmap_data;   /* devmap_handle private data */
+};
+
+/*
+ * (Semi) private data maintained by the seg_dev driver per segment mapping
+ *
+ * The segment lock is necessary to protect fields that are modified
+ * when the "read" version of the address space lock is held.  This lock
+ * is not needed when the segment operation has the "write" version of
+ * the address space lock (it would be redundant).
+ *
+ * The following fields in segdev_data are read-only when the address
+ * space is "read" locked, and don't require the segment lock:
+ *
+ *	vp
+ *	offset
+ *	mapfunc
+ *	maxprot
+ */
+struct	segdev_data {
+	offset_t	offset;		/* device offset for start of mapping */
+	kmutex_t	lock;		/* protects segdev_data */
+	int	(*mapfunc)(dev_t dev, off_t off, int prot);
+	struct	vnode *vp;	/* vnode associated with device */
+	uchar_t	pageprot;	/* true if per page protections present */
+	uchar_t	prot;		/* current segment prot if pageprot == 0 */
+	uchar_t	maxprot;	/* maximum segment protections */
+	uchar_t	type;		/* type of sharing done */
+	struct	vpage *vpage;	/* per-page information, if needed */
+	uint_t	hat_attr;	/* hat attr - pass to attr in hat_devload */
+	uint_t	hat_flags;	/* set HAT_LOAD_NOCONSIST flag in hat_devload */
+				/* see comments above in segdev_crargs */
+	size_t	softlockcnt;	/* # of SOFTLOCKED in seg */
+	void    *devmap_data;   /* devmap_handle private data */
+};
+
+/* Direct physical-userland mapping, without occupying kernel address space */
+#define	DEVMAP_PMEM_COOKIE	((ddi_umem_cookie_t)0x2)
+
+/*
+ * pmem_cookie:
+ * Records physical memory pages to be exported to userland.
+ */
+struct devmap_pmem_cookie {
+	pgcnt_t	dp_npages;		/* number of allocated mem pages */
+	page_t  **dp_pparray;		/* pages allocated for this cookie */
+	vnode_t *dp_vnp;		/* vnode associated with this cookie */
+	kproject_t	*dp_projp;	/* project ptr for resource ctl */
+};
+
+#ifdef _KERNEL
+
+extern void segdev_init(void);
+
+extern int segdev_create(struct seg *, void *);
+
+extern int segdev_copyto(struct seg *, caddr_t, const void *, void *, size_t);
+extern int segdev_copyfrom(struct seg *, caddr_t, const void *, void *, size_t);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_DEV_H */
diff --git a/usr/src/uts/common/vm/seg_enum.h b/usr/src/uts/common/vm/seg_enum.h
new file mode 100644
index 0000000000..25922e7b40
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_enum.h
@@ -0,0 +1,85 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ifndef	_VM_SEG_ENUM_H
+#define	_VM_SEG_ENUM_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * These enumerations are needed in both <vm/seg.h> and
+ * <sys/vnode.h> in order to declare function prototypes.
+ */
+
+/*
+ * Fault information passed to the seg fault handling routine.
+ * The F_SOFTLOCK and F_SOFTUNLOCK are used by software
+ * to lock and unlock pages for physical I/O.
+ */
+enum fault_type {
+	F_INVAL,		/* invalid page */
+	F_PROT,			/* protection fault */
+	F_SOFTLOCK,		/* software requested locking */
+	F_SOFTUNLOCK		/* software requested unlocking */
+};
+
+/*
+ * Lock information passed to the seg pagelock handling routine.
+ */
+enum lock_type {
+	L_PAGELOCK,		/* lock pages */
+	L_PAGEUNLOCK,		/* unlock pages */
+	L_PAGERECLAIM		/* reclaim pages */
+};
+
+/*
+ * seg_rw gives the access type for a fault operation
+ */
+enum seg_rw {
+	S_OTHER,		/* unknown or not touched */
+	S_READ,			/* read access attempted */
+	S_WRITE,		/* write access attempted */
+	S_EXEC,			/* execution access attempted */
+	S_CREATE,		/* create if page doesn't exist */
+	S_READ_NOCOW		/* read access, don't do a copy on write */
+};
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_ENUM_H */
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
new file mode 100644
index 0000000000..6f0c8f5750
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -0,0 +1,1516 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/tuneable.h>
+#include <sys/systm.h>
+#include <sys/vm.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+#include <sys/mman.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/dumphdr.h>
+#include <sys/bootconf.h>
+#include <sys/lgrp.h>
+#include <vm/seg_kmem.h>
+#include <vm/hat.h>
+#include <vm/page.h>
+#include <vm/vm_dep.h>
+#include <vm/faultcode.h>
+#include <sys/promif.h>
+#include <vm/seg_kp.h>
+#include <sys/bitmap.h>
+#include <sys/mem_cage.h>
+
+/*
+ * seg_kmem is the primary kernel memory segment driver.  It
+ * maps the kernel heap [kernelheap, ekernelheap), module text,
+ * and all memory which was allocated before the VM was initialized
+ * into kas.
+ *
+ * Pages which belong to seg_kmem are hashed into &kvp vnode at
+ * an offset equal to (u_offset_t)virt_addr, and have p_lckcnt >= 1.
+ * They must never be paged out since segkmem_fault() is a no-op to
+ * prevent recursive faults.
+ *
+ * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on
+ * __x86 and are unlocked (p_sharelock == 0) on __sparc.  Once __x86
+ * supports relocation the #ifdef kludges can be removed.
+ *
+ * seg_kmem pages may be subject to relocation by page_relocate(),
+ * provided that the HAT supports it; if this is so, segkmem_reloc
+ * will be set to a nonzero value. All boot time allocated memory as
+ * well as static memory is considered off limits to relocation.
+ * Pages are "relocatable" if p_state does not have P_NORELOC set, so
+ * we request P_NORELOC pages for memory that isn't safe to relocate.
+ *
+ * The kernel heap is logically divided up into four pieces:
+ *
+ *   heap32_arena is for allocations that require 32-bit absolute
+ *   virtual addresses (e.g. code that uses 32-bit pointers/offsets).
+ *
+ *   heap_core is for allocations that require 2GB *relative*
+ *   offsets; in other words all memory from heap_core is within
+ *   2GB of all other memory from the same arena. This is a requirement
+ *   of the addressing modes of some processors in supervisor code.
+ *
+ *   heap_arena is the general heap arena.
+ *
+ *   static_arena is the static memory arena.  Allocations from it
+ *   are not subject to relocation so it is safe to use the memory
+ *   physical address as well as the virtual address (e.g. the VA to
+ *   PA translations are static).  Caches may import from static_arena;
+ *   all other static memory allocations should use static_alloc_arena.
+ *
+ * On some platforms which have limited virtual address space, seg_kmem
+ * may share [kernelheap, ekernelheap) with seg_kp; if this is so,
+ * segkp_bitmap is non-NULL, and each bit represents a page of virtual
+ * address space which is actually seg_kp mapped.
+ */
+
+extern ulong_t *segkp_bitmap;   /* Is set if segkp is from the kernel heap */
+
+char *kernelheap;		/* start of primary kernel heap */
+char *ekernelheap;		/* end of primary kernel heap */
+struct seg kvseg;		/* primary kernel heap segment */
+struct seg kvseg_core;		/* "core" kernel heap segment */
+vmem_t *heap_arena;		/* primary kernel heap arena */
+vmem_t *heap_core_arena;	/* core kernel heap arena */
+char *heap_core_base;		/* start of core kernel heap arena */
+char *heap_lp_base;		/* start of kernel large page heap arena */
+char *heap_lp_end;		/* end of kernel large page heap arena */
+vmem_t *hat_memload_arena;	/* HAT translation data */
+struct seg kvseg32;		/* 32-bit kernel heap segment */
+vmem_t *heap32_arena;		/* 32-bit kernel heap arena */
+vmem_t *heaptext_arena;		/* heaptext arena */
+struct as kas;			/* kernel address space */
+struct vnode kvp;		/* vnode for all segkmem pages */
+int segkmem_reloc;		/* enable/disable relocatable segkmem pages */
+vmem_t *static_arena;		/* arena for caches to import static memory */
+vmem_t *static_alloc_arena;	/* arena for allocating static memory */
+
+/*
+ * seg_kmem driver can map part of the kernel heap with large pages.
+ * Currently this functionality is implemented for sparc platforms only.
+ *
+ * The large page size "segkmem_lpsize" for kernel heap is selected in the
+ * platform specific code. It can also be modified via /etc/system file.
+ * Setting segkmem_lpsize to PAGESIZE in /etc/system disables usage of large
+ * pages for kernel heap. "segkmem_lpshift" is adjusted appropriately to
+ * match segkmem_lpsize.
+ *
+ * At boot time we carve from kernel heap arena a range of virtual addresses
+ * that will be used for large page mappings. This range [heap_lp_base,
+ * heap_lp_end) is set up as a separate vmem arena - "heap_lp_arena". We also
+ * create "kmem_lp_arena" that caches memory already backed up by large
+ * pages. kmem_lp_arena imports virtual segments from heap_lp_arena.
+ */
+
+size_t	segkmem_lpsize;
+static  uint_t	segkmem_lpshift = PAGESHIFT;
+
+size_t  segkmem_kmemlp_quantum = 0x400000;	/* 4MB */
+size_t  segkmem_heaplp_quantum;
+static	vmem_t *heap_lp_arena;
+static  vmem_t *kmem_lp_arena;
+static  vmem_t *segkmem_ppa_arena;
+static	segkmem_lpcb_t segkmem_lpcb;
+
+/*
+ * We use "segkmem_kmemlp_max" to limit the total amount of physical memory
+ * consumed by the large page heap. By default this parameter is set to 1/4 of
+ * physmem but can be adjusted through /etc/system either directly or
+ * indirectly by setting "segkmem_kmemlp_pcnt" to the percent of physmem
+ * we allow for large page heap.
+ */
+size_t  segkmem_kmemlp_max;
+static  uint_t  segkmem_kmemlp_pcnt;
+
+/*
+ * Getting large pages for kernel heap could be problematic due to
+ * physical memory fragmentation. That's why we allow to preallocate
+ * "segkmem_kmemlp_min" bytes at boot time.
+ */
+static  size_t	segkmem_kmemlp_min;
+
+/*
+ * Throttling is used to avoid expensive tries to allocate large pages
+ * for kernel heap when a lot of succesive attempts to do so fail.
+ */
+static  ulong_t segkmem_lpthrottle_max = 0x400000;
+static  ulong_t segkmem_lpthrottle_start = 0x40;
+static  ulong_t segkmem_use_lpthrottle = 1;
+
+/*
+ * Freed pages accumulate on a garbage list until segkmem is ready,
+ * at which point we call segkmem_gc() to free it all.
+ */
+typedef struct segkmem_gc_list {
+	struct segkmem_gc_list	*gc_next;
+	vmem_t			*gc_arena;
+	size_t			gc_size;
+} segkmem_gc_list_t;
+
+static segkmem_gc_list_t *segkmem_gc_list;
+
+/*
+ * Allocations from the hat_memload arena add VM_MEMLOAD to their
+ * vmflags so that segkmem_xalloc() can inform the hat layer that it needs
+ * to take steps to prevent infinite recursion.  HAT allocations also
+ * must be non-relocatable to prevent recursive page faults.
+ */
+static void *
+hat_memload_alloc(vmem_t *vmp, size_t size, int flags)
+{
+	flags |= (VM_MEMLOAD | VM_NORELOC);
+	return (segkmem_alloc(vmp, size, flags));
+}
+
+/*
+ * Allocations from static_arena arena (or any other arena that uses
+ * segkmem_alloc_permanent()) require non-relocatable (permanently
+ * wired) memory pages, since these pages are referenced by physical
+ * as well as virtual address.
+ */
+void *
+segkmem_alloc_permanent(vmem_t *vmp, size_t size, int flags)
+{
+	return (segkmem_alloc(vmp, size, flags | VM_NORELOC));
+}
+
+/*
+ * Initialize kernel heap boundaries.
+ */
+void
+kernelheap_init(
+	void *heap_start,
+	void *heap_end,
+	char *first_avail,
+	void *core_start,
+	void *core_end)
+{
+	uintptr_t textbase;
+	size_t core_size;
+	size_t heap_size;
+	vmem_t *heaptext_parent;
+	size_t	heap_lp_size = 0;
+
+	kernelheap = heap_start;
+	ekernelheap = heap_end;
+
+#ifdef __sparc
+	heap_lp_size = (((uintptr_t)heap_end - (uintptr_t)heap_start) / 4);
+	heap_lp_base = ekernelheap - heap_lp_size;
+	heap_lp_end = heap_lp_base + heap_lp_size;
+#endif	/* __sparc */
+
+	/*
+	 * If this platform has a 'core' heap area, then the space for
+	 * overflow module text should be carved out of the end of that
+	 * heap.  Otherwise, it gets carved out of the general purpose
+	 * heap.
+	 */
+	core_size = (uintptr_t)core_end - (uintptr_t)core_start;
+	if (core_size > 0) {
+		ASSERT(core_size >= HEAPTEXT_SIZE);
+		textbase = (uintptr_t)core_end - HEAPTEXT_SIZE;
+		core_size -= HEAPTEXT_SIZE;
+	}
+#ifndef __sparc
+	else {
+		ekernelheap -= HEAPTEXT_SIZE;
+		textbase = (uintptr_t)ekernelheap;
+	}
+#endif
+
+	heap_size = (uintptr_t)ekernelheap - (uintptr_t)kernelheap;
+	heap_arena = vmem_init("heap", kernelheap, heap_size, PAGESIZE,
+	    segkmem_alloc, segkmem_free);
+
+	if (core_size > 0) {
+		heap_core_arena = vmem_create("heap_core", core_start,
+		    core_size, PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
+		heap_core_base = core_start;
+	} else {
+		heap_core_arena = heap_arena;
+		heap_core_base = kernelheap;
+	}
+
+	/*
+	 * reserve space for the large page heap. If large pages for kernel
+	 * heap is enabled large page heap arean will be created later in the
+	 * boot sequence in segkmem_heap_lp_init(). Otherwise the allocated
+	 * range will be returned back to the heap_arena.
+	 */
+	if (heap_lp_size) {
+		(void) vmem_xalloc(heap_arena, heap_lp_size, PAGESIZE, 0, 0,
+		    heap_lp_base, heap_lp_end,
+		    VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
+	}
+
+	/*
+	 * Remove the already-spoken-for memory range [kernelheap, first_avail).
+	 */
+	(void) vmem_xalloc(heap_arena, first_avail - kernelheap, PAGESIZE,
+	    0, 0, kernelheap, first_avail, VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
+
+#ifdef __sparc
+	heap32_arena = vmem_create("heap32", (void *)SYSBASE32,
+	    SYSLIMIT32 - SYSBASE32 - HEAPTEXT_SIZE, PAGESIZE, NULL,
+	    NULL, NULL, 0, VM_SLEEP);
+
+	textbase = SYSLIMIT32 - HEAPTEXT_SIZE;
+	heaptext_parent = NULL;
+#else	/* __sparc */
+	heap32_arena = heap_core_arena;
+	heaptext_parent = heap_core_arena;
+#endif	/* __sparc */
+
+	heaptext_arena = vmem_create("heaptext", (void *)textbase,
+	    HEAPTEXT_SIZE, PAGESIZE, NULL, NULL, heaptext_parent, 0, VM_SLEEP);
+
+	/*
+	 * Create a set of arenas for memory with static translations
+	 * (e.g. VA -> PA translations cannot change).  Since using
+	 * kernel pages by physical address implies it isn't safe to
+	 * walk across page boundaries, the static_arena quantum must
+	 * be PAGESIZE.  Any kmem caches that require static memory
+	 * should source from static_arena, while direct allocations
+	 * should only use static_alloc_arena.
+	 */
+	static_arena = vmem_create("static", NULL, 0, PAGESIZE,
+	    segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP);
+	static_alloc_arena = vmem_create("static_alloc", NULL, 0,
+	    sizeof (uint64_t), vmem_alloc, vmem_free, static_arena,
+	    0, VM_SLEEP);
+
+	/*
+	 * Create an arena for translation data (ptes, hmes, or hblks).
+	 * We need an arena for this because hat_memload() is essential
+	 * to vmem_populate() (see comments in common/os/vmem.c).
+	 *
+	 * Note: any kmem cache that allocates from hat_memload_arena
+	 * must be created as a KMC_NOHASH cache (i.e. no external slab
+	 * and bufctl structures to allocate) so that slab creation doesn't
+	 * require anything more than a single vmem_alloc().
+	 */
+	hat_memload_arena = vmem_create("hat_memload", NULL, 0, PAGESIZE,
+	    hat_memload_alloc, segkmem_free, heap_arena, 0,
+	    VM_SLEEP | VMC_POPULATOR);
+}
+
+/*
+ * Grow kernel heap downward.
+ */
+void
+kernelheap_extend(void *range_start, void *range_end)
+{
+	size_t len = (uintptr_t)range_end - (uintptr_t)range_start;
+
+	ASSERT(range_start < range_end && range_end == kernelheap);
+
+	if (vmem_add(heap_arena, range_start, len, VM_NOSLEEP) == NULL) {
+		cmn_err(CE_WARN, "Could not grow kernel heap below 0x%p",
+		    (void *)kernelheap);
+	} else {
+		kernelheap = range_start;
+	}
+}
+
+void
+boot_mapin(caddr_t addr, size_t size)
+{
+	caddr_t	 eaddr;
+	page_t	*pp;
+	pfn_t	 pfnum;
+
+	if (page_resv(btop(size), KM_NOSLEEP) == 0)
+		panic("boot_mapin: page_resv failed");
+
+	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
+		pfnum = va_to_pfn(addr);
+		if ((pp = page_numtopp_nolock(pfnum)) == NULL)
+			panic("boot_mapin(): No pp for pfnum = %lx", pfnum);
+
+		/*
+		 * must break up any large pages that may have constituent
+		 * pages being utilized for BOP_ALLOC()'s before calling
+		 * page_numtopp().The locking code (ie. page_reclaim())
+		 * can't handle them
+		 */
+		if (pp->p_szc != 0)
+			page_boot_demote(pp);
+
+		pp = page_numtopp(pfnum, SE_EXCL);
+		if (pp == NULL || PP_ISFREE(pp))
+			panic("boot_alloc: pp is NULL or free");
+
+		/*
+		 * If the cage is on but doesn't yet contain this page,
+		 * mark it as non-relocatable.
+		 */
+		if (kcage_on && !PP_ISNORELOC(pp))
+			PP_SETNORELOC(pp);
+
+		(void) page_hashin(pp, &kvp, (u_offset_t)(uintptr_t)addr, NULL);
+		pp->p_lckcnt = 1;
+#if defined(__x86)
+		page_downgrade(pp);
+#else
+		page_unlock(pp);
+#endif
+	}
+}
+
+/*
+ * Get pages from boot and hash them into the kernel's vp.
+ * Used after page structs have been allocated, but before segkmem is ready.
+ */
+void *
+boot_alloc(void *inaddr, size_t size, uint_t align)
+{
+	caddr_t addr = inaddr;
+
+	if (bootops == NULL)
+		prom_panic("boot_alloc: attempt to allocate memory after "
+		    "BOP_GONE");
+
+	size = ptob(btopr(size));
+	if (BOP_ALLOC(bootops, addr, size, align) != addr)
+		panic("boot_alloc: BOP_ALLOC failed");
+	boot_mapin((caddr_t)addr, size);
+	return (addr);
+}
+
+static void
+segkmem_badop()
+{
+	panic("segkmem_badop");
+}
+
+#define	SEGKMEM_BADOP(t)	(t(*)())segkmem_badop
+
+/*ARGSUSED*/
+static faultcode_t
+segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
+	enum fault_type type, enum seg_rw rw)
+{
+	ASSERT(RW_READ_HELD(&seg->s_as->a_lock));
+
+	if (seg->s_as != &kas || size > seg->s_size ||
+	    addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
+		panic("segkmem_fault: bad args");
+
+	if (segkp_bitmap && seg == &kvseg) {
+
+		/*
+		 * If it is one of segkp pages, call segkp_fault.
+		 */
+		if (BT_TEST(segkp_bitmap,
+			btop((uintptr_t)(addr - seg->s_base))))
+			return (SEGOP_FAULT(hat, segkp, addr, size, type, rw));
+	}
+
+	switch (type) {
+	case F_SOFTLOCK:	/* lock down already-loaded translations */
+		if (rw == S_OTHER) {
+			hat_reserve(seg->s_as, addr, size);
+			return (0);
+		}
+		/*FALLTHROUGH*/
+	case F_SOFTUNLOCK:
+		if (rw == S_READ || rw == S_WRITE)
+			return (0);
+		/*FALLTHROUGH*/
+	default:
+		break;
+	}
+	return (FC_NOSUPPORT);
+}
+
+static int
+segkmem_setprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
+{
+	ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
+
+	if (seg->s_as != &kas || size > seg->s_size ||
+	    addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
+		panic("segkmem_setprot: bad args");
+
+	if (segkp_bitmap && seg == &kvseg) {
+
+		/*
+		 * If it is one of segkp pages, call segkp.
+		 */
+		if (BT_TEST(segkp_bitmap,
+			btop((uintptr_t)(addr - seg->s_base))))
+			return (SEGOP_SETPROT(segkp, addr, size, prot));
+	}
+
+	if (prot == 0)
+		hat_unload(kas.a_hat, addr, size, HAT_UNLOAD);
+	else
+		hat_chgprot(kas.a_hat, addr, size, prot);
+	return (0);
+}
+
+/*
+ * This is a dummy segkmem function overloaded to call segkp
+ * when segkp is under the heap.
+ */
+/* ARGSUSED */
+static int
+segkmem_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
+{
+	ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
+
+	if (seg->s_as != &kas)
+		segkmem_badop();
+
+	if (segkp_bitmap && seg == &kvseg) {
+
+		/*
+		 * If it is one of segkp pages, call into segkp.
+		 */
+		if (BT_TEST(segkp_bitmap,
+			btop((uintptr_t)(addr - seg->s_base))))
+			return (SEGOP_CHECKPROT(segkp, addr, size, prot));
+	}
+	segkmem_badop();
+	return (0);
+}
+
+/*
+ * This is a dummy segkmem function overloaded to call segkp
+ * when segkp is under the heap.
+ */
+/* ARGSUSED */
+static int
+segkmem_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+	ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
+
+	if (seg->s_as != &kas)
+		segkmem_badop();
+
+	if (segkp_bitmap && seg == &kvseg) {
+
+		/*
+		 * If it is one of segkp pages, call into segkp.
+		 */
+		if (BT_TEST(segkp_bitmap,
+			btop((uintptr_t)(addr - seg->s_base))))
+			return (SEGOP_KLUSTER(segkp, addr, delta));
+	}
+	segkmem_badop();
+	return (0);
+}
+
+static void
+segkmem_xdump_range(void *arg, void *start, size_t size)
+{
+	struct as *as = arg;
+	caddr_t addr = start;
+	caddr_t addr_end = addr + size;
+
+	while (addr < addr_end) {
+		pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
+		if (pfn != PFN_INVALID && pfn <= physmax && pf_is_memory(pfn))
+			dump_addpage(as, addr, pfn);
+		addr += PAGESIZE;
+		dump_timeleft = dump_timeout;
+	}
+}
+
+static void
+segkmem_dump_range(void *arg, void *start, size_t size)
+{
+	caddr_t addr = start;
+	caddr_t addr_end = addr + size;
+
+	/*
+	 * If we are about to start dumping the range of addresses we
+	 * carved out of the kernel heap for the large page heap walk
+	 * heap_lp_arena to find what segments are actually populated
+	 */
+	if (SEGKMEM_USE_LARGEPAGES &&
+	    addr == heap_lp_base && addr_end == heap_lp_end &&
+	    vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
+		vmem_walk(heap_lp_arena, VMEM_ALLOC | VMEM_REENTRANT,
+		    segkmem_xdump_range, arg);
+	} else {
+		segkmem_xdump_range(arg, start, size);
+	}
+}
+
+static void
+segkmem_dump(struct seg *seg)
+{
+	/*
+	 * The kernel's heap_arena (represented by kvseg) is a very large
+	 * VA space, most of which is typically unused.  To speed up dumping
+	 * we use vmem_walk() to quickly find the pieces of heap_arena that
+	 * are actually in use.  We do the same for heap32_arena and
+	 * heap_core.
+	 *
+	 * We specify VMEM_REENTRANT to vmem_walk() because dump_addpage()
+	 * may ultimately need to allocate memory.  Reentrant walks are
+	 * necessarily imperfect snapshots.  The kernel heap continues
+	 * to change during a live crash dump, for example.  For a normal
+	 * crash dump, however, we know that there won't be any other threads
+	 * messing with the heap.  Therefore, at worst, we may fail to dump
+	 * the pages that get allocated by the act of dumping; but we will
+	 * always dump every page that was allocated when the walk began.
+	 *
+	 * The other segkmem segments are dense (fully populated), so there's
+	 * no need to use this technique when dumping them.
+	 *
+	 * Note: when adding special dump handling for any new sparsely-
+	 * populated segments, be sure to add similar handling to the ::kgrep
+	 * code in mdb.
+	 */
+	if (seg == &kvseg) {
+		vmem_walk(heap_arena, VMEM_ALLOC | VMEM_REENTRANT,
+		    segkmem_dump_range, seg->s_as);
+#ifndef __sparc
+		vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
+		    segkmem_dump_range, seg->s_as);
+#endif
+	} else if (seg == &kvseg_core) {
+		vmem_walk(heap_core_arena, VMEM_ALLOC | VMEM_REENTRANT,
+		    segkmem_dump_range, seg->s_as);
+	} else if (seg == &kvseg32) {
+		vmem_walk(heap32_arena, VMEM_ALLOC | VMEM_REENTRANT,
+		    segkmem_dump_range, seg->s_as);
+		vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
+		    segkmem_dump_range, seg->s_as);
+	} else {
+		segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
+	}
+}
+
+/*
+ * lock/unlock kmem pages over a given range [addr, addr+len).
+ * Returns a shadow list of pages in ppp if *ppp is not NULL
+ * and memory can be allocated to hold the shadow list.
+ */
+/*ARGSUSED*/
+static int
+segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
+	page_t ***ppp, enum lock_type type, enum seg_rw rw)
+{
+	page_t **pplist, *pp;
+	pgcnt_t npages;
+	size_t nb;
+
+	if (segkp_bitmap && seg == &kvseg) {
+		/*
+		 * If it is one of segkp pages, call into segkp.
+		 */
+		if (BT_TEST(segkp_bitmap,
+			btop((uintptr_t)(addr - seg->s_base))))
+			return (SEGOP_PAGELOCK(segkp, addr, len, ppp,
+						type, rw));
+	}
+
+	if (type == L_PAGERECLAIM)
+		return (ENOTSUP);
+
+	npages = btopr(len);
+	nb = sizeof (page_t *) * npages;
+
+	if (type == L_PAGEUNLOCK) {
+		if ((pplist = *ppp) == NULL) {
+			/*
+			 * No shadow list.  Iterate over the range
+			 * using page_find() and unlock the pages
+			 * that we encounter.
+			 */
+			while (npages--) {
+				pp = page_find(&kvp,
+				    (u_offset_t)(uintptr_t)addr);
+				if (pp)
+					page_unlock(pp);
+				addr += PAGESIZE;
+			}
+			return (0);
+		}
+
+		while (npages--) {
+			pp = *pplist++;
+			if (pp)
+				page_unlock(pp);
+		}
+		kmem_free(*ppp, nb);
+		return (0);
+	}
+
+	ASSERT(type == L_PAGELOCK);
+
+	pplist = NULL;
+	if (ppp != NULL)
+		*ppp = pplist = kmem_alloc(nb, KM_NOSLEEP);
+
+	while (npages--) {
+		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_SHARED);
+		/*
+		 * We'd like to ASSERT(pp != NULL) here, but we can't
+		 * because there are legitimate cases where the address
+		 * isn't really mapped -- for instance, attaching a
+		 * kernel debugger and poking at a non-existent address.
+		 */
+		if (pplist)
+			*pplist++ = pp;
+		addr += PAGESIZE;
+	}
+	return (0);
+}
+
+/*
+ * This is a dummy segkmem function overloaded to call segkp
+ * when segkp is under the heap.
+ */
+/* ARGSUSED */
+static int
+segkmem_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+	ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
+
+	if (seg->s_as != &kas)
+		segkmem_badop();
+
+	if (segkp_bitmap && seg == &kvseg) {
+
+		/*
+		 * If it is one of segkp pages, call into segkp.
+		 */
+		if (BT_TEST(segkp_bitmap,
+			btop((uintptr_t)(addr - seg->s_base))))
+			return (SEGOP_GETMEMID(segkp, addr, memidp));
+	}
+	segkmem_badop();
+	return (0);
+}
+
+/*ARGSUSED*/
+static lgrp_mem_policy_info_t *
+segkmem_getpolicy(struct seg *seg, caddr_t addr)
+{
+	return (NULL);
+}
+
+
+static struct seg_ops segkmem_ops = {
+	SEGKMEM_BADOP(int),		/* dup */
+	SEGKMEM_BADOP(int),		/* unmap */
+	SEGKMEM_BADOP(void),		/* free */
+	segkmem_fault,
+	SEGKMEM_BADOP(faultcode_t),	/* faulta */
+	segkmem_setprot,
+	segkmem_checkprot,
+	segkmem_kluster,
+	SEGKMEM_BADOP(size_t),		/* swapout */
+	SEGKMEM_BADOP(int),		/* sync */
+	SEGKMEM_BADOP(size_t),		/* incore */
+	SEGKMEM_BADOP(int),		/* lockop */
+	SEGKMEM_BADOP(int),		/* getprot */
+	SEGKMEM_BADOP(u_offset_t),	/* getoffset */
+	SEGKMEM_BADOP(int),		/* gettype */
+	SEGKMEM_BADOP(int),		/* getvp */
+	SEGKMEM_BADOP(int),		/* advise */
+	segkmem_dump,
+	segkmem_pagelock,
+	SEGKMEM_BADOP(int),		/* setpgsz */
+	segkmem_getmemid,
+	segkmem_getpolicy,		/* getpolicy */
+};
+
+int
+segkmem_create(struct seg *seg)
+{
+	ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
+	seg->s_ops = &segkmem_ops;
+	seg->s_data = NULL;
+	kas.a_size += seg->s_size;
+	return (0);
+}
+
+/*ARGSUSED*/
+page_t *
+segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
+{
+	struct seg kseg;
+	int pgflags;
+
+	kseg.s_as = &kas;
+	pgflags = PG_EXCL;
+
+	if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
+		pgflags |= PG_NORELOC;
+	if ((vmflag & VM_NOSLEEP) == 0)
+		pgflags |= PG_WAIT;
+	if (vmflag & VM_PANIC)
+		pgflags |= PG_PANIC;
+	if (vmflag & VM_PUSHPAGE)
+		pgflags |= PG_PUSHPAGE;
+
+	return (page_create_va(&kvp, (u_offset_t)(uintptr_t)addr, size,
+	    pgflags, &kseg, addr));
+}
+
+/*
+ * Allocate pages to back the virtual address range [addr, addr + size).
+ * If addr is NULL, allocate the virtual address space as well.
+ */
+void *
+segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
+	page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
+{
+	page_t *ppl;
+	caddr_t addr = inaddr;
+	pgcnt_t npages = btopr(size);
+	int allocflag;
+
+	if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
+		return (NULL);
+
+	ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
+
+	if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
+		if (inaddr == NULL)
+			vmem_free(vmp, addr, size);
+		return (NULL);
+	}
+
+	ppl = page_create_func(addr, size, vmflag, pcarg);
+	if (ppl == NULL) {
+		if (inaddr == NULL)
+			vmem_free(vmp, addr, size);
+		page_unresv(npages);
+		return (NULL);
+	}
+
+	/*
+	 * Under certain conditions, we need to let the HAT layer know
+	 * that it cannot safely allocate memory.  Allocations from
+	 * the hat_memload vmem arena always need this, to prevent
+	 * infinite recursion.
+	 *
+	 * In addition, the x86 hat cannot safely do memory
+	 * allocations while in vmem_populate(), because there
+	 * is no simple bound on its usage.
+	 */
+	if (vmflag & VM_MEMLOAD)
+		allocflag = HAT_NO_KALLOC;
+#if defined(__x86)
+	else if (vmem_is_populator())
+		allocflag = HAT_NO_KALLOC;
+#endif
+	else
+		allocflag = 0;
+
+	while (ppl != NULL) {
+		page_t *pp = ppl;
+		page_sub(&ppl, pp);
+		ASSERT(page_iolock_assert(pp));
+		ASSERT(PAGE_EXCL(pp));
+		page_io_unlock(pp);
+		hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp,
+		    (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
+		    HAT_LOAD_LOCK | allocflag);
+		pp->p_lckcnt = 1;
+#if defined(__x86)
+		page_downgrade(pp);
+#else
+		if (vmflag & SEGKMEM_SHARELOCKED)
+			page_downgrade(pp);
+		else
+			page_unlock(pp);
+#endif
+	}
+
+	return (addr);
+}
+
+void *
+segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
+{
+	void *addr;
+	segkmem_gc_list_t *gcp, **prev_gcpp;
+
+	if (kvseg.s_base == NULL) {
+#ifndef __sparc
+		if (bootops->bsys_alloc == NULL)
+			halt("Memory allocation between bop_alloc() and "
+			    "kmem_alloc().\n");
+#endif
+
+		/*
+		 * There's not a lot of memory to go around during boot,
+		 * so recycle it if we can.
+		 */
+		for (prev_gcpp = &segkmem_gc_list; (gcp = *prev_gcpp) != NULL;
+		    prev_gcpp = &gcp->gc_next) {
+			if (gcp->gc_arena == vmp && gcp->gc_size == size) {
+				*prev_gcpp = gcp->gc_next;
+				return (gcp);
+			}
+		}
+
+		addr = vmem_alloc(vmp, size, vmflag | VM_PANIC);
+		if (boot_alloc(addr, size, BO_NO_ALIGN) != addr)
+			panic("segkmem_alloc: boot_alloc failed");
+		return (addr);
+	}
+	return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
+	    segkmem_page_create, NULL));
+}
+
+/*
+ * Any changes to this routine must also be carried over to
+ * devmap_free_pages() in the seg_dev driver. This is because
+ * we currently don't have a special kernel segment for non-paged
+ * kernel memory that is exported by drivers to user space.
+ */
+void
+segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
+{
+	page_t *pp;
+	caddr_t addr = inaddr;
+	caddr_t eaddr;
+	pgcnt_t npages = btopr(size);
+
+	ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
+
+	if (kvseg.s_base == NULL) {
+		segkmem_gc_list_t *gc = inaddr;
+		gc->gc_arena = vmp;
+		gc->gc_size = size;
+		gc->gc_next = segkmem_gc_list;
+		segkmem_gc_list = gc;
+		return;
+	}
+
+	hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+
+	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
+#if defined(__x86)
+		pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr);
+		if (pp == NULL)
+			panic("segkmem_free: page not found");
+		if (!page_tryupgrade(pp)) {
+			/*
+			 * Some other thread has a sharelock. Wait for
+			 * it to drop the lock so we can free this page.
+			 */
+			page_unlock(pp);
+			pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr,
+			    SE_EXCL);
+		}
+#else
+		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
+#endif
+		if (pp == NULL)
+			panic("segkmem_free: page not found");
+		/* Clear p_lckcnt so page_destroy() doesn't update availrmem */
+		pp->p_lckcnt = 0;
+		page_destroy(pp, 0);
+	}
+	page_unresv(npages);
+
+	if (vmp != NULL)
+		vmem_free(vmp, inaddr, size);
+}
+
+void
+segkmem_gc(void)
+{
+	ASSERT(kvseg.s_base != NULL);
+	while (segkmem_gc_list != NULL) {
+		segkmem_gc_list_t *gc = segkmem_gc_list;
+		segkmem_gc_list = gc->gc_next;
+		segkmem_free(gc->gc_arena, gc, gc->gc_size);
+	}
+}
+
+/*
+ * Legacy entry points from here to end of file.
+ */
+void
+segkmem_mapin(struct seg *seg, void *addr, size_t size, uint_t vprot,
+    pfn_t pfn, uint_t flags)
+{
+	hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+	hat_devload(seg->s_as->a_hat, addr, size, pfn, vprot,
+	    flags | HAT_LOAD_LOCK);
+}
+
+void
+segkmem_mapout(struct seg *seg, void *addr, size_t size)
+{
+	hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+}
+
+void *
+kmem_getpages(pgcnt_t npages, int kmflag)
+{
+	return (kmem_alloc(ptob(npages), kmflag));
+}
+
+void
+kmem_freepages(void *addr, pgcnt_t npages)
+{
+	kmem_free(addr, ptob(npages));
+}
+
+/*
+ * segkmem_page_create_large() allocates a large page to be used for the kmem
+ * caches. If kpr is enabled we ask for a relocatable page unless requested
+ * otherwise. If kpr is disabled we have to ask for a non-reloc page
+ */
+static page_t *
+segkmem_page_create_large(void *addr, size_t size, int vmflag, void *arg)
+{
+	int pgflags;
+
+	pgflags = PG_EXCL;
+
+	if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
+		pgflags |= PG_NORELOC;
+	if (!(vmflag & VM_NOSLEEP))
+		pgflags |= PG_WAIT;
+	if (vmflag & VM_PUSHPAGE)
+		pgflags |= PG_PUSHPAGE;
+
+	return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size,
+	    pgflags, &kvseg, addr, arg));
+}
+
+/*
+ * Allocate a large page to back the virtual address range
+ * [addr, addr + size).  If addr is NULL, allocate the virtual address
+ * space as well.
+ */
+static void *
+segkmem_xalloc_lp(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
+    uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
+    void *pcarg)
+{
+	caddr_t addr = inaddr, pa;
+	size_t  lpsize = segkmem_lpsize;
+	pgcnt_t npages = btopr(size);
+	pgcnt_t nbpages = btop(lpsize);
+	pgcnt_t nlpages = size >> segkmem_lpshift;
+	size_t  ppasize = nbpages * sizeof (page_t *);
+	page_t *pp, *rootpp, **ppa, *pplist = NULL;
+	int i;
+
+	if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
+		return (NULL);
+	}
+
+	/*
+	 * allocate an array we need for hat_memload_array.
+	 * we use a separate arena to avoid recursion.
+	 * we will not need this array when hat_memload_array learns pp++
+	 */
+	if ((ppa = vmem_alloc(segkmem_ppa_arena, ppasize, vmflag)) == NULL) {
+		goto fail_array_alloc;
+	}
+
+	if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
+		goto fail_vmem_alloc;
+
+	ASSERT(((uintptr_t)addr & (lpsize - 1)) == 0);
+
+	/* create all the pages */
+	for (pa = addr, i = 0; i < nlpages; i++, pa += lpsize) {
+		if ((pp = page_create_func(pa, lpsize, vmflag, pcarg)) == NULL)
+			goto fail_page_create;
+		page_list_concat(&pplist, &pp);
+	}
+
+	/* at this point we have all the resource to complete the request */
+	while ((rootpp = pplist) != NULL) {
+		for (i = 0; i < nbpages; i++) {
+			ASSERT(pplist != NULL);
+			pp = pplist;
+			page_sub(&pplist, pp);
+			ASSERT(page_iolock_assert(pp));
+			page_io_unlock(pp);
+			ppa[i] = pp;
+		}
+		/*
+		 * Load the locked entry. It's OK to preload the entry into the
+		 * TSB since we now support large mappings in the kernel TSB.
+		 */
+		hat_memload_array(kas.a_hat,
+		    (caddr_t)(uintptr_t)rootpp->p_offset, lpsize,
+		    ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
+		    HAT_LOAD_LOCK);
+
+		for (--i; i >= 0; --i) {
+			ppa[i]->p_lckcnt = 1;
+			page_unlock(ppa[i]);
+		}
+	}
+
+	vmem_free(segkmem_ppa_arena, ppa, ppasize);
+	return (addr);
+
+fail_page_create:
+	while ((rootpp = pplist) != NULL) {
+		for (i = 0, pp = pplist; i < nbpages; i++, pp = pplist) {
+			ASSERT(pp != NULL);
+			page_sub(&pplist, pp);
+			ASSERT(page_iolock_assert(pp));
+			page_io_unlock(pp);
+		}
+		page_destroy_pages(rootpp);
+	}
+
+	if (inaddr == NULL)
+		vmem_free(vmp, addr, size);
+
+fail_vmem_alloc:
+	vmem_free(segkmem_ppa_arena, ppa, ppasize);
+
+fail_array_alloc:
+	page_unresv(npages);
+
+	return (NULL);
+}
+
+static void
+segkmem_free_one_lp(caddr_t addr, size_t size)
+{
+	page_t		*pp, *rootpp = NULL;
+	pgcnt_t 	pgs_left = btopr(size);
+
+	ASSERT(size == segkmem_lpsize);
+
+	hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+
+	for (; pgs_left > 0; addr += PAGESIZE, pgs_left--) {
+		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
+		if (pp == NULL)
+			panic("segkmem_free_one_lp: page not found");
+		ASSERT(PAGE_EXCL(pp));
+		pp->p_lckcnt = 0;
+		if (rootpp == NULL)
+			rootpp = pp;
+	}
+	ASSERT(rootpp != NULL);
+	page_destroy_pages(rootpp);
+
+	/* page_unresv() is done by the caller */
+}
+
+/*
+ * This function is called to import new spans into the vmem arenas like
+ * kmem_default_arena and kmem_oversize_arena. It first tries to import
+ * spans from large page arena - kmem_lp_arena. In order to do this it might
+ * have to "upgrade the requested size" to kmem_lp_arena quantum. If
+ * it was not able to satisfy the upgraded request it then calls regular
+ * segkmem_alloc() that satisfies the request by importing from "*vmp" arena
+ */
+void *
+segkmem_alloc_lp(vmem_t *vmp, size_t *sizep, int vmflag)
+{
+	size_t size;
+	kthread_t *t = curthread;
+	segkmem_lpcb_t *lpcb = &segkmem_lpcb;
+
+	ASSERT(sizep != NULL);
+
+	size = *sizep;
+
+	if (lpcb->lp_uselp && !(t->t_flag & T_PANIC) &&
+	    !(vmflag & SEGKMEM_SHARELOCKED)) {
+
+		size_t kmemlp_qnt = segkmem_kmemlp_quantum;
+		size_t asize = P2ROUNDUP(size, kmemlp_qnt);
+		void  *addr = NULL;
+		ulong_t *lpthrtp = &lpcb->lp_throttle;
+		ulong_t lpthrt = *lpthrtp;
+		int	dowakeup = 0;
+		int	doalloc = 1;
+
+		ASSERT(kmem_lp_arena != NULL);
+		ASSERT(asize >= size);
+
+		if (lpthrt != 0) {
+			/* try to update the throttle value */
+			lpthrt = atomic_add_long_nv(lpthrtp, 1);
+			if (lpthrt >= segkmem_lpthrottle_max) {
+				lpthrt = atomic_cas_ulong(lpthrtp, lpthrt,
+				    segkmem_lpthrottle_max / 4);
+			}
+
+			/*
+			 * when we get above throttle start do an exponential
+			 * backoff at trying large pages and reaping
+			 */
+			if (lpthrt > segkmem_lpthrottle_start &&
+			    (lpthrt & (lpthrt - 1))) {
+				atomic_add_64(&lpcb->allocs_throttled, 1L);
+				lpthrt--;
+				if ((lpthrt & (lpthrt - 1)) == 0)
+					kmem_reap();
+				return (segkmem_alloc(vmp, size, vmflag));
+			}
+		}
+
+		if (!(vmflag & VM_NOSLEEP) &&
+		    segkmem_heaplp_quantum >= (8 * kmemlp_qnt) &&
+		    vmem_size(kmem_lp_arena, VMEM_FREE) <= kmemlp_qnt &&
+		    asize < (segkmem_heaplp_quantum - kmemlp_qnt)) {
+
+			/*
+			 * we are low on free memory in kmem_lp_arena
+			 * we let only one guy to allocate heap_lp
+			 * quantum size chunk that everybody is going to
+			 * share
+			 */
+			mutex_enter(&lpcb->lp_lock);
+
+			if (lpcb->lp_wait) {
+
+				/* we are not the first one - wait */
+				cv_wait(&lpcb->lp_cv, &lpcb->lp_lock);
+				if (vmem_size(kmem_lp_arena, VMEM_FREE) <
+				    kmemlp_qnt)  {
+					doalloc = 0;
+				}
+			} else if (vmem_size(kmem_lp_arena, VMEM_FREE) <=
+			    kmemlp_qnt) {
+
+				/*
+				 * we are the first one, make sure we import
+				 * a large page
+				 */
+				if (asize == kmemlp_qnt)
+					asize += kmemlp_qnt;
+				dowakeup = 1;
+				lpcb->lp_wait = 1;
+			}
+
+			mutex_exit(&lpcb->lp_lock);
+		}
+
+		/*
+		 * VM_ABORT flag prevents sleeps in vmem_xalloc when
+		 * large pages are not available. In that case this allocation
+		 * attempt will fail and we will retry allocation with small
+		 * pages. We also do not want to panic if this allocation fails
+		 * because we are going to retry.
+		 */
+		if (doalloc) {
+			addr = vmem_alloc(kmem_lp_arena, asize,
+			    (vmflag | VM_ABORT) & ~VM_PANIC);
+
+			if (dowakeup) {
+				mutex_enter(&lpcb->lp_lock);
+				ASSERT(lpcb->lp_wait != 0);
+				lpcb->lp_wait = 0;
+				cv_broadcast(&lpcb->lp_cv);
+				mutex_exit(&lpcb->lp_lock);
+			}
+		}
+
+		if (addr != NULL) {
+			*sizep = asize;
+			*lpthrtp = 0;
+			return (addr);
+		}
+
+		if (vmflag & VM_NOSLEEP)
+			atomic_add_64(&lpcb->nosleep_allocs_failed, 1L);
+		else
+			atomic_add_64(&lpcb->sleep_allocs_failed, 1L);
+		atomic_add_64(&lpcb->alloc_bytes_failed, size);
+
+		/* if large page throttling is not started yet do it */
+		if (segkmem_use_lpthrottle && lpthrt == 0) {
+			lpthrt = atomic_cas_ulong(lpthrtp, lpthrt, 1);
+		}
+	}
+	return (segkmem_alloc(vmp, size, vmflag));
+}
+
+void
+segkmem_free_lp(vmem_t *vmp, void *inaddr, size_t size)
+{
+	if (kmem_lp_arena == NULL || !IS_KMEM_VA_LARGEPAGE((caddr_t)inaddr)) {
+		segkmem_free(vmp, inaddr, size);
+	} else {
+		vmem_free(kmem_lp_arena, inaddr, size);
+	}
+}
+
+/*
+ * segkmem_alloc_lpi() imports virtual memory from large page heap arena
+ * into kmem_lp arena. In the process it maps the imported segment with
+ * large pages
+ */
+static void *
+segkmem_alloc_lpi(vmem_t *vmp, size_t size, int vmflag)
+{
+	segkmem_lpcb_t *lpcb = &segkmem_lpcb;
+	void  *addr;
+
+	ASSERT(size != 0);
+	ASSERT(vmp == heap_lp_arena);
+
+	/* do not allow large page heap grow beyound limits */
+	if (vmem_size(vmp, VMEM_ALLOC) >= segkmem_kmemlp_max) {
+		atomic_add_64(&lpcb->allocs_limited, 1);
+		return (NULL);
+	}
+
+	addr = segkmem_xalloc_lp(vmp, NULL, size, vmflag, 0,
+	    segkmem_page_create_large, NULL);
+	return (addr);
+}
+
+/*
+ * segkmem_free_lpi() returns virtual memory back into large page heap arena
+ * from kmem_lp arena. Beore doing this it unmaps the segment and frees
+ * large pages used to map it.
+ */
+static void
+segkmem_free_lpi(vmem_t *vmp, void *inaddr, size_t size)
+{
+	pgcnt_t		nlpages = size >> segkmem_lpshift;
+	size_t		lpsize = segkmem_lpsize;
+	caddr_t		addr = inaddr;
+	pgcnt_t 	npages = btopr(size);
+	int		i;
+
+	ASSERT(vmp == heap_lp_arena);
+	ASSERT(IS_KMEM_VA_LARGEPAGE(addr));
+	ASSERT(((uintptr_t)inaddr & (lpsize - 1)) == 0);
+
+	for (i = 0; i < nlpages; i++) {
+		segkmem_free_one_lp(addr, lpsize);
+		addr += lpsize;
+	}
+
+	page_unresv(npages);
+
+	vmem_free(vmp, inaddr, size);
+}
+
+/*
+ * This function is called at system boot time by kmem_init right after
+ * /etc/system file has been read. It checks based on hardware configuration
+ * and /etc/system settings if system is going to use large pages. The
+ * initialiazation necessary to actually start using large pages
+ * happens later in the process after segkmem_heap_lp_init() is called.
+ */
+int
+segkmem_lpsetup()
+{
+	int use_large_pages = 0;
+
+#ifdef __sparc
+
+	size_t memtotal = physmem * PAGESIZE;
+
+	if (heap_lp_base == NULL) {
+		segkmem_lpsize = PAGESIZE;
+		return (0);
+	}
+
+	/* get a platform dependent value of large page size for kernel heap */
+	segkmem_lpsize = get_segkmem_lpsize(segkmem_lpsize);
+
+	if (segkmem_lpsize <= PAGESIZE) {
+		/*
+		 * put virtual space reserved for the large page kernel
+		 * back to the regular heap
+		 */
+		vmem_xfree(heap_arena, heap_lp_base,
+		    heap_lp_end - heap_lp_base);
+		heap_lp_base = NULL;
+		heap_lp_end = NULL;
+		segkmem_lpsize = PAGESIZE;
+		return (0);
+	}
+
+	/* set heap_lp quantum if necessary */
+	if (segkmem_heaplp_quantum == 0 ||
+	    (segkmem_heaplp_quantum & (segkmem_heaplp_quantum - 1)) ||
+	    P2PHASE(segkmem_heaplp_quantum, segkmem_lpsize)) {
+		segkmem_heaplp_quantum = segkmem_lpsize;
+	}
+
+	/* set kmem_lp quantum if necessary */
+	if (segkmem_kmemlp_quantum == 0 ||
+	    (segkmem_kmemlp_quantum & (segkmem_kmemlp_quantum - 1)) ||
+	    segkmem_kmemlp_quantum > segkmem_heaplp_quantum) {
+		segkmem_kmemlp_quantum = segkmem_heaplp_quantum;
+	}
+
+	/* set total amount of memory allowed for large page kernel heap */
+	if (segkmem_kmemlp_max == 0) {
+		if (segkmem_kmemlp_pcnt == 0 || segkmem_kmemlp_pcnt > 100)
+			segkmem_kmemlp_pcnt = 25;
+		segkmem_kmemlp_max = (memtotal * 100) / segkmem_kmemlp_pcnt;
+	}
+	segkmem_kmemlp_max = P2ROUNDUP(segkmem_kmemlp_max,
+	    segkmem_heaplp_quantum);
+
+	/* fix lp kmem preallocation request if necesssary */
+	if (segkmem_kmemlp_min) {
+		segkmem_kmemlp_min = P2ROUNDUP(segkmem_kmemlp_min,
+		    segkmem_heaplp_quantum);
+		if (segkmem_kmemlp_min > segkmem_kmemlp_max)
+			segkmem_kmemlp_min = segkmem_kmemlp_max;
+	}
+
+	use_large_pages = 1;
+	segkmem_lpshift = page_get_shift(page_szc(segkmem_lpsize));
+
+#endif
+	return (use_large_pages);
+}
+
+#ifdef __sparc
+
+
+static void *
+segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag)
+{
+	size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *);
+	void   *addr;
+
+	if (ppaquantum <= PAGESIZE)
+		return (segkmem_alloc(vmp, size, vmflag));
+
+	ASSERT((size & (ppaquantum - 1)) == 0);
+
+	addr = vmem_xalloc(vmp, size, ppaquantum, 0, 0, NULL, NULL, vmflag);
+	if (addr != NULL && segkmem_xalloc(vmp, addr, size, vmflag, 0,
+		segkmem_page_create, NULL) == NULL) {
+		vmem_xfree(vmp, addr, size);
+		addr = NULL;
+	}
+
+	return (addr);
+}
+
+static void
+segkmem_free_ppa(vmem_t *vmp, void *addr, size_t size)
+{
+	size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *);
+
+	ASSERT(addr != NULL);
+
+	if (ppaquantum <= PAGESIZE) {
+		segkmem_free(vmp, addr, size);
+	} else {
+		segkmem_free(NULL, addr, size);
+		vmem_xfree(vmp, addr, size);
+	}
+}
+
+void
+segkmem_heap_lp_init()
+{
+	segkmem_lpcb_t *lpcb = &segkmem_lpcb;
+	size_t heap_lp_size = heap_lp_end - heap_lp_base;
+	size_t lpsize = segkmem_lpsize;
+	size_t ppaquantum;
+	void   *addr;
+
+	if (segkmem_lpsize <= PAGESIZE) {
+		ASSERT(heap_lp_base == NULL);
+		ASSERT(heap_lp_end == NULL);
+		return;
+	}
+
+	ASSERT(segkmem_heaplp_quantum >= lpsize);
+	ASSERT((segkmem_heaplp_quantum & (lpsize - 1)) == 0);
+	ASSERT(lpcb->lp_uselp == 0);
+	ASSERT(heap_lp_base != NULL);
+	ASSERT(heap_lp_end != NULL);
+	ASSERT(heap_lp_base < heap_lp_end);
+	ASSERT(heap_lp_arena == NULL);
+	ASSERT(((uintptr_t)heap_lp_base & (lpsize - 1)) == 0);
+	ASSERT(((uintptr_t)heap_lp_end & (lpsize - 1)) == 0);
+
+	/* create large page heap arena */
+	heap_lp_arena = vmem_create("heap_lp", heap_lp_base, heap_lp_size,
+	    segkmem_heaplp_quantum, NULL, NULL, NULL, 0, VM_SLEEP);
+
+	ASSERT(heap_lp_arena != NULL);
+
+	/* This arena caches memory already mapped by large pages */
+	kmem_lp_arena = vmem_create("kmem_lp", NULL, 0, segkmem_kmemlp_quantum,
+	    segkmem_alloc_lpi, segkmem_free_lpi, heap_lp_arena, 0, VM_SLEEP);
+
+	ASSERT(kmem_lp_arena != NULL);
+
+	mutex_init(&lpcb->lp_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&lpcb->lp_cv, NULL, CV_DEFAULT, NULL);
+
+	/*
+	 * this arena is used for the array of page_t pointers necessary
+	 * to call hat_mem_load_array
+	 */
+	ppaquantum = btopr(lpsize) * sizeof (page_t *);
+	segkmem_ppa_arena = vmem_create("segkmem_ppa", NULL, 0, ppaquantum,
+	    segkmem_alloc_ppa, segkmem_free_ppa, heap_arena, ppaquantum,
+	    VM_SLEEP);
+
+	ASSERT(segkmem_ppa_arena != NULL);
+
+	/* prealloacate some memory for the lp kernel heap */
+	if (segkmem_kmemlp_min) {
+
+		ASSERT(P2PHASE(segkmem_kmemlp_min,
+		    segkmem_heaplp_quantum) == 0);
+
+		if ((addr = segkmem_alloc_lpi(heap_lp_arena,
+		    segkmem_kmemlp_min, VM_SLEEP)) != NULL) {
+
+			addr = vmem_add(kmem_lp_arena, addr,
+			    segkmem_kmemlp_min, VM_SLEEP);
+			ASSERT(addr != NULL);
+		}
+	}
+
+	lpcb->lp_uselp = 1;
+}
+
+#endif
diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h
new file mode 100644
index 0000000000..a1fcf43643
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kmem.h
@@ -0,0 +1,129 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VM_SEG_KMEM_H
+#define	_VM_SEG_KMEM_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/vnode.h>
+#include <sys/vmem.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/page.h>
+
+/*
+ * VM - Kernel Segment Driver
+ */
+
+#if defined(_KERNEL)
+
+extern char *kernelheap;	/* start of primary kernel heap */
+extern char *ekernelheap;	/* end of primary kernel heap */
+extern char *heap_lp_base;	/* start of kernel large page heap arena */
+extern char *heap_lp_end;	/* end of kernel large page heap arena */
+extern struct seg kvseg;	/* primary kernel heap segment */
+extern struct seg kvseg_core;	/* "core" kernel heap segment */
+extern vmem_t *heap_arena;	/* primary kernel heap arena */
+extern vmem_t *hat_memload_arena; /* HAT translation arena */
+extern struct seg kvseg32;	/* 32-bit kernel heap segment */
+extern vmem_t *heap32_arena;	/* 32-bit kernel heap arena */
+extern vmem_t *heaptext_arena;	/* kernel text arena, from heap */
+extern struct ctx *kctx;	/* kernel context */
+extern struct as kas;		/* kernel address space */
+extern struct vnode kvp;	/* vnode for all segkmem pages */
+extern int segkmem_reloc;	/* enable/disable segkmem relocatable pages */
+extern vmem_t *static_arena;	/* arena for caches to import static memory */
+extern vmem_t *static_alloc_arena;	/* arena for allocating static memory */
+
+extern int segkmem_create(struct seg *);
+extern page_t *segkmem_page_create(void *, size_t, int, void *);
+extern void *segkmem_xalloc(vmem_t *, void *, size_t, int, uint_t,
+	page_t *(*page_create_func)(void *, size_t, int, void *), void *);
+extern void *segkmem_alloc(vmem_t *, size_t, int);
+extern void *segkmem_alloc_permanent(vmem_t *, size_t, int);
+extern void segkmem_free(vmem_t *, void *, size_t);
+
+extern void *boot_alloc(void *, size_t, uint_t);
+extern void boot_mapin(caddr_t addr, size_t size);
+extern void kernelheap_init(void *, void *, char *, void *, void *);
+extern void kernelheap_extend(void *, void *);
+extern void segkmem_gc(void);
+
+/*
+ * Flags for segkmem_xalloc().
+ *
+ * SEGKMEM_SHARELOCKED requests pages which are locked SE_SHARED to be
+ * returned rather than unlocked which is now the default.  Note that
+ * memory returned by SEGKMEM_SHARELOCKED cannot be freed by segkmem_free().
+ * This is a hack for seg_dev that should be cleaned up in the future.
+ */
+#define	SEGKMEM_SHARELOCKED	0x20000
+
+/*
+ * Large page for kmem caches support
+ */
+typedef struct segkmem_lpcb {
+	kmutex_t	lp_lock;
+	kcondvar_t	lp_cv;
+	uint_t		lp_wait;
+	uint_t		lp_uselp;
+	ulong_t		lp_throttle;
+
+	/* stats */
+	uint64_t	sleep_allocs_failed;
+	uint64_t	nosleep_allocs_failed;
+	uint64_t	allocs_throttled;
+	uint64_t	allocs_limited;
+	uint64_t	alloc_bytes_failed;
+} segkmem_lpcb_t;
+
+extern void	*segkmem_alloc_lp(vmem_t *, size_t *, int);
+extern void	segkmem_free_lp(vmem_t *, void *, size_t);
+extern int	segkmem_lpsetup();
+extern void	segkmem_heap_lp_init(void);
+
+extern size_t segkmem_lpsize;
+extern size_t segkmem_heaplp_quantum;
+extern size_t segkmem_kmemlp_max;
+
+#define	SEGKMEM_USE_LARGEPAGES (segkmem_lpsize > PAGESIZE)
+
+#define	IS_KMEM_VA_LARGEPAGE(vaddr)				        \
+	(((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end))
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_KMEM_H */
diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c
new file mode 100644
index 0000000000..9c7b0710f3
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kp.c
@@ -0,0 +1,1444 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/*	All Rights Reserved   */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * segkp is a segment driver that administers the allocation and deallocation
+ * of pageable variable size chunks of kernel virtual address space. Each
+ * allocated resource is page-aligned.
+ *
+ * The user may specify whether the resource should be initialized to 0,
+ * include a redzone, or locked in memory.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/thread.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/mman.h>
+#include <sys/vnode.h>
+#include <sys/cmn_err.h>
+#include <sys/swap.h>
+#include <sys/tuneable.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+#include <sys/cred.h>
+#include <sys/dumphdr.h>
+#include <sys/debug.h>
+#include <sys/vtrace.h>
+#include <sys/stack.h>
+#include <sys/atomic.h>
+#include <sys/archsystm.h>
+#include <sys/lgrp.h>
+
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kp.h>
+#include <vm/seg_kmem.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <vm/hat.h>
+#include <sys/bitmap.h>
+
+/*
+ * Private seg op routines
+ */
+static void	segkp_badop(void);
+static void	segkp_dump(struct seg *seg);
+static int	segkp_checkprot(struct seg *seg, caddr_t addr, size_t len,
+			uint_t prot);
+static int	segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
+static int	segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
+			struct page ***page, enum lock_type type,
+			enum seg_rw rw);
+static void	segkp_insert(struct seg *seg, struct segkp_data *kpd);
+static void	segkp_delete(struct seg *seg, struct segkp_data *kpd);
+static caddr_t	segkp_get_internal(struct seg *seg, size_t len, uint_t flags,
+			struct segkp_data **tkpd, struct anon_map *amp);
+static void	segkp_release_internal(struct seg *seg,
+			struct segkp_data *kpd, size_t len);
+static int	segkp_unlock(struct hat *hat, struct seg *seg, caddr_t vaddr,
+			size_t len, struct segkp_data *kpd, uint_t flags);
+static int	segkp_load(struct hat *hat, struct seg *seg, caddr_t vaddr,
+			size_t len, struct segkp_data *kpd, uint_t flags);
+static struct	segkp_data *segkp_find(struct seg *seg, caddr_t vaddr);
+static int	segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
+static lgrp_mem_policy_info_t	*segkp_getpolicy(struct seg *seg,
+    caddr_t addr);
+
+/*
+ * Lock used to protect the hash table(s) and caches.
+ */
+static kmutex_t	segkp_lock;
+
+/*
+ * The segkp caches
+ */
+static struct segkp_cache segkp_cache[SEGKP_MAX_CACHE];
+
+#define	SEGKP_BADOP(t)	(t(*)())segkp_badop
+
+/*
+ * When there are fewer than red_minavail bytes left on the stack,
+ * segkp_map_red() will map in the redzone (if called).  5000 seems
+ * to work reasonably well...
+ */
+long		red_minavail = 5000;
+
+/*
+ * will be set to 1 for 32 bit x86 systems only, in startup.c
+ */
+int	segkp_fromheap = 0;
+ulong_t *segkp_bitmap;
+
+/*
+ * If segkp_map_red() is called with the redzone already mapped and
+ * with less than RED_DEEP_THRESHOLD bytes available on the stack,
+ * then the stack situation has become quite serious;  if much more stack
+ * is consumed, we have the potential of scrogging the next thread/LWP
+ * structure.  To help debug the "can't happen" panics which may
+ * result from this condition, we record lbolt and the calling thread
+ * in red_deep_lbolt and red_deep_thread respectively.
+ */
+#define	RED_DEEP_THRESHOLD	2000
+
+clock_t		red_deep_lbolt;
+kthread_t	*red_deep_thread;
+
+uint32_t	red_nmapped;
+uint32_t	red_closest = UINT_MAX;
+uint32_t	red_ndoubles;
+
+pgcnt_t anon_segkp_pages_locked;	/* See vm/anon.h */
+
+static struct	seg_ops segkp_ops = {
+	SEGKP_BADOP(int),		/* dup */
+	SEGKP_BADOP(int),		/* unmap */
+	SEGKP_BADOP(void),		/* free */
+	segkp_fault,
+	SEGKP_BADOP(faultcode_t),	/* faulta */
+	SEGKP_BADOP(int),		/* setprot */
+	segkp_checkprot,
+	segkp_kluster,
+	SEGKP_BADOP(size_t),		/* swapout */
+	SEGKP_BADOP(int),		/* sync */
+	SEGKP_BADOP(size_t),		/* incore */
+	SEGKP_BADOP(int),		/* lockop */
+	SEGKP_BADOP(int),		/* getprot */
+	SEGKP_BADOP(u_offset_t),		/* getoffset */
+	SEGKP_BADOP(int),		/* gettype */
+	SEGKP_BADOP(int),		/* getvp */
+	SEGKP_BADOP(int),		/* advise */
+	segkp_dump,			/* dump */
+	segkp_pagelock,			/* pagelock */
+	SEGKP_BADOP(int),		/* setpgsz */
+	segkp_getmemid,			/* getmemid */
+	segkp_getpolicy,		/* getpolicy */
+};
+
+
+static void
+segkp_badop(void)
+{
+	panic("segkp_badop");
+	/*NOTREACHED*/
+}
+
+static void segkpinit_mem_config(struct seg *);
+
+static uint32_t segkp_indel;
+
+/*
+ * Allocate the segment specific private data struct and fill it in
+ * with the per kp segment mutex, anon ptr. array and hash table.
+ */
+int
+segkp_create(struct seg *seg)
+{
+	struct segkp_segdata *kpsd;
+	size_t	np;
+
+	ASSERT(seg != NULL && seg->s_as == &kas);
+	ASSERT(RW_WRITE_HELD(&seg->s_as->a_lock));
+
+	if (seg->s_size & PAGEOFFSET) {
+		panic("Bad segkp size");
+		/*NOTREACHED*/
+	}
+
+	kpsd = kmem_zalloc(sizeof (struct segkp_segdata), KM_SLEEP);
+
+	/*
+	 * Allocate the virtual memory for segkp and initialize it
+	 */
+	if (segkp_fromheap) {
+		np = btop(kvseg.s_size);
+		segkp_bitmap = kmem_zalloc(BT_SIZEOFMAP(np), KM_SLEEP);
+		kpsd->kpsd_arena = vmem_create("segkp", NULL, 0, PAGESIZE,
+		    vmem_alloc, vmem_free, heap_arena, 5 * PAGESIZE, VM_SLEEP);
+	} else {
+		segkp_bitmap = NULL;
+		np = btop(seg->s_size);
+		kpsd->kpsd_arena = vmem_create("segkp", seg->s_base,
+		    seg->s_size, PAGESIZE, NULL, NULL, NULL, 5 * PAGESIZE,
+		    VM_SLEEP);
+	}
+
+	kpsd->kpsd_anon = anon_create(np, ANON_SLEEP | ANON_ALLOC_FORCE);
+
+	kpsd->kpsd_hash = kmem_zalloc(SEGKP_HASHSZ * sizeof (struct segkp *),
+	    KM_SLEEP);
+	seg->s_data = (void *)kpsd;
+	seg->s_ops = &segkp_ops;
+	segkpinit_mem_config(seg);
+	return (0);
+}
+
+
+/*
+ * Find a free 'freelist' and initialize it with the appropriate attributes
+ */
+void *
+segkp_cache_init(struct seg *seg, int maxsize, size_t len, uint_t flags)
+{
+	int i;
+
+	if ((flags & KPD_NO_ANON) && !(flags & KPD_LOCKED))
+		return ((void *)-1);
+
+	mutex_enter(&segkp_lock);
+	for (i = 0; i < SEGKP_MAX_CACHE; i++) {
+		if (segkp_cache[i].kpf_inuse)
+			continue;
+		segkp_cache[i].kpf_inuse = 1;
+		segkp_cache[i].kpf_max = maxsize;
+		segkp_cache[i].kpf_flags = flags;
+		segkp_cache[i].kpf_seg = seg;
+		segkp_cache[i].kpf_len = len;
+		mutex_exit(&segkp_lock);
+		return ((void *)(uintptr_t)i);
+	}
+	mutex_exit(&segkp_lock);
+	return ((void *)-1);
+}
+
+/*
+ * Free all the cache resources.
+ */
+void
+segkp_cache_free(void)
+{
+	struct segkp_data *kpd;
+	struct seg *seg;
+	int i;
+
+	mutex_enter(&segkp_lock);
+	for (i = 0; i < SEGKP_MAX_CACHE; i++) {
+		if (!segkp_cache[i].kpf_inuse)
+			continue;
+		/*
+		 * Disconnect the freelist and process each element
+		 */
+		kpd = segkp_cache[i].kpf_list;
+		seg = segkp_cache[i].kpf_seg;
+		segkp_cache[i].kpf_list = NULL;
+		segkp_cache[i].kpf_count = 0;
+		mutex_exit(&segkp_lock);
+
+		while (kpd != NULL) {
+			struct segkp_data *next;
+
+			next = kpd->kp_next;
+			segkp_release_internal(seg, kpd, kpd->kp_len);
+			kpd = next;
+		}
+		mutex_enter(&segkp_lock);
+	}
+	mutex_exit(&segkp_lock);
+}
+
+/*
+ * There are 2 entries into segkp_get_internal. The first includes a cookie
+ * used to access a pool of cached segkp resources. The second does not
+ * use the cache.
+ */
+caddr_t
+segkp_get(struct seg *seg, size_t len, uint_t flags)
+{
+	struct segkp_data *kpd = NULL;
+
+	if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
+		kpd->kp_cookie = -1;
+		return (stom(kpd->kp_base, flags));
+	}
+	return (NULL);
+}
+
+/*
+ * Return a 'cached' segkp address
+ */
+caddr_t
+segkp_cache_get(void *cookie)
+{
+	struct segkp_cache *freelist = NULL;
+	struct segkp_data *kpd = NULL;
+	int index = (int)(uintptr_t)cookie;
+	struct seg *seg;
+	size_t len;
+	uint_t flags;
+
+	if (index < 0 || index >= SEGKP_MAX_CACHE)
+		return (NULL);
+	freelist = &segkp_cache[index];
+
+	mutex_enter(&segkp_lock);
+	seg = freelist->kpf_seg;
+	flags = freelist->kpf_flags;
+	if (freelist->kpf_list != NULL) {
+		kpd = freelist->kpf_list;
+		freelist->kpf_list = kpd->kp_next;
+		freelist->kpf_count--;
+		mutex_exit(&segkp_lock);
+		kpd->kp_next = NULL;
+		segkp_insert(seg, kpd);
+		return (stom(kpd->kp_base, flags));
+	}
+	len = freelist->kpf_len;
+	mutex_exit(&segkp_lock);
+	if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
+		kpd->kp_cookie = index;
+		return (stom(kpd->kp_base, flags));
+	}
+	return (NULL);
+}
+
+caddr_t
+segkp_get_withanonmap(
+	struct seg *seg,
+	size_t len,
+	uint_t flags,
+	struct anon_map *amp)
+{
+	struct segkp_data *kpd = NULL;
+
+	ASSERT(amp != NULL);
+	flags |= KPD_HASAMP;
+	if (segkp_get_internal(seg, len, flags, &kpd, amp) != NULL) {
+		kpd->kp_cookie = -1;
+		return (stom(kpd->kp_base, flags));
+	}
+	return (NULL);
+}
+
+/*
+ * This does the real work of segkp allocation.
+ * Return to client base addr. len must be page-aligned. A null value is
+ * returned if there are no more vm resources (e.g. pages, swap). The len
+ * and base recorded in the private data structure include the redzone
+ * and the redzone length (if applicable). If the user requests a redzone
+ * either the first or last page is left unmapped depending whether stacks
+ * grow to low or high memory.
+ *
+ * The client may also specify a no-wait flag. If that is set then the
+ * request will choose a non-blocking path when requesting resources.
+ * The default is make the client wait.
+ */
+static caddr_t
+segkp_get_internal(
+	struct seg *seg,
+	size_t len,
+	uint_t flags,
+	struct segkp_data **tkpd,
+	struct anon_map *amp)
+{
+	struct segkp_segdata	*kpsd = (struct segkp_segdata *)seg->s_data;
+	struct segkp_data	*kpd;
+	caddr_t vbase = NULL;	/* always first virtual, may not be mapped */
+	pgcnt_t np = 0;		/* number of pages in the resource */
+	pgcnt_t segkpindex;
+	long i;
+	caddr_t va;
+	pgcnt_t pages = 0;
+	ulong_t anon_idx = 0;
+	int kmflag = (flags & KPD_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
+	caddr_t s_base = (segkp_fromheap) ? kvseg.s_base : seg->s_base;
+
+	if (len & PAGEOFFSET) {
+		panic("segkp_get: len is not page-aligned");
+		/*NOTREACHED*/
+	}
+
+	ASSERT(((flags & KPD_HASAMP) == 0) == (amp == NULL));
+
+	/* Only allow KPD_NO_ANON if we are going to lock it down */
+	if ((flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON)
+		return (NULL);
+
+	if ((kpd = kmem_zalloc(sizeof (struct segkp_data), kmflag)) == NULL)
+		return (NULL);
+	/*
+	 * Fix up the len to reflect the REDZONE if applicable
+	 */
+	if (flags & KPD_HASREDZONE)
+		len += PAGESIZE;
+	np = btop(len);
+
+	vbase = vmem_alloc(SEGKP_VMEM(seg), len, kmflag | VM_BESTFIT);
+	if (vbase == NULL) {
+		kmem_free(kpd, sizeof (struct segkp_data));
+		return (NULL);
+	}
+
+	/* If locking, reserve physical memory */
+	if (flags & KPD_LOCKED) {
+		pages = btop(SEGKP_MAPLEN(len, flags));
+		if (page_resv(pages, kmflag) == 0) {
+			vmem_free(SEGKP_VMEM(seg), vbase, len);
+			kmem_free(kpd, sizeof (struct segkp_data));
+			return (NULL);
+		}
+		if ((flags & KPD_NO_ANON) == 0)
+			atomic_add_long(&anon_segkp_pages_locked, pages);
+	}
+
+	/*
+	 * Reserve sufficient swap space for this vm resource.  We'll
+	 * actually allocate it in the loop below, but reserving it
+	 * here allows us to back out more gracefully than if we
+	 * had an allocation failure in the body of the loop.
+	 *
+	 * Note that we don't need swap space for the red zone page.
+	 */
+	if (amp != NULL) {
+		ASSERT((flags & KPD_NO_ANON) == 0);
+		/* The reserve has been done and the anon_hdr is separate. */
+		anon_idx = 0;
+		kpd->kp_anon_idx = anon_idx;
+		kpd->kp_anon = amp->ahp;
+
+		TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
+		    kpd, vbase, len, flags, 1);
+
+	} else if ((flags & KPD_NO_ANON) == 0) {
+		if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) {
+			if (flags & KPD_LOCKED) {
+				atomic_add_long(&anon_segkp_pages_locked,
+				    -pages);
+				page_unresv(pages);
+			}
+			vmem_free(SEGKP_VMEM(seg), vbase, len);
+			kmem_free(kpd, sizeof (struct segkp_data));
+			return (NULL);
+		}
+		anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT;
+		kpd->kp_anon_idx = anon_idx;
+		kpd->kp_anon = kpsd->kpsd_anon;
+
+		TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
+		    kpd, vbase, len, flags, 1);
+	} else {
+		kpd->kp_anon = NULL;
+		kpd->kp_anon_idx = 0;
+	}
+
+	/*
+	 * Allocate page and anon resources for the virtual address range
+	 * except the redzone
+	 */
+	if (segkp_fromheap)
+		segkpindex = btop((uintptr_t)(vbase - kvseg.s_base));
+	for (i = 0, va = vbase; i < np; i++, va += PAGESIZE) {
+		page_t		*pl[2];
+		struct vnode	*vp;
+		anoff_t		off;
+		int		err;
+		page_t		*pp = NULL;
+
+		/*
+		 * Mark this page to be a segkp page in the bitmap.
+		 */
+		if (segkp_fromheap) {
+			BT_ATOMIC_SET(segkp_bitmap, segkpindex);
+			segkpindex++;
+		}
+
+		/*
+		 * If this page is the red zone page, we don't need swap
+		 * space for it.  Note that we skip over the code that
+		 * establishes MMU mappings, so that the page remains
+		 * invalid.
+		 */
+		if ((flags & KPD_HASREDZONE) && KPD_REDZONE(kpd) == i)
+			continue;
+
+		if (kpd->kp_anon != NULL) {
+			struct anon *ap;
+
+			ASSERT(anon_get_ptr(kpd->kp_anon, anon_idx + i)
+			    == NULL);
+			/*
+			 * Determine the "vp" and "off" of the anon slot.
+			 */
+			ap = anon_alloc(NULL, 0);
+			if (amp != NULL)
+				ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+			(void) anon_set_ptr(kpd->kp_anon, anon_idx + i,
+			    ap, ANON_SLEEP);
+			if (amp != NULL)
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+			swap_xlate(ap, &vp, &off);
+
+			/*
+			 * Create a page with the specified identity.  The
+			 * page is returned with the "shared" lock held.
+			 */
+			err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE,
+			    NULL, pl, PAGESIZE, seg, va, S_CREATE,
+			    kcred);
+			if (err) {
+				/*
+				 * XXX - This should not fail.
+				 */
+				panic("segkp_get: no pages");
+				/*NOTREACHED*/
+			}
+			pp = pl[0];
+		} else {
+			ASSERT(page_exists(&kvp,
+			    (u_offset_t)(uintptr_t)va) == NULL);
+
+			if ((pp = page_create_va(&kvp,
+			    (u_offset_t)(uintptr_t)va, PAGESIZE,
+			    (flags & KPD_NOWAIT ? 0 : PG_WAIT) | PG_EXCL |
+			    PG_NORELOC, seg, va)) == NULL) {
+				/*
+				 * Legitimize resource; then destroy it.
+				 * Easier than trying to unwind here.
+				 */
+				kpd->kp_flags = flags;
+				kpd->kp_base = vbase;
+				kpd->kp_len = len;
+				segkp_release_internal(seg, kpd, va - vbase);
+				return (NULL);
+			}
+			page_io_unlock(pp);
+		}
+
+		if (flags & KPD_ZERO)
+			pagezero(pp, 0, PAGESIZE);
+
+		/*
+		 * Load and lock an MMU translation for the page.
+		 */
+		hat_memload(seg->s_as->a_hat, va, pp, (PROT_READ|PROT_WRITE),
+		    ((flags & KPD_LOCKED) ? HAT_LOAD_LOCK : HAT_LOAD));
+
+		/*
+		 * Now, release lock on the page.
+		 */
+		if (flags & KPD_LOCKED)
+			page_downgrade(pp);
+		else
+			page_unlock(pp);
+	}
+
+	kpd->kp_flags = flags;
+	kpd->kp_base = vbase;
+	kpd->kp_len = len;
+	segkp_insert(seg, kpd);
+	*tkpd = kpd;
+	return (stom(kpd->kp_base, flags));
+}
+
+/*
+ * Release the resource to cache if the pool(designate by the cookie)
+ * has less than the maximum allowable. If inserted in cache,
+ * segkp_delete insures element is taken off of active list.
+ */
+void
+segkp_release(struct seg *seg, caddr_t vaddr)
+{
+	struct segkp_cache *freelist;
+	struct segkp_data *kpd = NULL;
+
+	if ((kpd = segkp_find(seg, vaddr)) == NULL) {
+		panic("segkp_release: null kpd");
+		/*NOTREACHED*/
+	}
+
+	if (kpd->kp_cookie != -1) {
+		freelist = &segkp_cache[kpd->kp_cookie];
+		mutex_enter(&segkp_lock);
+		if (!segkp_indel && freelist->kpf_count < freelist->kpf_max) {
+			segkp_delete(seg, kpd);
+			kpd->kp_next = freelist->kpf_list;
+			freelist->kpf_list = kpd;
+			freelist->kpf_count++;
+			mutex_exit(&segkp_lock);
+			return;
+		} else {
+			mutex_exit(&segkp_lock);
+			kpd->kp_cookie = -1;
+		}
+	}
+	segkp_release_internal(seg, kpd, kpd->kp_len);
+}
+
+/*
+ * Free the entire resource. segkp_unlock gets called with the start of the
+ * mapped portion of the resource. The length is the size of the mapped
+ * portion
+ */
+static void
+segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len)
+{
+	caddr_t		va;
+	long		i;
+	long		redzone;
+	size_t		np;
+	page_t		*pp;
+	struct vnode 	*vp;
+	anoff_t		off;
+	struct anon	*ap;
+	pgcnt_t		segkpindex;
+
+	ASSERT(kpd != NULL);
+	ASSERT((kpd->kp_flags & KPD_HASAMP) == 0 || kpd->kp_cookie == -1);
+	np = btop(len);
+
+	/* Remove from active hash list */
+	if (kpd->kp_cookie == -1) {
+		mutex_enter(&segkp_lock);
+		segkp_delete(seg, kpd);
+		mutex_exit(&segkp_lock);
+	}
+
+	/*
+	 * Precompute redzone page index.
+	 */
+	redzone = -1;
+	if (kpd->kp_flags & KPD_HASREDZONE)
+		redzone = KPD_REDZONE(kpd);
+
+
+	va = kpd->kp_base;
+
+	hat_unload(seg->s_as->a_hat, va, (np << PAGESHIFT),
+	    ((kpd->kp_flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
+	/*
+	 * Free up those anon resources that are quiescent.
+	 */
+	if (segkp_fromheap)
+		segkpindex = btop((uintptr_t)(va - kvseg.s_base));
+	for (i = 0; i < np; i++, va += PAGESIZE) {
+
+		/*
+		 * Clear the bit for this page from the bitmap.
+		 */
+		if (segkp_fromheap) {
+			BT_ATOMIC_CLEAR(segkp_bitmap, segkpindex);
+			segkpindex++;
+		}
+
+		if (i == redzone)
+			continue;
+		if (kpd->kp_anon) {
+			/*
+			 * Free up anon resources and destroy the
+			 * associated pages.
+			 *
+			 * Release the lock if there is one. Have to get the
+			 * page to do this, unfortunately.
+			 */
+			if (kpd->kp_flags & KPD_LOCKED) {
+				ap = anon_get_ptr(kpd->kp_anon,
+				    kpd->kp_anon_idx + i);
+				swap_xlate(ap, &vp, &off);
+				/* Find the shared-locked page. */
+				pp = page_find(vp, (u_offset_t)off);
+				if (pp == NULL) {
+					panic("segkp_release: "
+					    "kp_anon: no page to unlock ");
+					/*NOTREACHED*/
+				}
+				page_unlock(pp);
+			}
+			if ((kpd->kp_flags & KPD_HASAMP) == 0) {
+				anon_free(kpd->kp_anon, kpd->kp_anon_idx + i,
+				    PAGESIZE);
+				anon_unresv(PAGESIZE);
+			}
+			TRACE_5(TR_FAC_VM,
+			    TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
+			    kpd, va, PAGESIZE, 0, 0);
+		} else {
+			if (kpd->kp_flags & KPD_LOCKED) {
+				pp = page_find(&kvp, (u_offset_t)(uintptr_t)va);
+				if (pp == NULL) {
+					panic("segkp_release: "
+					    "no page to unlock");
+					/*NOTREACHED*/
+				}
+				/*
+				 * We should just upgrade the lock here
+				 * but there is no upgrade that waits.
+				 */
+				page_unlock(pp);
+			}
+			pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)va,
+			    SE_EXCL);
+			if (pp != NULL)
+				page_destroy(pp, 0);
+		}
+	}
+
+	/* If locked, release physical memory reservation */
+	if (kpd->kp_flags & KPD_LOCKED) {
+		pgcnt_t pages = btop(SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
+		if ((kpd->kp_flags & KPD_NO_ANON) == 0)
+			atomic_add_long(&anon_segkp_pages_locked, -pages);
+		page_unresv(pages);
+	}
+
+	vmem_free(SEGKP_VMEM(seg), kpd->kp_base, kpd->kp_len);
+	kmem_free(kpd, sizeof (struct segkp_data));
+}
+
+/*
+ * segkp_map_red() will check the current frame pointer against the
+ * stack base.  If the amount of stack remaining is questionable
+ * (less than red_minavail), then segkp_map_red() will map in the redzone
+ * and return 1.  Otherwise, it will return 0.  segkp_map_red() can
+ * _only_ be called when:
+ *
+ *   - it is safe to sleep on page_create_va().
+ *   - the caller is non-swappable.
+ *
+ * It is up to the caller to remember whether segkp_map_red() successfully
+ * mapped the redzone, and, if so, to call segkp_unmap_red() at a later
+ * time.  Note that the caller must _remain_ non-swappable until after
+ * calling segkp_unmap_red().
+ *
+ * Currently, this routine is only called from pagefault() (which necessarily
+ * satisfies the above conditions).
+ */
+#if defined(STACK_GROWTH_DOWN)
+int
+segkp_map_red(void)
+{
+	uintptr_t fp = STACK_BIAS + (uintptr_t)getfp();
+#ifndef _LP64
+	caddr_t stkbase;
+#endif
+
+	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
+
+	/*
+	 * Optimize for the common case where we simply return.
+	 */
+	if ((curthread->t_red_pp == NULL) &&
+	    (fp - (uintptr_t)curthread->t_stkbase >= red_minavail))
+		return (0);
+
+#if defined(_LP64)
+	/*
+	 * XXX	We probably need something better than this.
+	 */
+	panic("kernel stack overflow");
+	/*NOTREACHED*/
+#else /* _LP64 */
+	if (curthread->t_red_pp == NULL) {
+		page_t *red_pp;
+		struct seg kseg;
+
+		caddr_t red_va = (caddr_t)
+		    (((uintptr_t)curthread->t_stkbase & (uintptr_t)PAGEMASK) -
+		    PAGESIZE);
+
+		ASSERT(page_exists(&kvp, (u_offset_t)(uintptr_t)red_va) ==
+		    NULL);
+
+		/*
+		 * Allocate the physical for the red page.
+		 */
+		/*
+		 * No PG_NORELOC here to avoid waits. Unlikely to get
+		 * a relocate happening in the short time the page exists
+		 * and it will be OK anyway.
+		 */
+
+		kseg.s_as = &kas;
+		red_pp = page_create_va(&kvp, (u_offset_t)(uintptr_t)red_va,
+		    PAGESIZE, PG_WAIT | PG_EXCL, &kseg, red_va);
+		ASSERT(red_pp != NULL);
+
+		/*
+		 * So we now have a page to jam into the redzone...
+		 */
+		page_io_unlock(red_pp);
+
+		hat_memload(kas.a_hat, red_va, red_pp,
+		    (PROT_READ|PROT_WRITE), HAT_LOAD_LOCK);
+		page_downgrade(red_pp);
+
+		/*
+		 * The page is left SE_SHARED locked so we can hold on to
+		 * the page_t pointer.
+		 */
+		curthread->t_red_pp = red_pp;
+
+		atomic_add_32(&red_nmapped, 1);
+		while (fp - (uintptr_t)curthread->t_stkbase < red_closest) {
+			(void) cas32(&red_closest, red_closest,
+			    (uint32_t)(fp - (uintptr_t)curthread->t_stkbase));
+		}
+		return (1);
+	}
+
+	stkbase = (caddr_t)(((uintptr_t)curthread->t_stkbase &
+	    (uintptr_t)PAGEMASK) - PAGESIZE);
+
+	atomic_add_32(&red_ndoubles, 1);
+
+	if (fp - (uintptr_t)stkbase < RED_DEEP_THRESHOLD) {
+		/*
+		 * Oh boy.  We're already deep within the mapped-in
+		 * redzone page, and the caller is trying to prepare
+		 * for a deep stack run.  We're running without a
+		 * redzone right now:  if the caller plows off the
+		 * end of the stack, it'll plow another thread or
+		 * LWP structure.  That situation could result in
+		 * a very hard-to-debug panic, so, in the spirit of
+		 * recording the name of one's killer in one's own
+		 * blood, we're going to record lbolt and the calling
+		 * thread.
+		 */
+		red_deep_lbolt = lbolt;
+		red_deep_thread = curthread;
+	}
+
+	/*
+	 * If this is a DEBUG kernel, and we've run too deep for comfort, toss.
+	 */
+	ASSERT(fp - (uintptr_t)stkbase >= RED_DEEP_THRESHOLD);
+	return (0);
+#endif /* _LP64 */
+}
+
+void
+segkp_unmap_red(void)
+{
+	page_t *pp;
+	caddr_t red_va = (caddr_t)(((uintptr_t)curthread->t_stkbase &
+	    (uintptr_t)PAGEMASK) - PAGESIZE);
+
+	ASSERT(curthread->t_red_pp != NULL);
+	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
+
+	/*
+	 * Because we locked the mapping down, we can't simply rely
+	 * on page_destroy() to clean everything up;  we need to call
+	 * hat_unload() to explicitly unlock the mapping resources.
+	 */
+	hat_unload(kas.a_hat, red_va, PAGESIZE, HAT_UNLOAD_UNLOCK);
+
+	pp = curthread->t_red_pp;
+
+	ASSERT(pp == page_find(&kvp, (u_offset_t)(uintptr_t)red_va));
+
+	/*
+	 * Need to upgrade the SE_SHARED lock to SE_EXCL.
+	 */
+	if (!page_tryupgrade(pp)) {
+		/*
+		 * As there is now wait for upgrade, release the
+		 * SE_SHARED lock and wait for SE_EXCL.
+		 */
+		page_unlock(pp);
+		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)red_va, SE_EXCL);
+		/* pp may be NULL here, hence the test below */
+	}
+
+	/*
+	 * Destroy the page, with dontfree set to zero (i.e. free it).
+	 */
+	if (pp != NULL)
+		page_destroy(pp, 0);
+	curthread->t_red_pp = NULL;
+}
+#else
+#error Red stacks only supported with downwards stack growth.
+#endif
+
+/*
+ * Handle a fault on an address corresponding to one of the
+ * resources in the segkp segment.
+ */
+faultcode_t
+segkp_fault(
+	struct hat	*hat,
+	struct seg	*seg,
+	caddr_t		vaddr,
+	size_t		len,
+	enum fault_type	type,
+	enum seg_rw rw)
+{
+	struct segkp_data	*kpd = NULL;
+	int			err;
+
+	ASSERT(seg->s_as == &kas && RW_READ_HELD(&seg->s_as->a_lock));
+
+	/*
+	 * Sanity checks.
+	 */
+	if (type == F_PROT) {
+		panic("segkp_fault: unexpected F_PROT fault");
+		/*NOTREACHED*/
+	}
+
+	if ((kpd = segkp_find(seg, vaddr)) == NULL)
+		return (FC_NOMAP);
+
+	mutex_enter(&kpd->kp_lock);
+
+	if (type == F_SOFTLOCK) {
+		ASSERT(!(kpd->kp_flags & KPD_LOCKED));
+		/*
+		 * The F_SOFTLOCK case has more stringent
+		 * range requirements: the given range must exactly coincide
+		 * with the resource's mapped portion. Note reference to
+		 * redzone is handled since vaddr would not equal base
+		 */
+		if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
+		    len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
+			mutex_exit(&kpd->kp_lock);
+			return (FC_MAKE_ERR(EFAULT));
+		}
+
+		if ((err = segkp_load(hat, seg, vaddr, len, kpd, KPD_LOCKED))) {
+			mutex_exit(&kpd->kp_lock);
+			return (FC_MAKE_ERR(err));
+		}
+		kpd->kp_flags |= KPD_LOCKED;
+		mutex_exit(&kpd->kp_lock);
+		return (0);
+	}
+
+	if (type == F_INVAL) {
+		ASSERT(!(kpd->kp_flags & KPD_NO_ANON));
+
+		/*
+		 * Check if we touched the redzone. Somewhat optimistic
+		 * here if we are touching the redzone of our own stack
+		 * since we wouldn't have a stack to get this far...
+		 */
+		if ((kpd->kp_flags & KPD_HASREDZONE) &&
+		    btop((uintptr_t)(vaddr - kpd->kp_base)) == KPD_REDZONE(kpd))
+			panic("segkp_fault: accessing redzone");
+
+		/*
+		 * This fault may occur while the page is being F_SOFTLOCK'ed.
+		 * Return since a 2nd segkp_load is unnecessary and also would
+		 * result in the page being locked twice and eventually
+		 * hang the thread_reaper thread.
+		 */
+		if (kpd->kp_flags & KPD_LOCKED) {
+			mutex_exit(&kpd->kp_lock);
+			return (0);
+		}
+
+		err = segkp_load(hat, seg, vaddr, len, kpd, kpd->kp_flags);
+		mutex_exit(&kpd->kp_lock);
+		return (err ? FC_MAKE_ERR(err) : 0);
+	}
+
+	if (type == F_SOFTUNLOCK) {
+		uint_t	flags;
+
+		/*
+		 * Make sure the addr is LOCKED and it has anon backing
+		 * before unlocking
+		 */
+		if ((kpd->kp_flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON) {
+			panic("segkp_fault: bad unlock");
+			/*NOTREACHED*/
+		}
+
+		if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
+		    len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
+			panic("segkp_fault: bad range");
+			/*NOTREACHED*/
+		}
+
+		if (rw == S_WRITE)
+			flags = kpd->kp_flags | KPD_WRITEDIRTY;
+		else
+			flags = kpd->kp_flags;
+		err = segkp_unlock(hat, seg, vaddr, len, kpd, flags);
+		kpd->kp_flags &= ~KPD_LOCKED;
+		mutex_exit(&kpd->kp_lock);
+		return (err ? FC_MAKE_ERR(err) : 0);
+	}
+	mutex_exit(&kpd->kp_lock);
+	panic("segkp_fault: bogus fault type: %d\n", type);
+	/*NOTREACHED*/
+}
+
+/*
+ * Check that the given protections suffice over the range specified by
+ * vaddr and len.  For this segment type, the only issue is whether or
+ * not the range lies completely within the mapped part of an allocated
+ * resource.
+ */
+/* ARGSUSED */
+static int
+segkp_checkprot(struct seg *seg, caddr_t vaddr, size_t len, uint_t prot)
+{
+	struct segkp_data *kpd = NULL;
+	caddr_t mbase;
+	size_t mlen;
+
+	if ((kpd = segkp_find(seg, vaddr)) == NULL)
+		return (EACCES);
+
+	mutex_enter(&kpd->kp_lock);
+	mbase = stom(kpd->kp_base, kpd->kp_flags);
+	mlen = SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags);
+	if (len > mlen || vaddr < mbase ||
+	    ((vaddr + len) > (mbase + mlen))) {
+		mutex_exit(&kpd->kp_lock);
+		return (EACCES);
+	}
+	mutex_exit(&kpd->kp_lock);
+	return (0);
+}
+
+
+/*
+ * Check to see if it makes sense to do kluster/read ahead to
+ * addr + delta relative to the mapping at addr.  We assume here
+ * that delta is a signed PAGESIZE'd multiple (which can be negative).
+ *
+ * For seg_u we always "approve" of this action from our standpoint.
+ */
+/*ARGSUSED*/
+static int
+segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+	return (0);
+}
+
+/*
+ * Load and possibly lock intra-slot resources in the range given by
+ * vaddr and len.
+ */
+static int
+segkp_load(
+	struct hat *hat,
+	struct seg *seg,
+	caddr_t vaddr,
+	size_t len,
+	struct segkp_data *kpd,
+	uint_t flags)
+{
+	caddr_t va;
+	caddr_t vlim;
+	ulong_t i;
+	uint_t lock;
+
+	ASSERT(MUTEX_HELD(&kpd->kp_lock));
+
+	len = P2ROUNDUP(len, PAGESIZE);
+
+	/* If locking, reserve physical memory */
+	if (flags & KPD_LOCKED) {
+		pgcnt_t pages = btop(len);
+		if ((kpd->kp_flags & KPD_NO_ANON) == 0)
+			atomic_add_long(&anon_segkp_pages_locked, pages);
+		(void) page_resv(pages, KM_SLEEP);
+	}
+
+	/*
+	 * Loop through the pages in the given range.
+	 */
+	va = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
+	vaddr = va;
+	vlim = va + len;
+	lock = flags & KPD_LOCKED;
+	i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
+	for (; va < vlim; va += PAGESIZE, i++) {
+		page_t		*pl[2];	/* second element NULL terminator */
+		struct vnode    *vp;
+		anoff_t		off;
+		int		err;
+		struct anon	*ap;
+
+		/*
+		 * Summon the page.  If it's not resident, arrange
+		 * for synchronous i/o to pull it in.
+		 */
+		ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
+		swap_xlate(ap, &vp, &off);
+
+		/*
+		 * The returned page list will have exactly one entry,
+		 * which is returned to us already kept.
+		 */
+		err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, NULL,
+		    pl, PAGESIZE, seg, va, S_READ, kcred);
+
+		if (err) {
+			/*
+			 * Back out of what we've done so far.
+			 */
+			(void) segkp_unlock(hat, seg, vaddr,
+			    (va - vaddr), kpd, flags);
+			return (err);
+		}
+
+		/*
+		 * Load an MMU translation for the page.
+		 */
+		hat_memload(hat, va, pl[0], (PROT_READ|PROT_WRITE),
+		    lock ? HAT_LOAD_LOCK : HAT_LOAD);
+
+		if (!lock) {
+			/*
+			 * Now, release "shared" lock on the page.
+			 */
+			page_unlock(pl[0]);
+		}
+	}
+	return (0);
+}
+
+/*
+ * At the very least unload the mmu-translations and unlock the range if locked
+ * Can be called with the following flag value KPD_WRITEDIRTY which specifies
+ * any dirty pages should be written to disk.
+ */
+static int
+segkp_unlock(
+	struct hat *hat,
+	struct seg *seg,
+	caddr_t vaddr,
+	size_t len,
+	struct segkp_data *kpd,
+	uint_t flags)
+{
+	caddr_t va;
+	caddr_t vlim;
+	ulong_t i;
+	struct page *pp;
+	struct vnode *vp;
+	anoff_t off;
+	struct anon *ap;
+
+#ifdef lint
+	seg = seg;
+#endif /* lint */
+
+	ASSERT(MUTEX_HELD(&kpd->kp_lock));
+
+	/*
+	 * Loop through the pages in the given range. It is assumed
+	 * segkp_unlock is called with page aligned base
+	 */
+	va = vaddr;
+	vlim = va + len;
+	i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
+	hat_unload(hat, va, len,
+	    ((flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
+	for (; va < vlim; va += PAGESIZE, i++) {
+		/*
+		 * Find the page associated with this part of the
+		 * slot, tracking it down through its associated swap
+		 * space.
+		 */
+		ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
+		swap_xlate(ap, &vp, &off);
+
+		if (flags & KPD_LOCKED) {
+			if ((pp = page_find(vp, off)) == NULL) {
+				if (flags & KPD_LOCKED) {
+					panic("segkp_softunlock: missing page");
+					/*NOTREACHED*/
+				}
+			}
+		} else {
+			/*
+			 * Nothing to do if the slot is not locked and the
+			 * page doesn't exist.
+			 */
+			if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL)
+				continue;
+		}
+
+		/*
+		 * If the page doesn't have any translations, is
+		 * dirty and not being shared, then push it out
+		 * asynchronously and avoid waiting for the
+		 * pageout daemon to do it for us.
+		 *
+		 * XXX - Do we really need to get the "exclusive"
+		 * lock via an upgrade?
+		 */
+		if ((flags & KPD_WRITEDIRTY) && !hat_page_is_mapped(pp) &&
+		    hat_ismod(pp) && page_tryupgrade(pp)) {
+			/*
+			 * Hold the vnode before releasing the page lock to
+			 * prevent it from being freed and re-used by some
+			 * other thread.
+			 */
+			VN_HOLD(vp);
+			page_unlock(pp);
+
+			/*
+			 * Want most powerful credentials we can get so
+			 * use kcred.
+			 */
+			(void) VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
+			    B_ASYNC | B_FREE, kcred);
+			VN_RELE(vp);
+		} else {
+			page_unlock(pp);
+		}
+	}
+
+	/* If unlocking, release physical memory */
+	if (flags & KPD_LOCKED) {
+		pgcnt_t pages = btopr(len);
+		if ((kpd->kp_flags & KPD_NO_ANON) == 0)
+			atomic_add_long(&anon_segkp_pages_locked, -pages);
+		page_unresv(pages);
+	}
+	return (0);
+}
+
+/*
+ * Insert the kpd in the hash table.
+ */
+static void
+segkp_insert(struct seg *seg, struct segkp_data *kpd)
+{
+	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
+	int index;
+
+	/*
+	 * Insert the kpd based on the address that will be returned
+	 * via segkp_release.
+	 */
+	index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
+	mutex_enter(&segkp_lock);
+	kpd->kp_next = kpsd->kpsd_hash[index];
+	kpsd->kpsd_hash[index] = kpd;
+	mutex_exit(&segkp_lock);
+}
+
+/*
+ * Remove kpd from the hash table.
+ */
+static void
+segkp_delete(struct seg *seg, struct segkp_data *kpd)
+{
+	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
+	struct segkp_data **kpp;
+	int index;
+
+	ASSERT(MUTEX_HELD(&segkp_lock));
+
+	index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
+	for (kpp = &kpsd->kpsd_hash[index];
+	    *kpp != NULL; kpp = &((*kpp)->kp_next)) {
+		if (*kpp == kpd) {
+			*kpp = kpd->kp_next;
+			return;
+		}
+	}
+	panic("segkp_delete: unable to find element to delete");
+	/*NOTREACHED*/
+}
+
+/*
+ * Find the kpd associated with a vaddr.
+ *
+ * Most of the callers of segkp_find will pass the vaddr that
+ * hashes to the desired index, but there are cases where
+ * this is not true in which case we have to (potentially) scan
+ * the whole table looking for it. This should be very rare
+ * (e.g. a segkp_fault(F_INVAL) on an address somewhere in the
+ * middle of the segkp_data region).
+ */
+static struct segkp_data *
+segkp_find(struct seg *seg, caddr_t vaddr)
+{
+	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
+	struct segkp_data *kpd;
+	int	i;
+	int	stop;
+
+	i = stop = SEGKP_HASH(vaddr);
+	mutex_enter(&segkp_lock);
+	do {
+		for (kpd = kpsd->kpsd_hash[i]; kpd != NULL;
+						kpd = kpd->kp_next) {
+			if (vaddr >= kpd->kp_base &&
+			    vaddr < kpd->kp_base + kpd->kp_len) {
+				mutex_exit(&segkp_lock);
+				return (kpd);
+			}
+		}
+		if (--i < 0)
+			i = SEGKP_HASHSZ - 1;	/* Wrap */
+	} while (i != stop);
+	mutex_exit(&segkp_lock);
+	return (NULL);		/* Not found */
+}
+
+/*
+ * returns size of swappable area.
+ */
+size_t
+swapsize(caddr_t v)
+{
+	struct segkp_data *kpd;
+
+	if ((kpd = segkp_find(segkp, v)) != NULL)
+		return (SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
+	else
+		return (NULL);
+}
+
+/*
+ * Dump out all the active segkp pages
+ */
+static void
+segkp_dump(struct seg *seg)
+{
+	int i;
+	struct segkp_data *kpd;
+	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
+
+	for (i = 0; i < SEGKP_HASHSZ; i++) {
+		for (kpd = kpsd->kpsd_hash[i];
+		    kpd != NULL; kpd = kpd->kp_next) {
+			pfn_t pfn;
+			caddr_t addr;
+			caddr_t eaddr;
+
+			addr = kpd->kp_base;
+			eaddr = addr + kpd->kp_len;
+			while (addr < eaddr) {
+				ASSERT(seg->s_as == &kas);
+				pfn = hat_getpfnum(seg->s_as->a_hat, addr);
+				if (pfn != PFN_INVALID)
+					dump_addpage(seg->s_as, addr, pfn);
+				addr += PAGESIZE;
+				dump_timeleft = dump_timeout;
+			}
+		}
+	}
+}
+
+/*ARGSUSED*/
+static int
+segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
+    struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+	return (ENOTSUP);
+}
+
+/*ARGSUSED*/
+static int
+segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+	return (ENODEV);
+}
+
+/*ARGSUSED*/
+static lgrp_mem_policy_info_t	*
+segkp_getpolicy(struct seg *seg, caddr_t addr)
+{
+	return (NULL);
+}
+
+#include <sys/mem_config.h>
+
+/*ARGSUSED*/
+static void
+segkp_mem_config_post_add(void *arg, pgcnt_t delta_pages)
+{}
+
+/*
+ * During memory delete, turn off caches so that pages are not held.
+ * A better solution may be to unlock the pages while they are
+ * in the cache so that they may be collected naturally.
+ */
+
+/*ARGSUSED*/
+static int
+segkp_mem_config_pre_del(void *arg, pgcnt_t delta_pages)
+{
+	atomic_add_32(&segkp_indel, 1);
+	segkp_cache_free();
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+segkp_mem_config_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
+{
+	atomic_add_32(&segkp_indel, -1);
+}
+
+static kphysm_setup_vector_t segkp_mem_config_vec = {
+	KPHYSM_SETUP_VECTOR_VERSION,
+	segkp_mem_config_post_add,
+	segkp_mem_config_pre_del,
+	segkp_mem_config_post_del,
+};
+
+static void
+segkpinit_mem_config(struct seg *seg)
+{
+	int ret;
+
+	ret = kphysm_setup_func_register(&segkp_mem_config_vec, (void *)seg);
+	ASSERT(ret == 0);
+}
diff --git a/usr/src/uts/common/vm/seg_kp.h b/usr/src/uts/common/vm/seg_kp.h
new file mode 100644
index 0000000000..64fa883cc9
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kp.h
@@ -0,0 +1,165 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_VM_SEG_KP_H
+#define	_VM_SEG_KP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * segkp (as in kernel pageable) is a segment driver that supports allocation
+ * of page-aligned variable size of vm resources.
+ *
+ * Each vm resource represents a page-aligned range of virtual addresses.
+ * The caller may specify whether the resource should include a redzone,
+ * be locked down, or be zero initialized.
+ */
+
+#include <vm/seg.h>
+#include <sys/vmem.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	_KERNEL
+
+/*
+ * Private information per overall segkp segment (as opposed
+ * to per resource within segment). There are as many anon slots
+ * allocated as there there are pages in the segment.
+ */
+struct segkp_segdata {
+	struct anon_hdr	*kpsd_anon;	/* anon structs */
+	vmem_t		*kpsd_arena; 	/* virtual memory descriptor */
+	struct segkp_data **kpsd_hash;	/* Hash table for lookups */
+};
+
+#define	SEGKP_VMEM(seg)	(((struct segkp_segdata *)(seg)->s_data)->kpsd_arena)
+
+/*
+ * A hash table is used to aid in the lookup of a kpd's based on vaddr.
+ * Since the heaviest use of segkp occurs from segkp_*get and segkp_*release,
+ * the hashing is based on the vaddr used by these routines.
+ */
+#define	SEGKP_HASHSZ		256	/* power of two */
+#define	SEGKP_HASHMASK		(SEGKP_HASHSZ - 1)
+#define	SEGKP_HASH(vaddr)	\
+	((int)(((uintptr_t)vaddr >> PAGESHIFT) & SEGKP_HASHMASK))
+
+struct segkp_data {
+	kmutex_t	kp_lock;	/* per resource lock */
+	caddr_t		kp_base;	/* starting addr of chunk */
+	size_t		kp_len;		/* # of bytes */
+	uint_t		kp_flags;	/* state info */
+	int		kp_cookie;	/* index into cache array */
+	ulong_t		kp_anon_idx;	/* index into main anon array */
+					/* in segkp_segdata */
+	struct anon_hdr	*kp_anon;	/* anon structs */
+	struct segkp_data *kp_next;	/* ptr to next in hash chain */
+};
+
+/*
+ * Flag bits
+ *
+ */
+#define	KPD_ZERO	0x01		/* initialize resource with 0 */
+#define	KPD_LOCKED	0x02		/* resources locked */
+#define	KPD_NO_ANON	0x04		/* no swap resources required */
+#define	KPD_HASREDZONE	0x08		/* include a redzone */
+#define	KPD_NOWAIT	0x10		/* do not wait for res. if unavail. */
+#define	KPD_WRITEDIRTY	0x20		/* dirty pages should be flushed */
+#define	KPD_HASAMP	0x40		/* anon_hdr managed by caller */
+
+/*
+ * A cache of segkp elements may be created via segkp_cache_init().
+ * The elements on the freelist all have the same len and flags value.
+ * The cookie passed to the client is an index into the freelist array.
+ */
+struct segkp_cache  {
+	int		kpf_max;		/* max # of elements allowed */
+	int		kpf_count;		/* current no. of elments */
+	int		kpf_inuse;		/* list inuse */
+	uint_t		kpf_flags;		/* seg_kp flag value */
+	size_t		kpf_len;		/* len of resource */
+	struct seg	*kpf_seg;		/* segment */
+	struct segkp_data *kpf_list;		/* list of kpd's */
+};
+#define	SEGKP_MAX_CACHE		4	/* Number of caches maintained */
+
+/*
+ * Define redzone, and stack_to_memory macros.
+ * The redzone is PAGESIZE bytes.
+ */
+#ifdef	STACK_GROWTH_DOWN
+#define	KPD_REDZONE(kpd)	(0)
+#define	stom(v, flags)	(((flags) & KPD_HASREDZONE) ? (v) + PAGESIZE : (v))
+
+#else	/* STACK_GROWTH_DOWN */
+
+#define	KPD_REDZONE(kpd) (btop(kpd->kp_len) - 1)
+#define	stom(v)	(v)
+#endif	/* STACK_GROWTH_DOWN */
+
+#define	SEGKP_MAPLEN(len, flags) \
+	(((flags) & KPD_HASREDZONE) ? (len) - PAGESIZE : (len))
+
+extern	struct seg *segkp;
+/* If segkp becomes more than one seg this test will need changing. */
+#define	SEG_IS_SEGKP(SEG)	((SEG) == segkp)
+
+/*
+ * Public routine declarations not part of the segment ops vector go here.
+ */
+int	segkp_create(struct seg *seg);
+caddr_t	segkp_get(struct seg *seg, size_t len, uint_t flags);
+void	segkp_release(struct seg *seg, caddr_t vaddr);
+void *  segkp_cache_init(struct seg *seg, int maxsize, size_t len,
+		uint_t flags);
+void	segkp_cache_free();
+caddr_t segkp_cache_get(void *cookie);
+int	segkp_map_red(void);
+void	segkp_unmap_red(void);
+size_t	swapsize(caddr_t v);
+
+/* Special currently only used by schedctl. */
+struct anon_map;	/* Make the compiler happy about the next line. */
+caddr_t segkp_get_withanonmap(struct seg *, size_t, uint_t, struct anon_map *);
+
+/*
+ * We allow explicit calls to segkp_fault, even though it's part
+ * of the segkp ops vector.
+ */
+faultcode_t segkp_fault(struct hat *hat, struct seg *seg, caddr_t addr,
+	size_t len, enum fault_type type, enum seg_rw rw);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_KP_H */
diff --git a/usr/src/uts/common/vm/seg_kpm.c b/usr/src/uts/common/vm/seg_kpm.c
new file mode 100644
index 0000000000..73b7dbe94c
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kpm.c
@@ -0,0 +1,323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Kernel Physical Mapping (kpm) segment driver (segkpm).
+ *
+ * This driver delivers along with the hat_kpm* interfaces an alternative
+ * mechanism for kernel mappings within the 64-bit Solaris operating system,
+ * which allows the mapping of all physical memory into the kernel address
+ * space at once. This is feasible in 64 bit kernels, e.g. for Ultrasparc II
+ * and beyond processors, since the available VA range is much larger than
+ * possible physical memory. Momentarily all physical memory is supported,
+ * that is represented by the list of memory segments (memsegs).
+ *
+ * Segkpm mappings have also very low overhead and large pages are used
+ * (when possible) to minimize the TLB and TSB footprint. It is also
+ * extentable for other than Sparc architectures (e.g. AMD64). Main
+ * advantage is the avoidance of the TLB-shootdown X-calls, which are
+ * normally needed when a kernel (global) mapping has to be removed.
+ *
+ * First example of a kernel facility that uses the segkpm mapping scheme
+ * is seg_map, where it is used as an alternative to hat_memload().
+ * See also hat layer for more information about the hat_kpm* routines.
+ * The kpm facilty can be turned off at boot time (e.g. /etc/system).
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/bitmap.h>
+#include <sys/atomic.h>
+
+#include <vm/seg_kmem.h>
+#include <vm/seg_kpm.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/page.h>
+
+/*
+ * Global kpm controls.
+ * See also platform and mmu specific controls.
+ *
+ * kpm_enable -- global on/off switch for segkpm.
+ * . Set by default on 64bit platforms that have kpm support.
+ * . Will be disabled from platform layer if not supported.
+ * . Can be disabled via /etc/system.
+ *
+ * kpm_smallpages -- use only regular/system pagesize for kpm mappings.
+ * . Can be useful for critical debugging of kpm clients.
+ * . Set to zero by default for platforms that support kpm large pages.
+ *   The use of kpm large pages reduces the footprint of kpm meta data
+ *   and has all the other advantages of using large pages (e.g TLB
+ *   miss reduction).
+ * . Set by default for platforms that don't support kpm large pages or
+ *   where large pages cannot be used for other reasons (e.g. there are
+ *   only few full associative TLB entries available for large pages).
+ *
+ * segmap_kpm -- separate on/off switch for segmap using segkpm:
+ * . Set by default.
+ * . Will be disabled when kpm_enable is zero.
+ * . Will be disabled when MAXBSIZE != PAGESIZE.
+ * . Can be disabled via /etc/system.
+ *
+ */
+int kpm_enable = 1;
+int kpm_smallpages = 0;
+int segmap_kpm = 1;
+
+/*
+ * Private seg op routines.
+ */
+faultcode_t segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr,
+			size_t len, enum fault_type type, enum seg_rw rw);
+static void	segkpm_dump(struct seg *);
+static void	segkpm_badop(void);
+static int	segkpm_notsup(void);
+
+#define	SEGKPM_BADOP(t)	(t(*)())segkpm_badop
+#define	SEGKPM_NOTSUP	(int(*)())segkpm_notsup
+
+static struct seg_ops segkpm_ops = {
+	SEGKPM_BADOP(int),	/* dup */
+	SEGKPM_BADOP(int),	/* unmap */
+	SEGKPM_BADOP(void),	/* free */
+	segkpm_fault,
+	SEGKPM_BADOP(int),	/* faulta */
+	SEGKPM_BADOP(int),	/* setprot */
+	SEGKPM_BADOP(int),	/* checkprot */
+	SEGKPM_BADOP(int),	/* kluster */
+	SEGKPM_BADOP(size_t),	/* swapout */
+	SEGKPM_BADOP(int),	/* sync */
+	SEGKPM_BADOP(size_t),	/* incore */
+	SEGKPM_BADOP(int),	/* lockop */
+	SEGKPM_BADOP(int),	/* getprot */
+	SEGKPM_BADOP(u_offset_t), /* getoffset */
+	SEGKPM_BADOP(int),	/* gettype */
+	SEGKPM_BADOP(int),	/* getvp */
+	SEGKPM_BADOP(int),	/* advise */
+	segkpm_dump,		/* dump */
+	SEGKPM_NOTSUP,		/* pagelock */
+	SEGKPM_BADOP(int),	/* setpgsz */
+	SEGKPM_BADOP(int),	/* getmemid */
+};
+
+/*
+ * kpm_pgsz and kpm_pgshft are set by platform layer.
+ */
+size_t		kpm_pgsz;	/* kpm page size */
+uint_t		kpm_pgshft;	/* kpm page shift */
+u_offset_t	kpm_pgoff;	/* kpm page offset mask */
+uint_t		kpmp2pshft;	/* kpm page to page shift */
+pgcnt_t		kpmpnpgs;	/* how many pages per kpm page */
+
+
+#ifdef	SEGKPM_SUPPORT
+
+int
+segkpm_create(struct seg *seg, void *argsp)
+{
+	struct segkpm_data *skd;
+	struct segkpm_crargs *b = (struct segkpm_crargs *)argsp;
+	ushort_t *p;
+	int i, j;
+
+	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
+	ASSERT(btokpmp(seg->s_size) >= 1 &&
+		kpmpageoff((uintptr_t)seg->s_base) == 0 &&
+		kpmpageoff((uintptr_t)seg->s_base + seg->s_size) == 0);
+
+	skd = kmem_zalloc(sizeof (struct segkpm_data), KM_SLEEP);
+
+	seg->s_data = (void *)skd;
+	seg->s_ops = &segkpm_ops;
+	skd->skd_prot = b->prot;
+
+	/*
+	 * (1) Segkpm virtual addresses are based on physical adresses.
+	 * From this and in opposite to other segment drivers it is
+	 * often required to allocate a page first to be able to
+	 * calculate the final segkpm virtual address.
+	 * (2) Page  allocation is done by calling page_create_va(),
+	 * one important input argument is a virtual address (also
+	 * expressed by the "va" in the function name). This function
+	 * is highly optimized to select the right page for an optimal
+	 * processor and platform support (e.g. virtual addressed
+	 * caches (VAC), physical addressed caches, NUMA).
+	 *
+	 * Because of (1) the approach is to generate a faked virtual
+	 * address for calling page_create_va(). In order to exploit
+	 * the abilities of (2), especially to utilize the cache
+	 * hierarchy (3) and to avoid VAC alias conflicts (4) the
+	 * selection has to be done carefully. For each virtual color
+	 * a separate counter is provided (4). The count values are
+	 * used for the utilization of all cache lines (3) and are
+	 * corresponding to the cache bins.
+	 */
+	skd->skd_nvcolors = b->nvcolors;
+
+	p = skd->skd_va_select =
+		kmem_zalloc(NCPU * b->nvcolors * sizeof (ushort_t), KM_SLEEP);
+
+	for (i = 0; i < NCPU; i++)
+		for (j = 0; j < b->nvcolors; j++, p++)
+			*p = j;
+
+	return (0);
+}
+
+/*
+ * This routine is called via a machine specific fault handling
+ * routine.
+ */
+/* ARGSUSED */
+faultcode_t
+segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
+	enum fault_type type, enum seg_rw rw)
+{
+	faultcode_t error;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	error = hat_kpm_fault(hat, addr);
+
+	return (error);
+}
+
+#define	addr_to_vcolor(addr, vcolors) \
+	((int)(((uintptr_t)(addr) & ((vcolors << PAGESHIFT) - 1)) >> PAGESHIFT))
+
+/*
+ * Create a virtual address that can be used for invocations of
+ * page_create_va. Goal is to utilize the cache hierarchy (round
+ * robin bins) and to select the right color for virtual indexed
+ * caches. It isn't exact since we also increment the bin counter
+ * when the caller uses VOP_GETPAGE and gets a hit in the page
+ * cache, but we keep the bins turning for cache distribution
+ * (see also segkpm_create block comment).
+ */
+caddr_t
+segkpm_create_va(u_offset_t off)
+{
+	int vcolor;
+	ushort_t *p;
+	struct segkpm_data *skd = (struct segkpm_data *)segkpm->s_data;
+	int nvcolors = skd->skd_nvcolors;
+	caddr_t	va;
+
+	vcolor = (nvcolors > 1) ? addr_to_vcolor(off, nvcolors) : 0;
+	p = &skd->skd_va_select[(CPU->cpu_id * nvcolors) + vcolor];
+	va = (caddr_t)ptob(*p);
+
+	atomic_add_16(p, nvcolors);
+
+	return (va);
+}
+
+/*
+ * Unload mapping if the instance has an active kpm mapping.
+ */
+void
+segkpm_mapout_validkpme(struct kpme *kpme)
+{
+	caddr_t vaddr;
+	page_t *pp;
+
+retry:
+	if ((pp = kpme->kpe_page) == NULL) {
+		return;
+	}
+
+	if (page_lock(pp, SE_SHARED, (kmutex_t *)NULL, P_RECLAIM) == 0)
+		goto retry;
+
+	/*
+	 * Check if segkpm mapping is not unloaded in the meantime
+	 */
+	if (kpme->kpe_page == NULL) {
+		page_unlock(pp);
+		return;
+	}
+
+	vaddr = hat_kpm_page2va(pp, 1);
+	hat_kpm_mapout(pp, kpme, vaddr);
+	page_unlock(pp);
+}
+
+static void
+segkpm_badop()
+{
+	panic("segkpm_badop");
+}
+
+#else	/* SEGKPM_SUPPORT */
+
+/* segkpm stubs */
+
+/*ARGSUSED*/
+int segkpm_create(struct seg *seg, void *argsp) { return (0); }
+
+/* ARGSUSED */
+faultcode_t
+segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
+	enum fault_type type, enum seg_rw rw)
+{
+	return ((faultcode_t)0);
+}
+
+/* ARGSUSED */
+caddr_t segkpm_create_va(u_offset_t off) { return (NULL); }
+
+/* ARGSUSED */
+void segkpm_mapout_validkpme(struct kpme *kpme) {}
+
+static void
+segkpm_badop() {}
+
+#endif	/* SEGKPM_SUPPORT */
+
+static int
+segkpm_notsup()
+{
+	return (ENOTSUP);
+}
+
+/*
+ * segkpm pages are not dumped, so we just return
+ */
+/*ARGSUSED*/
+static void
+segkpm_dump(struct seg *seg)
+{}
diff --git a/usr/src/uts/common/vm/seg_kpm.h b/usr/src/uts/common/vm/seg_kpm.h
new file mode 100644
index 0000000000..0b766bbaf4
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kpm.h
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_VM_SEG_KPM_H
+#define	_VM_SEG_KPM_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Kernel Physical Mapping (segkpm) segment driver.
+ */
+
+#include <vm/kpm.h>
+
+struct segkpm_data {
+	ushort_t *skd_va_select; /* page_create_va kpm vaddr bin count */
+	short    skd_nvcolors;   /* VAC colors to deal with */
+	uchar_t  skd_prot;
+};
+
+/*
+ * segkpm create needs some platform knowledge
+ */
+struct segkpm_crargs {
+	uint_t	prot;
+	short	nvcolors;	/* VAC # virtual colors, 0 for PAC. */
+};
+
+extern struct seg *segkpm;
+extern u_offset_t kpm_pgoff;
+extern size_t	kpm_pgsz;
+extern uint_t	kpm_pgshft;
+extern uint_t	kpmp2pshft;
+extern pgcnt_t	kpmpnpgs;
+
+/* kpm controls */
+extern int	kpm_enable;
+extern int	kpm_smallpages;
+extern int	segmap_kpm;
+
+/*
+ * kpm_page_t macros:
+ * . bytes (b) to kpm pages (kpmp)
+ * . pages (p) to kpm pages (kpmp), and back (with and without roundup)
+ * . kpm page offset in bytes
+ * . pages (p) modulo kpm pages (kpmp)
+ */
+#define	btokpmp(x)	((x) >> kpm_pgshft)
+#define	btokpmpr(x)	(((x) + kpm_pgoff) >> kpm_pgshft)
+#define	ptokpmp(x)	((x) >> kpmp2pshft)
+#define	ptokpmpr(x)	(((x) + (kpmpnpgs - 1)) >> kpmp2pshft)
+#define	kpmptop(x)	((x) << kpmp2pshft)
+#define	kpmpageoff(x)	((x) & kpm_pgoff)
+#define	pmodkpmp(x)	((x) & (kpmpnpgs - 1))
+
+#ifdef	SEGKPM_SUPPORT
+
+#define	IS_KPM_ADDR(addr) \
+	((addr) >= segkpm->s_base && (addr) < (segkpm->s_base + segkpm->s_size))
+
+#define	KPMPAGE_T_SZ \
+	((kpm_smallpages == 0) ? sizeof (kpm_page_t) : sizeof (kpm_spage_t))
+
+#else	/* SEGKPM_SUPPORT */
+
+#define	IS_KPM_ADDR(addr) (segkpm != NULL)
+#define	KPMPAGE_T_SZ	(0)
+
+#endif	/* SEGKPM_SUPPORT */
+
+#ifdef _KERNEL
+/*
+ * Public seg_kpm segment operations.
+ */
+extern int		segkpm_create(struct seg *, void *);
+extern faultcode_t	segkpm_fault(struct hat *, struct seg *, caddr_t,
+				size_t, enum fault_type, enum seg_rw);
+
+/*
+ * Public seg_kpm interfaces.
+ */
+extern caddr_t	segkpm_create_va(u_offset_t);
+extern void	segkpm_mapout_validkpme(struct kpme *);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_KPM_H */
diff --git a/usr/src/uts/common/vm/seg_map.c b/usr/src/uts/common/vm/seg_map.c
new file mode 100644
index 0000000000..d4b6a16ca4
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_map.c
@@ -0,0 +1,2345 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VM - generic vnode mapping segment.
+ *
+ * The segmap driver is used only by the kernel to get faster (than seg_vn)
+ * mappings [lower routine overhead; more persistent cache] to random
+ * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/buf.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/mman.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/vtrace.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/thread.h>
+#include <sys/dumphdr.h>
+#include <sys/bitmap.h>
+#include <sys/lgrp.h>
+
+#include <vm/seg_kmem.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kpm.h>
+#include <vm/seg_map.h>
+#include <vm/page.h>
+#include <vm/pvn.h>
+#include <vm/rm.h>
+
+/*
+ * Private seg op routines.
+ */
+static void	segmap_free(struct seg *seg);
+faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
+			size_t len, enum fault_type type, enum seg_rw rw);
+static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
+static int	segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
+			uint_t prot);
+static int	segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
+static int	segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
+			uint_t *protv);
+static u_offset_t	segmap_getoffset(struct seg *seg, caddr_t addr);
+static int	segmap_gettype(struct seg *seg, caddr_t addr);
+static int	segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
+static void	segmap_dump(struct seg *seg);
+static int	segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
+			struct page ***ppp, enum lock_type type,
+			enum seg_rw rw);
+static void	segmap_badop(void);
+static int	segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
+static lgrp_mem_policy_info_t	*segmap_getpolicy(struct seg *seg,
+    caddr_t addr);
+
+/* segkpm support */
+static caddr_t	segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
+			struct smap *, enum seg_rw);
+struct smap	*get_smap_kpm(caddr_t, page_t **);
+
+#define	SEGMAP_BADOP(t)	(t(*)())segmap_badop
+
+static struct seg_ops segmap_ops = {
+	SEGMAP_BADOP(int),	/* dup */
+	SEGMAP_BADOP(int),	/* unmap */
+	segmap_free,
+	segmap_fault,
+	segmap_faulta,
+	SEGMAP_BADOP(int),	/* setprot */
+	segmap_checkprot,
+	segmap_kluster,
+	SEGMAP_BADOP(size_t),	/* swapout */
+	SEGMAP_BADOP(int),	/* sync */
+	SEGMAP_BADOP(size_t),	/* incore */
+	SEGMAP_BADOP(int),	/* lockop */
+	segmap_getprot,
+	segmap_getoffset,
+	segmap_gettype,
+	segmap_getvp,
+	SEGMAP_BADOP(int),	/* advise */
+	segmap_dump,
+	segmap_pagelock,	/* pagelock */
+	SEGMAP_BADOP(int),	/* setpgsz */
+	segmap_getmemid,	/* getmemid */
+	segmap_getpolicy,	/* getpolicy */
+};
+
+/*
+ * Private segmap routines.
+ */
+static void	segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
+			size_t len, enum seg_rw rw, struct smap *smp);
+static void	segmap_smapadd(struct smap *smp);
+static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
+			u_offset_t off, int hashid);
+static void	segmap_hashout(struct smap *smp);
+
+
+/*
+ * Statistics for segmap operations.
+ *
+ * No explicit locking to protect these stats.
+ */
+struct segmapcnt segmapcnt = {
+	{ "fault",		KSTAT_DATA_ULONG },
+	{ "faulta",		KSTAT_DATA_ULONG },
+	{ "getmap",		KSTAT_DATA_ULONG },
+	{ "get_use",		KSTAT_DATA_ULONG },
+	{ "get_reclaim",	KSTAT_DATA_ULONG },
+	{ "get_reuse",		KSTAT_DATA_ULONG },
+	{ "get_unused",		KSTAT_DATA_ULONG },
+	{ "get_nofree",		KSTAT_DATA_ULONG },
+	{ "rel_async",		KSTAT_DATA_ULONG },
+	{ "rel_write",		KSTAT_DATA_ULONG },
+	{ "rel_free",		KSTAT_DATA_ULONG },
+	{ "rel_abort",		KSTAT_DATA_ULONG },
+	{ "rel_dontneed",	KSTAT_DATA_ULONG },
+	{ "release",		KSTAT_DATA_ULONG },
+	{ "pagecreate",		KSTAT_DATA_ULONG },
+	{ "free_notfree",	KSTAT_DATA_ULONG },
+	{ "free_dirty",		KSTAT_DATA_ULONG },
+	{ "free",		KSTAT_DATA_ULONG },
+	{ "stolen",		KSTAT_DATA_ULONG },
+	{ "get_nomtx",		KSTAT_DATA_ULONG }
+};
+
+kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
+uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
+
+/*
+ * Return number of map pages in segment.
+ */
+#define	MAP_PAGES(seg)		((seg)->s_size >> MAXBSHIFT)
+
+/*
+ * Translate addr into smap number within segment.
+ */
+#define	MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
+
+/*
+ * Translate addr in seg into struct smap pointer.
+ */
+#define	GET_SMAP(seg, addr)	\
+	&(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
+
+/*
+ * Bit in map (16 bit bitmap).
+ */
+#define	SMAP_BIT_MASK(bitindex)	(1 << ((bitindex) & 0xf))
+
+static int smd_colormsk = 0;
+static int smd_ncolor = 0;
+static int smd_nfree = 0;
+static int smd_freemsk = 0;
+#ifdef DEBUG
+static int *colors_used;
+#endif
+static struct smap *smd_smap;
+static struct smaphash *smd_hash;
+#ifdef SEGMAP_HASHSTATS
+static unsigned int *smd_hash_len;
+#endif
+static struct smfree *smd_free;
+static ulong_t smd_hashmsk = 0;
+
+#define	SEGMAP_MAXCOLOR		2
+#define	SEGMAP_CACHE_PAD	64
+
+union segmap_cpu {
+	struct {
+		uint32_t	scpu_free_ndx[SEGMAP_MAXCOLOR];
+		struct smap	*scpu_last_smap;
+		ulong_t		scpu_getmap;
+		ulong_t		scpu_release;
+		ulong_t		scpu_get_reclaim;
+		ulong_t		scpu_fault;
+		ulong_t		scpu_pagecreate;
+		ulong_t		scpu_get_reuse;
+	} scpu;
+	char	scpu_pad[SEGMAP_CACHE_PAD];
+};
+static union segmap_cpu *smd_cpu;
+
+/*
+ * There are three locks in seg_map:
+ *	- per freelist mutexes
+ *	- per hashchain mutexes
+ *	- per smap mutexes
+ *
+ * The lock ordering is to get the smap mutex to lock down the slot
+ * first then the hash lock (for hash in/out (vp, off) list) or the
+ * freelist lock to put the slot back on the free list.
+ *
+ * The hash search is done by only holding the hashchain lock, when a wanted
+ * slot is found, we drop the hashchain lock then lock the slot so there
+ * is no overlapping of hashchain and smap locks. After the slot is
+ * locked, we verify again if the slot is still what we are looking
+ * for.
+ *
+ * Allocation of a free slot is done by holding the freelist lock,
+ * then locking the smap slot at the head of the freelist. This is
+ * in reversed lock order so mutex_tryenter() is used.
+ *
+ * The smap lock protects all fields in smap structure except for
+ * the link fields for hash/free lists which are protected by
+ * hashchain and freelist locks.
+ */
+
+#define	SHASHMTX(hashid)	(&smd_hash[hashid].sh_mtx)
+
+#define	SMP2SMF(smp)		(&smd_free[(smp - smd_smap) & smd_freemsk])
+#define	SMP2SMF_NDX(smp)	(ushort_t)((smp - smd_smap) & smd_freemsk)
+
+#define	SMAPMTX(smp) (&smp->sm_mtx)
+
+#define	SMAP_HASHFUNC(vp, off, hashid) \
+	{ \
+	hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
+		((off) >> MAXBSHIFT)) & smd_hashmsk); \
+	}
+
+/*
+ * The most frequently updated kstat counters are kept in the
+ * per cpu array to avoid hot cache blocks. The update function
+ * sums the cpu local counters to update the global counters.
+ */
+
+/* ARGSUSED */
+int
+segmap_kstat_update(kstat_t *ksp, int rw)
+{
+	int i;
+	ulong_t	getmap, release, get_reclaim;
+	ulong_t	fault, pagecreate, get_reuse;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+	getmap = release = get_reclaim = (ulong_t)0;
+	fault = pagecreate = get_reuse = (ulong_t)0;
+	for (i = 0; i < max_ncpus; i++) {
+		getmap += smd_cpu[i].scpu.scpu_getmap;
+		release  += smd_cpu[i].scpu.scpu_release;
+		get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
+		fault  += smd_cpu[i].scpu.scpu_fault;
+		pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
+		get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
+	}
+	segmapcnt.smp_getmap.value.ul = getmap;
+	segmapcnt.smp_release.value.ul = release;
+	segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
+	segmapcnt.smp_fault.value.ul = fault;
+	segmapcnt.smp_pagecreate.value.ul = pagecreate;
+	segmapcnt.smp_get_reuse.value.ul = get_reuse;
+	return (0);
+}
+
+int
+segmap_create(struct seg *seg, void *argsp)
+{
+	struct segmap_data *smd;
+	struct smap *smp;
+	struct smfree *sm;
+	struct segmap_crargs *a = (struct segmap_crargs *)argsp;
+	struct smaphash *shashp;
+	union segmap_cpu *scpu;
+	long i, npages;
+	size_t hashsz;
+	uint_t nfreelist;
+	extern void prefetch_smap_w(void *);
+	extern int max_ncpus;
+
+	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
+
+	if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
+		panic("segkmap not MAXBSIZE aligned");
+		/*NOTREACHED*/
+	}
+
+	smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
+
+	seg->s_data = (void *)smd;
+	seg->s_ops = &segmap_ops;
+	smd->smd_prot = a->prot;
+
+	/*
+	 * Scale the number of smap freelists to be
+	 * proportional to max_ncpus * number of virtual colors.
+	 * The caller can over-ride this scaling by providing
+	 * a non-zero a->nfreelist argument.
+	 */
+	nfreelist = a->nfreelist;
+	if (nfreelist == 0)
+		nfreelist = max_ncpus;
+	else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
+		cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
+		"%d, using %d", nfreelist, max_ncpus);
+		nfreelist = max_ncpus;
+	}
+	if (nfreelist & (nfreelist - 1)) {
+		/* round up nfreelist to the next power of two. */
+		nfreelist = 1 << (highbit(nfreelist));
+	}
+
+	/*
+	 * Get the number of virtual colors - must be a power of 2.
+	 */
+	if (a->shmsize)
+		smd_ncolor = a->shmsize >> MAXBSHIFT;
+	else
+		smd_ncolor = 1;
+	ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
+	ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
+	smd_colormsk = smd_ncolor - 1;
+	smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
+	smd_freemsk = smd_nfree - 1;
+
+	/*
+	 * Allocate and initialize the freelist headers.
+	 * Note that sm_freeq[1] starts out as the release queue. This
+	 * is known when the smap structures are initialized below.
+	 */
+	smd_free = smd->smd_free =
+	    kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
+	for (i = 0; i < smd_nfree; i++) {
+		sm = &smd->smd_free[i];
+		mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
+		sm->sm_allocq = &sm->sm_freeq[0];
+		sm->sm_releq = &sm->sm_freeq[1];
+	}
+
+	/*
+	 * Allocate and initialize the smap hash chain headers.
+	 * Compute hash size rounding down to the next power of two.
+	 */
+	npages = MAP_PAGES(seg);
+	smd->smd_npages = npages;
+	hashsz = npages / SMAP_HASHAVELEN;
+	hashsz = 1 << (highbit(hashsz)-1);
+	smd_hashmsk = hashsz - 1;
+	smd_hash = smd->smd_hash =
+	    kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
+#ifdef SEGMAP_HASHSTATS
+	smd_hash_len =
+	    kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
+#endif
+	for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
+		shashp->sh_hash_list = NULL;
+		mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
+	}
+
+	/*
+	 * Allocate and initialize the smap structures.
+	 * Link all slots onto the appropriate freelist.
+	 * The smap array is large enough to affect boot time
+	 * on large systems, so use memory prefetching and only
+	 * go through the array 1 time. Inline a optimized version
+	 * of segmap_smapadd to add structures to freelists with
+	 * knowledge that no locks are needed here.
+	 */
+	smd_smap = smd->smd_sm =
+		kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
+
+	for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
+	    smp >= smd->smd_sm; smp--) {
+		struct smap *smpfreelist;
+		struct sm_freeq *releq;
+
+		prefetch_smap_w((char *)smp);
+
+		smp->sm_vp = NULL;
+		smp->sm_hash = NULL;
+		smp->sm_off = 0;
+		smp->sm_bitmap = 0;
+		smp->sm_refcnt = 0;
+		mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
+		smp->sm_free_ndx = SMP2SMF_NDX(smp);
+
+		sm = SMP2SMF(smp);
+		releq = sm->sm_releq;
+
+		smpfreelist = releq->smq_free;
+		if (smpfreelist == 0) {
+			releq->smq_free = smp->sm_next = smp->sm_prev = smp;
+		} else {
+			smp->sm_next = smpfreelist;
+			smp->sm_prev = smpfreelist->sm_prev;
+			smpfreelist->sm_prev = smp;
+			smp->sm_prev->sm_next = smp;
+			releq->smq_free = smp->sm_next;
+		}
+
+		/*
+		 * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
+		 */
+		smp->sm_flags = 0;
+
+#ifdef	SEGKPM_SUPPORT
+		/*
+		 * Due to the fragile prefetch loop no
+		 * separate function is used here.
+		 */
+		smp->sm_kpme_next = NULL;
+		smp->sm_kpme_prev = NULL;
+		smp->sm_kpme_page = NULL;
+#endif
+	}
+
+	/*
+	 * Allocate the per color indices that distribute allocation
+	 * requests over the free lists. Each cpu will have a private
+	 * rotor index to spread the allocations even across the available
+	 * smap freelists. Init the scpu_last_smap field to the first
+	 * smap element so there is no need to check for NULL.
+	 */
+	smd_cpu =
+		kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
+	for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
+		int j;
+		for (j = 0; j < smd_ncolor; j++)
+			scpu->scpu.scpu_free_ndx[j] = j;
+		scpu->scpu.scpu_last_smap = smd_smap;
+	}
+
+#ifdef DEBUG
+	/*
+	 * Keep track of which colors are used more often.
+	 */
+	colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
+#endif /* DEBUG */
+
+	return (0);
+}
+
+static void
+segmap_free(seg)
+	struct seg *seg;
+{
+	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
+}
+
+/*
+ * Do a F_SOFTUNLOCK call over the range requested.
+ * The range must have already been F_SOFTLOCK'ed.
+ */
+static void
+segmap_unlock(
+	struct hat *hat,
+	struct seg *seg,
+	caddr_t addr,
+	size_t len,
+	enum seg_rw rw,
+	struct smap *smp)
+{
+	page_t *pp;
+	caddr_t adr;
+	u_offset_t off;
+	struct vnode *vp;
+	kmutex_t *smtx;
+
+	ASSERT(smp->sm_refcnt > 0);
+
+#ifdef lint
+	seg = seg;
+#endif
+
+	if (segmap_kpm && IS_KPM_ADDR(addr)) {
+
+		/*
+		 * We're called only from segmap_fault and this was a
+		 * NOP in case of a kpm based smap, so dangerous things
+		 * must have happened in the meantime. Pages are prefaulted
+		 * and locked in segmap_getmapflt and they will not be
+		 * unlocked until segmap_release.
+		 */
+		panic("segmap_unlock: called with kpm addr %p", (void *)addr);
+		/*NOTREACHED*/
+	}
+
+	vp = smp->sm_vp;
+	off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
+
+	hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
+	for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
+		ushort_t bitmask;
+
+		/*
+		 * Use page_find() instead of page_lookup() to
+		 * find the page since we know that it has
+		 * "shared" lock.
+		 */
+		pp = page_find(vp, off);
+		if (pp == NULL) {
+			panic("segmap_unlock: page not found");
+			/*NOTREACHED*/
+		}
+
+		if (rw == S_WRITE) {
+			hat_setrefmod(pp);
+		} else if (rw != S_OTHER) {
+			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
+				"segmap_fault:pp %p vp %p offset %llx",
+				pp, vp, off);
+			hat_setref(pp);
+		}
+
+		/*
+		 * Clear bitmap, if the bit corresponding to "off" is set,
+		 * since the page and translation are being unlocked.
+		 */
+		bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
+
+		/*
+		 * Large Files: Following assertion is to verify
+		 * the correctness of the cast to (int) above.
+		 */
+		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
+		smtx = SMAPMTX(smp);
+		mutex_enter(smtx);
+		if (smp->sm_bitmap & bitmask) {
+			smp->sm_bitmap &= ~bitmask;
+		}
+		mutex_exit(smtx);
+
+		page_unlock(pp);
+	}
+}
+
+#define	MAXPPB	(MAXBSIZE/4096)	/* assumes minimum page size of 4k */
+
+/*
+ * This routine is called via a machine specific fault handling
+ * routine.  It is also called by software routines wishing to
+ * lock or unlock a range of addresses.
+ *
+ * Note that this routine expects a page-aligned "addr".
+ */
+faultcode_t
+segmap_fault(
+	struct hat *hat,
+	struct seg *seg,
+	caddr_t addr,
+	size_t len,
+	enum fault_type type,
+	enum seg_rw rw)
+{
+	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+	struct smap *smp;
+	page_t *pp, **ppp;
+	struct vnode *vp;
+	u_offset_t off;
+	page_t *pl[MAXPPB + 1];
+	uint_t prot;
+	u_offset_t addroff;
+	caddr_t adr;
+	int err;
+	u_offset_t sm_off;
+	int hat_flag;
+
+	if (segmap_kpm && IS_KPM_ADDR(addr)) {
+		int newpage;
+		kmutex_t *smtx;
+
+		/*
+		 * Pages are successfully prefaulted and locked in
+		 * segmap_getmapflt and can't be unlocked until
+		 * segmap_release. No hat mappings have to be locked
+		 * and they also can't be unlocked as long as the
+		 * caller owns an active kpm addr.
+		 */
+#ifndef DEBUG
+		if (type != F_SOFTUNLOCK)
+			return (0);
+#endif
+
+		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
+			panic("segmap_fault: smap not found "
+			    "for addr %p", (void *)addr);
+			/*NOTREACHED*/
+		}
+
+		smtx = SMAPMTX(smp);
+#ifdef	DEBUG
+		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
+		if (newpage) {
+			cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
+				(void *)smp);
+		}
+
+		if (type != F_SOFTUNLOCK) {
+			mutex_exit(smtx);
+			return (0);
+		}
+#endif
+		mutex_exit(smtx);
+		vp = smp->sm_vp;
+		sm_off = smp->sm_off;
+
+		if (vp == NULL)
+			return (FC_MAKE_ERR(EIO));
+
+		ASSERT(smp->sm_refcnt > 0);
+
+		addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
+		if (addroff + len > MAXBSIZE)
+			panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
+			    (void *)(addr + len));
+
+		off = sm_off + addroff;
+
+		pp = page_find(vp, off);
+
+		if (pp == NULL)
+			panic("segmap_fault: softunlock page not found");
+
+		/*
+		 * Set ref bit also here in case of S_OTHER to avoid the
+		 * overhead of supporting other cases than F_SOFTUNLOCK
+		 * with segkpm. We can do this because the underlying
+		 * pages are locked anyway.
+		 */
+		if (rw == S_WRITE) {
+			hat_setrefmod(pp);
+		} else {
+			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
+				"segmap_fault:pp %p vp %p offset %llx",
+				pp, vp, off);
+			hat_setref(pp);
+		}
+
+		return (0);
+	}
+
+	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
+	smp = GET_SMAP(seg, addr);
+	vp = smp->sm_vp;
+	sm_off = smp->sm_off;
+
+	if (vp == NULL)
+		return (FC_MAKE_ERR(EIO));
+
+	ASSERT(smp->sm_refcnt > 0);
+
+	addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
+	if (addroff + len > MAXBSIZE) {
+		panic("segmap_fault: endaddr %p "
+		    "exceeds MAXBSIZE chunk", (void *)(addr + len));
+		/*NOTREACHED*/
+	}
+	off = sm_off + addroff;
+
+	/*
+	 * First handle the easy stuff
+	 */
+	if (type == F_SOFTUNLOCK) {
+		segmap_unlock(hat, seg, addr, len, rw, smp);
+		return (0);
+	}
+
+	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
+		"segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
+	err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
+	    seg, addr, rw, CRED());
+
+	if (err)
+		return (FC_MAKE_ERR(err));
+
+	prot &= smd->smd_prot;
+
+	/*
+	 * Handle all pages returned in the pl[] array.
+	 * This loop is coded on the assumption that if
+	 * there was no error from the VOP_GETPAGE routine,
+	 * that the page list returned will contain all the
+	 * needed pages for the vp from [off..off + len].
+	 */
+	ppp = pl;
+	while ((pp = *ppp++) != NULL) {
+		u_offset_t poff;
+		ASSERT(pp->p_vnode == vp);
+		hat_flag = HAT_LOAD;
+
+		/*
+		 * Verify that the pages returned are within the range
+		 * of this segmap region.  Note that it is theoretically
+		 * possible for pages outside this range to be returned,
+		 * but it is not very likely.  If we cannot use the
+		 * page here, just release it and go on to the next one.
+		 */
+		if (pp->p_offset < sm_off ||
+		    pp->p_offset >= sm_off + MAXBSIZE) {
+			(void) page_release(pp, 1);
+			continue;
+		}
+
+		ASSERT(hat == kas.a_hat);
+		poff = pp->p_offset;
+		adr = addr + (poff - off);
+		if (adr >= addr && adr < addr + len) {
+			hat_setref(pp);
+			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
+			    "segmap_fault:pp %p vp %p offset %llx",
+			    pp, vp, poff);
+			if (type == F_SOFTLOCK)
+				hat_flag = HAT_LOAD_LOCK;
+		}
+
+		/*
+		 * Deal with VMODSORT pages here. If we know this is a write
+		 * do the setmod now and allow write protection.
+		 * As long as it's modified or not S_OTHER, remove write
+		 * protection. With S_OTHER it's up to the FS to deal with this.
+		 */
+		if (IS_VMODSORT(vp)) {
+			if (rw == S_WRITE)
+				hat_setmod(pp);
+			else if (rw != S_OTHER && !hat_ismod(pp))
+				prot &= ~PROT_WRITE;
+		}
+
+		hat_memload(hat, adr, pp, prot, hat_flag);
+		if (hat_flag != HAT_LOAD_LOCK)
+			page_unlock(pp);
+	}
+	return (0);
+}
+
+/*
+ * This routine is used to start I/O on pages asynchronously.
+ */
+static faultcode_t
+segmap_faulta(struct seg *seg, caddr_t addr)
+{
+	struct smap *smp;
+	struct vnode *vp;
+	u_offset_t off;
+	int err;
+
+	if (segmap_kpm && IS_KPM_ADDR(addr)) {
+		int	newpage;
+		kmutex_t *smtx;
+
+		/*
+		 * Pages are successfully prefaulted and locked in
+		 * segmap_getmapflt and can't be unlocked until
+		 * segmap_release. No hat mappings have to be locked
+		 * and they also can't be unlocked as long as the
+		 * caller owns an active kpm addr.
+		 */
+#ifdef	DEBUG
+		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
+			panic("segmap_faulta: smap not found "
+			    "for addr %p", (void *)addr);
+			/*NOTREACHED*/
+		}
+
+		smtx = SMAPMTX(smp);
+		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
+		mutex_exit(smtx);
+		if (newpage)
+			cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
+			    (void *)smp);
+#endif
+		return (0);
+	}
+
+	segmapcnt.smp_faulta.value.ul++;
+	smp = GET_SMAP(seg, addr);
+
+	ASSERT(smp->sm_refcnt > 0);
+
+	vp = smp->sm_vp;
+	off = smp->sm_off;
+
+	if (vp == NULL) {
+		cmn_err(CE_WARN, "segmap_faulta - no vp");
+		return (FC_MAKE_ERR(EIO));
+	}
+
+	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
+		"segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
+
+	err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
+	    & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
+	    seg, addr, S_READ, CRED());
+
+	if (err)
+		return (FC_MAKE_ERR(err));
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+
+	ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
+
+	/*
+	 * Need not acquire the segment lock since
+	 * "smd_prot" is a read-only field.
+	 */
+	return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
+}
+
+static int
+segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+	size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if (pgno != 0) {
+		do
+			protv[--pgno] = smd->smd_prot;
+		while (pgno != 0);
+	}
+	return (0);
+}
+
+static u_offset_t
+segmap_getoffset(struct seg *seg, caddr_t addr)
+{
+	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+
+	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
+
+	return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
+}
+
+/*ARGSUSED*/
+static int
+segmap_gettype(struct seg *seg, caddr_t addr)
+{
+	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
+
+	return (MAP_SHARED);
+}
+
+/*ARGSUSED*/
+static int
+segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+
+	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
+
+	/* XXX - This doesn't make any sense */
+	*vpp = smd->smd_sm->sm_vp;
+	return (0);
+}
+
+/*
+ * Check to see if it makes sense to do kluster/read ahead to
+ * addr + delta relative to the mapping at addr.  We assume here
+ * that delta is a signed PAGESIZE'd multiple (which can be negative).
+ *
+ * For segmap we always "approve" of this action from our standpoint.
+ */
+/*ARGSUSED*/
+static int
+segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+	return (0);
+}
+
+static void
+segmap_badop()
+{
+	panic("segmap_badop");
+	/*NOTREACHED*/
+}
+
+/*
+ * Special private segmap operations
+ */
+
+/*
+ * Add smap to the appropriate free list.
+ */
+static void
+segmap_smapadd(struct smap *smp)
+{
+	struct smfree *sm;
+	struct smap *smpfreelist;
+	struct sm_freeq *releq;
+
+	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
+
+	if (smp->sm_refcnt != 0) {
+		panic("segmap_smapadd");
+		/*NOTREACHED*/
+	}
+
+	sm = &smd_free[smp->sm_free_ndx];
+	/*
+	 * Add to the tail of the release queue
+	 * Note that sm_releq and sm_allocq could toggle
+	 * before we get the lock. This does not affect
+	 * correctness as the 2 queues are only maintained
+	 * to reduce lock pressure.
+	 */
+	releq = sm->sm_releq;
+	if (releq == &sm->sm_freeq[0])
+		smp->sm_flags |= SM_QNDX_ZERO;
+	else
+		smp->sm_flags &= ~SM_QNDX_ZERO;
+	mutex_enter(&releq->smq_mtx);
+	smpfreelist = releq->smq_free;
+	if (smpfreelist == 0) {
+		int want;
+
+		releq->smq_free = smp->sm_next = smp->sm_prev = smp;
+		/*
+		 * Both queue mutexes held to set sm_want;
+		 * snapshot the value before dropping releq mutex.
+		 * If sm_want appears after the releq mutex is dropped,
+		 * then the smap just freed is already gone.
+		 */
+		want = sm->sm_want;
+		mutex_exit(&releq->smq_mtx);
+		/*
+		 * See if there was a waiter before dropping the releq mutex
+		 * then recheck after obtaining sm_freeq[0] mutex as
+		 * the another thread may have already signaled.
+		 */
+		if (want) {
+			mutex_enter(&sm->sm_freeq[0].smq_mtx);
+			if (sm->sm_want)
+				cv_signal(&sm->sm_free_cv);
+			mutex_exit(&sm->sm_freeq[0].smq_mtx);
+		}
+	} else {
+		smp->sm_next = smpfreelist;
+		smp->sm_prev = smpfreelist->sm_prev;
+		smpfreelist->sm_prev = smp;
+		smp->sm_prev->sm_next = smp;
+		mutex_exit(&releq->smq_mtx);
+	}
+}
+
+
+static struct smap *
+segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
+{
+	struct smap **hpp;
+	struct smap *tmp;
+	kmutex_t *hmtx;
+
+	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
+	ASSERT(smp->sm_vp == NULL);
+	ASSERT(smp->sm_hash == NULL);
+	ASSERT(smp->sm_prev == NULL);
+	ASSERT(smp->sm_next == NULL);
+	ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
+
+	hmtx = SHASHMTX(hashid);
+
+	mutex_enter(hmtx);
+	/*
+	 * First we need to verify that no one has created a smp
+	 * with (vp,off) as its tag before we us.
+	 */
+	for (tmp = smd_hash[hashid].sh_hash_list;
+	    tmp != NULL; tmp = tmp->sm_hash)
+		if (tmp->sm_vp == vp && tmp->sm_off == off)
+			break;
+
+	if (tmp == NULL) {
+		/*
+		 * No one created one yet.
+		 *
+		 * Funniness here - we don't increment the ref count on the
+		 * vnode * even though we have another pointer to it here.
+		 * The reason for this is that we don't want the fact that
+		 * a seg_map entry somewhere refers to a vnode to prevent the
+		 * vnode * itself from going away.  This is because this
+		 * reference to the vnode is a "soft one".  In the case where
+		 * a mapping is being used by a rdwr [or directory routine?]
+		 * there already has to be a non-zero ref count on the vnode.
+		 * In the case where the vp has been freed and the the smap
+		 * structure is on the free list, there are no pages in memory
+		 * that can refer to the vnode.  Thus even if we reuse the same
+		 * vnode/smap structure for a vnode which has the same
+		 * address but represents a different object, we are ok.
+		 */
+		smp->sm_vp = vp;
+		smp->sm_off = off;
+
+		hpp = &smd_hash[hashid].sh_hash_list;
+		smp->sm_hash = *hpp;
+		*hpp = smp;
+#ifdef SEGMAP_HASHSTATS
+		smd_hash_len[hashid]++;
+#endif
+	}
+	mutex_exit(hmtx);
+
+	return (tmp);
+}
+
+static void
+segmap_hashout(struct smap *smp)
+{
+	struct smap **hpp, *hp;
+	struct vnode *vp;
+	kmutex_t *mtx;
+	int hashid;
+	u_offset_t off;
+
+	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
+
+	vp = smp->sm_vp;
+	off = smp->sm_off;
+
+	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
+	mtx = SHASHMTX(hashid);
+	mutex_enter(mtx);
+
+	hpp = &smd_hash[hashid].sh_hash_list;
+	for (;;) {
+		hp = *hpp;
+		if (hp == NULL) {
+			panic("segmap_hashout");
+			/*NOTREACHED*/
+		}
+		if (hp == smp)
+			break;
+		hpp = &hp->sm_hash;
+	}
+
+	*hpp = smp->sm_hash;
+	smp->sm_hash = NULL;
+#ifdef SEGMAP_HASHSTATS
+	smd_hash_len[hashid]--;
+#endif
+	mutex_exit(mtx);
+
+	smp->sm_vp = NULL;
+	smp->sm_off = (u_offset_t)0;
+
+}
+
+/*
+ * Attempt to free unmodified, unmapped, and non locked segmap
+ * pages.
+ */
+void
+segmap_pagefree(struct vnode *vp, u_offset_t off)
+{
+	u_offset_t pgoff;
+	page_t  *pp;
+
+	for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
+
+		if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
+			continue;
+
+		switch (page_release(pp, 1)) {
+		case PGREL_NOTREL:
+			segmapcnt.smp_free_notfree.value.ul++;
+			break;
+		case PGREL_MOD:
+			segmapcnt.smp_free_dirty.value.ul++;
+			break;
+		case PGREL_CLEAN:
+			segmapcnt.smp_free.value.ul++;
+			break;
+		}
+	}
+}
+
+/*
+ * Locks held on entry: smap lock
+ * Locks held on exit : smap lock.
+ */
+
+static void
+grab_smp(struct smap *smp, page_t *pp)
+{
+	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
+	ASSERT(smp->sm_refcnt == 0);
+
+	if (smp->sm_vp != (struct vnode *)NULL) {
+		struct vnode	*vp = smp->sm_vp;
+		u_offset_t 	off = smp->sm_off;
+		/*
+		 * Destroy old vnode association and
+		 * unload any hardware translations to
+		 * the old object.
+		 */
+		smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
+		segmap_hashout(smp);
+
+		/*
+		 * This node is off freelist and hashlist,
+		 * so there is no reason to drop/reacquire sm_mtx
+		 * across calls to hat_unload.
+		 */
+		if (segmap_kpm) {
+			caddr_t vaddr;
+			int hat_unload_needed = 0;
+
+			/*
+			 * unload kpm mapping
+			 */
+			if (pp != NULL) {
+				vaddr = hat_kpm_page2va(pp, 1);
+				hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
+				page_unlock(pp);
+			}
+
+			/*
+			 * Check if we have (also) the rare case of a
+			 * non kpm mapping.
+			 */
+			if (smp->sm_flags & SM_NOTKPM_RELEASED) {
+				hat_unload_needed = 1;
+				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
+			}
+
+			if (hat_unload_needed) {
+				hat_unload(kas.a_hat, segkmap->s_base +
+				    ((smp - smd_smap) * MAXBSIZE),
+				    MAXBSIZE, HAT_UNLOAD);
+			}
+
+		} else {
+			ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
+			smp->sm_flags &= ~SM_NOTKPM_RELEASED;
+			hat_unload(kas.a_hat, segkmap->s_base +
+			    ((smp - smd_smap) * MAXBSIZE),
+			    MAXBSIZE, HAT_UNLOAD);
+		}
+		segmap_pagefree(vp, off);
+	}
+}
+
+static struct smap *
+get_free_smp(int free_ndx)
+{
+	struct smfree *sm;
+	kmutex_t *smtx;
+	struct smap *smp, *first;
+	struct sm_freeq *allocq, *releq;
+	struct kpme *kpme;
+	page_t *pp = NULL;
+	int end_ndx, page_locked = 0;
+
+	end_ndx = free_ndx;
+	sm = &smd_free[free_ndx];
+
+retry_queue:
+	allocq = sm->sm_allocq;
+	mutex_enter(&allocq->smq_mtx);
+
+	if ((smp = allocq->smq_free) == NULL) {
+
+skip_queue:
+		/*
+		 * The alloc list is empty or this queue is being skipped;
+		 * first see if the allocq toggled.
+		 */
+		if (sm->sm_allocq != allocq) {
+			/* queue changed */
+			mutex_exit(&allocq->smq_mtx);
+			goto retry_queue;
+		}
+		releq = sm->sm_releq;
+		if (!mutex_tryenter(&releq->smq_mtx)) {
+			/* cannot get releq; a free smp may be there now */
+			mutex_exit(&allocq->smq_mtx);
+
+			/*
+			 * This loop could spin forever if this thread has
+			 * higher priority than the thread that is holding
+			 * releq->smq_mtx. In order to force the other thread
+			 * to run, we'll lock/unlock the mutex which is safe
+			 * since we just unlocked the allocq mutex.
+			 */
+			mutex_enter(&releq->smq_mtx);
+			mutex_exit(&releq->smq_mtx);
+			goto retry_queue;
+		}
+		if (releq->smq_free == NULL) {
+			/*
+			 * This freelist is empty.
+			 * This should not happen unless clients
+			 * are failing to release the segmap
+			 * window after accessing the data.
+			 * Before resorting to sleeping, try
+			 * the next list of the same color.
+			 */
+			free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
+			if (free_ndx != end_ndx) {
+				mutex_exit(&releq->smq_mtx);
+				mutex_exit(&allocq->smq_mtx);
+				sm = &smd_free[free_ndx];
+				goto retry_queue;
+			}
+			/*
+			 * Tried all freelists of the same color once,
+			 * wait on this list and hope something gets freed.
+			 */
+			segmapcnt.smp_get_nofree.value.ul++;
+			sm->sm_want++;
+			mutex_exit(&sm->sm_freeq[1].smq_mtx);
+			cv_wait(&sm->sm_free_cv,
+				&sm->sm_freeq[0].smq_mtx);
+			sm->sm_want--;
+			mutex_exit(&sm->sm_freeq[0].smq_mtx);
+			sm = &smd_free[free_ndx];
+			goto retry_queue;
+		} else {
+			/*
+			 * Something on the rele queue; flip the alloc
+			 * and rele queues and retry.
+			 */
+			sm->sm_allocq = releq;
+			sm->sm_releq = allocq;
+			mutex_exit(&allocq->smq_mtx);
+			mutex_exit(&releq->smq_mtx);
+			if (page_locked) {
+				delay(hz >> 2);
+				page_locked = 0;
+			}
+			goto retry_queue;
+		}
+	} else {
+		/*
+		 * Fastpath the case we get the smap mutex
+		 * on the first try.
+		 */
+		first = smp;
+next_smap:
+		smtx = SMAPMTX(smp);
+		if (!mutex_tryenter(smtx)) {
+			/*
+			 * Another thread is trying to reclaim this slot.
+			 * Skip to the next queue or smap.
+			 */
+			if ((smp = smp->sm_next) == first) {
+				goto skip_queue;
+			} else {
+				goto next_smap;
+			}
+		} else {
+			/*
+			 * if kpme exists, get shared lock on the page
+			 */
+			if (segmap_kpm && smp->sm_vp != NULL) {
+
+				kpme = GET_KPME(smp);
+				pp = kpme->kpe_page;
+
+				if (pp != NULL) {
+					if (!page_trylock(pp, SE_SHARED)) {
+						smp = smp->sm_next;
+						mutex_exit(smtx);
+						page_locked = 1;
+
+						pp = NULL;
+
+						if (smp == first) {
+							goto skip_queue;
+						} else {
+							goto next_smap;
+						}
+					} else {
+						if (kpme->kpe_page == NULL) {
+							page_unlock(pp);
+							pp = NULL;
+						}
+					}
+				}
+			}
+
+			/*
+			 * At this point, we've selected smp.  Remove smp
+			 * from its freelist.  If smp is the first one in
+			 * the freelist, update the head of the freelist.
+			 */
+			if (first == smp) {
+				ASSERT(first == allocq->smq_free);
+				allocq->smq_free = smp->sm_next;
+			}
+
+			/*
+			 * if the head of the freelist still points to smp,
+			 * then there are no more free smaps in that list.
+			 */
+			if (allocq->smq_free == smp)
+				/*
+				 * Took the last one
+				 */
+				allocq->smq_free = NULL;
+			else {
+				smp->sm_prev->sm_next = smp->sm_next;
+				smp->sm_next->sm_prev = smp->sm_prev;
+			}
+			mutex_exit(&allocq->smq_mtx);
+			smp->sm_prev = smp->sm_next = NULL;
+
+			/*
+			 * if pp != NULL, pp must have been locked;
+			 * grab_smp() unlocks pp.
+			 */
+			ASSERT((pp == NULL) || PAGE_LOCKED(pp));
+			grab_smp(smp, pp);
+			/* return smp locked. */
+			ASSERT(SMAPMTX(smp) == smtx);
+			ASSERT(MUTEX_HELD(smtx));
+			return (smp);
+		}
+	}
+}
+
+/*
+ * Special public segmap operations
+ */
+
+/*
+ * Create pages (without using VOP_GETPAGE) and load up tranlations to them.
+ * If softlock is TRUE, then set things up so that it looks like a call
+ * to segmap_fault with F_SOFTLOCK.
+ *
+ * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
+ *
+ * All fields in the generic segment (struct seg) are considered to be
+ * read-only for "segmap" even though the kernel address space (kas) may
+ * not be locked, hence no lock is needed to access them.
+ */
+int
+segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
+{
+	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+	page_t *pp;
+	u_offset_t off;
+	struct smap *smp;
+	struct vnode *vp;
+	caddr_t eaddr;
+	int newpage = 0;
+	uint_t prot;
+	kmutex_t *smtx;
+	int hat_flag;
+
+	ASSERT(seg->s_as == &kas);
+
+	if (segmap_kpm && IS_KPM_ADDR(addr)) {
+		/*
+		 * Pages are successfully prefaulted and locked in
+		 * segmap_getmapflt and can't be unlocked until
+		 * segmap_release. The SM_KPM_NEWPAGE flag is set
+		 * in segmap_pagecreate_kpm when new pages are created.
+		 * and it is returned as "newpage" indication here.
+		 */
+		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
+			panic("segmap_pagecreate: smap not found "
+			    "for addr %p", (void *)addr);
+			/*NOTREACHED*/
+		}
+
+		smtx = SMAPMTX(smp);
+		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
+		smp->sm_flags &= ~SM_KPM_NEWPAGE;
+		mutex_exit(smtx);
+
+		return (newpage);
+	}
+
+	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
+
+	eaddr = addr + len;
+	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+
+	smp = GET_SMAP(seg, addr);
+
+	/*
+	 * We don't grab smp mutex here since we assume the smp
+	 * has a refcnt set already which prevents the slot from
+	 * changing its id.
+	 */
+	ASSERT(smp->sm_refcnt > 0);
+
+	vp = smp->sm_vp;
+	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
+	prot = smd->smd_prot;
+
+	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
+		hat_flag = HAT_LOAD;
+		pp = page_lookup(vp, off, SE_SHARED);
+		if (pp == NULL) {
+			ushort_t bitindex;
+
+			if ((pp = page_create_va(vp, off,
+			    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
+				panic("segmap_pagecreate: page_create failed");
+				/*NOTREACHED*/
+			}
+			newpage = 1;
+			page_io_unlock(pp);
+
+			/*
+			 * Since pages created here do not contain valid
+			 * data until the caller writes into them, the
+			 * "exclusive" lock will not be dropped to prevent
+			 * other users from accessing the page.  We also
+			 * have to lock the translation to prevent a fault
+			 * from occuring when the virtual address mapped by
+			 * this page is written into.  This is necessary to
+			 * avoid a deadlock since we haven't dropped the
+			 * "exclusive" lock.
+			 */
+			bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
+
+			/*
+			 * Large Files: The following assertion is to
+			 * verify the cast above.
+			 */
+			ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
+			smtx = SMAPMTX(smp);
+			mutex_enter(smtx);
+			smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
+			mutex_exit(smtx);
+
+			hat_flag = HAT_LOAD_LOCK;
+		} else if (softlock) {
+			hat_flag = HAT_LOAD_LOCK;
+		}
+
+		if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
+			hat_setmod(pp);
+
+		hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
+
+		if (hat_flag != HAT_LOAD_LOCK)
+			page_unlock(pp);
+
+		TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
+		    "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
+		    seg, addr, pp, vp, off);
+	}
+
+	return (newpage);
+}
+
+void
+segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
+{
+	struct smap	*smp;
+	ushort_t	bitmask;
+	page_t		*pp;
+	struct	vnode	*vp;
+	u_offset_t	off;
+	caddr_t		eaddr;
+	kmutex_t	*smtx;
+
+	ASSERT(seg->s_as == &kas);
+
+	eaddr = addr + len;
+	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+
+	if (segmap_kpm && IS_KPM_ADDR(addr)) {
+		/*
+		 * Pages are successfully prefaulted and locked in
+		 * segmap_getmapflt and can't be unlocked until
+		 * segmap_release, so no pages or hat mappings have
+		 * to be unlocked at this point.
+		 */
+#ifdef DEBUG
+		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
+			panic("segmap_pageunlock: smap not found "
+			    "for addr %p", (void *)addr);
+			/*NOTREACHED*/
+		}
+
+		ASSERT(smp->sm_refcnt > 0);
+		mutex_exit(SMAPMTX(smp));
+#endif
+		return;
+	}
+
+	smp = GET_SMAP(seg, addr);
+	smtx = SMAPMTX(smp);
+
+	ASSERT(smp->sm_refcnt > 0);
+
+	vp = smp->sm_vp;
+	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
+
+	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
+		bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
+
+		/*
+		 * Large Files: Following assertion is to verify
+		 * the correctness of the cast to (int) above.
+		 */
+		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
+
+		/*
+		 * If the bit corresponding to "off" is set,
+		 * clear this bit in the bitmap, unlock translations,
+		 * and release the "exclusive" lock on the page.
+		 */
+		if (smp->sm_bitmap & bitmask) {
+			mutex_enter(smtx);
+			smp->sm_bitmap &= ~bitmask;
+			mutex_exit(smtx);
+
+			hat_unlock(kas.a_hat, addr, PAGESIZE);
+
+			/*
+			 * Use page_find() instead of page_lookup() to
+			 * find the page since we know that it has
+			 * "exclusive" lock.
+			 */
+			pp = page_find(vp, off);
+			if (pp == NULL) {
+				panic("segmap_pageunlock: page not found");
+				/*NOTREACHED*/
+			}
+			if (rw == S_WRITE) {
+				hat_setrefmod(pp);
+			} else if (rw != S_OTHER) {
+				hat_setref(pp);
+			}
+
+			page_unlock(pp);
+		}
+	}
+}
+
+caddr_t
+segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
+{
+	return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
+}
+
+/*
+ * This is the magic virtual address that offset 0 of an ELF
+ * file gets mapped to in user space. This is used to pick
+ * the vac color on the freelist.
+ */
+#define	ELF_OFFZERO_VA	(0x10000)
+/*
+ * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
+ * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
+ * The return address is  always MAXBSIZE aligned.
+ *
+ * If forcefault is nonzero and the MMU translations haven't yet been created,
+ * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
+ */
+caddr_t
+segmap_getmapflt(
+	struct seg *seg,
+	struct vnode *vp,
+	u_offset_t off,
+	size_t len,
+	int forcefault,
+	enum seg_rw rw)
+{
+	struct smap *smp, *nsmp;
+	extern struct vnode *common_specvp();
+	caddr_t baseaddr;			/* MAXBSIZE aligned */
+	u_offset_t baseoff;
+	int newslot;
+	caddr_t vaddr;
+	int color, hashid;
+	kmutex_t *hashmtx, *smapmtx;
+	struct smfree *sm;
+	page_t	*pp;
+	struct kpme *kpme;
+	uint_t	prot;
+	caddr_t base;
+	page_t	*pl[MAXPPB + 1];
+	int	error;
+	int	is_kpm = 1;
+
+	ASSERT(seg->s_as == &kas);
+	ASSERT(seg == segkmap);
+
+	baseoff = off & (offset_t)MAXBMASK;
+	if (off + len > baseoff + MAXBSIZE) {
+		panic("segmap_getmap bad len");
+		/*NOTREACHED*/
+	}
+
+	/*
+	 * If this is a block device we have to be sure to use the
+	 * "common" block device vnode for the mapping.
+	 */
+	if (vp->v_type == VBLK)
+		vp = common_specvp(vp);
+
+	smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
+
+	if (segmap_kpm == 0 ||
+	    (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
+		is_kpm = 0;
+	}
+
+	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
+	hashmtx = SHASHMTX(hashid);
+
+retry_hash:
+	mutex_enter(hashmtx);
+	for (smp = smd_hash[hashid].sh_hash_list;
+	    smp != NULL; smp = smp->sm_hash)
+		if (smp->sm_vp == vp && smp->sm_off == baseoff)
+			break;
+	mutex_exit(hashmtx);
+
+vrfy_smp:
+	if (smp != NULL) {
+
+		ASSERT(vp->v_count != 0);
+
+		/*
+		 * Get smap lock and recheck its tag. The hash lock
+		 * is dropped since the hash is based on (vp, off)
+		 * and (vp, off) won't change when we have smap mtx.
+		 */
+		smapmtx = SMAPMTX(smp);
+		mutex_enter(smapmtx);
+		if (smp->sm_vp != vp || smp->sm_off != baseoff) {
+			mutex_exit(smapmtx);
+			goto retry_hash;
+		}
+
+		if (smp->sm_refcnt == 0) {
+
+			smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
+
+			/*
+			 * Could still be on the free list. However, this
+			 * could also be an smp that is transitioning from
+			 * the free list when we have too much contention
+			 * for the smapmtx's. In this case, we have an
+			 * unlocked smp that is not on the free list any
+			 * longer, but still has a 0 refcnt.  The only way
+			 * to be sure is to check the freelist pointers.
+			 * Since we now have the smapmtx, we are guaranteed
+			 * that the (vp, off) won't change, so we are safe
+			 * to reclaim it.  get_free_smp() knows that this
+			 * can happen, and it will check the refcnt.
+			 */
+
+			if ((smp->sm_next != NULL)) {
+				struct sm_freeq *freeq;
+
+				ASSERT(smp->sm_prev != NULL);
+				sm = &smd_free[smp->sm_free_ndx];
+
+				if (smp->sm_flags & SM_QNDX_ZERO)
+					freeq = &sm->sm_freeq[0];
+				else
+					freeq = &sm->sm_freeq[1];
+
+				mutex_enter(&freeq->smq_mtx);
+				if (freeq->smq_free != smp) {
+					/*
+					 * fastpath normal case
+					 */
+					smp->sm_prev->sm_next = smp->sm_next;
+					smp->sm_next->sm_prev = smp->sm_prev;
+				} else if (smp == smp->sm_next) {
+					/*
+					 * Taking the last smap on freelist
+					 */
+					freeq->smq_free = NULL;
+				} else {
+					/*
+					 * Reclaiming 1st smap on list
+					 */
+					freeq->smq_free = smp->sm_next;
+					smp->sm_prev->sm_next = smp->sm_next;
+					smp->sm_next->sm_prev = smp->sm_prev;
+				}
+				mutex_exit(&freeq->smq_mtx);
+				smp->sm_prev = smp->sm_next = NULL;
+			} else {
+				ASSERT(smp->sm_prev == NULL);
+				segmapcnt.smp_stolen.value.ul++;
+			}
+
+		} else {
+			segmapcnt.smp_get_use.value.ul++;
+		}
+		smp->sm_refcnt++;		/* another user */
+
+		/*
+		 * We don't invoke segmap_fault via TLB miss, so we set ref
+		 * and mod bits in advance. For S_OTHER  we set them in
+		 * segmap_fault F_SOFTUNLOCK.
+		 */
+		if (is_kpm) {
+			if (rw == S_WRITE) {
+				smp->sm_flags |= SM_WRITE_DATA;
+			} else if (rw == S_READ) {
+				smp->sm_flags |= SM_READ_DATA;
+			}
+		}
+		mutex_exit(smapmtx);
+
+		newslot = 0;
+	} else {
+
+		uint32_t free_ndx, *free_ndxp;
+		union segmap_cpu *scpu;
+
+		/*
+		 * On a PAC machine or a machine with anti-alias
+		 * hardware, smd_colormsk will be zero.
+		 *
+		 * On a VAC machine- pick color by offset in the file
+		 * so we won't get VAC conflicts on elf files.
+		 * On data files, color does not matter but we
+		 * don't know what kind of file it is so we always
+		 * pick color by offset. This causes color
+		 * corresponding to file offset zero to be used more
+		 * heavily.
+		 */
+		color = (baseoff >> MAXBSHIFT) & smd_colormsk;
+		scpu = smd_cpu+CPU->cpu_seqid;
+		free_ndxp = &scpu->scpu.scpu_free_ndx[color];
+		free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
+#ifdef DEBUG
+		colors_used[free_ndx]++;
+#endif /* DEBUG */
+
+		/*
+		 * Get a locked smp slot from the free list.
+		 */
+		smp = get_free_smp(free_ndx);
+		smapmtx = SMAPMTX(smp);
+
+		ASSERT(smp->sm_vp == NULL);
+
+		if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
+			/*
+			 * Failed to hashin, there exists one now.
+			 * Return the smp we just allocated.
+			 */
+			segmap_smapadd(smp);
+			mutex_exit(smapmtx);
+
+			smp = nsmp;
+			goto vrfy_smp;
+		}
+		smp->sm_refcnt++;		/* another user */
+
+		/*
+		 * We don't invoke segmap_fault via TLB miss, so we set ref
+		 * and mod bits in advance. For S_OTHER  we set them in
+		 * segmap_fault F_SOFTUNLOCK.
+		 */
+		if (is_kpm) {
+			if (rw == S_WRITE) {
+				smp->sm_flags |= SM_WRITE_DATA;
+			} else if (rw == S_READ) {
+				smp->sm_flags |= SM_READ_DATA;
+			}
+		}
+		mutex_exit(smapmtx);
+
+		newslot = 1;
+	}
+
+	if (!is_kpm)
+		goto use_segmap_range;
+
+	/*
+	 * Use segkpm
+	 */
+	ASSERT(PAGESIZE == MAXBSIZE);
+
+	/*
+	 * remember the last smp faulted on this cpu.
+	 */
+	(smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
+
+	if (forcefault == SM_PAGECREATE) {
+		baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
+		return (baseaddr);
+	}
+
+	if (newslot == 0 &&
+	    (pp = GET_KPME(smp)->kpe_page) != NULL) {
+
+		/* fastpath */
+		switch (rw) {
+		case S_READ:
+		case S_WRITE:
+			if (page_trylock(pp, SE_SHARED)) {
+				if (PP_ISFREE(pp) ||
+				    !(pp->p_vnode == vp &&
+				    pp->p_offset == baseoff)) {
+					page_unlock(pp);
+					pp = page_lookup(vp, baseoff,
+						SE_SHARED);
+				}
+			} else {
+				pp = page_lookup(vp, baseoff, SE_SHARED);
+			}
+
+			if (pp == NULL) {
+				ASSERT(GET_KPME(smp)->kpe_page == NULL);
+				break;
+			}
+
+			if (rw == S_WRITE &&
+			    hat_page_getattr(pp, P_MOD | P_REF) !=
+			    (P_MOD | P_REF)) {
+				page_unlock(pp);
+				break;
+			}
+
+			/*
+			 * We have the p_selock as reader, grab_smp
+			 * can't hit us, we have bumped the smap
+			 * refcnt and hat_pageunload needs the
+			 * p_selock exclusive.
+			 */
+			kpme = GET_KPME(smp);
+			if (kpme->kpe_page == pp) {
+				baseaddr = hat_kpm_page2va(pp, 0);
+			} else if (kpme->kpe_page == NULL) {
+				baseaddr = hat_kpm_mapin(pp, kpme);
+			} else {
+				panic("segmap_getmapflt: stale "
+				    "kpme page, kpme %p", (void *)kpme);
+				/*NOTREACHED*/
+			}
+
+			/*
+			 * We don't invoke segmap_fault via TLB miss,
+			 * so we set ref and mod bits in advance.
+			 * For S_OTHER and we set them in segmap_fault
+			 * F_SOFTUNLOCK.
+			 */
+			if (rw == S_READ && !hat_isref(pp))
+				hat_setref(pp);
+
+			return (baseaddr);
+		default:
+			break;
+		}
+	}
+
+	base = segkpm_create_va(baseoff);
+	error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
+	    seg, base, rw, CRED());
+
+	pp = pl[0];
+	if (error || pp == NULL) {
+		/*
+		 * Use segmap address slot and let segmap_fault deal
+		 * with the error cases. There is no error return
+		 * possible here.
+		 */
+		goto use_segmap_range;
+	}
+
+	ASSERT(pl[1] == NULL);
+
+	/*
+	 * When prot is not returned w/ PROT_ALL the returned pages
+	 * are not backed by fs blocks. For most of the segmap users
+	 * this is no problem, they don't write to the pages in the
+	 * same request and therefore don't rely on a following
+	 * trap driven segmap_fault. With SM_LOCKPROTO users it
+	 * is more secure to use segkmap adresses to allow
+	 * protection segmap_fault's.
+	 */
+	if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
+		/*
+		 * Use segmap address slot and let segmap_fault
+		 * do the error return.
+		 */
+		ASSERT(rw != S_WRITE);
+		ASSERT(PAGE_LOCKED(pp));
+		page_unlock(pp);
+		forcefault = 0;
+		goto use_segmap_range;
+	}
+
+	/*
+	 * We have the p_selock as reader, grab_smp can't hit us, we
+	 * have bumped the smap refcnt and hat_pageunload needs the
+	 * p_selock exclusive.
+	 */
+	kpme = GET_KPME(smp);
+	if (kpme->kpe_page == pp) {
+		baseaddr = hat_kpm_page2va(pp, 0);
+	} else if (kpme->kpe_page == NULL) {
+		baseaddr = hat_kpm_mapin(pp, kpme);
+	} else {
+		panic("segmap_getmapflt: stale kpme page after "
+		    "VOP_GETPAGE, kpme %p", (void *)kpme);
+		/*NOTREACHED*/
+	}
+
+	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
+
+	return (baseaddr);
+
+
+use_segmap_range:
+	baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
+	TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
+	    "segmap_getmap:seg %p addr %p vp %p offset %llx",
+	    seg, baseaddr, vp, baseoff);
+
+	/*
+	 * Prefault the translations
+	 */
+	vaddr = baseaddr + (off - baseoff);
+	if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
+
+		caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
+		    (uintptr_t)PAGEMASK);
+
+		(void) segmap_fault(kas.a_hat, seg, pgaddr,
+		    (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
+		    F_INVAL, rw);
+	}
+
+	return (baseaddr);
+}
+
+int
+segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
+{
+	struct smap	*smp;
+	int 		error;
+	int		bflags = 0;
+	struct vnode	*vp;
+	u_offset_t	offset;
+	kmutex_t	*smtx;
+	int		is_kpm = 0;
+	page_t		*pp;
+
+	if (segmap_kpm && IS_KPM_ADDR(addr)) {
+
+		if (((uintptr_t)addr & MAXBOFFSET) != 0) {
+			panic("segmap_release: addr %p not "
+			    "MAXBSIZE aligned", (void *)addr);
+			/*NOTREACHED*/
+		}
+
+		if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
+			panic("segmap_release: smap not found "
+			    "for addr %p", (void *)addr);
+			/*NOTREACHED*/
+		}
+
+		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
+			"segmap_relmap:seg %p addr %p smp %p",
+			seg, addr, smp);
+
+		smtx = SMAPMTX(smp);
+
+		/*
+		 * For compatibilty reasons segmap_pagecreate_kpm sets this
+		 * flag to allow a following segmap_pagecreate to return
+		 * this as "newpage" flag. When segmap_pagecreate is not
+		 * called at all we clear it now.
+		 */
+		smp->sm_flags &= ~SM_KPM_NEWPAGE;
+		is_kpm = 1;
+		if (smp->sm_flags & SM_WRITE_DATA) {
+			hat_setrefmod(pp);
+		} else if (smp->sm_flags & SM_READ_DATA) {
+			hat_setref(pp);
+		}
+	} else {
+		if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
+		    ((uintptr_t)addr & MAXBOFFSET) != 0) {
+			panic("segmap_release: bad addr %p", (void *)addr);
+			/*NOTREACHED*/
+		}
+		smp = GET_SMAP(seg, addr);
+
+		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
+			"segmap_relmap:seg %p addr %p smp %p",
+			seg, addr, smp);
+
+		smtx = SMAPMTX(smp);
+		mutex_enter(smtx);
+		smp->sm_flags |= SM_NOTKPM_RELEASED;
+	}
+
+	ASSERT(smp->sm_refcnt > 0);
+
+	/*
+	 * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
+	 * are set.
+	 */
+	if ((flags & ~SM_DONTNEED) != 0) {
+		if (flags & SM_WRITE)
+			segmapcnt.smp_rel_write.value.ul++;
+		if (flags & SM_ASYNC) {
+			bflags |= B_ASYNC;
+			segmapcnt.smp_rel_async.value.ul++;
+		}
+		if (flags & SM_INVAL) {
+			bflags |= B_INVAL;
+			segmapcnt.smp_rel_abort.value.ul++;
+		}
+		if (flags & SM_DESTROY) {
+			bflags |= (B_INVAL|B_TRUNC);
+			segmapcnt.smp_rel_abort.value.ul++;
+		}
+		if (smp->sm_refcnt == 1) {
+			/*
+			 * We only bother doing the FREE and DONTNEED flags
+			 * if no one else is still referencing this mapping.
+			 */
+			if (flags & SM_FREE) {
+				bflags |= B_FREE;
+				segmapcnt.smp_rel_free.value.ul++;
+			}
+			if (flags & SM_DONTNEED) {
+				bflags |= B_DONTNEED;
+				segmapcnt.smp_rel_dontneed.value.ul++;
+			}
+		}
+	} else {
+		smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
+	}
+
+	vp = smp->sm_vp;
+	offset = smp->sm_off;
+
+	if (--smp->sm_refcnt == 0) {
+
+		if (is_kpm) {
+			smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
+		}
+		if (flags & (SM_INVAL|SM_DESTROY)) {
+			segmap_hashout(smp);	/* remove map info */
+			if (is_kpm) {
+				hat_kpm_mapout(pp, GET_KPME(smp), addr);
+				if (smp->sm_flags & SM_NOTKPM_RELEASED) {
+					smp->sm_flags &= ~SM_NOTKPM_RELEASED;
+					hat_unload(kas.a_hat, addr, MAXBSIZE,
+						HAT_UNLOAD);
+				}
+
+			} else {
+				if (segmap_kpm)
+					segkpm_mapout_validkpme(GET_KPME(smp));
+
+				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
+				hat_unload(kas.a_hat, addr, MAXBSIZE,
+					HAT_UNLOAD);
+			}
+		}
+		segmap_smapadd(smp);	/* add to free list */
+	}
+
+	mutex_exit(smtx);
+
+	if (is_kpm)
+		page_unlock(pp);
+	/*
+	 * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
+	 * are set.
+	 */
+	if ((flags & ~SM_DONTNEED) != 0) {
+		error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
+		    bflags, CRED());
+	} else {
+		error = 0;
+	}
+
+	return (error);
+}
+
+/*
+ * Dump the pages belonging to this segmap segment.
+ */
+static void
+segmap_dump(struct seg *seg)
+{
+	struct segmap_data *smd;
+	struct smap *smp, *smp_end;
+	page_t *pp;
+	pfn_t pfn;
+	u_offset_t off;
+	caddr_t addr;
+
+	smd = (struct segmap_data *)seg->s_data;
+	addr = seg->s_base;
+	for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
+	    smp < smp_end; smp++) {
+
+		if (smp->sm_refcnt) {
+			for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
+				int we_own_it = 0;
+
+				/*
+				 * If pp == NULL, the page either does
+				 * not exist or is exclusively locked.
+				 * So determine if it exists before
+				 * searching for it.
+				 */
+				if ((pp = page_lookup_nowait(smp->sm_vp,
+				    smp->sm_off + off, SE_SHARED)))
+					we_own_it = 1;
+				else
+					pp = page_exists(smp->sm_vp,
+					    smp->sm_off + off);
+
+				if (pp) {
+					pfn = page_pptonum(pp);
+					dump_addpage(seg->s_as,
+						addr + off, pfn);
+					if (we_own_it)
+						page_unlock(pp);
+				}
+				dump_timeleft = dump_timeout;
+			}
+		}
+		addr += MAXBSIZE;
+	}
+}
+
+/*ARGSUSED*/
+static int
+segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
+    struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+	return (ENOTSUP);
+}
+
+static int
+segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+
+	memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
+	memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
+	return (0);
+}
+
+/*ARGSUSED*/
+static lgrp_mem_policy_info_t *
+segmap_getpolicy(struct seg *seg, caddr_t addr)
+{
+	return (NULL);
+}
+
+
+#ifdef	SEGKPM_SUPPORT
+
+/*
+ * segkpm support routines
+ */
+
+static caddr_t
+segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
+	struct smap *smp, enum seg_rw rw)
+{
+	caddr_t	base;
+	page_t	*pp;
+	int	newpage = 0;
+	struct kpme	*kpme;
+
+	ASSERT(smp->sm_refcnt > 0);
+
+	if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
+		kmutex_t *smtx;
+
+		base = segkpm_create_va(off);
+
+		if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
+		    seg, base)) == NULL) {
+			panic("segmap_pagecreate_kpm: "
+			    "page_create failed");
+			/*NOTREACHED*/
+		}
+
+		newpage = 1;
+		page_io_unlock(pp);
+		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
+
+		/*
+		 * Mark this here until the following segmap_pagecreate
+		 * or segmap_release.
+		 */
+		smtx = SMAPMTX(smp);
+		mutex_enter(smtx);
+		smp->sm_flags |= SM_KPM_NEWPAGE;
+		mutex_exit(smtx);
+	}
+
+	kpme = GET_KPME(smp);
+	if (!newpage && kpme->kpe_page == pp)
+		base = hat_kpm_page2va(pp, 0);
+	else
+		base = hat_kpm_mapin(pp, kpme);
+
+	/*
+	 * FS code may decide not to call segmap_pagecreate and we
+	 * don't invoke segmap_fault via TLB miss, so we have to set
+	 * ref and mod bits in advance.
+	 */
+	if (rw == S_WRITE) {
+		hat_setrefmod(pp);
+	} else {
+		ASSERT(rw == S_READ);
+		hat_setref(pp);
+	}
+
+	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
+
+	return (base);
+}
+
+/*
+ * Find the smap structure corresponding to the
+ * KPM addr and return it locked.
+ */
+struct smap *
+get_smap_kpm(caddr_t addr, page_t **ppp)
+{
+	struct smap	*smp;
+	struct vnode	*vp;
+	u_offset_t	offset;
+	caddr_t		baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
+	int		hashid;
+	kmutex_t	*hashmtx;
+	page_t		*pp;
+	union segmap_cpu *scpu;
+
+	pp = hat_kpm_vaddr2page(baseaddr);
+
+	ASSERT(pp && !PP_ISFREE(pp));
+	ASSERT(PAGE_LOCKED(pp));
+	ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
+
+	vp = pp->p_vnode;
+	offset = pp->p_offset;
+	ASSERT(vp != NULL);
+
+	/*
+	 * Assume the last smap used on this cpu is the one needed.
+	 */
+	scpu = smd_cpu+CPU->cpu_seqid;
+	smp = scpu->scpu.scpu_last_smap;
+	mutex_enter(&smp->sm_mtx);
+	if (smp->sm_vp == vp && smp->sm_off == offset) {
+		ASSERT(smp->sm_refcnt > 0);
+	} else {
+		/*
+		 * Assumption wrong, find the smap on the hash chain.
+		 */
+		mutex_exit(&smp->sm_mtx);
+		SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
+		hashmtx = SHASHMTX(hashid);
+
+		mutex_enter(hashmtx);
+		smp = smd_hash[hashid].sh_hash_list;
+		for (; smp != NULL; smp = smp->sm_hash) {
+			if (smp->sm_vp == vp && smp->sm_off == offset)
+				break;
+		}
+		mutex_exit(hashmtx);
+		if (smp) {
+			mutex_enter(&smp->sm_mtx);
+			ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
+		}
+	}
+
+	if (ppp)
+		*ppp = smp ? pp : NULL;
+
+	return (smp);
+}
+
+#else	/* SEGKPM_SUPPORT */
+
+/* segkpm stubs */
+
+/*ARGSUSED*/
+static caddr_t
+segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
+	struct smap *smp, enum seg_rw rw)
+{
+	return (NULL);
+}
+
+/*ARGSUSED*/
+struct smap *
+get_smap_kpm(caddr_t addr, page_t **ppp)
+{
+	return (NULL);
+}
+
+#endif	/* SEGKPM_SUPPORT */
diff --git a/usr/src/uts/common/vm/seg_map.h b/usr/src/uts/common/vm/seg_map.h
new file mode 100644
index 0000000000..339dabe674
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_map.h
@@ -0,0 +1,294 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_SEG_MAP_H
+#define	_VM_SEG_MAP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * When segmap is created it is possible to program its behavior,
+ *	using the create args [needed for performance reasons].
+ * Segmap creates n lists of pages.
+ *	For VAC machines, there will be at least one free list
+ *	per color. If more than one free list per color is needed,
+ *	set nfreelist as needed.
+ *
+ *	For PAC machines, it will be treated as VAC with only one
+ *	color- every page is of the same color. Again, set nfreelist
+ *	to get more than one free list.
+ */
+struct segmap_crargs {
+	uint_t	prot;
+	uint_t	shmsize;	/* shm_alignment for VAC, 0 for PAC. */
+	uint_t	nfreelist;	/* number of freelist per color, >= 1 */
+};
+
+#include <vm/kpm.h>
+
+/*
+ * Each smap struct represents a MAXBSIZE sized mapping to the
+ * <sm_vp, sm_off> given in the structure.  The location of the
+ * the structure in the array gives the virtual address of the
+ * mapping. Structure rearranged for 64bit sm_off.
+ */
+struct	smap {
+	kmutex_t	sm_mtx;		/* protect non-list fields */
+	struct	vnode	*sm_vp;		/* vnode pointer (if mapped) */
+	struct	smap	*sm_hash;	/* hash pointer */
+	struct	smap	*sm_next;	/* next pointer */
+	struct	smap	*sm_prev;	/* previous pointer */
+	u_offset_t	sm_off;		/* file offset for mapping */
+	ushort_t	sm_bitmap;	/* bit map for locked translations */
+	ushort_t	sm_refcnt;	/* reference count for uses */
+	ushort_t	sm_flags;	/* smap flags */
+	ushort_t	sm_free_ndx;	/* freelist */
+#ifdef	SEGKPM_SUPPORT
+	struct kpme	sm_kpme;	/* segkpm */
+#endif
+};
+
+#ifdef	SEGKPM_SUPPORT
+#define	GET_KPME(smp)	(&(smp)->sm_kpme)
+#define	sm_kpme_next	sm_kpme.kpe_next
+#define	sm_kpme_prev	sm_kpme.kpe_prev
+#define	sm_kpme_page	sm_kpme.kpe_page
+#else
+#define	GET_KPME(smp)	((struct kpme *)NULL)
+#endif
+
+/* sm_flags */
+#define	SM_KPM_NEWPAGE	   0x00000001	/* page created in segmap_getmapft */
+#define	SM_NOTKPM_RELEASED 0x00000002	/* released smap not in segkpm mode */
+#define	SM_QNDX_ZERO	   0x00000004	/* on the index 0 freelist */
+#define	SM_READ_DATA	   0x00000010	/* page created for read */
+#define	SM_WRITE_DATA	   0x00000020	/* page created for write */
+
+/*
+ * Multiple smap free lists are maintained so that allocations
+ * will scale with cpu count. Each free list is made up of 2 queues
+ * so that allocations and deallocations can proceed concurrently.
+ * Each queue structure is padded to 64 bytes to avoid false sharing.
+ */
+#define	SM_FREEQ_PAD (64 - sizeof (struct smap *) - sizeof (kmutex_t))
+struct 	sm_freeq {
+	struct smap	*smq_free;	/* points into freelist */
+	kmutex_t	smq_mtx;	/* protects smq_free */
+	char		smq_pad[SM_FREEQ_PAD];
+};
+
+struct	smfree {
+	struct sm_freeq	sm_freeq[2];	/* alloc and release queues */
+	struct sm_freeq	*sm_allocq;	/* current allocq */
+	struct sm_freeq	*sm_releq;	/* current releq */
+	kcondvar_t	sm_free_cv;
+	ushort_t	sm_want;	/* someone wants a slot of this color */
+};
+
+/*
+ * Cached smaps are kept on hash chains to enable fast reclaim lookups.
+ */
+struct  smaphash {
+	kmutex_t	sh_mtx;		/* protects this hash chain */
+	struct  smap	*sh_hash_list;  /* start of hash chain */
+};
+
+/*
+ * (Semi) private data maintained by the segmap driver per SEGMENT mapping
+ * All fields in segmap_data are read-only after the segment is created.
+ *
+ */
+
+struct	segmap_data {
+	struct	smap	*smd_sm;	/* array of smap structures */
+	long		smd_npages;	/* size of smap array */
+	struct smfree	*smd_free;	/* ptr to freelist header array */
+	struct smaphash *smd_hash;	/* ptr to hash header array */
+	int		smd_nfree;	/* number of free lists */
+	uchar_t		smd_prot;	/* protections for all smap's */
+};
+
+/*
+ * Statistics for segmap operations.
+ *
+ * No explicit locking to protect these stats.
+ */
+struct segmapcnt {
+	kstat_named_t	smp_fault;	/* number of segmap_faults */
+	kstat_named_t	smp_faulta;	/* number of segmap_faultas */
+	kstat_named_t	smp_getmap;	/* number of segmap_getmaps */
+	kstat_named_t	smp_get_use;	/* getmaps that reuse existing map */
+	kstat_named_t	smp_get_reclaim; /* getmaps that do a reclaim */
+	kstat_named_t	smp_get_reuse;	/* getmaps that reuse a slot */
+	kstat_named_t	smp_get_unused;	/* getmaps that reuse existing map */
+	kstat_named_t	smp_get_nofree;	/* getmaps with no free slots */
+	kstat_named_t	smp_rel_async;	/* releases that are async */
+	kstat_named_t	smp_rel_write;	/* releases that write */
+	kstat_named_t	smp_rel_free;	/* releases that free */
+	kstat_named_t	smp_rel_abort;	/* releases that abort */
+	kstat_named_t	smp_rel_dontneed; /* releases with dontneed set */
+	kstat_named_t	smp_release;	/* releases with no other action */
+	kstat_named_t	smp_pagecreate;	/* pagecreates */
+	kstat_named_t   smp_free_notfree; /* pages not freed in */
+					/* segmap_pagefree */
+	kstat_named_t   smp_free_dirty; /* dirty pages freeed */
+					/* in segmap_pagefree */
+	kstat_named_t   smp_free;	/* clean pages freeed in */
+					/* segmap_pagefree */
+	kstat_named_t	smp_stolen;	/* segmap_getmapflt() stole */
+					/* from get_free_smp() */
+	kstat_named_t	smp_get_nomtx;	/* free smaps but no mutex */
+};
+
+/*
+ * These are flags used on release.  Some of these might get handled
+ * by segment operations needed for msync (when we figure them out).
+ * SM_ASYNC modifies SM_WRITE.  SM_DONTNEED modifies SM_FREE.  SM_FREE
+ * and SM_INVAL as well as SM_FREE and SM_DESTROY are mutually exclusive.
+ * SM_DESTROY behaves like SM_INVAL but also forces the pages to be
+ * destroyed -- this prevents them from being written to the backing
+ * store.
+ */
+#define	SM_WRITE	0x01		/* write back the pages upon release */
+#define	SM_ASYNC	0x02		/* do the write asynchronously */
+#define	SM_FREE		0x04		/* put pages back on free list */
+#define	SM_INVAL	0x08		/* invalidate page (no caching) */
+#define	SM_DONTNEED	0x10		/* less likely to be needed soon */
+#define	SM_DESTROY	0x20		/* invalidate page, don't write back */
+
+/*
+ * These are the forcefault flags used on getmapflt.
+ *
+ * The orginal semantic was extended to allow using the segkpm mapping
+ * scheme w/o a major segmap interface change for MAXBSIZE == PAGESIZE
+ * (which is required to enable segkpm for MAXBSIZE > PAGESIZE).
+ * Most segmap consumers needn't to be changed at all or only need to
+ * be changed slightly to take advantage of segkpm. Because the segkpm
+ * virtual address is based on the physical address of a page, a page is
+ * required to determine the virtual address (return value). Pages mapped
+ * with segkpm are always at least read locked and are hence protected
+ * from pageout or fsflush from segmap_getmap until segmap_release. This
+ * implies, that the segkpm mappings are locked within this period too.
+ * No trap driven segmap_fault's are possible in segkpm mode.
+ *
+ * The following combinations of "forcefault" and "rw" allow segkpm mode.
+ * (1) SM_FAULT, S_READ
+ * (2) SM_FAULT, S_WRITE
+ * (3) SM_PAGECREATE, S_WRITE
+ * (4) SM_LOCKPROTO, {S_READ, S_WRITE, S_OTHER}
+ *
+ * The regular additional operations (come in pairs in most of the cases):
+ * . segmap_pagecreate/segmap_pageunlock
+ * . segmap_fault(F_SOFTLOCK)/segmap_fault(F_SOFTUNLOCK)
+ *
+ * are mostly a no-op in segkpm mode with the following exceptions:
+ * . The "newpage" return value of segmap_pagecreate is still supported
+ *   for zeroout operations needed on newly created pages.
+ *
+ * . segmap_fault() must follow when a error could be expected in
+ *   the VOP_GETPAGE. In segkpm mode this error is recognized in
+ *   segmap_getmapflt and returned from the following segmap_fault()
+ *   call. The "hole" optimization (read only after first VOP_GETPAGE
+ *   mapping in segmap_getmapflt followed by a trap driven protection
+ *   fault and a second VOP_GETPAGE via segmap_fault) cannot be used.
+ *
+ * . segmap_fault(F_SOFTUNLOCK) must follow when segmap_getmapflt was
+ *   called w/ (SM_LOCKPROTO, S_OTHER). S_WRITE has to be applied, when
+ *   the page should be marked "dirty". Otherwise the page is not
+ *   written to the backing store later (as mentioned above, no page
+ *   or protection faults are possible in segkpm mode). Caller cannot
+ *   use only S_OTHER and rely on a protection fault to force the page
+ *   to become dirty.
+ *
+ * . The segmap_pagecreate parameter softlock is ignored, pages and
+ *   mappings are locked anyway.
+ *
+ * SM_LOCKPROTO is used in the fbio layer and some special segmap consumers.
+ */
+#define	SM_PAGECREATE	0x00		/* create page in segkpm mode, no I/O */
+#define	SM_FAULT	0x01		/* fault in page if necessary */
+#define	SM_LOCKPROTO	0x02		/* lock/unlock protocol used */
+
+#define	MAXBSHIFT	13		/* log2(MAXBSIZE) */
+
+#define	MAXBOFFSET	(MAXBSIZE - 1)
+#define	MAXBMASK	(~MAXBOFFSET)
+
+/*
+ * SMAP_HASHAVELEN is the average length desired for this chain, from
+ * which the size of the smd_hash table is derived at segment create time.
+ * SMAP_HASHVPSHIFT is defined so that 1 << SMAP_HASHVPSHIFT is the
+ * approximate size of a vnode struct.
+ */
+#define	SMAP_HASHAVELEN		4
+#define	SMAP_HASHVPSHIFT	6
+
+
+#ifdef _KERNEL
+/*
+ * The kernel generic mapping segment.
+ */
+extern struct seg *segkmap;
+
+/*
+ * Public seg_map segment operations.
+ */
+extern int	segmap_create(struct seg *, void *);
+extern int	segmap_pagecreate(struct seg *, caddr_t, size_t, int);
+extern void	segmap_pageunlock(struct seg *, caddr_t, size_t, enum seg_rw);
+extern faultcode_t segmap_fault(struct hat *, struct seg *, caddr_t, size_t,
+		enum fault_type, enum seg_rw);
+extern caddr_t	segmap_getmap(struct seg *, struct vnode *, u_offset_t);
+extern caddr_t	segmap_getmapflt(struct seg *, struct vnode *, u_offset_t,
+		size_t, int, enum seg_rw);
+extern int	segmap_release(struct seg *, caddr_t, uint_t);
+extern void	segmap_flush(struct seg *, struct vnode *);
+extern void	segmap_inval(struct seg *, struct vnode *, u_offset_t);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_MAP_H */
diff --git a/usr/src/uts/common/vm/seg_spt.c b/usr/src/uts/common/vm/seg_spt.c
new file mode 100644
index 0000000000..a97719ad5f
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_spt.c
@@ -0,0 +1,2701 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/param.h>
+#include <sys/user.h>
+#include <sys/mman.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/systm.h>
+#include <sys/tuneable.h>
+#include <vm/hat.h>
+#include <vm/seg.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <sys/buf.h>
+#include <sys/swap.h>
+#include <sys/atomic.h>
+#include <vm/seg_spt.h>
+#include <sys/debug.h>
+#include <sys/vtrace.h>
+#include <sys/shm.h>
+#include <sys/lgrp.h>
+#include <sys/vmsystm.h>
+
+#include <sys/tnf_probe.h>
+
+#define	SEGSPTADDR	(caddr_t)0x0
+
+/*
+ * # pages used for spt
+ */
+static size_t	spt_used;
+
+/*
+ * segspt_minfree is the memory left for system after ISM
+ * locked its pages; it is set up to 5% of availrmem in
+ * sptcreate when ISM is created.  ISM should not use more
+ * than ~90% of availrmem; if it does, then the performance
+ * of the system may decrease. Machines with large memories may
+ * be able to use up more memory for ISM so we set the default
+ * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
+ * If somebody wants even more memory for ISM (risking hanging
+ * the system) they can patch the segspt_minfree to smaller number.
+ */
+pgcnt_t segspt_minfree = 0;
+
+static int segspt_create(struct seg *seg, caddr_t argsp);
+static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
+static void segspt_free(struct seg *seg);
+static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
+static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
+
+static void
+segspt_badop()
+{
+	panic("segspt_badop called");
+	/*NOTREACHED*/
+}
+
+#define	SEGSPT_BADOP(t)	(t(*)())segspt_badop
+
+struct seg_ops segspt_ops = {
+	SEGSPT_BADOP(int),		/* dup */
+	segspt_unmap,
+	segspt_free,
+	SEGSPT_BADOP(int),		/* fault */
+	SEGSPT_BADOP(faultcode_t),	/* faulta */
+	SEGSPT_BADOP(int),		/* setprot */
+	SEGSPT_BADOP(int),		/* checkprot */
+	SEGSPT_BADOP(int),		/* kluster */
+	SEGSPT_BADOP(size_t),		/* swapout */
+	SEGSPT_BADOP(int),		/* sync */
+	SEGSPT_BADOP(size_t),		/* incore */
+	SEGSPT_BADOP(int),		/* lockop */
+	SEGSPT_BADOP(int),		/* getprot */
+	SEGSPT_BADOP(u_offset_t), 	/* getoffset */
+	SEGSPT_BADOP(int),		/* gettype */
+	SEGSPT_BADOP(int),		/* getvp */
+	SEGSPT_BADOP(int),		/* advise */
+	SEGSPT_BADOP(void),		/* dump */
+	SEGSPT_BADOP(int),		/* pagelock */
+	SEGSPT_BADOP(int),		/* setpgsz */
+	SEGSPT_BADOP(int),		/* getmemid */
+	segspt_getpolicy,		/* getpolicy */
+};
+
+static int segspt_shmdup(struct seg *seg, struct seg *newseg);
+static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
+static void segspt_shmfree(struct seg *seg);
+static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
+		caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
+static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
+static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
+			register size_t len, register uint_t prot);
+static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
+			uint_t prot);
+static int	segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
+static size_t	segspt_shmswapout(struct seg *seg);
+static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
+			register char *vec);
+static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
+			int attr, uint_t flags);
+static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
+			int attr, int op, ulong_t *lockmap, size_t pos);
+static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
+			uint_t *protv);
+static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
+static int segspt_shmgettype(struct seg *seg, caddr_t addr);
+static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
+static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
+			uint_t behav);
+static void segspt_shmdump(struct seg *seg);
+static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
+			struct page ***, enum lock_type, enum seg_rw);
+static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
+static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
+static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
+
+struct seg_ops segspt_shmops = {
+	segspt_shmdup,
+	segspt_shmunmap,
+	segspt_shmfree,
+	segspt_shmfault,
+	segspt_shmfaulta,
+	segspt_shmsetprot,
+	segspt_shmcheckprot,
+	segspt_shmkluster,
+	segspt_shmswapout,
+	segspt_shmsync,
+	segspt_shmincore,
+	segspt_shmlockop,
+	segspt_shmgetprot,
+	segspt_shmgetoffset,
+	segspt_shmgettype,
+	segspt_shmgetvp,
+	segspt_shmadvise,	/* advise */
+	segspt_shmdump,
+	segspt_shmpagelock,
+	segspt_shmsetpgsz,
+	segspt_shmgetmemid,
+	segspt_shmgetpolicy,
+};
+
+static void segspt_purge(struct seg *seg);
+static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **,
+		enum seg_rw);
+static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
+		page_t **ppa);
+
+
+
+/*ARGSUSED*/
+int
+sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
+    uint_t prot, uint_t flags, uint_t share_szc)
+{
+	int 	err;
+	struct  as	*newas;
+	struct	segspt_crargs sptcargs;
+
+#ifdef DEBUG
+	TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
+                	tnf_ulong, size, size );
+#endif
+	if (segspt_minfree == 0)	/* leave min 5% of availrmem for */
+		segspt_minfree = availrmem/20;	/* for the system */
+
+	if (!hat_supported(HAT_SHARED_PT, (void *)0))
+		return (EINVAL);
+
+	/*
+	 * get a new as for this shared memory segment
+	 */
+	newas = as_alloc();
+	sptcargs.amp = amp;
+	sptcargs.prot = prot;
+	sptcargs.flags = flags;
+	sptcargs.szc = share_szc;
+
+	/*
+	 * create a shared page table (spt) segment
+	 */
+
+	if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
+		as_free(newas);
+		return (err);
+	}
+	*sptseg = sptcargs.seg_spt;
+	return (0);
+}
+
+void
+sptdestroy(struct as *as, struct anon_map *amp)
+{
+
+#ifdef DEBUG
+	TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
+#endif
+	(void) as_unmap(as, SEGSPTADDR, amp->size);
+	as_free(as);
+}
+
+/*
+ * called from seg_free().
+ * free (i.e., unlock, unmap, return to free list)
+ *  all the pages in the given seg.
+ */
+void
+segspt_free(struct seg	*seg)
+{
+	struct spt_data *sptd = (struct spt_data *)seg->s_data;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if (sptd != NULL) {
+		if (sptd->spt_realsize)
+			segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
+
+		if (sptd->spt_ppa_lckcnt)
+			kmem_free(sptd->spt_ppa_lckcnt,
+				sizeof (*sptd->spt_ppa_lckcnt)
+				* btopr(sptd->spt_amp->size));
+		kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
+		mutex_destroy(&sptd->spt_lock);
+		kmem_free(sptd, sizeof (*sptd));
+	}
+}
+
+/*ARGSUSED*/
+static int
+segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
+	uint_t flags)
+{
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static size_t
+segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
+{
+	caddr_t	eo_seg;
+	pgcnt_t	npages;
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct seg	*sptseg;
+	struct spt_data *sptd;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+#ifdef lint
+	seg = seg;
+#endif
+	sptseg = shmd->shm_sptseg;
+	sptd = sptseg->s_data;
+
+	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+		eo_seg = addr + len;
+		while (addr < eo_seg) {
+			/* page exists, and it's locked. */
+			*vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
+				SEG_PAGE_ANON;
+			addr += PAGESIZE;
+		}
+		return (len);
+	} else {
+		struct  anon_map *amp = shmd->shm_amp;
+		struct  anon	*ap;
+		page_t		*pp;
+		pgcnt_t 	anon_index;
+		struct vnode 	*vp;
+		u_offset_t 	off;
+		ulong_t		i;
+		int		ret;
+		anon_sync_obj_t	cookie;
+
+		addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+		anon_index = seg_page(seg, addr);
+		npages = btopr(len);
+		if (anon_index + npages > btopr(shmd->shm_amp->size)) {
+			return (EINVAL);
+		}
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+		for (i = 0; i < npages; i++, anon_index++) {
+			ret = 0;
+			anon_array_enter(amp, anon_index, &cookie);
+			ap = anon_get_ptr(amp->ahp, anon_index);
+			if (ap != NULL) {
+				swap_xlate(ap, &vp, &off);
+				anon_array_exit(&cookie);
+				pp = page_lookup_nowait(vp, off, SE_SHARED);
+				if (pp != NULL) {
+					ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
+					page_unlock(pp);
+				}
+			} else {
+				anon_array_exit(&cookie);
+			}
+			if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
+				ret |= SEG_PAGE_LOCKED;
+			}
+			*vec++ = (char)ret;
+		}
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+		return (len);
+	}
+}
+
+static int
+segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
+{
+	size_t share_size;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * seg.s_size may have been rounded up to the largest page size
+	 * in shmat().
+	 * XXX This should be cleanedup. sptdestroy should take a length
+	 * argument which should be the same as sptcreate. Then
+	 * this rounding would not be needed (or is done in shm.c)
+	 * Only the check for full segment will be needed.
+	 *
+	 * XXX -- shouldn't raddr == 0 always? These tests don't seem
+	 * to be useful at all.
+	 */
+	share_size = page_get_pagesize(seg->s_szc);
+	ssize = P2ROUNDUP(ssize, share_size);
+
+	if (raddr == seg->s_base && ssize == seg->s_size) {
+		seg_free(seg);
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+int
+segspt_create(struct seg *seg, caddr_t argsp)
+{
+	int		err;
+	caddr_t		addr = seg->s_base;
+	struct spt_data *sptd;
+	struct 	segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
+	struct anon_map *amp = sptcargs->amp;
+	struct	cred	*cred = CRED();
+	ulong_t		i, j, anon_index = 0;
+	pgcnt_t		npages = btopr(amp->size);
+	struct vnode	*vp;
+	page_t		**ppa;
+	uint_t		hat_flags;
+
+	/*
+	 * We are holding the a_lock on the underlying dummy as,
+	 * so we can make calls to the HAT layer.
+	 */
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+#ifdef DEBUG
+	TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
+                                tnf_opaque, addr, addr,
+				tnf_ulong, len, seg->s_size);
+#endif
+	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
+		if (err = anon_swap_adjust(npages))
+			return (err);
+	}
+	err = ENOMEM;
+
+	if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
+		goto out1;
+
+	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
+		if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
+		    KM_NOSLEEP)) == NULL)
+			goto out2;
+	}
+
+	mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
+		goto out3;
+
+	seg->s_ops = &segspt_ops;
+	sptd->spt_vp = vp;
+	sptd->spt_amp = amp;
+	sptd->spt_prot = sptcargs->prot;
+	sptd->spt_flags = sptcargs->flags;
+	seg->s_data = (caddr_t)sptd;
+	sptd->spt_ppa = NULL;
+	sptd->spt_ppa_lckcnt = NULL;
+	seg->s_szc = sptcargs->szc;
+
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+	amp->a_szc = seg->s_szc;
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+
+	/*
+	 * Set policy to affect initial allocation of pages in
+	 * anon_map_createpages()
+	 */
+	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
+	    NULL, 0, ptob(npages));
+
+	if (sptcargs->flags & SHM_PAGEABLE) {
+		size_t  share_sz;
+		pgcnt_t new_npgs, more_pgs;
+		struct anon_hdr *nahp;
+
+		share_sz = page_get_pagesize(seg->s_szc);
+		if (!IS_P2ALIGNED(amp->size, share_sz)) {
+			/*
+			 * We are rounding up the size of the anon array
+			 * on 4 M boundary because we always create 4 M
+			 * of page(s) when locking, faulting pages and we
+			 * don't have to check for all corner cases e.g.
+			 * if there is enough space to allocate 4 M
+			 * page.
+			 */
+			new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
+			more_pgs = new_npgs - npages;
+
+			if (anon_resv(ptob(more_pgs)) == 0) {
+				err = ENOMEM;
+				goto out4;
+			}
+			nahp = anon_create(new_npgs, ANON_SLEEP);
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+			(void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
+			    ANON_SLEEP);
+			anon_release(amp->ahp, npages);
+			amp->ahp = nahp;
+			amp->swresv = amp->size = ptob(new_npgs);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+			npages = new_npgs;
+		}
+
+		sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
+		    sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
+		sptd->spt_pcachecnt = 0;
+		sptd->spt_realsize = ptob(npages);
+		sptcargs->seg_spt = seg;
+		return (0);
+	}
+
+	/*
+	 * get array of pages for each anon slot in amp
+	 */
+	if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
+	    seg, addr, S_CREATE, cred)) != 0)
+		goto out4;
+
+	/*
+	 * addr is initial address corresponding to the first page on ppa list
+	 */
+	for (i = 0; i < npages; i++) {
+		/* attempt to lock all pages */
+		if (!page_pp_lock(ppa[i], 0, 1)) {
+			/*
+			 * if unable to lock any page, unlock all
+			 * of them and return error
+			 */
+			for (j = 0; j < i; j++)
+				page_pp_unlock(ppa[j], 0, 1);
+			for (i = 0; i < npages; i++) {
+				page_unlock(ppa[i]);
+			}
+			err = ENOMEM;
+			goto out4;
+		}
+	}
+
+	/*
+	 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
+	 * for the entire life of the segment. For example platforms
+	 * that do not support Dynamic Reconfiguration.
+	 */
+	hat_flags = HAT_LOAD_SHARE;
+	if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
+		hat_flags |= HAT_LOAD_LOCK;
+
+	hat_memload_array(seg->s_as->a_hat, addr, ptob(npages),
+	    ppa, sptd->spt_prot, hat_flags);
+
+	/*
+	 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
+	 * we will leave the pages locked SE_SHARED for the life
+	 * of the ISM segment. This will prevent any calls to
+	 * hat_pageunload() on this ISM segment for those platforms.
+	 */
+	if (!(hat_flags & HAT_LOAD_LOCK)) {
+		/*
+		 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
+		 * we no longer need to hold the SE_SHARED lock on the pages,
+		 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
+		 * SE_SHARED lock on the pages as necessary.
+		 */
+		for (i = 0; i < npages; i++)
+			page_unlock(ppa[i]);
+	}
+	sptd->spt_pcachecnt = 0;
+	kmem_free(ppa, ((sizeof (page_t *)) * npages));
+	sptd->spt_realsize = ptob(npages);
+	atomic_add_long(&spt_used, npages);
+	sptcargs->seg_spt = seg;
+	return (0);
+
+out4:
+	seg->s_data = NULL;
+	kmem_free(vp, sizeof (*vp));
+out3:
+	mutex_destroy(&sptd->spt_lock);
+	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
+		kmem_free(ppa, (sizeof (*ppa) * npages));
+out2:
+	kmem_free(sptd, sizeof (*sptd));
+out1:
+	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
+		anon_swap_restore(npages);
+	return (err);
+}
+
+/*ARGSUSED*/
+void
+segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
+{
+	struct page 	*pp;
+	struct spt_data *sptd = (struct spt_data *)seg->s_data;
+	pgcnt_t		npages;
+	ulong_t		anon_idx;
+	struct anon_map *amp;
+	struct anon 	*ap;
+	struct vnode 	*vp;
+	u_offset_t 	off;
+	uint_t		hat_flags;
+	int		root = 0;
+	pgcnt_t		pgs, curnpgs = 0;
+	page_t		*rootpp;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	len = P2ROUNDUP(len, PAGESIZE);
+
+	npages = btop(len);
+
+	hat_flags = HAT_UNLOAD_UNLOCK;
+	if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
+	    (sptd->spt_flags & SHM_PAGEABLE)) {
+		hat_flags = HAT_UNLOAD;
+	}
+
+	hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
+
+	amp = sptd->spt_amp;
+	if (sptd->spt_flags & SHM_PAGEABLE)
+		npages = btop(amp->size);
+
+	ASSERT(amp);
+	for (anon_idx = 0; anon_idx < npages; anon_idx++) {
+		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+			if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
+				panic("segspt_free_pages: null app");
+				/*NOTREACHED*/
+			}
+		} else {
+			if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
+			    == NULL)
+				continue;
+		}
+		ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
+		swap_xlate(ap, &vp, &off);
+
+		/*
+		 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
+		 * the pages won't be having SE_SHARED lock at this
+		 * point.
+		 *
+		 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
+		 * the pages are still held SE_SHARED locked from the
+		 * original segspt_create()
+		 *
+		 * Our goal is to get SE_EXCL lock on each page, remove
+		 * permanent lock on it and invalidate the page.
+		 */
+		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+			if (hat_flags == HAT_UNLOAD)
+				pp = page_lookup(vp, off, SE_EXCL);
+			else {
+				if ((pp = page_find(vp, off)) == NULL) {
+					panic("segspt_free_pages: "
+					    "page not locked");
+					/*NOTREACHED*/
+				}
+				if (!page_tryupgrade(pp)) {
+					page_unlock(pp);
+					pp = page_lookup(vp, off, SE_EXCL);
+				}
+			}
+			if (pp == NULL) {
+				panic("segspt_free_pages: "
+				    "page not in the system");
+				/*NOTREACHED*/
+			}
+			page_pp_unlock(pp, 0, 1);
+		} else {
+			if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
+				continue;
+			page_pp_unlock(pp, 0, 0);
+		}
+		/*
+		 * It's logical to invalidate the pages here as in most cases
+		 * these were created by segspt.
+		 */
+		if (pp->p_szc != 0) {
+			/*
+			 * For DISM swap is released in shm_rm_amp.
+			 */
+			if ((sptd->spt_flags & SHM_PAGEABLE) == 0 &&
+			    ap->an_pvp != NULL) {
+				panic("segspt_free_pages: pvp non NULL");
+				/*NOTREACHED*/
+			}
+			if (root == 0) {
+				ASSERT(curnpgs == 0);
+				root = 1;
+				rootpp = pp;
+				pgs = curnpgs = page_get_pagecnt(pp->p_szc);
+				ASSERT(pgs > 1);
+				ASSERT(IS_P2ALIGNED(pgs, pgs));
+				ASSERT(!(page_pptonum(pp) & (pgs - 1)));
+				curnpgs--;
+			} else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
+				ASSERT(curnpgs == 1);
+				ASSERT(page_pptonum(pp) ==
+				    page_pptonum(rootpp) + (pgs - 1));
+				page_destroy_pages(rootpp);
+				root = 0;
+				curnpgs = 0;
+			} else {
+				ASSERT(curnpgs > 1);
+				ASSERT(page_pptonum(pp) ==
+				    page_pptonum(rootpp) + (pgs - curnpgs));
+				curnpgs--;
+			}
+		} else {
+			if (root != 0 || curnpgs != 0) {
+				panic("segspt_free_pages: bad large page");
+				/*NOTREACHED*/
+			}
+			/*LINTED: constant in conditional context */
+			VN_DISPOSE(pp, B_INVAL, 0, kcred);
+		}
+	}
+
+	if (root != 0 || curnpgs != 0) {
+		panic("segspt_free_pages: bad large page");
+		/*NOTREACHED*/
+	}
+
+	/*
+	 * mark that pages have been released
+	 */
+	sptd->spt_realsize = 0;
+
+	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+		atomic_add_long(&spt_used, -npages);
+		anon_swap_restore(npages);
+	}
+}
+
+/*
+ * Get memory allocation policy info for specified address in given segment
+ */
+static lgrp_mem_policy_info_t *
+segspt_getpolicy(struct seg *seg, caddr_t addr)
+{
+	struct anon_map		*amp;
+	ulong_t			anon_index;
+	lgrp_mem_policy_info_t	*policy_info;
+	struct spt_data		*spt_data;
+
+	ASSERT(seg != NULL);
+
+	/*
+	 * Get anon_map from segspt
+	 *
+	 * Assume that no lock needs to be held on anon_map, since
+	 * it should be protected by its reference count which must be
+	 * nonzero for an existing segment
+	 * Need to grab readers lock on policy tree though
+	 */
+	spt_data = (struct spt_data *)seg->s_data;
+	if (spt_data == NULL)
+		return (NULL);
+	amp = spt_data->spt_amp;
+	ASSERT(amp->refcnt != 0);
+
+	/*
+	 * Get policy info
+	 *
+	 * Assume starting anon index of 0
+	 */
+	anon_index = seg_page(seg, addr);
+	policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
+
+	return (policy_info);
+}
+
+/*
+ * DISM only.
+ * Return locked pages over a given range.
+ *
+ * We will cache all DISM locked pages and save the pplist for the
+ * entire segment in the ppa field of the underlying DISM segment structure.
+ * Later, during a call to segspt_reclaim() we will use this ppa array
+ * to page_unlock() all of the pages and then we will free this ppa list.
+ */
+/*ARGSUSED*/
+static int
+segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
+    struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+	struct  shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct  seg	*sptseg = shmd->shm_sptseg;
+	struct  spt_data *sptd = sptseg->s_data;
+	pgcnt_t pg_idx, npages, tot_npages, npgs;
+	struct  page **pplist, **pl, **ppa, *pp;
+	struct  anon_map *amp;
+	spgcnt_t	an_idx;
+	int 	ret = ENOTSUP;
+	uint_t	pl_built = 0;
+	struct  anon *ap;
+	struct  vnode *vp;
+	u_offset_t off;
+	pgcnt_t claim_availrmem = 0;
+	uint_t	szc;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * We want to lock/unlock the entire ISM segment. Therefore,
+	 * we will be using the underlying sptseg and it's base address
+	 * and length for the caching arguments.
+	 */
+	ASSERT(sptseg);
+	ASSERT(sptd);
+
+	pg_idx = seg_page(seg, addr);
+	npages = btopr(len);
+
+	/*
+	 * check if the request is larger than number of pages covered
+	 * by amp
+	 */
+	if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
+		*ppp = NULL;
+		return (ENOTSUP);
+	}
+
+	if (type == L_PAGEUNLOCK) {
+		ASSERT(sptd->spt_ppa != NULL);
+
+		seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
+		    sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
+
+		/*
+		 * If someone is blocked while unmapping, we purge
+		 * segment page cache and thus reclaim pplist synchronously
+		 * without waiting for seg_pasync_thread. This speeds up
+		 * unmapping in cases where munmap(2) is called, while
+		 * raw async i/o is still in progress or where a thread
+		 * exits on data fault in a multithreaded application.
+		 */
+		if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
+			segspt_purge(seg);
+		}
+		return (0);
+	} else if (type == L_PAGERECLAIM) {
+		ASSERT(sptd->spt_ppa != NULL);
+		(void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
+		    sptd->spt_ppa, sptd->spt_prot);
+		return (0);
+	}
+
+	if (sptd->spt_flags & DISM_PPA_CHANGED) {
+		segspt_purge(seg);
+		/*
+		 * for DISM ppa needs to be rebuild since
+		 * number of locked pages could be changed
+		 */
+		*ppp = NULL;
+		return (ENOTSUP);
+	}
+
+	/*
+	 * First try to find pages in segment page cache, without
+	 * holding the segment lock.
+	 */
+	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
+	    sptd->spt_prot);
+	if (pplist != NULL) {
+		ASSERT(sptd->spt_ppa != NULL);
+		ASSERT(sptd->spt_ppa == pplist);
+		ppa = sptd->spt_ppa;
+		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
+			if (ppa[an_idx] == NULL) {
+				seg_pinactive(seg, seg->s_base,
+				    sptd->spt_amp->size, ppa,
+				    sptd->spt_prot, segspt_reclaim);
+				*ppp = NULL;
+				return (ENOTSUP);
+			}
+			if ((szc = ppa[an_idx]->p_szc) != 0) {
+				npgs = page_get_pagecnt(szc);
+				an_idx = P2ROUNDUP(an_idx + 1, npgs);
+			} else {
+				an_idx++;
+			}
+		}
+		/*
+		 * Since we cache the entire DISM segment, we want to
+		 * set ppp to point to the first slot that corresponds
+		 * to the requested addr, i.e. pg_idx.
+		 */
+		*ppp = &(sptd->spt_ppa[pg_idx]);
+		return (0);
+	}
+
+	/* The L_PAGELOCK case... */
+	mutex_enter(&sptd->spt_lock);
+	/*
+	 * try to find pages in segment page cache with mutex
+	 */
+	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
+	    sptd->spt_prot);
+	if (pplist != NULL) {
+		ASSERT(sptd->spt_ppa != NULL);
+		ASSERT(sptd->spt_ppa == pplist);
+		ppa = sptd->spt_ppa;
+		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
+			if (ppa[an_idx] == NULL) {
+				mutex_exit(&sptd->spt_lock);
+				seg_pinactive(seg, seg->s_base,
+				    sptd->spt_amp->size, ppa,
+				    sptd->spt_prot, segspt_reclaim);
+				*ppp = NULL;
+				return (ENOTSUP);
+			}
+			if ((szc = ppa[an_idx]->p_szc) != 0) {
+				npgs = page_get_pagecnt(szc);
+				an_idx = P2ROUNDUP(an_idx + 1, npgs);
+			} else {
+				an_idx++;
+			}
+		}
+		/*
+		 * Since we cache the entire DISM segment, we want to
+		 * set ppp to point to the first slot that corresponds
+		 * to the requested addr, i.e. pg_idx.
+		 */
+		mutex_exit(&sptd->spt_lock);
+		*ppp = &(sptd->spt_ppa[pg_idx]);
+		return (0);
+	}
+	if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
+	    SEGP_FAIL) {
+		mutex_exit(&sptd->spt_lock);
+		*ppp = NULL;
+		return (ENOTSUP);
+	}
+
+	/*
+	 * No need to worry about protections because DISM pages are always rw.
+	 */
+	pl = pplist = NULL;
+	amp = sptd->spt_amp;
+
+	/*
+	 * Do we need to build the ppa array?
+	 */
+	if (sptd->spt_ppa == NULL) {
+		pgcnt_t lpg_cnt = 0;
+
+		pl_built = 1;
+		tot_npages = btopr(sptd->spt_amp->size);
+
+		ASSERT(sptd->spt_pcachecnt == 0);
+		pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
+		pl = pplist;
+
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+		for (an_idx = 0; an_idx < tot_npages; ) {
+			ap = anon_get_ptr(amp->ahp, an_idx);
+			/*
+			 * Cache only mlocked pages. For large pages
+			 * if one (constituent) page is mlocked
+			 * all pages for that large page
+			 * are cached also. This is for quick
+			 * lookups of ppa array;
+			 */
+			if ((ap != NULL) && (lpg_cnt != 0 ||
+			    (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
+
+				swap_xlate(ap, &vp, &off);
+				pp = page_lookup(vp, off, SE_SHARED);
+				ASSERT(pp != NULL);
+				if (lpg_cnt == 0) {
+					npgs = page_get_pagecnt(pp->p_szc);
+					if (!IS_P2ALIGNED(an_idx, npgs)) {
+						an_idx = P2ALIGN(an_idx, npgs);
+						page_unlock(pp);
+						continue;
+					}
+				}
+				if (++lpg_cnt == npgs)
+					lpg_cnt = 0;
+
+				/*
+				 * availrmem is decremented only
+				 * for unlocked pages
+				 */
+				if (sptd->spt_ppa_lckcnt[an_idx] == 0)
+					claim_availrmem++;
+				pplist[an_idx] = pp;
+			}
+			an_idx++;
+		}
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+
+		mutex_enter(&freemem_lock);
+		if (availrmem < tune.t_minarmem + claim_availrmem) {
+			mutex_exit(&freemem_lock);
+			ret = FC_MAKE_ERR(ENOMEM);
+			claim_availrmem = 0;
+			goto insert_fail;
+		} else {
+			availrmem -= claim_availrmem;
+		}
+		mutex_exit(&freemem_lock);
+
+		sptd->spt_ppa = pl;
+	} else {
+		/*
+		 * We already have a valid ppa[].
+		 */
+		pl = sptd->spt_ppa;
+	}
+
+	ASSERT(pl != NULL);
+
+	ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
+	    pl, sptd->spt_prot, SEGP_FORCE_WIRED | SEGP_ASYNC_FLUSH,
+	    segspt_reclaim);
+	if (ret == SEGP_FAIL) {
+		/*
+		 * seg_pinsert failed. We return
+		 * ENOTSUP, so that the as_pagelock() code will
+		 * then try the slower F_SOFTLOCK path.
+		 */
+		sptd->spt_ppa = NULL;
+		ret = ENOTSUP;
+		goto insert_fail;
+	}
+
+	/*
+	 * In either case, we increment softlockcnt on the 'real' segment.
+	 */
+	sptd->spt_pcachecnt++;
+	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);
+
+	ppa = sptd->spt_ppa;
+	for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
+		if (ppa[an_idx] == NULL) {
+			mutex_exit(&sptd->spt_lock);
+			seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
+			    pl, sptd->spt_prot, segspt_reclaim);
+			*ppp = NULL;
+			return (ENOTSUP);
+		}
+		if ((szc = ppa[an_idx]->p_szc) != 0) {
+			npgs = page_get_pagecnt(szc);
+			an_idx = P2ROUNDUP(an_idx + 1, npgs);
+		} else {
+			an_idx++;
+		}
+	}
+	/*
+	 * We can now drop the sptd->spt_lock since the ppa[]
+	 * exists and he have incremented pacachecnt.
+	 */
+	mutex_exit(&sptd->spt_lock);
+
+	/*
+	 * Since we cache the entire segment, we want to
+	 * set ppp to point to the first slot that corresponds
+	 * to the requested addr, i.e. pg_idx.
+	 */
+	*ppp = &(sptd->spt_ppa[pg_idx]);
+	return (ret);
+
+insert_fail:
+	/*
+	 * We will only reach this code if we tried and failed.
+	 *
+	 * And we can drop the lock on the dummy seg, once we've failed
+	 * to set up a new ppa[].
+	 */
+	mutex_exit(&sptd->spt_lock);
+
+	if (pl_built) {
+		mutex_enter(&freemem_lock);
+		availrmem += claim_availrmem;
+		mutex_exit(&freemem_lock);
+
+		/*
+		 * We created pl and we need to destroy it.
+		 */
+		pplist = pl;
+		for (an_idx = 0; an_idx < tot_npages; an_idx++) {
+			if (pplist[an_idx] != NULL)
+				page_unlock(pplist[an_idx]);
+		}
+		kmem_free(pl, sizeof (page_t *) * tot_npages);
+	}
+
+	if (shmd->shm_softlockcnt <= 0) {
+		if (AS_ISUNMAPWAIT(seg->s_as)) {
+			mutex_enter(&seg->s_as->a_contents);
+			if (AS_ISUNMAPWAIT(seg->s_as)) {
+				AS_CLRUNMAPWAIT(seg->s_as);
+				cv_broadcast(&seg->s_as->a_cv);
+			}
+			mutex_exit(&seg->s_as->a_contents);
+		}
+	}
+	*ppp = NULL;
+	return (ret);
+}
+
+
+
+/*
+ * return locked pages over a given range.
+ *
+ * We will cache the entire ISM segment and save the pplist for the
+ * entire segment in the ppa field of the underlying ISM segment structure.
+ * Later, during a call to segspt_reclaim() we will use this ppa array
+ * to page_unlock() all of the pages and then we will free this ppa list.
+ */
+/*ARGSUSED*/
+static int
+segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
+    struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct seg	*sptseg = shmd->shm_sptseg;
+	struct spt_data *sptd = sptseg->s_data;
+	pgcnt_t np, page_index, npages;
+	caddr_t a, spt_base;
+	struct page **pplist, **pl, *pp;
+	struct anon_map *amp;
+	ulong_t anon_index;
+	int ret = ENOTSUP;
+	uint_t	pl_built = 0;
+	struct anon *ap;
+	struct vnode *vp;
+	u_offset_t off;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * We want to lock/unlock the entire ISM segment. Therefore,
+	 * we will be using the underlying sptseg and it's base address
+	 * and length for the caching arguments.
+	 */
+	ASSERT(sptseg);
+	ASSERT(sptd);
+
+	if (sptd->spt_flags & SHM_PAGEABLE) {
+		return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
+	}
+
+	page_index = seg_page(seg, addr);
+	npages = btopr(len);
+
+	/*
+	 * check if the request is larger than number of pages covered
+	 * by amp
+	 */
+	if (page_index + npages > btopr(sptd->spt_amp->size)) {
+		*ppp = NULL;
+		return (ENOTSUP);
+	}
+
+	if (type == L_PAGEUNLOCK) {
+
+		ASSERT(sptd->spt_ppa != NULL);
+
+		seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
+		    sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
+
+		/*
+		 * If someone is blocked while unmapping, we purge
+		 * segment page cache and thus reclaim pplist synchronously
+		 * without waiting for seg_pasync_thread. This speeds up
+		 * unmapping in cases where munmap(2) is called, while
+		 * raw async i/o is still in progress or where a thread
+		 * exits on data fault in a multithreaded application.
+		 */
+		if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
+			segspt_purge(seg);
+		}
+		return (0);
+	} else if (type == L_PAGERECLAIM) {
+		ASSERT(sptd->spt_ppa != NULL);
+
+		(void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
+		    sptd->spt_ppa, sptd->spt_prot);
+		return (0);
+	}
+
+	/*
+	 * First try to find pages in segment page cache, without
+	 * holding the segment lock.
+	 */
+	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
+	    sptd->spt_prot);
+	if (pplist != NULL) {
+		ASSERT(sptd->spt_ppa == pplist);
+		ASSERT(sptd->spt_ppa[page_index]);
+		/*
+		 * Since we cache the entire ISM segment, we want to
+		 * set ppp to point to the first slot that corresponds
+		 * to the requested addr, i.e. page_index.
+		 */
+		*ppp = &(sptd->spt_ppa[page_index]);
+		return (0);
+	}
+
+	/* The L_PAGELOCK case... */
+	mutex_enter(&sptd->spt_lock);
+
+	/*
+	 * try to find pages in segment page cache
+	 */
+	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
+	    sptd->spt_prot);
+	if (pplist != NULL) {
+		ASSERT(sptd->spt_ppa == pplist);
+		/*
+		 * Since we cache the entire segment, we want to
+		 * set ppp to point to the first slot that corresponds
+		 * to the requested addr, i.e. page_index.
+		 */
+		mutex_exit(&sptd->spt_lock);
+		*ppp = &(sptd->spt_ppa[page_index]);
+		return (0);
+	}
+
+	if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
+	    SEGP_FAIL) {
+		mutex_exit(&sptd->spt_lock);
+		*ppp = NULL;
+		return (ENOTSUP);
+	}
+
+	/*
+	 * No need to worry about protections because ISM pages
+	 * are always rw.
+	 */
+	pl = pplist = NULL;
+
+	/*
+	 * Do we need to build the ppa array?
+	 */
+	if (sptd->spt_ppa == NULL) {
+		ASSERT(sptd->spt_ppa == pplist);
+
+		spt_base = sptseg->s_base;
+		pl_built = 1;
+
+		/*
+		 * availrmem is decremented once during anon_swap_adjust()
+		 * and is incremented during the anon_unresv(), which is
+		 * called from shm_rm_amp() when the segment is destroyed.
+		 */
+		amp = sptd->spt_amp;
+		ASSERT(amp != NULL);
+
+		/* pcachecnt is protected by sptd->spt_lock */
+		ASSERT(sptd->spt_pcachecnt == 0);
+		pplist = kmem_zalloc(sizeof (page_t *)
+		    * btopr(sptd->spt_amp->size), KM_SLEEP);
+		pl = pplist;
+
+		anon_index = seg_page(sptseg, spt_base);
+
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+		for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
+		    a += PAGESIZE, anon_index++, pplist++) {
+			ap = anon_get_ptr(amp->ahp, anon_index);
+			ASSERT(ap != NULL);
+			swap_xlate(ap, &vp, &off);
+			pp = page_lookup(vp, off, SE_SHARED);
+			ASSERT(pp != NULL);
+			*pplist = pp;
+		}
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+
+		if (a < (spt_base + sptd->spt_amp->size)) {
+			ret = ENOTSUP;
+			goto insert_fail;
+		}
+		sptd->spt_ppa = pl;
+	} else {
+		/*
+		 * We already have a valid ppa[].
+		 */
+		pl = sptd->spt_ppa;
+	}
+
+	ASSERT(pl != NULL);
+
+	ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
+	    pl, sptd->spt_prot, SEGP_FORCE_WIRED, segspt_reclaim);
+	if (ret == SEGP_FAIL) {
+		/*
+		 * seg_pinsert failed. We return
+		 * ENOTSUP, so that the as_pagelock() code will
+		 * then try the slower F_SOFTLOCK path.
+		 */
+		if (pl_built) {
+			/*
+			 * No one else has referenced the ppa[].
+			 * We created it and we need to destroy it.
+			 */
+			sptd->spt_ppa = NULL;
+		}
+		ret = ENOTSUP;
+		goto insert_fail;
+	}
+
+	/*
+	 * In either case, we increment softlockcnt on the 'real' segment.
+	 */
+	sptd->spt_pcachecnt++;
+	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);
+
+	/*
+	 * We can now drop the sptd->spt_lock since the ppa[]
+	 * exists and he have incremented pacachecnt.
+	 */
+	mutex_exit(&sptd->spt_lock);
+
+	/*
+	 * Since we cache the entire segment, we want to
+	 * set ppp to point to the first slot that corresponds
+	 * to the requested addr, i.e. page_index.
+	 */
+	*ppp = &(sptd->spt_ppa[page_index]);
+	return (ret);
+
+insert_fail:
+	/*
+	 * We will only reach this code if we tried and failed.
+	 *
+	 * And we can drop the lock on the dummy seg, once we've failed
+	 * to set up a new ppa[].
+	 */
+	mutex_exit(&sptd->spt_lock);
+
+	if (pl_built) {
+		/*
+		 * We created pl and we need to destroy it.
+		 */
+		pplist = pl;
+		np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
+		while (np) {
+			page_unlock(*pplist);
+			np--;
+			pplist++;
+		}
+		kmem_free(pl, sizeof (page_t *) *
+				btopr(sptd->spt_amp->size));
+	}
+	if (shmd->shm_softlockcnt <= 0) {
+		if (AS_ISUNMAPWAIT(seg->s_as)) {
+			mutex_enter(&seg->s_as->a_contents);
+			if (AS_ISUNMAPWAIT(seg->s_as)) {
+				AS_CLRUNMAPWAIT(seg->s_as);
+				cv_broadcast(&seg->s_as->a_cv);
+			}
+			mutex_exit(&seg->s_as->a_contents);
+		}
+	}
+	*ppp = NULL;
+	return (ret);
+}
+
+/*
+ * purge any cached pages in the I/O page cache
+ */
+static void
+segspt_purge(struct seg *seg)
+{
+	seg_ppurge(seg);
+}
+
+static int
+segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
+	enum seg_rw rw)
+{
+	struct	shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct	seg	*sptseg;
+	struct	spt_data *sptd;
+	pgcnt_t npages, i, free_availrmem = 0;
+	int	done = 0;
+
+#ifdef lint
+	addr = addr;
+#endif
+	sptseg = shmd->shm_sptseg;
+	sptd = sptseg->s_data;
+	npages = (len >> PAGESHIFT);
+	ASSERT(npages);
+	ASSERT(sptd->spt_pcachecnt != 0);
+	ASSERT(sptd->spt_ppa == pplist);
+	ASSERT(npages == btopr(sptd->spt_amp->size));
+
+	/*
+	 * Acquire the lock on the dummy seg and destroy the
+	 * ppa array IF this is the last pcachecnt.
+	 */
+	mutex_enter(&sptd->spt_lock);
+	if (--sptd->spt_pcachecnt == 0) {
+		for (i = 0; i < npages; i++) {
+			if (pplist[i] == NULL) {
+				continue;
+			}
+			if (rw == S_WRITE) {
+				hat_setrefmod(pplist[i]);
+			} else {
+				hat_setref(pplist[i]);
+			}
+			if ((sptd->spt_flags & SHM_PAGEABLE) &&
+				(sptd->spt_ppa_lckcnt[i] == 0))
+				free_availrmem++;
+			page_unlock(pplist[i]);
+		}
+		if (sptd->spt_flags & SHM_PAGEABLE) {
+			mutex_enter(&freemem_lock);
+			availrmem += free_availrmem;
+			mutex_exit(&freemem_lock);
+		}
+		/*
+		 * Since we want to cach/uncache the entire ISM segment,
+		 * we will track the pplist in a segspt specific field
+		 * ppa, that is initialized at the time we add an entry to
+		 * the cache.
+		 */
+		ASSERT(sptd->spt_pcachecnt == 0);
+		kmem_free(pplist, sizeof (page_t *) * npages);
+		sptd->spt_ppa = NULL;
+		sptd->spt_flags &= ~DISM_PPA_CHANGED;
+		done = 1;
+	}
+	mutex_exit(&sptd->spt_lock);
+	/*
+	 * Now decrement softlockcnt.
+	 */
+	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1);
+
+	if (shmd->shm_softlockcnt <= 0) {
+		if (AS_ISUNMAPWAIT(seg->s_as)) {
+			mutex_enter(&seg->s_as->a_contents);
+			if (AS_ISUNMAPWAIT(seg->s_as)) {
+				AS_CLRUNMAPWAIT(seg->s_as);
+				cv_broadcast(&seg->s_as->a_cv);
+			}
+			mutex_exit(&seg->s_as->a_contents);
+		}
+	}
+	return (done);
+}
+
+/*
+ * Do a F_SOFTUNLOCK call over the range requested.
+ * The range must have already been F_SOFTLOCK'ed.
+ *
+ * The calls to acquire and release the anon map lock mutex were
+ * removed in order to avoid a deadly embrace during a DR
+ * memory delete operation.  (Eg. DR blocks while waiting for a
+ * exclusive lock on a page that is being used for kaio; the
+ * thread that will complete the kaio and call segspt_softunlock
+ * blocks on the anon map lock; another thread holding the anon
+ * map lock blocks on another page lock via the segspt_shmfault
+ * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
+ *
+ * The appropriateness of the removal is based upon the following:
+ * 1. If we are holding a segment's reader lock and the page is held
+ * shared, then the corresponding element in anonmap which points to
+ * anon struct cannot change and there is no need to acquire the
+ * anonymous map lock.
+ * 2. Threads in segspt_softunlock have a reader lock on the segment
+ * and already have the shared page lock, so we are guaranteed that
+ * the anon map slot cannot change and therefore can call anon_get_ptr()
+ * without grabbing the anonymous map lock.
+ * 3. Threads that softlock a shared page break copy-on-write, even if
+ * its a read.  Thus cow faults can be ignored with respect to soft
+ * unlocking, since the breaking of cow means that the anon slot(s) will
+ * not be shared.
+ */
+static void
+segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
+	size_t len, enum seg_rw rw)
+{
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct seg	*sptseg;
+	struct spt_data *sptd;
+	page_t *pp;
+	caddr_t adr;
+	struct vnode *vp;
+	u_offset_t offset;
+	ulong_t anon_index;
+	struct anon_map *amp;		/* XXX - for locknest */
+	struct anon *ap = NULL;
+	pgcnt_t npages;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	sptseg = shmd->shm_sptseg;
+	sptd = sptseg->s_data;
+
+	/*
+	 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
+	 * and therefore their pages are SE_SHARED locked
+	 * for the entire life of the segment.
+	 */
+	if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
+		((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
+		goto softlock_decrement;
+	}
+
+	/*
+	 * Any thread is free to do a page_find and
+	 * page_unlock() on the pages within this seg.
+	 *
+	 * We are already holding the as->a_lock on the user's
+	 * real segment, but we need to hold the a_lock on the
+	 * underlying dummy as. This is mostly to satisfy the
+	 * underlying HAT layer.
+	 */
+	AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
+	hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
+	AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
+
+	amp = sptd->spt_amp;
+	ASSERT(amp != NULL);
+	anon_index = seg_page(sptseg, sptseg_addr);
+
+	for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
+		ap = anon_get_ptr(amp->ahp, anon_index++);
+		ASSERT(ap != NULL);
+		swap_xlate(ap, &vp, &offset);
+
+		/*
+		 * Use page_find() instead of page_lookup() to
+		 * find the page since we know that it has a
+		 * "shared" lock.
+		 */
+		pp = page_find(vp, offset);
+		ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
+		if (pp == NULL) {
+			panic("segspt_softunlock: "
+			    "addr %p, ap %p, vp %p, off %llx",
+			    (void *)adr, (void *)ap, (void *)vp, offset);
+			/*NOTREACHED*/
+		}
+
+		if (rw == S_WRITE) {
+			hat_setrefmod(pp);
+		} else if (rw != S_OTHER) {
+			hat_setref(pp);
+		}
+		page_unlock(pp);
+	}
+
+softlock_decrement:
+	npages = btopr(len);
+	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
+	if (shmd->shm_softlockcnt == 0) {
+		/*
+		 * All SOFTLOCKS are gone. Wakeup any waiting
+		 * unmappers so they can try again to unmap.
+		 * Check for waiters first without the mutex
+		 * held so we don't always grab the mutex on
+		 * softunlocks.
+		 */
+		if (AS_ISUNMAPWAIT(seg->s_as)) {
+			mutex_enter(&seg->s_as->a_contents);
+			if (AS_ISUNMAPWAIT(seg->s_as)) {
+				AS_CLRUNMAPWAIT(seg->s_as);
+				cv_broadcast(&seg->s_as->a_cv);
+			}
+			mutex_exit(&seg->s_as->a_contents);
+		}
+	}
+}
+
+int
+segspt_shmattach(struct seg *seg, caddr_t *argsp)
+{
+	struct shm_data *shmd_arg = (struct shm_data *)argsp;
+	struct shm_data *shmd;
+	struct anon_map *shm_amp = shmd_arg->shm_amp;
+	struct spt_data *sptd;
+	int error = 0;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
+	if (shmd == NULL)
+		return (ENOMEM);
+
+	shmd->shm_sptas = shmd_arg->shm_sptas;
+	shmd->shm_amp = shm_amp;
+	shmd->shm_sptseg = shmd_arg->shm_sptseg;
+
+	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
+	    NULL, 0, seg->s_size);
+
+	seg->s_data = (void *)shmd;
+	seg->s_ops = &segspt_shmops;
+	seg->s_szc = shmd->shm_sptseg->s_szc;
+	sptd = shmd->shm_sptseg->s_data;
+
+	if (sptd->spt_flags & SHM_PAGEABLE) {
+		if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
+		    KM_NOSLEEP)) == NULL) {
+			seg->s_data = (void *)NULL;
+			kmem_free(shmd, (sizeof (*shmd)));
+			return (ENOMEM);
+		}
+		shmd->shm_lckpgs = 0;
+		if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
+			if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
+			    shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
+			    seg->s_size, seg->s_szc)) != 0) {
+				kmem_free(shmd->shm_vpage,
+					btopr(shm_amp->size));
+			}
+		}
+	} else {
+		error = hat_share(seg->s_as->a_hat, seg->s_base,
+				shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
+				seg->s_size, seg->s_szc);
+	}
+	if (error) {
+		seg->s_szc = 0;
+		seg->s_data = (void *)NULL;
+		kmem_free(shmd, (sizeof (*shmd)));
+	} else {
+		ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
+		shm_amp->refcnt++;
+		ANON_LOCK_EXIT(&shm_amp->a_rwlock);
+	}
+	return (error);
+}
+
+int
+segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
+{
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	int reclaim = 1;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+retry:
+	if (shmd->shm_softlockcnt > 0) {
+		if (reclaim == 1) {
+			segspt_purge(seg);
+			reclaim = 0;
+			goto retry;
+		}
+		return (EAGAIN);
+	}
+
+	if (ssize != seg->s_size) {
+#ifdef DEBUG
+		cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
+		    ssize, seg->s_size);
+#endif
+		return (EINVAL);
+	}
+
+	(void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
+	    NULL, 0);
+	hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
+
+	seg_free(seg);
+
+	return (0);
+}
+
+void
+segspt_shmfree(struct seg *seg)
+{
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct anon_map *shm_amp = shmd->shm_amp;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	(void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
+		MC_UNLOCK, NULL, 0);
+
+	/*
+	 * Need to increment refcnt when attaching
+	 * and decrement when detaching because of dup().
+	 */
+	ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
+	shm_amp->refcnt--;
+	ANON_LOCK_EXIT(&shm_amp->a_rwlock);
+
+	if (shmd->shm_vpage) {	/* only for DISM */
+		kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
+		shmd->shm_vpage = NULL;
+	}
+	kmem_free(shmd, sizeof (*shmd));
+}
+
+/*ARGSUSED*/
+int
+segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * Shared page table is more than shared mapping.
+	 *  Individual process sharing page tables can't change prot
+	 *  because there is only one set of page tables.
+	 *  This will be allowed after private page table is
+	 *  supported.
+	 */
+/* need to return correct status error? */
+	return (0);
+}
+
+
+faultcode_t
+segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
+    size_t len, enum fault_type type, enum seg_rw rw)
+{
+	struct  shm_data 	*shmd = (struct shm_data *)seg->s_data;
+	struct  seg		*sptseg = shmd->shm_sptseg;
+	struct  as		*curspt = shmd->shm_sptas;
+	struct  spt_data 	*sptd = sptseg->s_data;
+	pgcnt_t npages;
+	size_t  share_sz, size;
+	caddr_t segspt_addr, shm_addr;
+	page_t  **ppa;
+	int	i;
+	ulong_t an_idx = 0;
+	int	err = 0;
+
+#ifdef lint
+	hat = hat;
+#endif
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * Because of the way spt is implemented
+	 * the realsize of the segment does not have to be
+	 * equal to the segment size itself. The segment size is
+	 * often in multiples of a page size larger than PAGESIZE.
+	 * The realsize is rounded up to the nearest PAGESIZE
+	 * based on what the user requested. This is a bit of
+	 * ungliness that is historical but not easily fixed
+	 * without re-designing the higher levels of ISM.
+	 */
+	ASSERT(addr >= seg->s_base);
+	if (((addr + len) - seg->s_base) > sptd->spt_realsize)
+		return (FC_NOMAP);
+	/*
+	 * For all of the following cases except F_PROT, we need to
+	 * make any necessary adjustments to addr and len
+	 * and get all of the necessary page_t's into an array called ppa[].
+	 *
+	 * The code in shmat() forces base addr and len of ISM segment
+	 * to be aligned to largest page size supported. Therefore,
+	 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
+	 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
+	 * in large pagesize chunks, or else we will screw up the HAT
+	 * layer by calling hat_memload_array() with differing page sizes
+	 * over a given virtual range.
+	 */
+	share_sz = page_get_pagesize(sptseg->s_szc);
+	shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
+	size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_sz);
+	npages = btopr(size);
+
+	/*
+	 * Now we need to convert from addr in segshm to addr in segspt.
+	 */
+	an_idx = seg_page(seg, shm_addr);
+	segspt_addr = sptseg->s_base + ptob(an_idx);
+
+	ASSERT((segspt_addr + ptob(npages)) <=
+		(sptseg->s_base + sptd->spt_realsize));
+	ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
+
+	switch (type) {
+
+	case F_SOFTLOCK:
+
+		mutex_enter(&freemem_lock);
+		if (availrmem < tune.t_minarmem + npages) {
+			mutex_exit(&freemem_lock);
+			return (FC_MAKE_ERR(ENOMEM));
+		} else {
+			availrmem -= npages;
+		}
+		mutex_exit(&freemem_lock);
+		atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
+		/*
+		 * Fall through to the F_INVAL case to load up the hat layer
+		 * entries with the HAT_LOAD_LOCK flag.
+		 */
+		/* FALLTHRU */
+	case F_INVAL:
+
+		if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
+			return (FC_NOMAP);
+
+		ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
+
+		err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
+		if (err != 0) {
+			if (type == F_SOFTLOCK) {
+				mutex_enter(&freemem_lock);
+				availrmem += npages;
+				mutex_exit(&freemem_lock);
+				atomic_add_long((ulong_t *)(
+				    &(shmd->shm_softlockcnt)), -npages);
+			}
+			goto dism_err;
+		}
+		AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
+		if (type == F_SOFTLOCK) {
+
+			/*
+			 * Load up the translation keeping it
+			 * locked and don't unlock the page.
+			 */
+			hat_memload_array(sptseg->s_as->a_hat, segspt_addr,
+			    size, ppa, sptd->spt_prot,
+			    HAT_LOAD_LOCK | HAT_LOAD_SHARE);
+		} else {
+			if (hat == seg->s_as->a_hat) {
+
+				/*
+				 * Migrate pages marked for migration
+				 */
+				if (lgrp_optimizations())
+					page_migrate(seg, shm_addr, ppa,
+					    npages);
+
+				/* CPU HAT */
+				hat_memload_array(sptseg->s_as->a_hat,
+				    segspt_addr, size, ppa, sptd->spt_prot,
+				    HAT_LOAD_SHARE);
+			} else {
+				/* XHAT. Pass real address */
+				hat_memload_array(hat, shm_addr,
+				    size, ppa, sptd->spt_prot, HAT_LOAD_SHARE);
+			}
+
+			/*
+			 * And now drop the SE_SHARED lock(s).
+			 */
+			for (i = 0; i < npages; i++)
+				page_unlock(ppa[i]);
+		}
+
+		if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
+			if (hat_share(seg->s_as->a_hat, shm_addr,
+			    curspt->a_hat, segspt_addr, ptob(npages),
+			    seg->s_szc) != 0) {
+				panic("hat_share err in DISM fault");
+				/* NOTREACHED */
+			}
+		}
+		AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
+dism_err:
+		kmem_free(ppa, npages * sizeof (page_t *));
+		return (err);
+
+	case F_SOFTUNLOCK:
+
+		mutex_enter(&freemem_lock);
+		availrmem += npages;
+		mutex_exit(&freemem_lock);
+
+		/*
+		 * This is a bit ugly, we pass in the real seg pointer,
+		 * but the segspt_addr is the virtual address within the
+		 * dummy seg.
+		 */
+		segspt_softunlock(seg, segspt_addr, size, rw);
+		return (0);
+
+	case F_PROT:
+
+		/*
+		 * This takes care of the unusual case where a user
+		 * allocates a stack in shared memory and a register
+		 * window overflow is written to that stack page before
+		 * it is otherwise modified.
+		 *
+		 * We can get away with this because ISM segments are
+		 * always rw. Other than this unusual case, there
+		 * should be no instances of protection violations.
+		 */
+		return (0);
+
+	default:
+#ifdef DEBUG
+		panic("segspt_dismfault default type?");
+#else
+		return (FC_NOMAP);
+#endif
+	}
+}
+
+
+faultcode_t
+segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
+    size_t len, enum fault_type type, enum seg_rw rw)
+{
+	struct shm_data 	*shmd = (struct shm_data *)seg->s_data;
+	struct seg		*sptseg = shmd->shm_sptseg;
+	struct as		*curspt = shmd->shm_sptas;
+	struct spt_data 	*sptd   = sptseg->s_data;
+	pgcnt_t npages;
+	size_t share_size, size;
+	caddr_t sptseg_addr, shm_addr;
+	page_t *pp, **ppa;
+	int	i;
+	u_offset_t offset;
+	ulong_t anon_index = 0;
+	struct vnode *vp;
+	struct anon_map *amp;		/* XXX - for locknest */
+	struct anon *ap = NULL;
+	anon_sync_obj_t cookie;
+
+#ifdef lint
+	hat = hat;
+#endif
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if (sptd->spt_flags & SHM_PAGEABLE) {
+		return (segspt_dismfault(hat, seg, addr, len, type, rw));
+	}
+
+	/*
+	 * Because of the way spt is implemented
+	 * the realsize of the segment does not have to be
+	 * equal to the segment size itself. The segment size is
+	 * often in multiples of a page size larger than PAGESIZE.
+	 * The realsize is rounded up to the nearest PAGESIZE
+	 * based on what the user requested. This is a bit of
+	 * ungliness that is historical but not easily fixed
+	 * without re-designing the higher levels of ISM.
+	 */
+	ASSERT(addr >= seg->s_base);
+	if (((addr + len) - seg->s_base) > sptd->spt_realsize)
+		return (FC_NOMAP);
+	/*
+	 * For all of the following cases except F_PROT, we need to
+	 * make any necessary adjustments to addr and len
+	 * and get all of the necessary page_t's into an array called ppa[].
+	 *
+	 * The code in shmat() forces base addr and len of ISM segment
+	 * to be aligned to largest page size supported. Therefore,
+	 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
+	 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
+	 * in large pagesize chunks, or else we will screw up the HAT
+	 * layer by calling hat_memload_array() with differing page sizes
+	 * over a given virtual range.
+	 */
+	share_size = page_get_pagesize(sptseg->s_szc);
+	shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
+	size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_size);
+	npages = btopr(size);
+
+	/*
+	 * Now we need to convert from addr in segshm to addr in segspt.
+	 */
+	anon_index = seg_page(seg, shm_addr);
+	sptseg_addr = sptseg->s_base + ptob(anon_index);
+
+	/*
+	 * And now we may have to adjust npages downward if we have
+	 * exceeded the realsize of the segment or initial anon
+	 * allocations.
+	 */
+	if ((sptseg_addr + ptob(npages)) >
+	    (sptseg->s_base + sptd->spt_realsize))
+		size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
+
+	npages = btopr(size);
+
+	ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
+	ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
+
+	switch (type) {
+
+	case F_SOFTLOCK:
+
+		/*
+		 * availrmem is decremented once during anon_swap_adjust()
+		 * and is incremented during the anon_unresv(), which is
+		 * called from shm_rm_amp() when the segment is destroyed.
+		 */
+		atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
+		/*
+		 * Some platforms assume that ISM pages are SE_SHARED
+		 * locked for the entire life of the segment.
+		 */
+		if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
+			return (0);
+		/*
+		 * Fall through to the F_INVAL case to load up the hat layer
+		 * entries with the HAT_LOAD_LOCK flag.
+		 */
+
+		/* FALLTHRU */
+	case F_INVAL:
+
+		if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
+			return (FC_NOMAP);
+
+		/*
+		 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
+		 * may still rely on this call to hat_share(). That
+		 * would imply that those hat's can fault on a
+		 * HAT_LOAD_LOCK translation, which would seem
+		 * contradictory.
+		 */
+		if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
+			if (hat_share(seg->s_as->a_hat, seg->s_base,
+			    curspt->a_hat, sptseg->s_base,
+			    sptseg->s_size, sptseg->s_szc) != 0) {
+				panic("hat_share error in ISM fault");
+				/*NOTREACHED*/
+			}
+			return (0);
+		}
+		ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
+
+		/*
+		 * I see no need to lock the real seg,
+		 * here, because all of our work will be on the underlying
+		 * dummy seg.
+		 *
+		 * sptseg_addr and npages now account for large pages.
+		 */
+		amp = sptd->spt_amp;
+		ASSERT(amp != NULL);
+		anon_index = seg_page(sptseg, sptseg_addr);
+
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+		for (i = 0; i < npages; i++) {
+			anon_array_enter(amp, anon_index, &cookie);
+			ap = anon_get_ptr(amp->ahp, anon_index++);
+			ASSERT(ap != NULL);
+			swap_xlate(ap, &vp, &offset);
+			anon_array_exit(&cookie);
+			pp = page_lookup(vp, offset, SE_SHARED);
+			ASSERT(pp != NULL);
+			ppa[i] = pp;
+		}
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+		ASSERT(i == npages);
+
+		/*
+		 * We are already holding the as->a_lock on the user's
+		 * real segment, but we need to hold the a_lock on the
+		 * underlying dummy as. This is mostly to satisfy the
+		 * underlying HAT layer.
+		 */
+		AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
+		if (type == F_SOFTLOCK) {
+			/*
+			 * Load up the translation keeping it
+			 * locked and don't unlock the page.
+			 */
+			hat_memload_array(sptseg->s_as->a_hat, sptseg_addr,
+			    ptob(npages), ppa, sptd->spt_prot,
+			    HAT_LOAD_LOCK | HAT_LOAD_SHARE);
+		} else {
+			if (hat == seg->s_as->a_hat) {
+
+				/*
+				 * Migrate pages marked for migration.
+				 */
+				if (lgrp_optimizations())
+					page_migrate(seg, shm_addr, ppa,
+					    npages);
+
+				/* CPU HAT */
+				hat_memload_array(sptseg->s_as->a_hat,
+				    sptseg_addr, ptob(npages), ppa,
+				    sptd->spt_prot, HAT_LOAD_SHARE);
+			} else {
+				/* XHAT. Pass real address */
+				hat_memload_array(hat, shm_addr,
+				    ptob(npages), ppa, sptd->spt_prot,
+				    HAT_LOAD_SHARE);
+			}
+
+			/*
+			 * And now drop the SE_SHARED lock(s).
+			 */
+			for (i = 0; i < npages; i++)
+				page_unlock(ppa[i]);
+		}
+		AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
+
+		kmem_free(ppa, sizeof (page_t *) * npages);
+		return (0);
+	case F_SOFTUNLOCK:
+
+		/*
+		 * This is a bit ugly, we pass in the real seg pointer,
+		 * but the sptseg_addr is the virtual address within the
+		 * dummy seg.
+		 */
+		segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
+		return (0);
+
+	case F_PROT:
+
+		/*
+		 * This takes care of the unusual case where a user
+		 * allocates a stack in shared memory and a register
+		 * window overflow is written to that stack page before
+		 * it is otherwise modified.
+		 *
+		 * We can get away with this because ISM segments are
+		 * always rw. Other than this unusual case, there
+		 * should be no instances of protection violations.
+		 */
+		return (0);
+
+	default:
+#ifdef DEBUG
+		cmn_err(CE_WARN, "segspt_shmfault default type?");
+#endif
+		return (FC_NOMAP);
+	}
+}
+
+/*ARGSUSED*/
+static faultcode_t
+segspt_shmfaulta(struct seg *seg, caddr_t addr)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+static size_t
+segspt_shmswapout(struct seg *seg)
+{
+	return (0);
+}
+
+/*
+ * duplicate the shared page tables
+ */
+int
+segspt_shmdup(struct seg *seg, struct seg *newseg)
+{
+	struct shm_data		*shmd = (struct shm_data *)seg->s_data;
+	struct anon_map 	*amp = shmd->shm_amp;
+	struct shm_data 	*shmd_new;
+	struct seg		*spt_seg = shmd->shm_sptseg;
+	struct spt_data		*sptd = spt_seg->s_data;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
+	newseg->s_data = (void *)shmd_new;
+	shmd_new->shm_sptas = shmd->shm_sptas;
+	shmd_new->shm_amp = amp;
+	shmd_new->shm_sptseg = shmd->shm_sptseg;
+	newseg->s_ops = &segspt_shmops;
+	newseg->s_szc = seg->s_szc;
+	ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
+
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+	amp->refcnt++;
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+
+	if (sptd->spt_flags & SHM_PAGEABLE) {
+		shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
+		shmd_new->shm_lckpgs = 0;
+	}
+	return (hat_share(newseg->s_as->a_hat, newseg->s_base,
+	    shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, seg->s_szc));
+}
+
+/*ARGSUSED*/
+int
+segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
+{
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * ISM segment is always rw.
+	 */
+	return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
+}
+
+/*
+ * Return an array of locked large pages, for empty slots allocate
+ * private zero-filled anon pages.
+ */
+static int
+spt_anon_getpages(
+	struct seg *sptseg,
+	caddr_t sptaddr,
+	size_t len,
+	page_t *ppa[])
+{
+	struct  spt_data *sptd = sptseg->s_data;
+	struct  anon_map *amp = sptd->spt_amp;
+	enum 	seg_rw rw = sptd->spt_prot;
+	uint_t	szc = sptseg->s_szc;
+	size_t	pg_sz, share_sz = page_get_pagesize(szc);
+	pgcnt_t	lp_npgs;
+	caddr_t	lp_addr, e_sptaddr;
+	uint_t	vpprot, ppa_szc = 0;
+	struct  vpage *vpage = NULL;
+	ulong_t	j, ppa_idx;
+	int	err, ierr = 0;
+	pgcnt_t	an_idx;
+	anon_sync_obj_t cookie;
+
+	ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
+	ASSERT(len != 0);
+
+	pg_sz = share_sz;
+	lp_npgs = btop(pg_sz);
+	lp_addr = sptaddr;
+	e_sptaddr = sptaddr + len;
+	an_idx = seg_page(sptseg, sptaddr);
+	ppa_idx = 0;
+
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+	/*CONSTCOND*/
+	while (1) {
+		for (; lp_addr < e_sptaddr;
+			an_idx += lp_npgs, lp_addr += pg_sz,
+			ppa_idx += lp_npgs) {
+
+			anon_array_enter(amp, an_idx, &cookie);
+			ppa_szc = (uint_t)-1;
+			ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
+			    lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
+			    &ppa_szc, vpage, rw, 0, segvn_anypgsz, kcred);
+			anon_array_exit(&cookie);
+
+			if (ierr != 0) {
+				if (ierr > 0) {
+					err = FC_MAKE_ERR(ierr);
+					goto lpgs_err;
+				}
+				break;
+			}
+		}
+		if (lp_addr == e_sptaddr) {
+			break;
+		}
+		ASSERT(lp_addr < e_sptaddr);
+
+		/*
+		 * ierr == -1 means we failed to allocate a large page.
+		 * so do a size down operation.
+		 *
+		 * ierr == -2 means some other process that privately shares
+		 * pages with this process has allocated a larger page and we
+		 * need to retry with larger pages. So do a size up
+		 * operation. This relies on the fact that large pages are
+		 * never partially shared i.e. if we share any constituent
+		 * page of a large page with another process we must share the
+		 * entire large page. Note this cannot happen for SOFTLOCK
+		 * case, unless current address (lpaddr) is at the beginning
+		 * of the next page size boundary because the other process
+		 * couldn't have relocated locked pages.
+		 */
+		ASSERT(ierr == -1 || ierr == -2);
+		if (segvn_anypgsz) {
+			ASSERT(ierr == -2 || szc != 0);
+			ASSERT(ierr == -1 || szc < sptseg->s_szc);
+			szc = (ierr == -1) ? szc - 1 : szc + 1;
+		} else {
+			/*
+			 * For faults and segvn_anypgsz == 0
+			 * we need to be careful not to loop forever
+			 * if existing page is found with szc other
+			 * than 0 or seg->s_szc. This could be due
+			 * to page relocations on behalf of DR or
+			 * more likely large page creation. For this
+			 * case simply re-size to existing page's szc
+			 * if returned by anon_map_getpages().
+			 */
+			if (ppa_szc == (uint_t)-1) {
+				szc = (ierr == -1) ? 0 : sptseg->s_szc;
+			} else {
+				ASSERT(ppa_szc <= sptseg->s_szc);
+				ASSERT(ierr == -2 || ppa_szc < szc);
+				ASSERT(ierr == -1 || ppa_szc > szc);
+				szc = ppa_szc;
+			}
+		}
+		pg_sz = page_get_pagesize(szc);
+		lp_npgs = btop(pg_sz);
+		ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
+	}
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+	return (0);
+
+lpgs_err:
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+	for (j = 0; j < ppa_idx; j++)
+		page_unlock(ppa[j]);
+	return (err);
+}
+
+int
+spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
+    page_t **ppa, ulong_t *lockmap, size_t pos)
+{
+	struct shm_data *shmd = seg->s_data;
+	struct spt_data *sptd = shmd->shm_sptseg->s_data;
+	ulong_t	i;
+	int	kernel;
+
+	for (i = 0; i < npages; anon_index++, pos++, i++) {
+		if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
+			if (sptd->spt_ppa_lckcnt[anon_index] <
+			    (ushort_t)DISM_LOCK_MAX) {
+				if (++sptd->spt_ppa_lckcnt[anon_index] ==
+				    (ushort_t)DISM_LOCK_MAX) {
+					cmn_err(CE_WARN,
+					    "DISM page lock limit "
+					    "reached on DISM offset 0x%lx\n",
+					    anon_index << PAGESHIFT);
+				}
+				kernel = (sptd->spt_ppa &&
+				    sptd->spt_ppa[anon_index]) ? 1 : 0;
+				if (!page_pp_lock(ppa[i], 0, kernel)) {
+					/* unlock rest of the pages */
+					for (; i < npages; i++)
+						page_unlock(ppa[i]);
+					sptd->spt_ppa_lckcnt[anon_index]--;
+					return (EAGAIN);
+				}
+				shmd->shm_lckpgs++;
+				shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
+				if (lockmap != NULL)
+					BT_SET(lockmap, pos);
+			}
+		}
+		page_unlock(ppa[i]);
+	}
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
+    int attr, int op, ulong_t *lockmap, size_t pos)
+{
+	struct shm_data *shmd = seg->s_data;
+	struct seg	*sptseg = shmd->shm_sptseg;
+	struct spt_data *sptd = sptseg->s_data;
+	pgcnt_t		npages, a_npages;
+	page_t		**ppa;
+	pgcnt_t 	an_idx, a_an_idx, ppa_idx;
+	caddr_t		spt_addr, a_addr;	/* spt and aligned address */
+	size_t		a_len;			/* aligned len */
+	size_t		share_sz;
+	ulong_t		i;
+	int		sts = 0;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+		return (0);
+	}
+
+	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	an_idx = seg_page(seg, addr);
+	npages = btopr(len);
+
+	if (an_idx + npages > btopr(shmd->shm_amp->size)) {
+		return (ENOMEM);
+	}
+
+	if (op == MC_LOCK) {
+		/*
+		 * Need to align addr and size request if they are not
+		 * aligned so we can always allocate large page(s) however
+		 * we only lock what was requested in initial request.
+		 */
+		share_sz = page_get_pagesize(sptseg->s_szc);
+		a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
+		a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
+				share_sz);
+		a_npages = btop(a_len);
+		a_an_idx = seg_page(seg, a_addr);
+		spt_addr = sptseg->s_base + ptob(a_an_idx);
+		ppa_idx = an_idx - a_an_idx;
+
+		if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
+			KM_NOSLEEP)) == NULL) {
+			return (ENOMEM);
+		}
+
+		/*
+		 * Don't cache any new pages for IO and
+		 * flush any cached pages.
+		 */
+		mutex_enter(&sptd->spt_lock);
+		if (sptd->spt_ppa != NULL)
+			sptd->spt_flags |= DISM_PPA_CHANGED;
+
+		sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
+		if (sts != 0) {
+			mutex_exit(&sptd->spt_lock);
+			kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
+			return (sts);
+		}
+
+		sts = spt_lockpages(seg, an_idx, npages,
+		    &ppa[ppa_idx], lockmap, pos);
+		/*
+		 * unlock remaining pages for requests which are not
+		 * aligned or not in 4 M chunks
+		 */
+		for (i = 0; i < ppa_idx; i++)
+			page_unlock(ppa[i]);
+		for (i = ppa_idx + npages; i < a_npages; i++)
+			page_unlock(ppa[i]);
+		if (sptd->spt_ppa != NULL)
+			sptd->spt_flags |= DISM_PPA_CHANGED;
+		mutex_exit(&sptd->spt_lock);
+
+		kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
+
+	} else if (op == MC_UNLOCK) { /* unlock */
+		struct anon_map *amp;
+		struct anon 	*ap;
+		struct vnode 	*vp;
+		u_offset_t 	off;
+		struct page	*pp;
+		int		kernel;
+		anon_sync_obj_t cookie;
+
+		amp = sptd->spt_amp;
+		mutex_enter(&sptd->spt_lock);
+		if (shmd->shm_lckpgs == 0) {
+			mutex_exit(&sptd->spt_lock);
+			return (0);
+		}
+		/*
+		 * Don't cache new IO pages.
+		 */
+		if (sptd->spt_ppa != NULL)
+			sptd->spt_flags |= DISM_PPA_CHANGED;
+
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+		for (i = 0; i < npages; i++, an_idx++) {
+			if (shmd->shm_vpage[an_idx] & DISM_PG_LOCKED) {
+				anon_array_enter(amp, an_idx, &cookie);
+				ap = anon_get_ptr(amp->ahp, an_idx);
+				ASSERT(ap);
+				ASSERT(sptd->spt_ppa_lckcnt[an_idx] > 0);
+
+				swap_xlate(ap, &vp, &off);
+				anon_array_exit(&cookie);
+				pp = page_lookup(vp, off, SE_SHARED);
+				ASSERT(pp);
+				/*
+				 * the availrmem is decremented only for
+				 * pages which are not in seg pcache,
+				 * for pages in seg pcache availrmem was
+				 * decremented in _dismpagelock() (if
+				 * they were not locked here)
+				 */
+				kernel = (sptd->spt_ppa &&
+				    sptd->spt_ppa[an_idx]) ? 1 : 0;
+				page_pp_unlock(pp, 0, kernel);
+				page_unlock(pp);
+				shmd->shm_vpage[an_idx] &= ~DISM_PG_LOCKED;
+				sptd->spt_ppa_lckcnt[an_idx]--;
+				shmd->shm_lckpgs--;
+			}
+		}
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+		if (sptd->spt_ppa != NULL)
+			sptd->spt_flags |= DISM_PPA_CHANGED;
+		mutex_exit(&sptd->spt_lock);
+	}
+	return (sts);
+}
+
+/*ARGSUSED*/
+int
+segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+	spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * ISM segment is always rw.
+	 */
+	while (--pgno >= 0)
+		*protv++ = sptd->spt_prot;
+	return (0);
+}
+
+/*ARGSUSED*/
+u_offset_t
+segspt_shmgetoffset(struct seg *seg, caddr_t addr)
+{
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/* Offset does not matter in ISM memory */
+
+	return ((u_offset_t)0);
+}
+
+/* ARGSUSED */
+int
+segspt_shmgettype(struct seg *seg, caddr_t addr)
+{
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * The shared memory mapping is always MAP_SHARED, SWAP is only
+	 * reserved for DISM
+	 */
+	return (MAP_SHARED |
+		((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
+}
+
+/*ARGSUSED*/
+int
+segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	*vpp = sptd->spt_vp;
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
+{
+	struct shm_data 	*shmd = (struct shm_data *)seg->s_data;
+	struct spt_data	*sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+	struct anon_map	*amp;
+	pgcnt_t		pg_idx;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if (behav == MADV_FREE) {
+		if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
+			return (0);
+
+		amp = sptd->spt_amp;
+		pg_idx = seg_page(seg, addr);
+
+		mutex_enter(&sptd->spt_lock);
+		if (sptd->spt_ppa != NULL)
+			sptd->spt_flags |= DISM_PPA_CHANGED;
+		mutex_exit(&sptd->spt_lock);
+
+		/*
+		 * Purge all DISM cached pages
+		 */
+		seg_ppurge_seg(segspt_reclaim);
+
+		mutex_enter(&sptd->spt_lock);
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+		anon_disclaim(amp, pg_idx, len, ANON_PGLOOKUP_BLK);
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+		mutex_exit(&sptd->spt_lock);
+	} else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
+	    behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
+		int			already_set;
+		ulong_t			anon_index;
+		lgrp_mem_policy_t	policy;
+		caddr_t			shm_addr;
+		size_t			share_size;
+		size_t			size;
+		struct seg		*sptseg = shmd->shm_sptseg;
+		caddr_t			sptseg_addr;
+
+		/*
+		 * Align address and length to page size of underlying segment
+		 */
+		share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
+		shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
+		size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
+		    share_size);
+
+		amp = shmd->shm_amp;
+		anon_index = seg_page(seg, shm_addr);
+
+		/*
+		 * And now we may have to adjust size downward if we have
+		 * exceeded the realsize of the segment or initial anon
+		 * allocations.
+		 */
+		sptseg_addr = sptseg->s_base + ptob(anon_index);
+		if ((sptseg_addr + size) >
+		    (sptseg->s_base + sptd->spt_realsize))
+			size = (sptseg->s_base + sptd->spt_realsize) -
+			    sptseg_addr;
+
+		/*
+		 * Set memory allocation policy for this segment
+		 */
+		policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
+		already_set = lgrp_shm_policy_set(policy, amp, anon_index,
+		    NULL, 0, len);
+
+		/*
+		 * If random memory allocation policy set already,
+		 * don't bother reapplying it.
+		 */
+		if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
+			return (0);
+
+		/*
+		 * Mark any existing pages in the given range for
+		 * migration, flushing the I/O page cache, and using
+		 * underlying segment to calculate anon index and get
+		 * anonmap and vnode pointer from
+		 */
+		if (shmd->shm_softlockcnt > 0)
+			segspt_purge(seg);
+
+		page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
+	}
+
+	return (0);
+}
+
+/*ARGSUSED*/
+void
+segspt_shmdump(struct seg *seg)
+{
+	/* no-op for ISM segment */
+}
+
+/*ARGSUSED*/
+static faultcode_t
+segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
+{
+	return (ENOTSUP);
+}
+
+/*
+ * get a memory ID for an addr in a given segment
+ */
+static int
+segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+	struct shm_data *shmd = (struct shm_data *)seg->s_data;
+	struct anon 	*ap;
+	size_t		anon_index;
+	struct anon_map	*amp = shmd->shm_amp;
+	struct spt_data	*sptd = shmd->shm_sptseg->s_data;
+	struct seg	*sptseg = shmd->shm_sptseg;
+	anon_sync_obj_t	cookie;
+
+	anon_index = seg_page(seg, addr);
+
+	if (addr > (seg->s_base + sptd->spt_realsize)) {
+		return (EFAULT);
+	}
+
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+	anon_array_enter(amp, anon_index, &cookie);
+	ap = anon_get_ptr(amp->ahp, anon_index);
+	if (ap == NULL) {
+		struct page *pp;
+		caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
+
+		pp = anon_zero(sptseg, spt_addr, &ap, kcred);
+		if (pp == NULL) {
+			anon_array_exit(&cookie);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+			return (ENOMEM);
+		}
+		(void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
+		page_unlock(pp);
+	}
+	anon_array_exit(&cookie);
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+	memidp->val[0] = (uintptr_t)ap;
+	memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
+	return (0);
+}
+
+/*
+ * Get memory allocation policy info for specified address in given segment
+ */
+static lgrp_mem_policy_info_t *
+segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
+{
+	struct anon_map		*amp;
+	ulong_t			anon_index;
+	lgrp_mem_policy_info_t	*policy_info;
+	struct shm_data		*shm_data;
+
+	ASSERT(seg != NULL);
+
+	/*
+	 * Get anon_map from segshm
+	 *
+	 * Assume that no lock needs to be held on anon_map, since
+	 * it should be protected by its reference count which must be
+	 * nonzero for an existing segment
+	 * Need to grab readers lock on policy tree though
+	 */
+	shm_data = (struct shm_data *)seg->s_data;
+	if (shm_data == NULL)
+		return (NULL);
+	amp = shm_data->shm_amp;
+	ASSERT(amp->refcnt != 0);
+
+	/*
+	 * Get policy info
+	 *
+	 * Assume starting anon index of 0
+	 */
+	anon_index = seg_page(seg, addr);
+	policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
+
+	return (policy_info);
+}
diff --git a/usr/src/uts/common/vm/seg_spt.h b/usr/src/uts/common/vm/seg_spt.h
new file mode 100644
index 0000000000..fb97c77fcf
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_spt.h
@@ -0,0 +1,155 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_VM_SEG_SPT_H
+#define	_VM_SEG_SPT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifndef _ASM
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/lgrp.h>
+
+/*
+ * Passed data when creating spt segment.
+ */
+struct  segspt_crargs {
+	struct	seg	*seg_spt;
+	struct anon_map *amp;
+	uint_t		prot;
+	uint_t		flags;
+	uint_t		szc;
+};
+
+typedef struct spt_data {
+	struct vnode	*spt_vp;
+	struct anon_map	*spt_amp;
+	size_t 		spt_realsize;
+	struct page	**spt_ppa;
+	ushort_t	*spt_ppa_lckcnt;
+	uint_t		spt_prot;
+	kmutex_t 	spt_lock;
+	size_t		spt_pcachecnt;	/* # of times in pcache */
+	uint_t		spt_flags;	/* Dynamic ISM or regular ISM */
+	/*
+	 * Initial memory allocation policy
+	 * used during pre-allocation done in shmat()
+	 */
+	lgrp_mem_policy_info_t	spt_policy_info;
+} spt_data_t;
+
+/*
+ * Private data for spt_shm segment.
+ */
+typedef struct shm_data {
+	struct as	*shm_sptas;
+	struct anon_map *shm_amp;
+	size_t		shm_softlockcnt; /* # outstanding lock operations */
+	struct seg 	*shm_sptseg;	/* pointer to spt segment */
+	char		*shm_vpage;	/* indicating locked pages */
+	spgcnt_t	shm_lckpgs;	/* # of locked pages per attached seg */
+	/*
+	 * Memory allocation policy after shmat()
+	 */
+	lgrp_mem_policy_info_t	shm_policy_info;
+} shm_data_t;
+
+#define	DISM_PG_LOCKED		0x1	/* DISM page is locked */
+#define	DISM_PPA_CHANGED	0x2	/* DISM new lock, need to rebuild ppa */
+
+#define	DISM_LOCK_MAX		0xfffe	/* max number of locks per DISM page */
+#endif
+
+#ifdef _KERNEL
+
+#ifndef _ASM
+
+/*
+ * Functions used in shm.c to call ISM.
+ */
+int	sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
+	    uint_t prot, uint_t flags, uint_t szc);
+void	sptdestroy(struct as *, struct anon_map *);
+int	segspt_shmattach(struct seg *, caddr_t *);
+
+#define	isspt(sp)	((sp)->shm_sptinfo ? (sp)->shm_sptinfo->sptas : NULL)
+#define	spt_locked(a)	((a) & SHM_SHARE_MMU)
+#define	spt_pageable(a)	((a) & SHM_PAGEABLE)
+#define	spt_invalid(a)	(spt_locked((a)) && spt_pageable((a)))
+
+/*
+ * This can be applied to a segment with seg->s_ops == &segspt_shmops
+ * to determine the real size of the ISM segment.
+ */
+#define	spt_realsize(seg) (((struct spt_data *)(((struct shm_data *)\
+			((seg)->s_data))->shm_sptseg->s_data))->spt_realsize)
+
+/*
+ * This can be applied to a segment with seg->s_ops == &segspt_ops
+ * to determine the flags of the {D}ISM segment.
+ */
+#define	spt_flags(seg) (((struct spt_data *)((seg)->s_data))->spt_flags)
+
+/*
+ * For large page support
+ */
+extern int segvn_anypgsz;
+
+#endif
+
+/*
+ * In a 64-bit address space, we'll try to put ISM segments between
+ * PREDISM_BASE and PREDISM_BOUND.  The HAT may use these constants to
+ * predict that a VA is contained by an ISM segment, which may optimize
+ * translation.  The range must _only_ be treated as advisory; ISM segments
+ * may fall outside of the range, and non-ISM segments may be contained
+ * within the range.
+ * In order to avoid collision between ISM/DISM addresses with e.g.
+ * process heap addresses we will try to put ISM/DISM segments above
+ * PREDISM_1T_BASESHIFT (1T).
+ * The HAT is still expecting that any VA larger than PREDISM_BASESHIFT
+ * may belong to ISM/DISM (so on tlb miss it will probe first for 4M
+ * translation)
+ */
+#define	PREDISM_BASESHIFT	33
+#define	PREDISM_1T_BASESHIFT	40
+#define	PREDISM_BASE		((uintptr_t)1 << PREDISM_BASESHIFT)
+#define	PREDISM_1T_BASE		((uintptr_t)1 << PREDISM_1T_BASESHIFT)
+#define	PREDISM_BOUND		((uintptr_t)1 << 63)
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_SPT_H */
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
new file mode 100644
index 0000000000..86e57227f8
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -0,0 +1,7745 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VM - shared or copy-on-write from a vnode/anonymous memory.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/debug.h>
+#include <sys/cred.h>
+#include <sys/vmsystm.h>
+#include <sys/tuneable.h>
+#include <sys/bitmap.h>
+#include <sys/swap.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/vtrace.h>
+#include <sys/cmn_err.h>
+#include <sys/vm.h>
+#include <sys/dumphdr.h>
+#include <sys/lgrp.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <vm/pvn.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <vm/vpage.h>
+
+/*
+ * Private seg op routines.
+ */
+static int	segvn_dup(struct seg *seg, struct seg *newseg);
+static int	segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
+static void	segvn_free(struct seg *seg);
+static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
+		    caddr_t addr, size_t len, enum fault_type type,
+		    enum seg_rw rw);
+static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
+static int	segvn_setprot(struct seg *seg, caddr_t addr,
+		    size_t len, uint_t prot);
+static int	segvn_checkprot(struct seg *seg, caddr_t addr,
+		    size_t len, uint_t prot);
+static int	segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
+static size_t	segvn_swapout(struct seg *seg);
+static int	segvn_sync(struct seg *seg, caddr_t addr, size_t len,
+		    int attr, uint_t flags);
+static size_t	segvn_incore(struct seg *seg, caddr_t addr, size_t len,
+		    char *vec);
+static int	segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
+		    int attr, int op, ulong_t *lockmap, size_t pos);
+static int	segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
+		    uint_t *protv);
+static u_offset_t	segvn_getoffset(struct seg *seg, caddr_t addr);
+static int	segvn_gettype(struct seg *seg, caddr_t addr);
+static int	segvn_getvp(struct seg *seg, caddr_t addr,
+		    struct vnode **vpp);
+static int	segvn_advise(struct seg *seg, caddr_t addr, size_t len,
+		    uint_t behav);
+static void	segvn_dump(struct seg *seg);
+static int	segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
+		    struct page ***ppp, enum lock_type type, enum seg_rw rw);
+static int	segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
+		    uint_t szc);
+static int	segvn_getmemid(struct seg *seg, caddr_t addr,
+		    memid_t *memidp);
+static lgrp_mem_policy_info_t	*segvn_getpolicy(struct seg *, caddr_t);
+
+struct	seg_ops segvn_ops = {
+	segvn_dup,
+	segvn_unmap,
+	segvn_free,
+	segvn_fault,
+	segvn_faulta,
+	segvn_setprot,
+	segvn_checkprot,
+	segvn_kluster,
+	segvn_swapout,
+	segvn_sync,
+	segvn_incore,
+	segvn_lockop,
+	segvn_getprot,
+	segvn_getoffset,
+	segvn_gettype,
+	segvn_getvp,
+	segvn_advise,
+	segvn_dump,
+	segvn_pagelock,
+	segvn_setpagesize,
+	segvn_getmemid,
+	segvn_getpolicy,
+};
+
+/*
+ * Common zfod structures, provided as a shorthand for others to use.
+ */
+static segvn_crargs_t zfod_segvn_crargs =
+	SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
+static segvn_crargs_t kzfod_segvn_crargs =
+	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
+	PROT_ALL & ~PROT_USER);
+static segvn_crargs_t stack_noexec_crargs =
+	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
+
+caddr_t	zfod_argsp = (caddr_t)&zfod_segvn_crargs;	/* user zfod argsp */
+caddr_t	kzfod_argsp = (caddr_t)&kzfod_segvn_crargs;	/* kernel zfod argsp */
+caddr_t	stack_exec_argsp = (caddr_t)&zfod_segvn_crargs;	/* executable stack */
+caddr_t	stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
+
+#define	vpgtob(n)	((n) * sizeof (struct vpage))	/* For brevity */
+
+size_t	segvn_comb_thrshld = UINT_MAX;	/* patchable -- see 1196681 */
+
+static int	segvn_concat(struct seg *, struct seg *, int);
+static int	segvn_extend_prev(struct seg *, struct seg *,
+		    struct segvn_crargs *, size_t);
+static int	segvn_extend_next(struct seg *, struct seg *,
+		    struct segvn_crargs *, size_t);
+static void	segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
+static void	segvn_pagelist_rele(page_t **);
+static void	segvn_setvnode_mpss(vnode_t *);
+static void	segvn_relocate_pages(page_t **, page_t *);
+static int	segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
+static int	segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
+    uint_t, page_t **, page_t **, uint_t *, int *);
+static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
+    caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
+static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
+    caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
+static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
+    u_offset_t, struct vpage *, page_t **, uint_t,
+    enum fault_type, enum seg_rw, int);
+static void	segvn_vpage(struct seg *);
+
+static void segvn_purge(struct seg *seg);
+static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **,
+    enum seg_rw);
+
+static int sameprot(struct seg *, caddr_t, size_t);
+
+static int segvn_demote_range(struct seg *, caddr_t, size_t, int);
+static int segvn_clrszc(struct seg *);
+static struct seg *segvn_split_seg(struct seg *, caddr_t);
+static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
+    ulong_t, uint_t);
+
+static struct kmem_cache *segvn_cache;
+
+#ifdef VM_STATS
+static struct segvnvmstats_str {
+	ulong_t	fill_vp_pages[31];
+	ulong_t fltvnpages[49];
+	ulong_t	fullszcpages[10];
+	ulong_t	relocatepages[3];
+	ulong_t	fltanpages[17];
+	ulong_t pagelock[3];
+	ulong_t	demoterange[3];
+} segvnvmstats;
+#endif /* VM_STATS */
+
+#define	SDR_RANGE	1		/* demote entire range */
+#define	SDR_END		2		/* demote non aligned ends only */
+
+#define	CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) {            \
+		if ((len) != 0) { 		      	      		      \
+			lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);  \
+			ASSERT(lpgaddr >= (seg)->s_base);	      	      \
+			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) +    \
+			    (len)), pgsz);				      \
+			ASSERT(lpgeaddr > lpgaddr);		      	      \
+			ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size);    \
+		} else {					      	      \
+			lpgeaddr = lpgaddr = (addr);	      		      \
+		}							      \
+	}
+
+/*ARGSUSED*/
+static int
+segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+	struct segvn_data *svd = buf;
+
+	rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+/*ARGSUSED1*/
+static void
+segvn_cache_destructor(void *buf, void *cdrarg)
+{
+	struct segvn_data *svd = buf;
+
+	rw_destroy(&svd->lock);
+	mutex_destroy(&svd->segp_slock);
+}
+
+/*
+ * Patching this variable to non-zero allows the system to run with
+ * stacks marked as "not executable".  It's a bit of a kludge, but is
+ * provided as a tweakable for platforms that export those ABIs
+ * (e.g. sparc V8) that have executable stacks enabled by default.
+ * There are also some restrictions for platforms that don't actually
+ * implement 'noexec' protections.
+ *
+ * Once enabled, the system is (therefore) unable to provide a fully
+ * ABI-compliant execution environment, though practically speaking,
+ * most everything works.  The exceptions are generally some interpreters
+ * and debuggers that create executable code on the stack and jump
+ * into it (without explicitly mprotecting the address range to include
+ * PROT_EXEC).
+ *
+ * One important class of applications that are disabled are those
+ * that have been transformed into malicious agents using one of the
+ * numerous "buffer overflow" attacks.  See 4007890.
+ */
+int noexec_user_stack = 0;
+int noexec_user_stack_log = 1;
+
+int segvn_lpg_disable = 0;
+uint_t segvn_maxpgszc = 0;
+
+ulong_t segvn_fltvnpages_clrszc_err;
+ulong_t segvn_setpgsz_align_err;
+ulong_t segvn_setpgsz_getattr_err;
+ulong_t segvn_setpgsz_eof_err;
+ulong_t segvn_faultvnmpss_align_err1;
+ulong_t segvn_faultvnmpss_align_err2;
+ulong_t segvn_faultvnmpss_align_err3;
+ulong_t segvn_faultvnmpss_align_err4;
+ulong_t segvn_faultvnmpss_align_err5;
+ulong_t	segvn_vmpss_pageio_deadlk_err;
+
+/*
+ * Initialize segvn data structures
+ */
+void
+segvn_init(void)
+{
+	uint_t maxszc;
+	uint_t szc;
+	size_t pgsz;
+
+	segvn_cache = kmem_cache_create("segvn_cache",
+		sizeof (struct segvn_data), 0,
+		segvn_cache_constructor, segvn_cache_destructor, NULL,
+		NULL, NULL, 0);
+
+	if (segvn_lpg_disable != 0)
+		return;
+	szc = maxszc = page_num_pagesizes() - 1;
+	if (szc == 0) {
+		segvn_lpg_disable = 1;
+		return;
+	}
+	if (page_get_pagesize(0) != PAGESIZE) {
+		panic("segvn_init: bad szc 0");
+		/*NOTREACHED*/
+	}
+	while (szc != 0) {
+		pgsz = page_get_pagesize(szc);
+		if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
+			panic("segvn_init: bad szc %d", szc);
+			/*NOTREACHED*/
+		}
+		szc--;
+	}
+	if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
+		segvn_maxpgszc = maxszc;
+}
+
+#define	SEGVN_PAGEIO	((void *)0x1)
+#define	SEGVN_NOPAGEIO	((void *)0x2)
+
+static void
+segvn_setvnode_mpss(vnode_t *vp)
+{
+	int err;
+
+	ASSERT(vp->v_mpssdata == NULL ||
+	    vp->v_mpssdata == SEGVN_PAGEIO ||
+	    vp->v_mpssdata == SEGVN_NOPAGEIO);
+
+	if (vp->v_mpssdata == NULL) {
+		if (vn_vmpss_usepageio(vp)) {
+			err = VOP_PAGEIO(vp, (page_t *)NULL,
+			    (u_offset_t)0, 0, 0, CRED());
+		} else {
+			err = ENOSYS;
+		}
+		/*
+		 * set v_mpssdata just once per vnode life
+		 * so that it never changes.
+		 */
+		mutex_enter(&vp->v_lock);
+		if (vp->v_mpssdata == NULL) {
+			if (err == EINVAL) {
+				vp->v_mpssdata = SEGVN_PAGEIO;
+			} else {
+				vp->v_mpssdata = SEGVN_NOPAGEIO;
+			}
+		}
+		mutex_exit(&vp->v_lock);
+	}
+}
+
+int
+segvn_create(struct seg *seg, void *argsp)
+{
+	struct segvn_crargs *a = (struct segvn_crargs *)argsp;
+	struct segvn_data *svd;
+	size_t swresv = 0;
+	struct cred *cred;
+	struct anon_map *amp;
+	int error = 0;
+	size_t pgsz;
+	lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT;
+
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
+		panic("segvn_create type");
+		/*NOTREACHED*/
+	}
+
+	/*
+	 * Check arguments.  If a shared anon structure is given then
+	 * it is illegal to also specify a vp.
+	 */
+	if (a->amp != NULL && a->vp != NULL) {
+		panic("segvn_create anon_map");
+		/*NOTREACHED*/
+	}
+
+	/* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
+	if (a->type == MAP_SHARED)
+		a->flags &= ~MAP_NORESERVE;
+
+	if (a->szc != 0) {
+		if (segvn_lpg_disable != 0 || a->amp != NULL ||
+		    (a->type == MAP_SHARED && a->vp == NULL) ||
+		    (a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
+			a->szc = 0;
+		} else {
+			if (a->szc > segvn_maxpgszc)
+				a->szc = segvn_maxpgszc;
+			pgsz = page_get_pagesize(a->szc);
+			if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
+			    !IS_P2ALIGNED(seg->s_size, pgsz)) {
+				a->szc = 0;
+			} else if (a->vp != NULL) {
+				extern struct vnode kvp;
+				if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) {
+					/*
+					 * paranoid check.
+					 * hat_page_demote() is not supported
+					 * on swapfs pages.
+					 */
+					a->szc = 0;
+				} else if (map_addr_vacalign_check(seg->s_base,
+				    a->offset & PAGEMASK)) {
+					a->szc = 0;
+				}
+			}
+		}
+	}
+
+	/*
+	 * If segment may need private pages, reserve them now.
+	 */
+	if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
+	    (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
+		if (anon_resv(seg->s_size) == 0)
+			return (EAGAIN);
+		swresv = seg->s_size;
+		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+			seg, swresv, 1);
+	}
+
+	/*
+	 * Reserve any mapping structures that may be required.
+	 */
+	hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
+
+	if (a->cred) {
+		cred = a->cred;
+		crhold(cred);
+	} else {
+		crhold(cred = CRED());
+	}
+
+	/* Inform the vnode of the new mapping */
+	if (a->vp) {
+		error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
+		    seg->s_as, seg->s_base, seg->s_size, a->prot,
+		    a->maxprot, a->type, cred);
+		if (error) {
+			if (swresv != 0) {
+				anon_unresv(swresv);
+				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+					"anon proc:%p %lu %u",
+					seg, swresv, 0);
+			}
+			crfree(cred);
+			hat_unload(seg->s_as->a_hat, seg->s_base,
+				seg->s_size, HAT_UNLOAD_UNMAP);
+			return (error);
+		}
+	}
+
+	/*
+	 * If more than one segment in the address space, and
+	 * they're adjacent virtually, try to concatenate them.
+	 * Don't concatenate if an explicit anon_map structure
+	 * was supplied (e.g., SystemV shared memory).
+	 */
+	if (a->amp == NULL) {
+		struct seg *pseg, *nseg;
+		struct segvn_data *psvd, *nsvd;
+		lgrp_mem_policy_t ppolicy, npolicy;
+		uint_t	lgrp_mem_policy_flags = 0;
+		extern lgrp_mem_policy_t lgrp_mem_default_policy;
+
+		/*
+		 * Memory policy flags (lgrp_mem_policy_flags) is valid when
+		 * extending stack/heap segments.
+		 */
+		if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
+			!(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
+			lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
+		} else {
+			/*
+			 * Get policy when not extending it from another segment
+			 */
+			mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
+		}
+
+		/*
+		 * First, try to concatenate the previous and new segments
+		 */
+		pseg = AS_SEGPREV(seg->s_as, seg);
+		if (pseg != NULL &&
+		    pseg->s_base + pseg->s_size == seg->s_base &&
+		    pseg->s_ops == &segvn_ops) {
+			/*
+			 * Get memory allocation policy from previous segment.
+			 * When extension is specified (e.g. for heap) apply
+			 * this policy to the new segment regardless of the
+			 * outcome of segment concatenation.  Extension occurs
+			 * for non-default policy otherwise default policy is
+			 * used and is based on extended segment size.
+			 */
+			psvd = (struct segvn_data *)pseg->s_data;
+			ppolicy = psvd->policy_info.mem_policy;
+			if (lgrp_mem_policy_flags ==
+			    LGRP_MP_FLAG_EXTEND_UP) {
+				if (ppolicy != lgrp_mem_default_policy) {
+					mpolicy = ppolicy;
+				} else {
+					mpolicy = lgrp_mem_policy_default(
+					    pseg->s_size + seg->s_size,
+					    a->type);
+				}
+			}
+
+			if (mpolicy == ppolicy &&
+			    (pseg->s_size + seg->s_size <=
+			    segvn_comb_thrshld || psvd->amp == NULL) &&
+			    segvn_extend_prev(pseg, seg, a, swresv) == 0) {
+				/*
+				 * success! now try to concatenate
+				 * with following seg
+				 */
+				crfree(cred);
+				nseg = AS_SEGNEXT(pseg->s_as, pseg);
+				if (nseg != NULL &&
+				    nseg != pseg &&
+				    nseg->s_ops == &segvn_ops &&
+				    pseg->s_base + pseg->s_size ==
+				    nseg->s_base)
+					(void) segvn_concat(pseg, nseg, 0);
+				ASSERT(pseg->s_szc == 0 ||
+				    (a->szc == pseg->s_szc &&
+				    IS_P2ALIGNED(pseg->s_base, pgsz) &&
+				    IS_P2ALIGNED(pseg->s_size, pgsz)));
+				return (0);
+			}
+		}
+
+		/*
+		 * Failed, so try to concatenate with following seg
+		 */
+		nseg = AS_SEGNEXT(seg->s_as, seg);
+		if (nseg != NULL &&
+		    seg->s_base + seg->s_size == nseg->s_base &&
+		    nseg->s_ops == &segvn_ops) {
+			/*
+			 * Get memory allocation policy from next segment.
+			 * When extension is specified (e.g. for stack) apply
+			 * this policy to the new segment regardless of the
+			 * outcome of segment concatenation.  Extension occurs
+			 * for non-default policy otherwise default policy is
+			 * used and is based on extended segment size.
+			 */
+			nsvd = (struct segvn_data *)nseg->s_data;
+			npolicy = nsvd->policy_info.mem_policy;
+			if (lgrp_mem_policy_flags ==
+			    LGRP_MP_FLAG_EXTEND_DOWN) {
+				if (npolicy != lgrp_mem_default_policy) {
+					mpolicy = npolicy;
+				} else {
+					mpolicy = lgrp_mem_policy_default(
+					    nseg->s_size + seg->s_size,
+					    a->type);
+				}
+			}
+
+			if (mpolicy == npolicy &&
+			    segvn_extend_next(seg, nseg, a, swresv) == 0) {
+				crfree(cred);
+				ASSERT(nseg->s_szc == 0 ||
+				    (a->szc == nseg->s_szc &&
+				    IS_P2ALIGNED(nseg->s_base, pgsz) &&
+				    IS_P2ALIGNED(nseg->s_size, pgsz)));
+				return (0);
+			}
+		}
+	}
+
+	if (a->vp != NULL) {
+		VN_HOLD(a->vp);
+		if (a->type == MAP_SHARED)
+			lgrp_shm_policy_init(NULL, a->vp);
+	}
+	svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
+
+	seg->s_ops = &segvn_ops;
+	seg->s_data = (void *)svd;
+	seg->s_szc = a->szc;
+
+	svd->vp = a->vp;
+	/*
+	 * Anonymous mappings have no backing file so the offset is meaningless.
+	 */
+	svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
+	svd->prot = a->prot;
+	svd->maxprot = a->maxprot;
+	svd->pageprot = 0;
+	svd->type = a->type;
+	svd->vpage = NULL;
+	svd->cred = cred;
+	svd->advice = MADV_NORMAL;
+	svd->pageadvice = 0;
+	svd->flags = (ushort_t)a->flags;
+	svd->softlockcnt = 0;
+	if (a->szc != 0 && a->vp != NULL) {
+		segvn_setvnode_mpss(a->vp);
+	}
+
+	amp = a->amp;
+	if ((svd->amp = amp) == NULL) {
+		svd->anon_index = 0;
+		if (svd->type == MAP_SHARED) {
+			svd->swresv = 0;
+			/*
+			 * Shared mappings to a vp need no other setup.
+			 * If we have a shared mapping to an anon_map object
+			 * which hasn't been allocated yet,  allocate the
+			 * struct now so that it will be properly shared
+			 * by remembering the swap reservation there.
+			 */
+			if (a->vp == NULL) {
+				svd->amp = anonmap_alloc(seg->s_size, swresv);
+				svd->amp->a_szc = seg->s_szc;
+			}
+		} else {
+			/*
+			 * Private mapping (with or without a vp).
+			 * Allocate anon_map when needed.
+			 */
+			svd->swresv = swresv;
+		}
+	} else {
+		pgcnt_t anon_num;
+
+		/*
+		 * Mapping to an existing anon_map structure without a vp.
+		 * For now we will insure that the segment size isn't larger
+		 * than the size - offset gives us.  Later on we may wish to
+		 * have the anon array dynamically allocated itself so that
+		 * we don't always have to allocate all the anon pointer slots.
+		 * This of course involves adding extra code to check that we
+		 * aren't trying to use an anon pointer slot beyond the end
+		 * of the currently allocated anon array.
+		 */
+		if ((amp->size - a->offset) < seg->s_size) {
+			panic("segvn_create anon_map size");
+			/*NOTREACHED*/
+		}
+
+		anon_num = btopr(a->offset);
+
+		if (a->type == MAP_SHARED) {
+			/*
+			 * SHARED mapping to a given anon_map.
+			 */
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+			amp->refcnt++;
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+			svd->anon_index = anon_num;
+			svd->swresv = 0;
+		} else {
+			/*
+			 * PRIVATE mapping to a given anon_map.
+			 * Make sure that all the needed anon
+			 * structures are created (so that we will
+			 * share the underlying pages if nothing
+			 * is written by this mapping) and then
+			 * duplicate the anon array as is done
+			 * when a privately mapped segment is dup'ed.
+			 */
+			struct anon *ap;
+			caddr_t addr;
+			caddr_t eaddr;
+			ulong_t	anon_idx;
+			int hat_flag = HAT_LOAD;
+
+			if (svd->flags & MAP_TEXT) {
+				hat_flag |= HAT_LOAD_TEXT;
+			}
+
+			svd->amp = anonmap_alloc(seg->s_size, 0);
+			svd->amp->a_szc = seg->s_szc;
+			svd->anon_index = 0;
+			svd->swresv = swresv;
+
+			/*
+			 * Prevent 2 threads from allocating anon
+			 * slots simultaneously.
+			 */
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+			eaddr = seg->s_base + seg->s_size;
+
+			for (anon_idx = anon_num, addr = seg->s_base;
+			    addr < eaddr; addr += PAGESIZE, anon_idx++) {
+				page_t *pp;
+
+				if ((ap = anon_get_ptr(amp->ahp,
+				    anon_idx)) != NULL)
+					continue;
+
+				/*
+				 * Allocate the anon struct now.
+				 * Might as well load up translation
+				 * to the page while we're at it...
+				 */
+				pp = anon_zero(seg, addr, &ap, cred);
+				if (ap == NULL || pp == NULL) {
+					panic("segvn_create anon_zero");
+					/*NOTREACHED*/
+				}
+
+				/*
+				 * Re-acquire the anon_map lock and
+				 * initialize the anon array entry.
+				 */
+				ASSERT(anon_get_ptr(amp->ahp,
+				    anon_idx) == NULL);
+				(void) anon_set_ptr(amp->ahp, anon_idx, ap,
+				    ANON_SLEEP);
+
+				ASSERT(seg->s_szc == 0);
+				ASSERT(!IS_VMODSORT(pp->p_vnode));
+
+				hat_memload(seg->s_as->a_hat, addr, pp,
+					svd->prot & ~PROT_WRITE, hat_flag);
+
+				page_unlock(pp);
+			}
+			ASSERT(seg->s_szc == 0);
+			anon_dup(amp->ahp, anon_num, svd->amp->ahp,
+			    0, seg->s_size);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		}
+	}
+
+	/*
+	 * Set default memory allocation policy for segment
+	 *
+	 * Always set policy for private memory at least for initialization
+	 * even if this is a shared memory segment
+	 */
+	(void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
+
+	if (svd->type == MAP_SHARED)
+		(void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
+		    svd->vp, svd->offset, seg->s_size);
+
+	return (0);
+}
+
+/*
+ * Concatenate two existing segments, if possible.
+ * Return 0 on success, -1 if two segments are not compatible
+ * or -2 on memory allocation failure.
+ * If private == 1 then try and concat segments with private pages.
+ */
+static int
+segvn_concat(struct seg *seg1, struct seg *seg2, int private)
+{
+	struct segvn_data *svd1 = seg1->s_data;
+	struct segvn_data *svd2 = seg2->s_data;
+	struct anon_map *amp1 = svd1->amp;
+	struct anon_map *amp2 = svd2->amp;
+	struct vpage *vpage1 = svd1->vpage;
+	struct vpage *vpage2 = svd2->vpage, *nvpage = NULL;
+	size_t size, nvpsize;
+	pgcnt_t npages1, npages2;
+
+	ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as);
+	ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
+	ASSERT(seg1->s_ops == seg2->s_ops);
+
+	/* both segments exist, try to merge them */
+#define	incompat(x)	(svd1->x != svd2->x)
+	if (incompat(vp) || incompat(maxprot) ||
+	    (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) ||
+	    (!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
+	    incompat(type) || incompat(cred) || incompat(flags) ||
+	    seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
+	    (svd2->softlockcnt > 0))
+		return (-1);
+#undef incompat
+
+	/*
+	 * vp == NULL implies zfod, offset doesn't matter
+	 */
+	if (svd1->vp != NULL &&
+	    svd1->offset + seg1->s_size != svd2->offset) {
+		return (-1);
+	}
+
+	/*
+	 * Fail early if we're not supposed to concatenate
+	 * private pages.
+	 */
+	if ((private == 0 || svd1->type != MAP_PRIVATE) &&
+	    (amp1 != NULL || amp2 != NULL)) {
+		return (-1);
+	}
+
+	/*
+	 * If either seg has vpages, create a new merged vpage array.
+	 */
+	if (vpage1 != NULL || vpage2 != NULL) {
+		struct vpage *vp;
+
+		npages1 = seg_pages(seg1);
+		npages2 = seg_pages(seg2);
+		nvpsize = vpgtob(npages1 + npages2);
+
+		if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) {
+			return (-2);
+		}
+		if (vpage1 != NULL) {
+			bcopy(vpage1, nvpage, vpgtob(npages1));
+		}
+		if (vpage2 != NULL) {
+			bcopy(vpage2, nvpage + npages1, vpgtob(npages2));
+		}
+		for (vp = nvpage; vp < nvpage + npages1; vp++) {
+			if (svd2->pageprot && !svd1->pageprot) {
+				VPP_SETPROT(vp, svd1->prot);
+			}
+			if (svd2->pageadvice && !svd1->pageadvice) {
+				VPP_SETADVICE(vp, svd1->advice);
+			}
+		}
+		for (vp = nvpage + npages1;
+		    vp < nvpage + npages1 + npages2; vp++) {
+			if (svd1->pageprot && !svd2->pageprot) {
+				VPP_SETPROT(vp, svd2->prot);
+			}
+			if (svd1->pageadvice && !svd2->pageadvice) {
+				VPP_SETADVICE(vp, svd2->advice);
+			}
+		}
+	}
+
+	/*
+	 * If either segment has private pages, create a new merged anon
+	 * array.
+	 */
+	if (amp1 != NULL || amp2 != NULL) {
+		struct anon_hdr *nahp;
+		struct anon_map *namp = NULL;
+		size_t asize = seg1->s_size + seg2->s_size;
+
+		if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) {
+			if (nvpage != NULL) {
+				kmem_free(nvpage, nvpsize);
+			}
+			return (-2);
+		}
+		if (amp1 != NULL) {
+			/*
+			 * XXX anon rwlock is not really needed because
+			 * this is a private segment and we are writers.
+			 */
+			ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
+			ASSERT(amp1->refcnt == 1);
+			if (anon_copy_ptr(amp1->ahp, svd1->anon_index,
+			    nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) {
+				anon_release(nahp, btop(asize));
+				ANON_LOCK_EXIT(&amp1->a_rwlock);
+				if (nvpage != NULL) {
+					kmem_free(nvpage, nvpsize);
+				}
+				return (-2);
+			}
+		}
+		if (amp2 != NULL) {
+			ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
+			ASSERT(amp2->refcnt == 1);
+			if (anon_copy_ptr(amp2->ahp, svd2->anon_index,
+			    nahp, btop(seg1->s_size), btop(seg2->s_size),
+			    ANON_NOSLEEP)) {
+				anon_release(nahp, btop(asize));
+				ANON_LOCK_EXIT(&amp2->a_rwlock);
+				if (amp1 != NULL) {
+					ANON_LOCK_EXIT(&amp1->a_rwlock);
+				}
+				if (nvpage != NULL) {
+					kmem_free(nvpage, nvpsize);
+				}
+				return (-2);
+			}
+		}
+		if (amp1 != NULL) {
+			namp = amp1;
+			anon_release(amp1->ahp, btop(amp1->size));
+		}
+		if (amp2 != NULL) {
+			if (namp == NULL) {
+				ASSERT(amp1 == NULL);
+				namp = amp2;
+				anon_release(amp2->ahp, btop(amp2->size));
+			} else {
+				amp2->refcnt--;
+				ANON_LOCK_EXIT(&amp2->a_rwlock);
+				anonmap_free(amp2);
+			}
+			svd2->amp = NULL; /* needed for seg_free */
+		}
+		namp->ahp = nahp;
+		namp->size = asize;
+		svd1->amp = namp;
+		svd1->anon_index = 0;
+		ANON_LOCK_EXIT(&namp->a_rwlock);
+	}
+	/*
+	 * Now free the old vpage structures.
+	 */
+	if (nvpage != NULL) {
+		if (vpage1 != NULL) {
+			kmem_free(vpage1, vpgtob(npages1));
+		}
+		if (vpage2 != NULL) {
+			svd2->vpage = NULL;
+			kmem_free(vpage2, vpgtob(npages2));
+		}
+		if (svd2->pageprot) {
+			svd1->pageprot = 1;
+		}
+		if (svd2->pageadvice) {
+			svd1->pageadvice = 1;
+		}
+		svd1->vpage = nvpage;
+	}
+
+	/* all looks ok, merge segments */
+	svd1->swresv += svd2->swresv;
+	svd2->swresv = 0;  /* so seg_free doesn't release swap space */
+	size = seg2->s_size;
+	seg_free(seg2);
+	seg1->s_size += size;
+	return (0);
+}
+
+/*
+ * Extend the previous segment (seg1) to include the
+ * new segment (seg2 + a), if possible.
+ * Return 0 on success.
+ */
+static int
+segvn_extend_prev(seg1, seg2, a, swresv)
+	struct seg *seg1, *seg2;
+	struct segvn_crargs *a;
+	size_t swresv;
+{
+	struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data;
+	size_t size;
+	struct anon_map *amp1;
+	struct vpage *new_vpage;
+
+	/*
+	 * We don't need any segment level locks for "segvn" data
+	 * since the address space is "write" locked.
+	 */
+	ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
+
+	/* second segment is new, try to extend first */
+	/* XXX - should also check cred */
+	if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
+	    (!svd1->pageprot && (svd1->prot != a->prot)) ||
+	    svd1->type != a->type || svd1->flags != a->flags ||
+	    seg1->s_szc != a->szc)
+		return (-1);
+
+	/* vp == NULL implies zfod, offset doesn't matter */
+	if (svd1->vp != NULL &&
+	    svd1->offset + seg1->s_size != (a->offset & PAGEMASK))
+		return (-1);
+
+	amp1 = svd1->amp;
+	if (amp1) {
+		pgcnt_t newpgs;
+
+		/*
+		 * Segment has private pages, can data structures
+		 * be expanded?
+		 *
+		 * Acquire the anon_map lock to prevent it from changing,
+		 * if it is shared.  This ensures that the anon_map
+		 * will not change while a thread which has a read/write
+		 * lock on an address space references it.
+		 * XXX - Don't need the anon_map lock at all if "refcnt"
+		 * is 1.
+		 *
+		 * Can't grow a MAP_SHARED segment with an anonmap because
+		 * there may be existing anon slots where we want to extend
+		 * the segment and we wouldn't know what to do with them
+		 * (e.g., for tmpfs right thing is to just leave them there,
+		 * for /dev/zero they should be cleared out).
+		 */
+		if (svd1->type == MAP_SHARED)
+			return (-1);
+
+		ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
+		if (amp1->refcnt > 1) {
+			ANON_LOCK_EXIT(&amp1->a_rwlock);
+			return (-1);
+		}
+		newpgs = anon_grow(amp1->ahp, &svd1->anon_index,
+		    btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP);
+
+		if (newpgs == 0) {
+			ANON_LOCK_EXIT(&amp1->a_rwlock);
+			return (-1);
+		}
+		amp1->size = ptob(newpgs);
+		ANON_LOCK_EXIT(&amp1->a_rwlock);
+	}
+	if (svd1->vpage != NULL) {
+		new_vpage =
+		    kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
+			KM_NOSLEEP);
+		if (new_vpage == NULL)
+			return (-1);
+		bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1)));
+		kmem_free(svd1->vpage, vpgtob(seg_pages(seg1)));
+		svd1->vpage = new_vpage;
+		if (svd1->pageprot) {
+			struct vpage *vp, *evp;
+
+			vp = new_vpage + seg_pages(seg1);
+			evp = vp + seg_pages(seg2);
+			for (; vp < evp; vp++)
+				VPP_SETPROT(vp, a->prot);
+		}
+	}
+	size = seg2->s_size;
+	seg_free(seg2);
+	seg1->s_size += size;
+	svd1->swresv += swresv;
+	return (0);
+}
+
+/*
+ * Extend the next segment (seg2) to include the
+ * new segment (seg1 + a), if possible.
+ * Return 0 on success.
+ */
+static int
+segvn_extend_next(
+	struct seg *seg1,
+	struct seg *seg2,
+	struct segvn_crargs *a,
+	size_t swresv)
+{
+	struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data;
+	size_t size;
+	struct anon_map *amp2;
+	struct vpage *new_vpage;
+
+	/*
+	 * We don't need any segment level locks for "segvn" data
+	 * since the address space is "write" locked.
+	 */
+	ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock));
+
+	/* first segment is new, try to extend second */
+	/* XXX - should also check cred */
+	if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
+	    (!svd2->pageprot && (svd2->prot != a->prot)) ||
+	    svd2->type != a->type || svd2->flags != a->flags ||
+	    seg2->s_szc != a->szc)
+		return (-1);
+	/* vp == NULL implies zfod, offset doesn't matter */
+	if (svd2->vp != NULL &&
+	    (a->offset & PAGEMASK) + seg1->s_size != svd2->offset)
+		return (-1);
+
+	amp2 = svd2->amp;
+	if (amp2) {
+		pgcnt_t newpgs;
+
+		/*
+		 * Segment has private pages, can data structures
+		 * be expanded?
+		 *
+		 * Acquire the anon_map lock to prevent it from changing,
+		 * if it is shared.  This ensures that the anon_map
+		 * will not change while a thread which has a read/write
+		 * lock on an address space references it.
+		 *
+		 * XXX - Don't need the anon_map lock at all if "refcnt"
+		 * is 1.
+		 */
+		if (svd2->type == MAP_SHARED)
+			return (-1);
+
+		ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
+		if (amp2->refcnt > 1) {
+			ANON_LOCK_EXIT(&amp2->a_rwlock);
+			return (-1);
+		}
+		newpgs = anon_grow(amp2->ahp, &svd2->anon_index,
+		    btop(seg2->s_size), btop(seg1->s_size),
+		    ANON_NOSLEEP | ANON_GROWDOWN);
+
+		if (newpgs == 0) {
+			ANON_LOCK_EXIT(&amp2->a_rwlock);
+			return (-1);
+		}
+		amp2->size = ptob(newpgs);
+		ANON_LOCK_EXIT(&amp2->a_rwlock);
+	}
+	if (svd2->vpage != NULL) {
+		new_vpage =
+		    kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
+			KM_NOSLEEP);
+		if (new_vpage == NULL) {
+			/* Not merging segments so adjust anon_index back */
+			if (amp2)
+				svd2->anon_index += seg_pages(seg1);
+			return (-1);
+		}
+		bcopy(svd2->vpage, new_vpage + seg_pages(seg1),
+		    vpgtob(seg_pages(seg2)));
+		kmem_free(svd2->vpage, vpgtob(seg_pages(seg2)));
+		svd2->vpage = new_vpage;
+		if (svd2->pageprot) {
+			struct vpage *vp, *evp;
+
+			vp = new_vpage;
+			evp = vp + seg_pages(seg1);
+			for (; vp < evp; vp++)
+				VPP_SETPROT(vp, a->prot);
+		}
+	}
+	size = seg1->s_size;
+	seg_free(seg1);
+	seg2->s_size += size;
+	seg2->s_base -= size;
+	svd2->offset -= size;
+	svd2->swresv += swresv;
+	return (0);
+}
+
+static int
+segvn_dup(struct seg *seg, struct seg *newseg)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct segvn_data *newsvd;
+	pgcnt_t npages = seg_pages(seg);
+	int error = 0;
+	uint_t prot;
+	size_t len;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * If segment has anon reserved, reserve more for the new seg.
+	 * For a MAP_NORESERVE segment swresv will be a count of all the
+	 * allocated anon slots; thus we reserve for the child as many slots
+	 * as the parent has allocated. This semantic prevents the child or
+	 * parent from dieing during a copy-on-write fault caused by trying
+	 * to write a shared pre-existing anon page.
+	 */
+	if ((len = svd->swresv) != 0) {
+		if (anon_resv(svd->swresv) == 0)
+			return (ENOMEM);
+
+		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+			seg, len, 0);
+	}
+
+	newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
+
+	newseg->s_ops = &segvn_ops;
+	newseg->s_data = (void *)newsvd;
+	newseg->s_szc = seg->s_szc;
+
+	if ((newsvd->vp = svd->vp) != NULL) {
+		VN_HOLD(svd->vp);
+		if (svd->type == MAP_SHARED)
+			lgrp_shm_policy_init(NULL, svd->vp);
+	}
+	newsvd->offset = svd->offset;
+	newsvd->prot = svd->prot;
+	newsvd->maxprot = svd->maxprot;
+	newsvd->pageprot = svd->pageprot;
+	newsvd->type = svd->type;
+	newsvd->cred = svd->cred;
+	crhold(newsvd->cred);
+	newsvd->advice = svd->advice;
+	newsvd->pageadvice = svd->pageadvice;
+	newsvd->swresv = svd->swresv;
+	newsvd->flags = svd->flags;
+	newsvd->softlockcnt = 0;
+	newsvd->policy_info = svd->policy_info;
+	if ((newsvd->amp = svd->amp) == NULL) {
+		/*
+		 * Not attaching to a shared anon object.
+		 */
+		newsvd->anon_index = 0;
+	} else {
+		struct anon_map *amp;
+
+		amp = svd->amp;
+		if (svd->type == MAP_SHARED) {
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+			amp->refcnt++;
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+			newsvd->anon_index = svd->anon_index;
+		} else {
+			int reclaim = 1;
+
+			/*
+			 * Allocate and initialize new anon_map structure.
+			 */
+			newsvd->amp = anonmap_alloc(newseg->s_size, 0);
+			newsvd->amp->a_szc = newseg->s_szc;
+			newsvd->anon_index = 0;
+
+			/*
+			 * We don't have to acquire the anon_map lock
+			 * for the new segment (since it belongs to an
+			 * address space that is still not associated
+			 * with any process), or the segment in the old
+			 * address space (since all threads in it
+			 * are stopped while duplicating the address space).
+			 */
+
+			/*
+			 * The goal of the following code is to make sure that
+			 * softlocked pages do not end up as copy on write
+			 * pages.  This would cause problems where one
+			 * thread writes to a page that is COW and a different
+			 * thread in the same process has softlocked it.  The
+			 * softlock lock would move away from this process
+			 * because the write would cause this process to get
+			 * a copy (without the softlock).
+			 *
+			 * The strategy here is to just break the
+			 * sharing on pages that could possibly be
+			 * softlocked.
+			 */
+retry:
+			if (svd->softlockcnt) {
+				struct anon *ap, *newap;
+				size_t i;
+				uint_t vpprot;
+				page_t *anon_pl[1+1], *pp;
+				caddr_t addr;
+				ulong_t anon_idx = 0;
+
+				/*
+				 * The softlock count might be non zero
+				 * because some pages are still stuck in the
+				 * cache for lazy reclaim. Flush the cache
+				 * now. This should drop the count to zero.
+				 * [or there is really I/O going on to these
+				 * pages]. Note, we have the writers lock so
+				 * nothing gets inserted during the flush.
+				 */
+				if (reclaim == 1) {
+					segvn_purge(seg);
+					reclaim = 0;
+					goto retry;
+				}
+				i = btopr(seg->s_size);
+				addr = seg->s_base;
+				/*
+				 * XXX break cow sharing using PAGESIZE
+				 * pages. They will be relocated into larger
+				 * pages at fault time.
+				 */
+				while (i-- > 0) {
+					if (ap = anon_get_ptr(amp->ahp,
+					    anon_idx)) {
+						error = anon_getpage(&ap,
+						    &vpprot, anon_pl, PAGESIZE,
+						    seg, addr, S_READ,
+						    svd->cred);
+						if (error) {
+							newsvd->vpage = NULL;
+							goto out;
+						}
+						/*
+						 * prot need not be computed
+						 * below 'cause anon_private is
+						 * going to ignore it anyway
+						 * as child doesn't inherit
+						 * pagelock from parent.
+						 */
+						prot = svd->pageprot ?
+						    VPP_PROT(
+						    &svd->vpage[
+						    seg_page(seg, addr)])
+						    : svd->prot;
+						pp = anon_private(&newap,
+						    newseg, addr, prot,
+						    anon_pl[0],	0,
+						    newsvd->cred);
+						if (pp == NULL) {
+							/* no mem abort */
+							newsvd->vpage = NULL;
+							error = ENOMEM;
+							goto out;
+						}
+						(void) anon_set_ptr(
+						    newsvd->amp->ahp, anon_idx,
+						    newap, ANON_SLEEP);
+						page_unlock(pp);
+					}
+					addr += PAGESIZE;
+					anon_idx++;
+				}
+			} else {	/* common case */
+				if (seg->s_szc != 0) {
+					/*
+					 * If at least one of anon slots of a
+					 * large page exists then make sure
+					 * all anon slots of a large page
+					 * exist to avoid partial cow sharing
+					 * of a large page in the future.
+					 */
+					anon_dup_fill_holes(amp->ahp,
+					    svd->anon_index, newsvd->amp->ahp,
+					    0, seg->s_size, seg->s_szc,
+					    svd->vp != NULL);
+				} else {
+					anon_dup(amp->ahp, svd->anon_index,
+					    newsvd->amp->ahp, 0, seg->s_size);
+				}
+
+				hat_clrattr(seg->s_as->a_hat, seg->s_base,
+				    seg->s_size, PROT_WRITE);
+			}
+		}
+	}
+	/*
+	 * If necessary, create a vpage structure for the new segment.
+	 * Do not copy any page lock indications.
+	 */
+	if (svd->vpage != NULL) {
+		uint_t i;
+		struct vpage *ovp = svd->vpage;
+		struct vpage *nvp;
+
+		nvp = newsvd->vpage =
+		    kmem_alloc(vpgtob(npages), KM_SLEEP);
+		for (i = 0; i < npages; i++) {
+			*nvp = *ovp++;
+			VPP_CLRPPLOCK(nvp++);
+		}
+	} else
+		newsvd->vpage = NULL;
+
+	/* Inform the vnode of the new mapping */
+	if (newsvd->vp != NULL) {
+		error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset,
+		    newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot,
+		    newsvd->maxprot, newsvd->type, newsvd->cred);
+	}
+out:
+	return (error);
+}
+
+
+/*
+ * callback function used by segvn_unmap to invoke free_vp_pages() for only
+ * those pages actually processed by the HAT
+ */
+extern int free_pages;
+
+static void
+segvn_hat_unload_callback(hat_callback_t *cb)
+{
+	struct seg		*seg = cb->hcb_data;
+	struct segvn_data	*svd = (struct segvn_data *)seg->s_data;
+	size_t			len;
+	u_offset_t		off;
+
+	ASSERT(svd->vp != NULL);
+	ASSERT(cb->hcb_end_addr > cb->hcb_start_addr);
+	ASSERT(cb->hcb_start_addr >= seg->s_base);
+
+	len = cb->hcb_end_addr - cb->hcb_start_addr;
+	off = cb->hcb_start_addr - seg->s_base;
+	free_vp_pages(svd->vp, svd->offset + off, len);
+}
+
+
+static int
+segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct segvn_data *nsvd;
+	struct seg *nseg;
+	struct anon_map *amp;
+	pgcnt_t	opages;		/* old segment size in pages */
+	pgcnt_t	npages;		/* new segment size in pages */
+	pgcnt_t	dpages;		/* pages being deleted (unmapped) */
+	hat_callback_t callback;	/* used for free_vp_pages() */
+	hat_callback_t *cbp = NULL;
+	caddr_t nbase;
+	size_t nsize;
+	size_t oswresv;
+	int reclaim = 1;
+
+	/*
+	 * We don't need any segment level locks for "segvn" data
+	 * since the address space is "write" locked.
+	 */
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * Fail the unmap if pages are SOFTLOCKed through this mapping.
+	 * softlockcnt is protected from change by the as write lock.
+	 */
+retry:
+	if (svd->softlockcnt > 0) {
+		/*
+		 * since we do have the writers lock nobody can fill
+		 * the cache during the purge. The flush either succeeds
+		 * or we still have pending I/Os.
+		 */
+		if (reclaim == 1) {
+			segvn_purge(seg);
+			reclaim = 0;
+			goto retry;
+		}
+		return (EAGAIN);
+	}
+
+	/*
+	 * Check for bad sizes
+	 */
+	if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
+	    (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) {
+		panic("segvn_unmap");
+		/*NOTREACHED*/
+	}
+
+	if (seg->s_szc != 0) {
+		size_t pgsz = page_get_pagesize(seg->s_szc);
+		int err;
+		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
+			ASSERT(seg->s_base != addr || seg->s_size != len);
+			VM_STAT_ADD(segvnvmstats.demoterange[0]);
+			err = segvn_demote_range(seg, addr, len, SDR_END);
+			if (err == 0) {
+				return (IE_RETRY);
+			}
+			return (err);
+		}
+	}
+
+	/* Inform the vnode of the unmapping. */
+	if (svd->vp) {
+		int error;
+
+		error = VOP_DELMAP(svd->vp,
+			(offset_t)svd->offset + (uintptr_t)(addr - seg->s_base),
+			seg->s_as, addr, len, svd->prot, svd->maxprot,
+			svd->type, svd->cred);
+
+		if (error == EAGAIN)
+			return (error);
+	}
+	/*
+	 * Remove any page locks set through this mapping.
+	 */
+	(void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0);
+
+	/*
+	 * Unload any hardware translations in the range to be taken out.
+	 * Use a callback to invoke free_vp_pages() effectively.
+	 */
+	if (svd->vp != NULL && free_pages != 0) {
+		callback.hcb_data = seg;
+		callback.hcb_function = segvn_hat_unload_callback;
+		cbp = &callback;
+	}
+	hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp);
+
+	/*
+	 * Check for entire segment
+	 */
+	if (addr == seg->s_base && len == seg->s_size) {
+		seg_free(seg);
+		return (0);
+	}
+
+	opages = seg_pages(seg);
+	dpages = btop(len);
+	npages = opages - dpages;
+	amp = svd->amp;
+
+	/*
+	 * Check for beginning of segment
+	 */
+	if (addr == seg->s_base) {
+		if (svd->vpage != NULL) {
+			size_t nbytes;
+			struct vpage *ovpage;
+
+			ovpage = svd->vpage;	/* keep pointer to vpage */
+
+			nbytes = vpgtob(npages);
+			svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+			bcopy(&ovpage[dpages], svd->vpage, nbytes);
+
+			/* free up old vpage */
+			kmem_free(ovpage, vpgtob(opages));
+		}
+		if (amp != NULL) {
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+			if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
+				/*
+				 * Free up now unused parts of anon_map array.
+				 */
+				if (seg->s_szc != 0) {
+					anon_free_pages(amp->ahp,
+					    svd->anon_index, len, seg->s_szc);
+				} else {
+					anon_free(amp->ahp, svd->anon_index,
+					    len);
+				}
+
+				/*
+				 * Unreserve swap space for the unmapped chunk
+				 * of this segment in case it's MAP_SHARED
+				 */
+				if (svd->type == MAP_SHARED) {
+					anon_unresv(len);
+					amp->swresv -= len;
+				}
+			}
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+			svd->anon_index += dpages;
+		}
+		if (svd->vp != NULL)
+			svd->offset += len;
+
+		if (svd->swresv) {
+			if (svd->flags & MAP_NORESERVE) {
+				ASSERT(amp);
+				oswresv = svd->swresv;
+
+				svd->swresv = ptob(anon_pages(amp->ahp,
+				    svd->anon_index, npages));
+				anon_unresv(oswresv - svd->swresv);
+			} else {
+				anon_unresv(len);
+				svd->swresv -= len;
+			}
+			TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+				seg, len, 0);
+		}
+
+		seg->s_base += len;
+		seg->s_size -= len;
+		return (0);
+	}
+
+	/*
+	 * Check for end of segment
+	 */
+	if (addr + len == seg->s_base + seg->s_size) {
+		if (svd->vpage != NULL) {
+			size_t nbytes;
+			struct vpage *ovpage;
+
+			ovpage = svd->vpage;	/* keep pointer to vpage */
+
+			nbytes = vpgtob(npages);
+			svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+			bcopy(ovpage, svd->vpage, nbytes);
+
+			/* free up old vpage */
+			kmem_free(ovpage, vpgtob(opages));
+
+		}
+		if (amp != NULL) {
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+			if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
+				/*
+				 * Free up now unused parts of anon_map array
+				 */
+				if (seg->s_szc != 0) {
+					ulong_t an_idx = svd->anon_index +
+					    npages;
+					anon_free_pages(amp->ahp, an_idx,
+					    len, seg->s_szc);
+				} else {
+					anon_free(amp->ahp,
+					    svd->anon_index + npages, len);
+				}
+				/*
+				 * Unreserve swap space for the unmapped chunk
+				 * of this segment in case it's MAP_SHARED
+				 */
+				if (svd->type == MAP_SHARED) {
+					anon_unresv(len);
+					amp->swresv -= len;
+				}
+			}
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		}
+
+		if (svd->swresv) {
+			if (svd->flags & MAP_NORESERVE) {
+				ASSERT(amp);
+				oswresv = svd->swresv;
+				svd->swresv = ptob(anon_pages(amp->ahp,
+					svd->anon_index, npages));
+				anon_unresv(oswresv - svd->swresv);
+			} else {
+				anon_unresv(len);
+				svd->swresv -= len;
+			}
+			TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+				"anon proc:%p %lu %u", seg, len, 0);
+		}
+
+		seg->s_size -= len;
+		return (0);
+	}
+
+	/*
+	 * The section to go is in the middle of the segment,
+	 * have to make it into two segments.  nseg is made for
+	 * the high end while seg is cut down at the low end.
+	 */
+	nbase = addr + len;				/* new seg base */
+	nsize = (seg->s_base + seg->s_size) - nbase;	/* new seg size */
+	seg->s_size = addr - seg->s_base;		/* shrink old seg */
+	nseg = seg_alloc(seg->s_as, nbase, nsize);
+	if (nseg == NULL) {
+		panic("segvn_unmap seg_alloc");
+		/*NOTREACHED*/
+	}
+	nseg->s_ops = seg->s_ops;
+	nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
+	nseg->s_data = (void *)nsvd;
+	nseg->s_szc = seg->s_szc;
+	*nsvd = *svd;
+	nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
+	nsvd->swresv = 0;
+	nsvd->softlockcnt = 0;
+
+	if (svd->vp != NULL) {
+		VN_HOLD(nsvd->vp);
+		if (nsvd->type == MAP_SHARED)
+			lgrp_shm_policy_init(NULL, nsvd->vp);
+	}
+	crhold(svd->cred);
+
+	if (svd->vpage == NULL) {
+		nsvd->vpage = NULL;
+	} else {
+		/* need to split vpage into two arrays */
+		size_t nbytes;
+		struct vpage *ovpage;
+
+		ovpage = svd->vpage;		/* keep pointer to vpage */
+
+		npages = seg_pages(seg);	/* seg has shrunk */
+		nbytes = vpgtob(npages);
+		svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+
+		bcopy(ovpage, svd->vpage, nbytes);
+
+		npages = seg_pages(nseg);
+		nbytes = vpgtob(npages);
+		nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+
+		bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes);
+
+		/* free up old vpage */
+		kmem_free(ovpage, vpgtob(opages));
+	}
+
+	if (amp == NULL) {
+		nsvd->amp = NULL;
+		nsvd->anon_index = 0;
+	} else {
+		/*
+		 * Need to create a new anon map for the new segment.
+		 * We'll also allocate a new smaller array for the old
+		 * smaller segment to save space.
+		 */
+		opages = btop((uintptr_t)(addr - seg->s_base));
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+		if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
+			/*
+			 * Free up now unused parts of anon_map array
+			 */
+			if (seg->s_szc != 0) {
+				ulong_t an_idx = svd->anon_index + opages;
+				anon_free_pages(amp->ahp, an_idx, len,
+				    seg->s_szc);
+			} else {
+				anon_free(amp->ahp, svd->anon_index + opages,
+				    len);
+			}
+
+			/*
+			 * Unreserve swap space for the unmapped chunk
+			 * of this segment in case it's MAP_SHARED
+			 */
+			if (svd->type == MAP_SHARED) {
+				anon_unresv(len);
+				amp->swresv -= len;
+			}
+		}
+
+		nsvd->anon_index = svd->anon_index +
+		    btop((uintptr_t)(nseg->s_base - seg->s_base));
+		if (svd->type == MAP_SHARED) {
+			ASSERT(seg->s_szc == 0);
+			amp->refcnt++;
+			nsvd->amp = amp;
+		} else {
+			struct anon_map *namp;
+			struct anon_hdr *nahp;
+
+			ASSERT(svd->type == MAP_PRIVATE);
+			nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
+			namp = anonmap_alloc(nseg->s_size, 0);
+			namp->a_szc = seg->s_szc;
+			(void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp,
+			    0, btop(seg->s_size), ANON_SLEEP);
+			(void) anon_copy_ptr(amp->ahp, nsvd->anon_index,
+			    namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
+			anon_release(amp->ahp, btop(amp->size));
+			svd->anon_index = 0;
+			nsvd->anon_index = 0;
+			amp->ahp = nahp;
+			amp->size = seg->s_size;
+			nsvd->amp = namp;
+		}
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+	}
+	if (svd->swresv) {
+		if (svd->flags & MAP_NORESERVE) {
+			ASSERT(amp);
+			oswresv = svd->swresv;
+			svd->swresv = ptob(anon_pages(amp->ahp,
+				svd->anon_index, btop(seg->s_size)));
+			nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
+				nsvd->anon_index, btop(nseg->s_size)));
+			ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
+			anon_unresv(oswresv - (svd->swresv + nsvd->swresv));
+		} else {
+			if (seg->s_size + nseg->s_size + len != svd->swresv) {
+				panic("segvn_unmap: "
+				    "cannot split swap reservation");
+				/*NOTREACHED*/
+			}
+			anon_unresv(len);
+			svd->swresv = seg->s_size;
+			nsvd->swresv = nseg->s_size;
+		}
+		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+			seg, len, 0);
+	}
+
+	return (0);			/* I'm glad that's all over with! */
+}
+
+static void
+segvn_free(struct seg *seg)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	pgcnt_t npages = seg_pages(seg);
+	struct anon_map *amp;
+	size_t len;
+
+	/*
+	 * We don't need any segment level locks for "segvn" data
+	 * since the address space is "write" locked.
+	 */
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * Be sure to unlock pages. XXX Why do things get free'ed instead
+	 * of unmapped? XXX
+	 */
+	(void) segvn_lockop(seg, seg->s_base, seg->s_size,
+	    0, MC_UNLOCK, NULL, 0);
+
+	/*
+	 * Deallocate the vpage and anon pointers if necessary and possible.
+	 */
+	if (svd->vpage != NULL) {
+		kmem_free(svd->vpage, vpgtob(npages));
+		svd->vpage = NULL;
+	}
+	if ((amp = svd->amp) != NULL) {
+		/*
+		 * If there are no more references to this anon_map
+		 * structure, then deallocate the structure after freeing
+		 * up all the anon slot pointers that we can.
+		 */
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+		if (--amp->refcnt == 0) {
+			if (svd->type == MAP_PRIVATE) {
+				/*
+				 * Private - we only need to anon_free
+				 * the part that this segment refers to.
+				 */
+				if (seg->s_szc != 0) {
+					anon_free_pages(amp->ahp,
+					    svd->anon_index, seg->s_size,
+					    seg->s_szc);
+				} else {
+					anon_free(amp->ahp, svd->anon_index,
+					    seg->s_size);
+				}
+			} else {
+				/*
+				 * Shared - anon_free the entire
+				 * anon_map's worth of stuff and
+				 * release any swap reservation.
+				 */
+				ASSERT(seg->s_szc == 0);
+				anon_free(amp->ahp, 0, amp->size);
+				if ((len = amp->swresv) != 0) {
+					anon_unresv(len);
+					TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+						"anon proc:%p %lu %u",
+						seg, len, 0);
+				}
+			}
+			svd->amp = NULL;
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+			anonmap_free(amp);
+		} else if (svd->type == MAP_PRIVATE) {
+			/*
+			 * We had a private mapping which still has
+			 * a held anon_map so just free up all the
+			 * anon slot pointers that we were using.
+			 */
+			if (seg->s_szc != 0) {
+				anon_free_pages(amp->ahp, svd->anon_index,
+				    seg->s_size, seg->s_szc);
+			} else {
+				anon_free(amp->ahp, svd->anon_index,
+				    seg->s_size);
+			}
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		} else {
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		}
+	}
+
+	/*
+	 * Release swap reservation.
+	 */
+	if ((len = svd->swresv) != 0) {
+		anon_unresv(svd->swresv);
+		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+			seg, len, 0);
+		svd->swresv = 0;
+	}
+	/*
+	 * Release claim on vnode, credentials, and finally free the
+	 * private data.
+	 */
+	if (svd->vp != NULL) {
+		if (svd->type == MAP_SHARED)
+			lgrp_shm_policy_fini(NULL, svd->vp);
+		VN_RELE(svd->vp);
+		svd->vp = NULL;
+	}
+	crfree(svd->cred);
+	svd->cred = NULL;
+
+	seg->s_data = NULL;
+	kmem_cache_free(segvn_cache, svd);
+}
+
+/*
+ * Do a F_SOFTUNLOCK call over the range requested.  The range must have
+ * already been F_SOFTLOCK'ed.
+ * Caller must always match addr and len of a softunlock with a previous
+ * softlock with exactly the same addr and len.
+ */
+static void
+segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	page_t *pp;
+	caddr_t adr;
+	struct vnode *vp;
+	u_offset_t offset;
+	ulong_t anon_index;
+	struct anon_map *amp;
+	struct anon *ap = NULL;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+
+	if ((amp = svd->amp) != NULL)
+		anon_index = svd->anon_index + seg_page(seg, addr);
+
+	hat_unlock(seg->s_as->a_hat, addr, len);
+	for (adr = addr; adr < addr + len; adr += PAGESIZE) {
+		if (amp != NULL) {
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+			if ((ap = anon_get_ptr(amp->ahp, anon_index++))
+								!= NULL) {
+				swap_xlate(ap, &vp, &offset);
+			} else {
+				vp = svd->vp;
+				offset = svd->offset +
+				    (uintptr_t)(adr - seg->s_base);
+			}
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		} else {
+			vp = svd->vp;
+			offset = svd->offset +
+			    (uintptr_t)(adr - seg->s_base);
+		}
+
+		/*
+		 * Use page_find() instead of page_lookup() to
+		 * find the page since we know that it is locked.
+		 */
+		pp = page_find(vp, offset);
+		if (pp == NULL) {
+			panic(
+			    "segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
+			    (void *)adr, (void *)ap, (void *)vp, offset);
+			/*NOTREACHED*/
+		}
+
+		if (rw == S_WRITE) {
+			hat_setrefmod(pp);
+			if (seg->s_as->a_vbits)
+				hat_setstat(seg->s_as, adr, PAGESIZE,
+				    P_REF | P_MOD);
+		} else if (rw != S_OTHER) {
+			hat_setref(pp);
+			if (seg->s_as->a_vbits)
+				hat_setstat(seg->s_as, adr, PAGESIZE, P_REF);
+		}
+		TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
+			"segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
+		page_unlock(pp);
+	}
+	mutex_enter(&freemem_lock); /* for availrmem */
+	availrmem += btop(len);
+	segvn_pages_locked -= btop(len);
+	svd->softlockcnt -= btop(len);
+	mutex_exit(&freemem_lock);
+	if (svd->softlockcnt == 0) {
+		/*
+		 * All SOFTLOCKS are gone. Wakeup any waiting
+		 * unmappers so they can try again to unmap.
+		 * Check for waiters first without the mutex
+		 * held so we don't always grab the mutex on
+		 * softunlocks.
+		 */
+		if (AS_ISUNMAPWAIT(seg->s_as)) {
+			mutex_enter(&seg->s_as->a_contents);
+			if (AS_ISUNMAPWAIT(seg->s_as)) {
+				AS_CLRUNMAPWAIT(seg->s_as);
+				cv_broadcast(&seg->s_as->a_cv);
+			}
+			mutex_exit(&seg->s_as->a_contents);
+		}
+	}
+}
+
+#define	PAGE_HANDLED	((page_t *)-1)
+
+/*
+ * Release all the pages in the NULL terminated ppp list
+ * which haven't already been converted to PAGE_HANDLED.
+ */
+static void
+segvn_pagelist_rele(page_t **ppp)
+{
+	for (; *ppp != NULL; ppp++) {
+		if (*ppp != PAGE_HANDLED)
+			page_unlock(*ppp);
+	}
+}
+
+static int stealcow = 1;
+
+/*
+ * Workaround for viking chip bug.  See bug id 1220902.
+ * To fix this down in pagefault() would require importing so
+ * much as and segvn code as to be unmaintainable.
+ */
+int enable_mbit_wa = 0;
+
+/*
+ * Handles all the dirty work of getting the right
+ * anonymous pages and loading up the translations.
+ * This routine is called only from segvn_fault()
+ * when looping over the range of addresses requested.
+ *
+ * The basic algorithm here is:
+ * 	If this is an anon_zero case
+ *		Call anon_zero to allocate page
+ *		Load up translation
+ *		Return
+ *	endif
+ *	If this is an anon page
+ *		Use anon_getpage to get the page
+ *	else
+ *		Find page in pl[] list passed in
+ *	endif
+ *	If not a cow
+ *		Load up the translation to the page
+ *		return
+ *	endif
+ *	Call anon_private to handle cow
+ *	Load up (writable) translation to new page
+ */
+static faultcode_t
+segvn_faultpage(
+	struct hat *hat,		/* the hat to use for mapping */
+	struct seg *seg,		/* seg_vn of interest */
+	caddr_t addr,			/* address in as */
+	u_offset_t off,			/* offset in vp */
+	struct vpage *vpage,		/* pointer to vpage for vp, off */
+	page_t *pl[],			/* object source page pointer */
+	uint_t vpprot,			/* access allowed to object pages */
+	enum fault_type type,		/* type of fault */
+	enum seg_rw rw,			/* type of access at fault */
+	int brkcow)			/* we may need to break cow */
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	page_t *pp, **ppp;
+	uint_t pageflags = 0;
+	page_t *anon_pl[1 + 1];
+	page_t *opp = NULL;		/* original page */
+	uint_t prot;
+	int err;
+	int cow;
+	int claim;
+	int steal = 0;
+	ulong_t anon_index;
+	struct anon *ap, *oldap;
+	struct anon_map *amp;
+	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
+	int anon_lock = 0;
+	anon_sync_obj_t cookie;
+
+	if (svd->flags & MAP_TEXT) {
+		hat_flag |= HAT_LOAD_TEXT;
+	}
+
+	ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
+	ASSERT(seg->s_szc == 0);
+
+	/*
+	 * Initialize protection value for this page.
+	 * If we have per page protection values check it now.
+	 */
+	if (svd->pageprot) {
+		uint_t protchk;
+
+		switch (rw) {
+		case S_READ:
+			protchk = PROT_READ;
+			break;
+		case S_WRITE:
+			protchk = PROT_WRITE;
+			break;
+		case S_EXEC:
+			protchk = PROT_EXEC;
+			break;
+		case S_OTHER:
+		default:
+			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+			break;
+		}
+
+		prot = VPP_PROT(vpage);
+		if ((prot & protchk) == 0)
+			return (FC_PROT);	/* illegal access type */
+	} else {
+		prot = svd->prot;
+	}
+
+	if (type == F_SOFTLOCK) {
+		mutex_enter(&freemem_lock);
+		if (availrmem <= tune.t_minarmem) {
+			mutex_exit(&freemem_lock);
+			return (FC_MAKE_ERR(ENOMEM));	/* out of real memory */
+		} else {
+			svd->softlockcnt++;
+			availrmem--;
+			segvn_pages_locked++;
+		}
+		mutex_exit(&freemem_lock);
+	}
+
+	/*
+	 * Always acquire the anon array lock to prevent 2 threads from
+	 * allocating separate anon slots for the same "addr".
+	 */
+
+	if ((amp = svd->amp) != NULL) {
+		ASSERT(RW_READ_HELD(&amp->a_rwlock));
+		anon_index = svd->anon_index + seg_page(seg, addr);
+		anon_array_enter(amp, anon_index, &cookie);
+		anon_lock = 1;
+	}
+
+	if (svd->vp == NULL && amp != NULL) {
+		if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) {
+			/*
+			 * Allocate a (normally) writable anonymous page of
+			 * zeroes. If no advance reservations, reserve now.
+			 */
+			if (svd->flags & MAP_NORESERVE) {
+				if (anon_resv(ptob(1))) {
+					svd->swresv += ptob(1);
+				} else {
+					err = ENOMEM;
+					goto out;
+				}
+			}
+			if ((pp = anon_zero(seg, addr, &ap,
+			    svd->cred)) == NULL) {
+				err = ENOMEM;
+				goto out;	/* out of swap space */
+			}
+			/*
+			 * Re-acquire the anon_map lock and
+			 * initialize the anon array entry.
+			 */
+			(void) anon_set_ptr(amp->ahp, anon_index, ap,
+				ANON_SLEEP);
+			if (enable_mbit_wa) {
+				if (rw == S_WRITE)
+					hat_setmod(pp);
+				else if (!hat_ismod(pp))
+					prot &= ~PROT_WRITE;
+			}
+			/*
+			 * If AS_PAGLCK is set in a_flags (via memcntl(2)
+			 * with MC_LOCKAS, MCL_FUTURE) and this is a
+			 * MAP_NORESERVE segment, we may need to
+			 * permanently lock the page as it is being faulted
+			 * for the first time. The following text applies
+			 * only to MAP_NORESERVE segments:
+			 *
+			 * As per memcntl(2), if this segment was created
+			 * after MCL_FUTURE was applied (a "future"
+			 * segment), its pages must be locked.  If this
+			 * segment existed at MCL_FUTURE application (a
+			 * "past" segment), the interface is unclear.
+			 *
+			 * We decide to lock only if vpage is present:
+			 *
+			 * - "future" segments will have a vpage array (see
+			 *    as_map), and so will be locked as required
+			 *
+			 * - "past" segments may not have a vpage array,
+			 *    depending on whether events (such as
+			 *    mprotect) have occurred. Locking if vpage
+			 *    exists will preserve legacy behavior.  Not
+			 *    locking if vpage is absent, will not break
+			 *    the interface or legacy behavior.  Note that
+			 *    allocating vpage here if it's absent requires
+			 *    upgrading the segvn reader lock, the cost of
+			 *    which does not seem worthwhile.
+			 */
+			if (AS_ISPGLCK(seg->s_as) && vpage != NULL &&
+			    (svd->flags & MAP_NORESERVE)) {
+				claim = VPP_PROT(vpage) & PROT_WRITE;
+				ASSERT(svd->type == MAP_PRIVATE);
+				if (page_pp_lock(pp, claim, 0))
+					VPP_SETPPLOCK(vpage);
+			}
+
+
+			/*
+			 * Handle pages that have been marked for migration
+			 */
+			if (lgrp_optimizations())
+				page_migrate(seg, addr, &pp, 1);
+			hat_memload(hat, addr, pp, prot, hat_flag);
+
+			if (!(hat_flag & HAT_LOAD_LOCK))
+				page_unlock(pp);
+
+			anon_array_exit(&cookie);
+			return (0);
+		}
+	}
+
+	/*
+	 * Obtain the page structure via anon_getpage() if it is
+	 * a private copy of an object (the result of a previous
+	 * copy-on-write).
+	 */
+	if (amp != NULL) {
+		if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) {
+			err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE,
+			    seg, addr, rw, svd->cred);
+			if (err)
+				goto out;
+
+			if (svd->type == MAP_SHARED) {
+				/*
+				 * If this is a shared mapping to an
+				 * anon_map, then ignore the write
+				 * permissions returned by anon_getpage().
+				 * They apply to the private mappings
+				 * of this anon_map.
+				 */
+				vpprot |= PROT_WRITE;
+			}
+			opp = anon_pl[0];
+		}
+	}
+
+	/*
+	 * Search the pl[] list passed in if it is from the
+	 * original object (i.e., not a private copy).
+	 */
+	if (opp == NULL) {
+		/*
+		 * Find original page.  We must be bringing it in
+		 * from the list in pl[].
+		 */
+		for (ppp = pl; (opp = *ppp) != NULL; ppp++) {
+			if (opp == PAGE_HANDLED)
+				continue;
+			ASSERT(opp->p_vnode == svd->vp); /* XXX */
+			if (opp->p_offset == off)
+				break;
+		}
+		if (opp == NULL) {
+			panic("segvn_faultpage not found");
+			/*NOTREACHED*/
+		}
+		*ppp = PAGE_HANDLED;
+
+	}
+
+	ASSERT(PAGE_LOCKED(opp));
+
+	TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
+		"segvn_fault:pp %p vp %p offset %llx",
+		opp, NULL, 0);
+
+	/*
+	 * The fault is treated as a copy-on-write fault if a
+	 * write occurs on a private segment and the object
+	 * page (i.e., mapping) is write protected.  We assume
+	 * that fatal protection checks have already been made.
+	 */
+
+	cow = brkcow && ((vpprot & PROT_WRITE) == 0);
+
+	/*
+	 * If not a copy-on-write case load the translation
+	 * and return.
+	 */
+	if (cow == 0) {
+		if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
+			if (rw == S_WRITE)
+				hat_setmod(opp);
+			else if (rw != S_OTHER && !hat_ismod(opp))
+				prot &= ~PROT_WRITE;
+		}
+
+		/*
+		 * Handle pages that have been marked for migration
+		 */
+		if (lgrp_optimizations())
+			page_migrate(seg, addr, &opp, 1);
+
+		hat_memload(hat, addr, opp, prot & vpprot, hat_flag);
+
+		if (!(hat_flag & HAT_LOAD_LOCK))
+			page_unlock(opp);
+
+		if (anon_lock) {
+			anon_array_exit(&cookie);
+		}
+		return (0);
+	}
+
+	hat_setref(opp);
+
+	ASSERT(amp != NULL && anon_lock);
+
+	/*
+	 * Steal the page only if it isn't a private page
+	 * since stealing a private page is not worth the effort.
+	 */
+	if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL)
+		steal = 1;
+
+	/*
+	 * Steal the original page if the following conditions are true:
+	 *
+	 * We are low on memory, the page is not private, page is not
+	 * shared, not modified, not `locked' or if we have it `locked'
+	 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
+	 * that the page is not shared) and if it doesn't have any
+	 * translations. page_struct_lock isn't needed to look at p_cowcnt
+	 * and p_lckcnt because we first get exclusive lock on page.
+	 */
+	(void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
+
+	if (stealcow && freemem < minfree && steal &&
+	    page_tryupgrade(opp) && !hat_ismod(opp) &&
+	    ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) ||
+	    (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 &&
+	    vpage != NULL && VPP_ISPPLOCK(vpage)))) {
+		/*
+		 * Check if this page has other translations
+		 * after unloading our translation.
+		 */
+		if (hat_page_is_mapped(opp)) {
+			hat_unload(seg->s_as->a_hat, addr, PAGESIZE,
+				HAT_UNLOAD);
+		}
+
+		/*
+		 * hat_unload() might sync back someone else's recent
+		 * modification, so check again.
+		 */
+		if (!hat_ismod(opp) && !hat_page_is_mapped(opp))
+			pageflags |= STEAL_PAGE;
+	}
+
+	/*
+	 * If we have a vpage pointer, see if it indicates that we have
+	 * ``locked'' the page we map -- if so, tell anon_private to
+	 * transfer the locking resource to the new page.
+	 *
+	 * See Statement at the beginning of segvn_lockop regarding
+	 * the way lockcnts/cowcnts are handled during COW.
+	 *
+	 */
+	if (vpage != NULL && VPP_ISPPLOCK(vpage))
+		pageflags |= LOCK_PAGE;
+
+	/*
+	 * Allocate a private page and perform the copy.
+	 * For MAP_NORESERVE reserve swap space now, unless this
+	 * is a cow fault on an existing anon page in which case
+	 * MAP_NORESERVE will have made advance reservations.
+	 */
+	if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) {
+		if (anon_resv(ptob(1))) {
+			svd->swresv += ptob(1);
+		} else {
+			page_unlock(opp);
+			err = ENOMEM;
+			goto out;
+		}
+	}
+	oldap = ap;
+	pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred);
+	if (pp == NULL) {
+		err = ENOMEM;	/* out of swap space */
+		goto out;
+	}
+
+	/*
+	 * If we copied away from an anonymous page, then
+	 * we are one step closer to freeing up an anon slot.
+	 *
+	 * NOTE:  The original anon slot must be released while
+	 * holding the "anon_map" lock.  This is necessary to prevent
+	 * other threads from obtaining a pointer to the anon slot
+	 * which may be freed if its "refcnt" is 1.
+	 */
+	if (oldap != NULL)
+		anon_decref(oldap);
+
+	(void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
+
+	ASSERT(!IS_VMODSORT(pp->p_vnode));
+	if (enable_mbit_wa) {
+		if (rw == S_WRITE)
+			hat_setmod(pp);
+		else if (!hat_ismod(pp))
+			prot &= ~PROT_WRITE;
+	}
+
+
+	/*
+	 * Handle pages that have been marked for migration
+	 */
+	if (lgrp_optimizations())
+		page_migrate(seg, addr, &pp, 1);
+	hat_memload(hat, addr, pp, prot, hat_flag);
+
+	if (!(hat_flag & HAT_LOAD_LOCK))
+		page_unlock(pp);
+
+	ASSERT(anon_lock);
+	anon_array_exit(&cookie);
+	return (0);
+out:
+	if (anon_lock)
+		anon_array_exit(&cookie);
+
+	if (type == F_SOFTLOCK) {
+		mutex_enter(&freemem_lock);
+		availrmem++;
+		segvn_pages_locked--;
+		svd->softlockcnt--;
+		mutex_exit(&freemem_lock);
+	}
+	return (FC_MAKE_ERR(err));
+}
+
+/*
+ * relocate a bunch of smaller targ pages into one large repl page. all targ
+ * pages must be complete pages smaller than replacement pages.
+ * it's assumed that no page's szc can change since they are all PAGESIZE or
+ * complete large pages locked SHARED.
+ */
+static void
+segvn_relocate_pages(page_t **targ, page_t *replacement)
+{
+	page_t *pp;
+	pgcnt_t repl_npgs, curnpgs;
+	pgcnt_t i;
+	uint_t repl_szc = replacement->p_szc;
+	page_t *first_repl = replacement;
+	page_t *repl;
+	spgcnt_t npgs;
+
+	VM_STAT_ADD(segvnvmstats.relocatepages[0]);
+
+	ASSERT(repl_szc != 0);
+	npgs = repl_npgs = page_get_pagecnt(repl_szc);
+
+	i = 0;
+	while (repl_npgs) {
+		spgcnt_t nreloc;
+		int err;
+		ASSERT(replacement != NULL);
+		pp = targ[i];
+		ASSERT(pp->p_szc < repl_szc);
+		ASSERT(PAGE_EXCL(pp));
+		ASSERT(!PP_ISFREE(pp));
+		curnpgs = page_get_pagecnt(pp->p_szc);
+		if (curnpgs == 1) {
+			VM_STAT_ADD(segvnvmstats.relocatepages[1]);
+			repl = replacement;
+			page_sub(&replacement, repl);
+			ASSERT(PAGE_EXCL(repl));
+			ASSERT(!PP_ISFREE(repl));
+			ASSERT(repl->p_szc == repl_szc);
+		} else {
+			page_t *repl_savepp;
+			int j;
+			VM_STAT_ADD(segvnvmstats.relocatepages[2]);
+			repl_savepp = replacement;
+			for (j = 0; j < curnpgs; j++) {
+				repl = replacement;
+				page_sub(&replacement, repl);
+				ASSERT(PAGE_EXCL(repl));
+				ASSERT(!PP_ISFREE(repl));
+				ASSERT(repl->p_szc == repl_szc);
+				ASSERT(page_pptonum(targ[i + j]) ==
+				    page_pptonum(targ[i]) + j);
+			}
+			repl = repl_savepp;
+			ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs));
+		}
+		err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL);
+		if (err || nreloc != curnpgs) {
+			panic("segvn_relocate_pages: "
+			    "page_relocate failed err=%d curnpgs=%ld "
+			    "nreloc=%ld", err, curnpgs, nreloc);
+		}
+		ASSERT(curnpgs <= repl_npgs);
+		repl_npgs -= curnpgs;
+		i += curnpgs;
+	}
+	ASSERT(replacement == NULL);
+
+	repl = first_repl;
+	repl_npgs = npgs;
+	for (i = 0; i < repl_npgs; i++) {
+		ASSERT(PAGE_EXCL(repl));
+		ASSERT(!PP_ISFREE(repl));
+		targ[i] = repl;
+		page_downgrade(targ[i]);
+		repl = page_next(repl);
+	}
+}
+
+/*
+ * Check if all pages in ppa array are complete smaller than szc pages and
+ * their roots will still be aligned relative to their current size if the
+ * entire ppa array is relocated into one szc page. If these conditions are
+ * not met return 0.
+ *
+ * If all pages are properly aligned attempt to upgrade their locks
+ * to exclusive mode. If it fails set *upgrdfail to 1 and return 0.
+ * upgrdfail was set to 0 by caller.
+ *
+ * Return 1 if all pages are aligned and locked exclusively.
+ *
+ * If all pages in ppa array happen to be physically contiguous to make one
+ * szc page and all exclusive locks are successfully obtained promote the page
+ * size to szc and set *pszc to szc. Return 1 with pages locked shared.
+ */
+static int
+segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc)
+{
+	page_t *pp;
+	pfn_t pfn;
+	pgcnt_t totnpgs = page_get_pagecnt(szc);
+	pfn_t first_pfn;
+	int contig = 1;
+	pgcnt_t i;
+	pgcnt_t j;
+	uint_t curszc;
+	pgcnt_t curnpgs;
+	int root = 0;
+
+	ASSERT(szc > 0);
+
+	VM_STAT_ADD(segvnvmstats.fullszcpages[0]);
+
+	for (i = 0; i < totnpgs; i++) {
+		pp = ppa[i];
+		ASSERT(PAGE_SHARED(pp));
+		ASSERT(!PP_ISFREE(pp));
+		pfn = page_pptonum(pp);
+		if (i == 0) {
+			if (!IS_P2ALIGNED(pfn, totnpgs)) {
+				contig = 0;
+			} else {
+				first_pfn = pfn;
+			}
+		} else if (contig && pfn != first_pfn + i) {
+			contig = 0;
+		}
+		if (pp->p_szc == 0) {
+			if (root) {
+				VM_STAT_ADD(segvnvmstats.fullszcpages[1]);
+				return (0);
+			}
+		} else if (!root) {
+			if ((curszc = pp->p_szc) >= szc) {
+				VM_STAT_ADD(segvnvmstats.fullszcpages[2]);
+				return (0);
+			}
+			if (curszc == 0) {
+				/*
+				 * p_szc changed means we don't have all pages
+				 * locked. return failure.
+				 */
+				VM_STAT_ADD(segvnvmstats.fullszcpages[3]);
+				return (0);
+			}
+			curnpgs = page_get_pagecnt(curszc);
+			if (!IS_P2ALIGNED(pfn, curnpgs) ||
+			    !IS_P2ALIGNED(i, curnpgs)) {
+				VM_STAT_ADD(segvnvmstats.fullszcpages[4]);
+				return (0);
+			}
+			root = 1;
+		} else {
+			ASSERT(i > 0);
+			VM_STAT_ADD(segvnvmstats.fullszcpages[5]);
+			if (pp->p_szc != curszc) {
+				VM_STAT_ADD(segvnvmstats.fullszcpages[6]);
+				return (0);
+			}
+			if (pfn - 1 != page_pptonum(ppa[i - 1])) {
+				panic("segvn_full_szcpages: "
+				    "large page not physically contiguous");
+			}
+			if (P2PHASE(pfn, curnpgs) == curnpgs - 1) {
+				root = 0;
+			}
+		}
+	}
+
+	for (i = 0; i < totnpgs; i++) {
+		ASSERT(ppa[i]->p_szc < szc);
+		if (!page_tryupgrade(ppa[i])) {
+			for (j = 0; j < i; j++) {
+				page_downgrade(ppa[j]);
+			}
+			*pszc = ppa[i]->p_szc;
+			*upgrdfail = 1;
+			VM_STAT_ADD(segvnvmstats.fullszcpages[7]);
+			return (0);
+		}
+	}
+
+	/*
+	 * When a page is put a free cachelist its szc is set to 0.  if file
+	 * system reclaimed pages from cachelist targ pages will be physically
+	 * contiguous with 0 p_szc.  in this case just upgrade szc of targ
+	 * pages without any relocations.
+	 * To avoid any hat issues with previous small mappings
+	 * hat_pageunload() the target pages first.
+	 */
+	if (contig) {
+		VM_STAT_ADD(segvnvmstats.fullszcpages[8]);
+		for (i = 0; i < totnpgs; i++) {
+			(void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD);
+		}
+		for (i = 0; i < totnpgs; i++) {
+			ppa[i]->p_szc = szc;
+		}
+		for (i = 0; i < totnpgs; i++) {
+			ASSERT(PAGE_EXCL(ppa[i]));
+			page_downgrade(ppa[i]);
+		}
+		if (pszc != NULL) {
+			*pszc = szc;
+		}
+	}
+	VM_STAT_ADD(segvnvmstats.fullszcpages[9]);
+	return (1);
+}
+
+/*
+ * Create physically contiguous pages for [vp, off] - [vp, off +
+ * page_size(szc)) range and for private segment return them in ppa array.
+ * Pages are created either via IO or relocations.
+ *
+ * Return 1 on sucess and 0 on failure.
+ *
+ * If physically contiguos pages already exist for this range return 1 without
+ * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa
+ * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE().
+ */
+
+static int
+segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off,
+    uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc,
+    int *downsize)
+
+{
+	page_t *pplist = *ppplist;
+	size_t pgsz = page_get_pagesize(szc);
+	pgcnt_t pages = btop(pgsz);
+	ulong_t start_off = off;
+	u_offset_t eoff = off + pgsz;
+	spgcnt_t nreloc;
+	u_offset_t io_off = off;
+	size_t io_len;
+	page_t *io_pplist = NULL;
+	page_t *done_pplist = NULL;
+	pgcnt_t pgidx = 0;
+	page_t *pp;
+	page_t *newpp;
+	page_t *targpp;
+	int io_err = 0;
+	int i;
+	pfn_t pfn;
+	ulong_t ppages;
+	page_t *targ_pplist = NULL;
+	page_t *repl_pplist = NULL;
+	page_t *tmp_pplist;
+	int nios = 0;
+	uint_t pszc;
+	struct vattr va;
+
+	VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]);
+
+	ASSERT(szc != 0);
+	ASSERT(pplist->p_szc == szc);
+
+	/*
+	 * downsize will be set to 1 only if we fail to lock pages. this will
+	 * allow subsequent faults to try to relocate the page again. If we
+	 * fail due to misalignment don't downsize and let the caller map the
+	 * whole region with small mappings to avoid more faults into the area
+	 * where we can't get large pages anyway.
+	 */
+	*downsize = 0;
+
+	while (off < eoff) {
+		newpp = pplist;
+		ASSERT(newpp != NULL);
+		ASSERT(PAGE_EXCL(newpp));
+		ASSERT(!PP_ISFREE(newpp));
+		/*
+		 * we pass NULL for nrelocp to page_lookup_create()
+		 * so that it doesn't relocate. We relocate here
+		 * later only after we make sure we can lock all
+		 * pages in the range we handle and they are all
+		 * aligned.
+		 */
+		pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0);
+		ASSERT(pp != NULL);
+		ASSERT(!PP_ISFREE(pp));
+		ASSERT(pp->p_vnode == vp);
+		ASSERT(pp->p_offset == off);
+		if (pp == newpp) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]);
+			page_sub(&pplist, pp);
+			ASSERT(PAGE_EXCL(pp));
+			ASSERT(page_iolock_assert(pp));
+			page_list_concat(&io_pplist, &pp);
+			off += PAGESIZE;
+			continue;
+		}
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]);
+		pfn = page_pptonum(pp);
+		pszc = pp->p_szc;
+		if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL &&
+		    IS_P2ALIGNED(pfn, pages)) {
+			ASSERT(repl_pplist == NULL);
+			ASSERT(done_pplist == NULL);
+			ASSERT(pplist == *ppplist);
+			page_unlock(pp);
+			page_free_replacement_page(pplist);
+			page_create_putback(pages);
+			*ppplist = NULL;
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]);
+			return (1);
+		}
+		if (pszc >= szc) {
+			page_unlock(pp);
+			segvn_faultvnmpss_align_err1++;
+			goto out;
+		}
+		ppages = page_get_pagecnt(pszc);
+		if (!IS_P2ALIGNED(pfn, ppages)) {
+			ASSERT(pszc > 0);
+			/*
+			 * sizing down to pszc won't help.
+			 */
+			page_unlock(pp);
+			segvn_faultvnmpss_align_err2++;
+			goto out;
+		}
+		pfn = page_pptonum(newpp);
+		if (!IS_P2ALIGNED(pfn, ppages)) {
+			ASSERT(pszc > 0);
+			/*
+			 * sizing down to pszc won't help.
+			 */
+			page_unlock(pp);
+			segvn_faultvnmpss_align_err3++;
+			goto out;
+		}
+		if (!PAGE_EXCL(pp)) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]);
+			page_unlock(pp);
+			*downsize = 1;
+			*ret_pszc = pp->p_szc;
+			goto out;
+		}
+		targpp = pp;
+		if (io_pplist != NULL) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]);
+			io_len = off - io_off;
+			/*
+			 * Some file systems like NFS don't check EOF
+			 * conditions in VOP_PAGEIO(). Check it here
+			 * now that pages are locked SE_EXCL. Any file
+			 * truncation will wait until the pages are
+			 * unlocked so no need to worry that file will
+			 * be truncated after we check its size here.
+			 * XXX fix NFS to remove this check.
+			 */
+			va.va_mask = AT_SIZE;
+			if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) {
+				VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]);
+				page_unlock(targpp);
+				goto out;
+			}
+			if (btopr(va.va_size) < btopr(io_off + io_len)) {
+				VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]);
+				*downsize = 1;
+				*ret_pszc = 0;
+				page_unlock(targpp);
+				goto out;
+			}
+			io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
+				B_READ, svd->cred);
+			if (io_err) {
+				VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]);
+				page_unlock(targpp);
+				if (io_err == EDEADLK) {
+					segvn_vmpss_pageio_deadlk_err++;
+				}
+				goto out;
+			}
+			nios++;
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]);
+			while (io_pplist != NULL) {
+				pp = io_pplist;
+				page_sub(&io_pplist, pp);
+				ASSERT(page_iolock_assert(pp));
+				page_io_unlock(pp);
+				pgidx = (pp->p_offset - start_off) >>
+				    PAGESHIFT;
+				ASSERT(pgidx < pages);
+				ppa[pgidx] = pp;
+				page_list_concat(&done_pplist, &pp);
+			}
+		}
+		pp = targpp;
+		ASSERT(PAGE_EXCL(pp));
+		ASSERT(pp->p_szc <= pszc);
+		if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]);
+			page_unlock(pp);
+			*downsize = 1;
+			*ret_pszc = pp->p_szc;
+			goto out;
+		}
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]);
+		/*
+		 * page szc chould have changed before the entire group was
+		 * locked. reread page szc.
+		 */
+		pszc = pp->p_szc;
+		ppages = page_get_pagecnt(pszc);
+
+		/* link just the roots */
+		page_list_concat(&targ_pplist, &pp);
+		page_sub(&pplist, newpp);
+		page_list_concat(&repl_pplist, &newpp);
+		off += PAGESIZE;
+		while (--ppages != 0) {
+			newpp = pplist;
+			page_sub(&pplist, newpp);
+			off += PAGESIZE;
+		}
+		io_off = off;
+	}
+	if (io_pplist != NULL) {
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]);
+		io_len = eoff - io_off;
+		va.va_mask = AT_SIZE;
+		if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]);
+			goto out;
+		}
+		if (btopr(va.va_size) < btopr(io_off + io_len)) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]);
+			*downsize = 1;
+			*ret_pszc = 0;
+			goto out;
+		}
+		io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
+		    B_READ, svd->cred);
+		if (io_err) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]);
+			if (io_err == EDEADLK) {
+				segvn_vmpss_pageio_deadlk_err++;
+			}
+			goto out;
+		}
+		nios++;
+		while (io_pplist != NULL) {
+			pp = io_pplist;
+			page_sub(&io_pplist, pp);
+			ASSERT(page_iolock_assert(pp));
+			page_io_unlock(pp);
+			pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
+			ASSERT(pgidx < pages);
+			ppa[pgidx] = pp;
+		}
+	}
+	/*
+	 * we're now bound to succeed or panic.
+	 * remove pages from done_pplist. it's not needed anymore.
+	 */
+	while (done_pplist != NULL) {
+		pp = done_pplist;
+		page_sub(&done_pplist, pp);
+	}
+	VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]);
+	ASSERT(pplist == NULL);
+	*ppplist = NULL;
+	while (targ_pplist != NULL) {
+		int ret;
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]);
+		ASSERT(repl_pplist);
+		pp = targ_pplist;
+		page_sub(&targ_pplist, pp);
+		pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
+		newpp = repl_pplist;
+		page_sub(&repl_pplist, newpp);
+#ifdef DEBUG
+		pfn = page_pptonum(pp);
+		pszc = pp->p_szc;
+		ppages = page_get_pagecnt(pszc);
+		ASSERT(IS_P2ALIGNED(pfn, ppages));
+		pfn = page_pptonum(newpp);
+		ASSERT(IS_P2ALIGNED(pfn, ppages));
+		ASSERT(P2PHASE(pfn, pages) == pgidx);
+#endif
+		nreloc = 0;
+		ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL);
+		if (ret != 0 || nreloc == 0) {
+			panic("segvn_fill_vp_pages: "
+			    "page_relocate failed");
+		}
+		pp = newpp;
+		while (nreloc-- != 0) {
+			ASSERT(PAGE_EXCL(pp));
+			ASSERT(pp->p_vnode == vp);
+			ASSERT(pgidx ==
+			    ((pp->p_offset - start_off) >> PAGESHIFT));
+			ppa[pgidx++] = pp;
+			pp = page_next(pp);
+		}
+	}
+
+	if (svd->type == MAP_PRIVATE) {
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]);
+		for (i = 0; i < pages; i++) {
+			ASSERT(ppa[i] != NULL);
+			ASSERT(PAGE_EXCL(ppa[i]));
+			ASSERT(ppa[i]->p_vnode == vp);
+			ASSERT(ppa[i]->p_offset ==
+			    start_off + (i << PAGESHIFT));
+			page_downgrade(ppa[i]);
+		}
+		ppa[pages] = NULL;
+	} else {
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]);
+		/*
+		 * the caller will still call VOP_GETPAGE() for shared segments
+		 * to check FS write permissions. For private segments we map
+		 * file read only anyway.  so no VOP_GETPAGE is needed.
+		 */
+		for (i = 0; i < pages; i++) {
+			ASSERT(ppa[i] != NULL);
+			ASSERT(PAGE_EXCL(ppa[i]));
+			ASSERT(ppa[i]->p_vnode == vp);
+			ASSERT(ppa[i]->p_offset ==
+			    start_off + (i << PAGESHIFT));
+			page_unlock(ppa[i]);
+		}
+		ppa[0] = NULL;
+	}
+
+	return (1);
+out:
+	/*
+	 * Do the cleanup. Unlock target pages we didn't relocate. They are
+	 * linked on targ_pplist by root pages. reassemble unused replacement
+	 * and io pages back to pplist.
+	 */
+	if (io_pplist != NULL) {
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]);
+		pp = io_pplist;
+		do {
+			ASSERT(pp->p_vnode == vp);
+			ASSERT(pp->p_offset == io_off);
+			ASSERT(page_iolock_assert(pp));
+			page_io_unlock(pp);
+			page_hashout(pp, NULL);
+			io_off += PAGESIZE;
+		} while ((pp = pp->p_next) != io_pplist);
+		page_list_concat(&io_pplist, &pplist);
+		pplist = io_pplist;
+	}
+	tmp_pplist = NULL;
+	while (targ_pplist != NULL) {
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]);
+		pp = targ_pplist;
+		ASSERT(PAGE_EXCL(pp));
+		page_sub(&targ_pplist, pp);
+
+		pszc = pp->p_szc;
+		ppages = page_get_pagecnt(pszc);
+		ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
+
+		if (pszc != 0) {
+			group_page_unlock(pp);
+		}
+		page_unlock(pp);
+
+		pp = repl_pplist;
+		ASSERT(pp != NULL);
+		ASSERT(PAGE_EXCL(pp));
+		ASSERT(pp->p_szc == szc);
+		page_sub(&repl_pplist, pp);
+
+		ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
+
+		/* relink replacement page */
+		page_list_concat(&tmp_pplist, &pp);
+		while (--ppages != 0) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]);
+			pp = page_next(pp);
+			ASSERT(PAGE_EXCL(pp));
+			ASSERT(pp->p_szc == szc);
+			page_list_concat(&tmp_pplist, &pp);
+		}
+	}
+	if (tmp_pplist != NULL) {
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]);
+		page_list_concat(&tmp_pplist, &pplist);
+		pplist = tmp_pplist;
+	}
+	/*
+	 * at this point all pages are either on done_pplist or
+	 * pplist. They can't be all on done_pplist otherwise
+	 * we'd've been done.
+	 */
+	ASSERT(pplist != NULL);
+	if (nios != 0) {
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]);
+		pp = pplist;
+		do {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]);
+			ASSERT(pp->p_szc == szc);
+			ASSERT(PAGE_EXCL(pp));
+			ASSERT(pp->p_vnode != vp);
+			pp->p_szc = 0;
+		} while ((pp = pp->p_next) != pplist);
+
+		pp = done_pplist;
+		do {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]);
+			ASSERT(pp->p_szc == szc);
+			ASSERT(PAGE_EXCL(pp));
+			ASSERT(pp->p_vnode == vp);
+			pp->p_szc = 0;
+		} while ((pp = pp->p_next) != done_pplist);
+
+		while (pplist != NULL) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]);
+			pp = pplist;
+			page_sub(&pplist, pp);
+			page_free(pp, 0);
+		}
+
+		while (done_pplist != NULL) {
+			VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]);
+			pp = done_pplist;
+			page_sub(&done_pplist, pp);
+			page_unlock(pp);
+		}
+		*ppplist = NULL;
+		return (0);
+	}
+	ASSERT(pplist == *ppplist);
+	if (io_err) {
+		VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]);
+		/*
+		 * don't downsize on io error.
+		 * see if vop_getpage succeeds.
+		 * pplist may still be used in this case
+		 * for relocations.
+		 */
+		return (0);
+	}
+	VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]);
+	page_free_replacement_page(pplist);
+	page_create_putback(pages);
+	*ppplist = NULL;
+	return (0);
+}
+
+int segvn_anypgsz = 0;
+
+#define	SEGVN_RESTORE_SOFTLOCK(type, pages) 		\
+		if ((type) == F_SOFTLOCK) {		\
+			mutex_enter(&freemem_lock);	\
+			availrmem += (pages);		\
+			segvn_pages_locked -= (pages);	\
+			svd->softlockcnt -= (pages);	\
+			mutex_exit(&freemem_lock);	\
+		}
+
+#define	SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot)		\
+		if (IS_VMODSORT((ppa)[0]->p_vnode)) {			\
+			if ((rw) == S_WRITE) {				\
+				for (i = 0; i < (pages); i++) {		\
+					ASSERT((ppa)[i]->p_vnode ==	\
+					    (ppa)[0]->p_vnode);		\
+					hat_setmod((ppa)[i]);		\
+				}					\
+			} else if ((rw) != S_OTHER &&			\
+			    ((prot) & (vpprot) & PROT_WRITE)) {		\
+				for (i = 0; i < (pages); i++) {		\
+					ASSERT((ppa)[i]->p_vnode ==	\
+					    (ppa)[0]->p_vnode);		\
+					if (!hat_ismod((ppa)[i])) {	\
+						prot &= ~PROT_WRITE;	\
+						break;			\
+					}				\
+				}					\
+			}						\
+		}
+
+#ifdef  VM_STATS
+
+#define	SEGVN_VMSTAT_FLTVNPAGES(idx)					\
+		VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]);
+
+#else /* VM_STATS */
+
+#define	SEGVN_VMSTAT_FLTVNPAGES(idx)
+
+#endif
+
+static faultcode_t
+segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
+    caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
+    caddr_t eaddr, int brkcow)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct anon_map *amp = svd->amp;
+	uchar_t segtype = svd->type;
+	uint_t szc = seg->s_szc;
+	size_t pgsz = page_get_pagesize(szc);
+	size_t maxpgsz = pgsz;
+	pgcnt_t pages = btop(pgsz);
+	pgcnt_t maxpages = pages;
+	size_t ppasize = (pages + 1) * sizeof (page_t *);
+	caddr_t a = lpgaddr;
+	caddr_t	maxlpgeaddr = lpgeaddr;
+	u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base);
+	ulong_t aindx = svd->anon_index + seg_page(seg, a);
+	struct vpage *vpage = (svd->vpage != NULL) ?
+	    &svd->vpage[seg_page(seg, a)] : NULL;
+	vnode_t *vp = svd->vp;
+	page_t **ppa;
+	uint_t	pszc;
+	size_t	ppgsz;
+	pgcnt_t	ppages;
+	faultcode_t err = 0;
+	int ierr;
+	int vop_size_err = 0;
+	uint_t protchk, prot, vpprot;
+	ulong_t i;
+	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
+	anon_sync_obj_t an_cookie;
+	enum seg_rw arw;
+	int alloc_failed = 0;
+	int adjszc_chk;
+	struct vattr va;
+	int xhat = 0;
+	page_t *pplist;
+	pfn_t pfn;
+	int physcontig;
+	int upgrdfail;
+	int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */
+
+	ASSERT(szc != 0);
+	ASSERT(vp != NULL);
+	ASSERT(brkcow == 0 || amp != NULL);
+	ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
+	ASSERT(!(svd->flags & MAP_NORESERVE));
+	ASSERT(type != F_SOFTUNLOCK);
+	ASSERT(IS_P2ALIGNED(a, maxpgsz));
+	ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages));
+	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+	ASSERT(seg->s_szc < NBBY * sizeof (int));
+
+	VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]);
+	VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]);
+
+	if (svd->flags & MAP_TEXT) {
+		hat_flag |= HAT_LOAD_TEXT;
+	}
+
+	if (svd->pageprot) {
+		switch (rw) {
+		case S_READ:
+			protchk = PROT_READ;
+			break;
+		case S_WRITE:
+			protchk = PROT_WRITE;
+			break;
+		case S_EXEC:
+			protchk = PROT_EXEC;
+			break;
+		case S_OTHER:
+		default:
+			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+			break;
+		}
+	} else {
+		prot = svd->prot;
+		/* caller has already done segment level protection check. */
+	}
+
+	if (seg->s_as->a_hat != hat) {
+		xhat = 1;
+	}
+
+	if (rw == S_WRITE && segtype == MAP_PRIVATE) {
+		SEGVN_VMSTAT_FLTVNPAGES(2);
+		arw = S_READ;
+	} else {
+		arw = rw;
+	}
+
+	ppa = kmem_alloc(ppasize, KM_SLEEP);
+
+	VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]);
+
+	for (;;) {
+		adjszc_chk = 0;
+		for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) {
+			if (adjszc_chk) {
+				while (szc < seg->s_szc) {
+					uintptr_t e;
+					uint_t tszc;
+					tszc = segvn_anypgsz_vnode ? szc + 1 :
+					    seg->s_szc;
+					ppgsz = page_get_pagesize(tszc);
+					if (!IS_P2ALIGNED(a, ppgsz) ||
+					    ((alloc_failed >> tszc) &
+						0x1)) {
+						break;
+					}
+					SEGVN_VMSTAT_FLTVNPAGES(4);
+					szc = tszc;
+					pgsz = ppgsz;
+					pages = btop(pgsz);
+					e = P2ROUNDUP((uintptr_t)eaddr, pgsz);
+					lpgeaddr = (caddr_t)e;
+				}
+			}
+
+		again:
+			if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) {
+				ASSERT(IS_P2ALIGNED(aindx, maxpages));
+				ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+				anon_array_enter(amp, aindx, &an_cookie);
+				if (anon_get_ptr(amp->ahp, aindx) != NULL) {
+					SEGVN_VMSTAT_FLTVNPAGES(5);
+					if (anon_pages(amp->ahp, aindx,
+					    maxpages) != maxpages) {
+						panic("segvn_fault_vnodepages:"
+						    " empty anon slots\n");
+					}
+					anon_array_exit(&an_cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+					err = segvn_fault_anonpages(hat, seg,
+					    a, a + maxpgsz, type, rw,
+					    MAX(a, addr),
+					    MIN(a + maxpgsz, eaddr), brkcow);
+					if (err != 0) {
+						SEGVN_VMSTAT_FLTVNPAGES(6);
+						goto out;
+					}
+					if (szc < seg->s_szc) {
+						szc = seg->s_szc;
+						pgsz = maxpgsz;
+						pages = maxpages;
+						lpgeaddr = maxlpgeaddr;
+					}
+					goto next;
+				} else if (anon_pages(amp->ahp, aindx,
+				    maxpages)) {
+					panic("segvn_fault_vnodepages:"
+						" non empty anon slots\n");
+				} else {
+					SEGVN_VMSTAT_FLTVNPAGES(7);
+					anon_array_exit(&an_cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+				}
+			}
+			ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz));
+
+			if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
+				ASSERT(vpage != NULL);
+				prot = VPP_PROT(vpage);
+				ASSERT(sameprot(seg, a, maxpgsz));
+				if ((prot & protchk) == 0) {
+					SEGVN_VMSTAT_FLTVNPAGES(8);
+					err = FC_PROT;
+					goto out;
+				}
+			}
+			if (type == F_SOFTLOCK) {
+				mutex_enter(&freemem_lock);
+				if (availrmem < tune.t_minarmem + pages) {
+					mutex_exit(&freemem_lock);
+					err = FC_MAKE_ERR(ENOMEM);
+					goto out;
+				} else {
+					availrmem -= pages;
+					segvn_pages_locked += pages;
+					svd->softlockcnt += pages;
+				}
+				mutex_exit(&freemem_lock);
+			}
+
+			pplist = NULL;
+			physcontig = 0;
+			ppa[0] = NULL;
+			if (!brkcow && szc &&
+			    !page_exists_physcontig(vp, off, szc,
+				segtype == MAP_PRIVATE ? ppa : NULL)) {
+				SEGVN_VMSTAT_FLTVNPAGES(9);
+				if (page_alloc_pages(seg, a, &pplist, NULL,
+				    szc, 0)) {
+					SEGVN_RESTORE_SOFTLOCK(type, pages);
+					SEGVN_VMSTAT_FLTVNPAGES(10);
+					pszc = 0;
+					ierr = -1;
+					alloc_failed |= (1 << szc);
+					break;
+				}
+				if (vp->v_mpssdata == SEGVN_PAGEIO) {
+					int downsize;
+					SEGVN_VMSTAT_FLTVNPAGES(11);
+					physcontig = segvn_fill_vp_pages(svd,
+					    vp, off, szc, ppa, &pplist,
+					    &pszc, &downsize);
+					ASSERT(!physcontig || pplist == NULL);
+					if (!physcontig && downsize) {
+						SEGVN_RESTORE_SOFTLOCK(type,
+						    pages);
+						ASSERT(pplist == NULL);
+						SEGVN_VMSTAT_FLTVNPAGES(12);
+						ierr = -1;
+						break;
+					}
+					ASSERT(!physcontig ||
+					    segtype == MAP_PRIVATE ||
+					    ppa[0] == NULL);
+					if (physcontig && ppa[0] == NULL) {
+						physcontig = 0;
+					}
+				}
+			} else if (!brkcow && szc && ppa[0] != NULL) {
+				SEGVN_VMSTAT_FLTVNPAGES(13);
+				ASSERT(segtype == MAP_PRIVATE);
+				physcontig = 1;
+			}
+
+			if (!physcontig) {
+				SEGVN_VMSTAT_FLTVNPAGES(14);
+				ppa[0] = NULL;
+				ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz,
+				    &vpprot, ppa, pgsz, seg, a, arw,
+				    svd->cred);
+				if (segtype == MAP_PRIVATE) {
+					SEGVN_VMSTAT_FLTVNPAGES(15);
+					vpprot &= ~PROT_WRITE;
+				}
+			} else {
+				ASSERT(segtype == MAP_PRIVATE);
+				SEGVN_VMSTAT_FLTVNPAGES(16);
+				vpprot = PROT_ALL & ~PROT_WRITE;
+				ierr = 0;
+			}
+
+			if (ierr != 0) {
+				SEGVN_VMSTAT_FLTVNPAGES(17);
+				if (pplist != NULL) {
+					SEGVN_VMSTAT_FLTVNPAGES(18);
+					page_free_replacement_page(pplist);
+					page_create_putback(pages);
+				}
+				SEGVN_RESTORE_SOFTLOCK(type, pages);
+				if (a + pgsz <= eaddr) {
+					SEGVN_VMSTAT_FLTVNPAGES(19);
+					err = FC_MAKE_ERR(ierr);
+					goto out;
+				}
+				va.va_mask = AT_SIZE;
+				if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) {
+					SEGVN_VMSTAT_FLTVNPAGES(20);
+					err = FC_MAKE_ERR(EIO);
+					goto out;
+				}
+				if (btopr(va.va_size) >= btopr(off + pgsz)) {
+					SEGVN_VMSTAT_FLTVNPAGES(21);
+					err = FC_MAKE_ERR(EIO);
+					goto out;
+				}
+				if (btopr(va.va_size) <
+				    btopr(off + (eaddr - a))) {
+					SEGVN_VMSTAT_FLTVNPAGES(22);
+					err = FC_MAKE_ERR(EIO);
+					goto out;
+				}
+				if (brkcow || type == F_SOFTLOCK) {
+					/* can't reduce map area */
+					SEGVN_VMSTAT_FLTVNPAGES(23);
+					vop_size_err = 1;
+					goto out;
+				}
+				SEGVN_VMSTAT_FLTVNPAGES(24);
+				ASSERT(szc != 0);
+				pszc = 0;
+				ierr = -1;
+				break;
+			}
+
+			if (amp != NULL) {
+				ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+				anon_array_enter(amp, aindx, &an_cookie);
+			}
+			if (amp != NULL &&
+			    anon_get_ptr(amp->ahp, aindx) != NULL) {
+				ulong_t taindx = P2ALIGN(aindx, maxpages);
+
+				SEGVN_VMSTAT_FLTVNPAGES(25);
+				if (anon_pages(amp->ahp, taindx, maxpages) !=
+				    maxpages) {
+					panic("segvn_fault_vnodepages:"
+					    " empty anon slots\n");
+				}
+				for (i = 0; i < pages; i++) {
+					page_unlock(ppa[i]);
+				}
+				anon_array_exit(&an_cookie);
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+				if (pplist != NULL) {
+					page_free_replacement_page(pplist);
+					page_create_putback(pages);
+				}
+				SEGVN_RESTORE_SOFTLOCK(type, pages);
+				if (szc < seg->s_szc) {
+					SEGVN_VMSTAT_FLTVNPAGES(26);
+					/*
+					 * For private segments SOFTLOCK
+					 * either always breaks cow (any rw
+					 * type except S_READ_NOCOW) or
+					 * address space is locked as writer
+					 * (S_READ_NOCOW case) and anon slots
+					 * can't show up on second check.
+					 * Therefore if we are here for
+					 * SOFTLOCK case it must be a cow
+					 * break but cow break never reduces
+					 * szc. Thus the assert below.
+					 */
+					ASSERT(!brkcow && type != F_SOFTLOCK);
+					pszc = seg->s_szc;
+					ierr = -2;
+					break;
+				}
+				ASSERT(IS_P2ALIGNED(a, maxpgsz));
+				goto again;
+			}
+#ifdef DEBUG
+			if (amp != NULL) {
+				ulong_t taindx = P2ALIGN(aindx, maxpages);
+				ASSERT(!anon_pages(amp->ahp, taindx, maxpages));
+			}
+#endif /* DEBUG */
+
+			if (brkcow) {
+				ASSERT(amp != NULL);
+				ASSERT(pplist == NULL);
+				ASSERT(szc == seg->s_szc);
+				ASSERT(IS_P2ALIGNED(a, maxpgsz));
+				ASSERT(IS_P2ALIGNED(aindx, maxpages));
+				SEGVN_VMSTAT_FLTVNPAGES(27);
+				ierr = anon_map_privatepages(amp, aindx, szc,
+				    seg, a, prot, ppa, vpage, segvn_anypgsz,
+				    svd->cred);
+				if (ierr != 0) {
+					SEGVN_VMSTAT_FLTVNPAGES(28);
+					anon_array_exit(&an_cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+					SEGVN_RESTORE_SOFTLOCK(type, pages);
+					err = FC_MAKE_ERR(ierr);
+					goto out;
+				}
+
+				ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
+				/*
+				 * p_szc can't be changed for locked
+				 * swapfs pages.
+				 */
+				hat_memload_array(hat, a, pgsz, ppa, prot,
+				    hat_flag);
+
+				if (!(hat_flag & HAT_LOAD_LOCK)) {
+					SEGVN_VMSTAT_FLTVNPAGES(29);
+					for (i = 0; i < pages; i++) {
+						page_unlock(ppa[i]);
+					}
+				}
+				anon_array_exit(&an_cookie);
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+				goto next;
+			}
+
+			pfn = page_pptonum(ppa[0]);
+			/*
+			 * hat_page_demote() needs an EXCl lock on one of
+			 * constituent page_t's and it decreases root's p_szc
+			 * last. This means if root's p_szc is equal szc and
+			 * all its constituent pages are locked
+			 * hat_page_demote() that could have changed p_szc to
+			 * szc is already done and no new have page_demote()
+			 * can start for this large page.
+			 */
+
+			/*
+			 * we need to make sure same mapping size is used for
+			 * the same address range if there's a possibility the
+			 * adddress is already mapped because hat layer panics
+			 * when translation is loaded for the range already
+			 * mapped with a different page size.  We achieve it
+			 * by always using largest page size possible subject
+			 * to the constraints of page size, segment page size
+			 * and page alignment.  Since mappings are invalidated
+			 * when those constraints change and make it
+			 * impossible to use previously used mapping size no
+			 * mapping size conflicts should happen.
+			 */
+
+		chkszc:
+			if ((pszc = ppa[0]->p_szc) == szc &&
+			    IS_P2ALIGNED(pfn, pages)) {
+
+				SEGVN_VMSTAT_FLTVNPAGES(30);
+#ifdef DEBUG
+				for (i = 0; i < pages; i++) {
+					ASSERT(PAGE_LOCKED(ppa[i]));
+					ASSERT(!PP_ISFREE(ppa[i]));
+					ASSERT(page_pptonum(ppa[i]) ==
+					    pfn + i);
+					ASSERT(ppa[i]->p_szc == szc);
+					ASSERT(ppa[i]->p_vnode == vp);
+					ASSERT(ppa[i]->p_offset ==
+					    off + (i << PAGESHIFT));
+				}
+#endif
+				/*
+				 * All pages are of szc we need and they are
+				 * all locked so they can't change szc. load
+				 * translations.
+				 *
+				 * if page got promoted since last check
+				 * we don't need pplist.
+				 */
+				if (pplist != NULL) {
+					page_free_replacement_page(pplist);
+					page_create_putback(pages);
+				}
+				if (PP_ISMIGRATE(ppa[0])) {
+					page_migrate(seg, a, ppa, pages);
+				}
+				SEGVN_UPDATE_MODBITS(ppa, pages, rw,
+				    prot, vpprot);
+				if (!xhat) {
+					hat_memload_array(hat, a, pgsz, ppa,
+					    prot & vpprot, hat_flag);
+				} else {
+					/*
+					 * avoid large xhat mappings to FS
+					 * pages so that hat_page_demote()
+					 * doesn't need to check for xhat
+					 * large mappings.
+					 */
+					for (i = 0; i < pages; i++) {
+						hat_memload(hat,
+						    a + (i << PAGESHIFT),
+						    ppa[i], prot & vpprot,
+						    hat_flag);
+					}
+				}
+
+				if (!(hat_flag & HAT_LOAD_LOCK)) {
+					for (i = 0; i < pages; i++) {
+						page_unlock(ppa[i]);
+					}
+				}
+				if (amp != NULL) {
+					anon_array_exit(&an_cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+				}
+				goto next;
+			}
+
+			/*
+			 * See if upsize is possible.
+			 */
+			if (pszc > szc && szc < seg->s_szc &&
+			    (segvn_anypgsz_vnode || pszc >= seg->s_szc)) {
+				pgcnt_t aphase;
+				uint_t pszc1 = MIN(pszc, seg->s_szc);
+				ppgsz = page_get_pagesize(pszc1);
+				ppages = btop(ppgsz);
+				aphase = btop(P2PHASE((uintptr_t)a, ppgsz));
+
+				SEGVN_VMSTAT_FLTVNPAGES(31);
+				if (aphase != P2PHASE(pfn, ppages)) {
+					segvn_faultvnmpss_align_err4++;
+				} else if (type == F_SOFTLOCK &&
+				    a != lpgaddr &&
+				    !IS_P2ALIGNED(pfn,
+					page_get_pagecnt(ppa[0]->p_szc))) {
+					/*
+					 * if we locked previous offsets for
+					 * smaller szc page larger page can't
+					 * be here since one needs excl locks
+					 * to promote page size.
+					 */
+					panic("segvn_fault_vnodepages: "
+					    "unexpected larger than szc page"
+					    " found after SOFTLOCK");
+				} else {
+					SEGVN_VMSTAT_FLTVNPAGES(32);
+					if (pplist != NULL) {
+						page_t *pl = pplist;
+						page_free_replacement_page(pl);
+						page_create_putback(pages);
+					}
+					for (i = 0; i < pages; i++) {
+						page_unlock(ppa[i]);
+					}
+					if (amp != NULL) {
+						anon_array_exit(&an_cookie);
+						ANON_LOCK_EXIT(&amp->a_rwlock);
+					}
+					SEGVN_RESTORE_SOFTLOCK(type, pages);
+					pszc = pszc1;
+					ierr = -2;
+					break;
+				}
+			}
+
+			/*
+			 * check if we should use smallest mapping size.
+			 */
+			upgrdfail = 0;
+			if (szc == 0 || xhat ||
+			    (pszc >= szc &&
+			    !IS_P2ALIGNED(pfn, pages)) ||
+			    (pszc < szc &&
+			    !segvn_full_szcpages(ppa, szc, &upgrdfail,
+				&pszc))) {
+
+				if (upgrdfail) {
+					/*
+					 * segvn_full_szcpages failed to lock
+					 * all pages EXCL. Size down.
+					 */
+					ASSERT(pszc < szc);
+
+					SEGVN_VMSTAT_FLTVNPAGES(33);
+
+					if (pplist != NULL) {
+						page_t *pl = pplist;
+						page_free_replacement_page(pl);
+						page_create_putback(pages);
+					}
+
+					for (i = 0; i < pages; i++) {
+						page_unlock(ppa[i]);
+					}
+					if (amp != NULL) {
+						anon_array_exit(&an_cookie);
+						ANON_LOCK_EXIT(&amp->a_rwlock);
+					}
+					SEGVN_RESTORE_SOFTLOCK(type, pages);
+					ierr = -1;
+					break;
+				}
+				if (szc != 0 && !xhat) {
+					segvn_faultvnmpss_align_err5++;
+				}
+				SEGVN_VMSTAT_FLTVNPAGES(34);
+				if (pplist != NULL) {
+					page_free_replacement_page(pplist);
+					page_create_putback(pages);
+				}
+				SEGVN_UPDATE_MODBITS(ppa, pages, rw,
+				    prot, vpprot);
+				for (i = 0; i < pages; i++) {
+					hat_memload(hat, a + (i << PAGESHIFT),
+					    ppa[i], prot & vpprot, hat_flag);
+				}
+				if (!(hat_flag & HAT_LOAD_LOCK)) {
+					for (i = 0; i < pages; i++) {
+						page_unlock(ppa[i]);
+					}
+				}
+				if (amp != NULL) {
+					anon_array_exit(&an_cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+				}
+				goto next;
+			}
+
+			if (pszc == szc) {
+				/*
+				 * segvn_full_szcpages() upgraded pages szc.
+				 */
+				ASSERT(pszc == ppa[0]->p_szc);
+				ASSERT(IS_P2ALIGNED(pfn, pages));
+				goto chkszc;
+			}
+
+			if (pszc > szc) {
+				kmutex_t *szcmtx;
+				SEGVN_VMSTAT_FLTVNPAGES(35);
+				/*
+				 * p_szc of ppa[0] can change since we haven't
+				 * locked all constituent pages. Call
+				 * page_lock_szc() to prevent szc changes.
+				 * This should be a rare case that happens when
+				 * multiple segments use a different page size
+				 * to map the same file offsets.
+				 */
+				szcmtx = page_szc_lock(ppa[0]);
+				pszc = ppa[0]->p_szc;
+				ASSERT(szcmtx != NULL || pszc == 0);
+				ASSERT(ppa[0]->p_szc <= pszc);
+				if (pszc <= szc) {
+					SEGVN_VMSTAT_FLTVNPAGES(36);
+					if (szcmtx != NULL) {
+						mutex_exit(szcmtx);
+					}
+					goto chkszc;
+				}
+				if (pplist != NULL) {
+					/*
+					 * page got promoted since last check.
+					 * we don't need preaalocated large
+					 * page.
+					 */
+					SEGVN_VMSTAT_FLTVNPAGES(37);
+					page_free_replacement_page(pplist);
+					page_create_putback(pages);
+				}
+				SEGVN_UPDATE_MODBITS(ppa, pages, rw,
+				    prot, vpprot);
+				hat_memload_array(hat, a, pgsz, ppa,
+				    prot & vpprot, hat_flag);
+				mutex_exit(szcmtx);
+				if (!(hat_flag & HAT_LOAD_LOCK)) {
+					for (i = 0; i < pages; i++) {
+						page_unlock(ppa[i]);
+					}
+				}
+				if (amp != NULL) {
+					anon_array_exit(&an_cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+				}
+				goto next;
+			}
+
+			/*
+			 * if page got demoted since last check
+			 * we could have not allocated larger page.
+			 * allocate now.
+			 */
+			if (pplist == NULL &&
+			    page_alloc_pages(seg, a, &pplist, NULL, szc, 0)) {
+				SEGVN_VMSTAT_FLTVNPAGES(38);
+				for (i = 0; i < pages; i++) {
+					page_unlock(ppa[i]);
+				}
+				if (amp != NULL) {
+					anon_array_exit(&an_cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+				}
+				SEGVN_RESTORE_SOFTLOCK(type, pages);
+				ierr = -1;
+				alloc_failed |= (1 << szc);
+				break;
+			}
+
+			SEGVN_VMSTAT_FLTVNPAGES(39);
+
+			segvn_relocate_pages(ppa, pplist);
+
+			SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot);
+			hat_memload_array(hat, a, pgsz, ppa, prot & vpprot,
+			    hat_flag);
+			if (!(hat_flag & HAT_LOAD_LOCK)) {
+				for (i = 0; i < pages; i++) {
+					ASSERT(PAGE_SHARED(ppa[i]));
+					page_unlock(ppa[i]);
+				}
+			}
+			if (amp != NULL) {
+				anon_array_exit(&an_cookie);
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+			}
+
+		next:
+			if (vpage != NULL) {
+				vpage += pages;
+			}
+			adjszc_chk = 1;
+		}
+		if (a == lpgeaddr)
+			break;
+		ASSERT(a < lpgeaddr);
+		/*
+		 * ierr == -1 means we failed to map with a large page.
+		 * (either due to allocation/relocation failures or
+		 * misalignment with other mappings to this file.
+		 *
+		 * ierr == -2 means some other thread allocated a large page
+		 * after we gave up tp map with a large page.  retry with
+		 * larger mapping.
+		 */
+		ASSERT(ierr == -1 || ierr == -2);
+		ASSERT(ierr == -2 || szc != 0);
+		ASSERT(ierr == -1 || szc < seg->s_szc);
+		if (ierr == -2) {
+			SEGVN_VMSTAT_FLTVNPAGES(40);
+			ASSERT(pszc > szc && pszc <= seg->s_szc);
+			szc = pszc;
+		} else if (segvn_anypgsz_vnode) {
+			SEGVN_VMSTAT_FLTVNPAGES(41);
+			szc--;
+		} else {
+			SEGVN_VMSTAT_FLTVNPAGES(42);
+			ASSERT(pszc < szc);
+			/*
+			 * other process created pszc large page.
+			 * but we still have to drop to 0 szc.
+			 */
+			szc = 0;
+		}
+
+		pgsz = page_get_pagesize(szc);
+		pages = btop(pgsz);
+		ASSERT(type != F_SOFTLOCK || ierr == -1 ||
+		    (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz)));
+		if (type == F_SOFTLOCK) {
+			/*
+			 * For softlocks we cannot reduce the fault area
+			 * (calculated based on the largest page size for this
+			 * segment) for size down and a is already next
+			 * page size aligned as assertted above for size
+			 * ups. Therefore just continue in case of softlock.
+			 */
+			SEGVN_VMSTAT_FLTVNPAGES(43);
+			continue; /* keep lint happy */
+		} else if (ierr == -2) {
+
+			/*
+			 * Size up case. Note lpgaddr may only be needed for
+			 * softlock case so we don't adjust it here.
+			 */
+			a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
+			ASSERT(a >= lpgaddr);
+			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
+			off = svd->offset + (uintptr_t)(a - seg->s_base);
+			aindx = svd->anon_index + seg_page(seg, a);
+			vpage = (svd->vpage != NULL) ?
+			    &svd->vpage[seg_page(seg, a)] : NULL;
+		} else {
+			/*
+			 * Size down case. Note lpgaddr may only be needed for
+			 * softlock case so we don't adjust it here.
+			 */
+			ASSERT(IS_P2ALIGNED(a, pgsz));
+			ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
+			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
+			ASSERT(a < lpgeaddr);
+			if (a < addr) {
+				SEGVN_VMSTAT_FLTVNPAGES(44);
+				/*
+				 * The beginning of the large page region can
+				 * be pulled to the right to make a smaller
+				 * region. We haven't yet faulted a single
+				 * page.
+				 */
+				a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
+				ASSERT(a >= lpgaddr);
+				off = svd->offset +
+				    (uintptr_t)(a - seg->s_base);
+				aindx = svd->anon_index + seg_page(seg, a);
+				vpage = (svd->vpage != NULL) ?
+				    &svd->vpage[seg_page(seg, a)] : NULL;
+			}
+		}
+	}
+out:
+	kmem_free(ppa, ppasize);
+	if (!err && !vop_size_err) {
+		SEGVN_VMSTAT_FLTVNPAGES(45);
+		return (0);
+	}
+	if (type == F_SOFTLOCK && a > lpgaddr) {
+		SEGVN_VMSTAT_FLTVNPAGES(46);
+		segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
+	}
+	if (!vop_size_err) {
+		SEGVN_VMSTAT_FLTVNPAGES(47);
+		return (err);
+	}
+	ASSERT(brkcow || type == F_SOFTLOCK);
+	/*
+	 * Large page end is mapped beyond the end of file and it's a cow
+	 * fault or softlock so we can't reduce the map area.  For now just
+	 * demote the segment. This should really only happen if the end of
+	 * the file changed after the mapping was established since when large
+	 * page segments are created we make sure they don't extend beyond the
+	 * end of the file.
+	 */
+	SEGVN_VMSTAT_FLTVNPAGES(48);
+
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+	err = 0;
+	if (seg->s_szc != 0) {
+		err = segvn_clrszc(seg);
+		if (err != 0) {
+			segvn_fltvnpages_clrszc_err++;
+		}
+	}
+	ASSERT(err || seg->s_szc == 0);
+	SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock);
+	/* segvn_fault will do its job as if szc had been zero to begin with */
+	return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err));
+}
+
+/*
+ * This routine will attempt to fault in one large page.
+ * it will use smaller pages if that fails.
+ * It should only be called for pure anonymous segments.
+ */
+static faultcode_t
+segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
+    caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
+    caddr_t eaddr, int brkcow)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct anon_map *amp = svd->amp;
+	uchar_t segtype = svd->type;
+	uint_t szc = seg->s_szc;
+	size_t pgsz = page_get_pagesize(szc);
+	size_t maxpgsz = pgsz;
+	pgcnt_t pages = btop(pgsz);
+	size_t ppasize = pages * sizeof (page_t *);
+	caddr_t a = lpgaddr;
+	ulong_t aindx = svd->anon_index + seg_page(seg, a);
+	struct vpage *vpage = (svd->vpage != NULL) ?
+	    &svd->vpage[seg_page(seg, a)] : NULL;
+	page_t **ppa;
+	uint_t	ppa_szc;
+	faultcode_t err;
+	int ierr;
+	uint_t protchk, prot, vpprot;
+	int i;
+	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
+	anon_sync_obj_t cookie;
+
+	ASSERT(szc != 0);
+	ASSERT(amp != NULL);
+	ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
+	ASSERT(!(svd->flags & MAP_NORESERVE));
+	ASSERT(type != F_SOFTUNLOCK);
+	ASSERT(segtype == MAP_PRIVATE);
+	ASSERT(IS_P2ALIGNED(a, maxpgsz));
+
+	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+
+	VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]);
+	VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]);
+
+	if (svd->flags & MAP_TEXT) {
+		hat_flag |= HAT_LOAD_TEXT;
+	}
+
+	if (svd->pageprot) {
+		switch (rw) {
+		case S_READ:
+			protchk = PROT_READ;
+			break;
+		case S_WRITE:
+			protchk = PROT_WRITE;
+			break;
+		case S_EXEC:
+			protchk = PROT_EXEC;
+			break;
+		case S_OTHER:
+		default:
+			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+			break;
+		}
+		VM_STAT_ADD(segvnvmstats.fltanpages[2]);
+	} else {
+		prot = svd->prot;
+		/* caller has already done segment level protection check. */
+	}
+
+	ppa = kmem_alloc(ppasize, KM_SLEEP);
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+	for (;;) {
+		for (; a < lpgeaddr; a += pgsz, aindx += pages) {
+			if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
+				VM_STAT_ADD(segvnvmstats.fltanpages[3]);
+				ASSERT(vpage != NULL);
+				prot = VPP_PROT(vpage);
+				ASSERT(sameprot(seg, a, maxpgsz));
+				if ((prot & protchk) == 0) {
+					err = FC_PROT;
+					goto error;
+				}
+			}
+			if (type == F_SOFTLOCK) {
+				mutex_enter(&freemem_lock);
+				if (availrmem < tune.t_minarmem + pages) {
+					mutex_exit(&freemem_lock);
+					err = FC_MAKE_ERR(ENOMEM);
+					goto error;
+				} else {
+					availrmem -= pages;
+					segvn_pages_locked += pages;
+					svd->softlockcnt += pages;
+				}
+				mutex_exit(&freemem_lock);
+			}
+			anon_array_enter(amp, aindx, &cookie);
+			ppa_szc = (uint_t)-1;
+			ierr = anon_map_getpages(amp, aindx, szc, seg, a,
+				prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow,
+				segvn_anypgsz, svd->cred);
+			if (ierr != 0) {
+				anon_array_exit(&cookie);
+				VM_STAT_ADD(segvnvmstats.fltanpages[4]);
+				if (type == F_SOFTLOCK) {
+					VM_STAT_ADD(segvnvmstats.fltanpages[5]);
+					mutex_enter(&freemem_lock);
+					availrmem += pages;
+					segvn_pages_locked -= pages;
+					svd->softlockcnt -= pages;
+					mutex_exit(&freemem_lock);
+				}
+				if (ierr > 0) {
+					VM_STAT_ADD(segvnvmstats.fltanpages[6]);
+					err = FC_MAKE_ERR(ierr);
+					goto error;
+				}
+				break;
+			}
+
+			ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
+
+			/*
+			 * Handle pages that have been marked for migration
+			 */
+			if (lgrp_optimizations())
+				page_migrate(seg, a, ppa, pages);
+
+			hat_memload_array(hat, a, pgsz, ppa,
+			    prot & vpprot, hat_flag);
+
+			if (hat_flag & HAT_LOAD_LOCK) {
+				VM_STAT_ADD(segvnvmstats.fltanpages[7]);
+			} else {
+				VM_STAT_ADD(segvnvmstats.fltanpages[8]);
+				for (i = 0; i < pages; i++)
+					page_unlock(ppa[i]);
+			}
+			if (vpage != NULL)
+				vpage += pages;
+
+			anon_array_exit(&cookie);
+		}
+		if (a == lpgeaddr)
+			break;
+		ASSERT(a < lpgeaddr);
+		/*
+		 * ierr == -1 means we failed to allocate a large page.
+		 * so do a size down operation.
+		 *
+		 * ierr == -2 means some other process that privately shares
+		 * pages with this process has allocated a larger page and we
+		 * need to retry with larger pages. So do a size up
+		 * operation. This relies on the fact that large pages are
+		 * never partially shared i.e. if we share any constituent
+		 * page of a large page with another process we must share the
+		 * entire large page. Note this cannot happen for SOFTLOCK
+		 * case, unless current address (a) is at the beginning of the
+		 * next page size boundary because the other process couldn't
+		 * have relocated locked pages.
+		 */
+		ASSERT(ierr == -1 || ierr == -2);
+		if (segvn_anypgsz) {
+			ASSERT(ierr == -2 || szc != 0);
+			ASSERT(ierr == -1 || szc < seg->s_szc);
+			szc = (ierr == -1) ? szc - 1 : szc + 1;
+		} else {
+			/*
+			 * For non COW faults and segvn_anypgsz == 0
+			 * we need to be careful not to loop forever
+			 * if existing page is found with szc other
+			 * than 0 or seg->s_szc. This could be due
+			 * to page relocations on behalf of DR or
+			 * more likely large page creation. For this
+			 * case simply re-size to existing page's szc
+			 * if returned by anon_map_getpages().
+			 */
+			if (ppa_szc == (uint_t)-1) {
+				szc = (ierr == -1) ? 0 : seg->s_szc;
+			} else {
+				ASSERT(ppa_szc <= seg->s_szc);
+				ASSERT(ierr == -2 || ppa_szc < szc);
+				ASSERT(ierr == -1 || ppa_szc > szc);
+				szc = ppa_szc;
+			}
+		}
+
+		pgsz = page_get_pagesize(szc);
+		pages = btop(pgsz);
+		ASSERT(type != F_SOFTLOCK || ierr == -1 ||
+		    (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz)));
+		if (type == F_SOFTLOCK) {
+			/*
+			 * For softlocks we cannot reduce the fault area
+			 * (calculated based on the largest page size for this
+			 * segment) for size down and a is already next
+			 * page size aligned as assertted above for size
+			 * ups. Therefore just continue in case of softlock.
+			 */
+			VM_STAT_ADD(segvnvmstats.fltanpages[9]);
+			continue; /* keep lint happy */
+		} else if (ierr == -2) {
+
+			/*
+			 * Size up case. Note lpgaddr may only be needed for
+			 * softlock case so we don't adjust it here.
+			 */
+			VM_STAT_ADD(segvnvmstats.fltanpages[10]);
+			a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
+			ASSERT(a >= lpgaddr);
+			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
+			aindx = svd->anon_index + seg_page(seg, a);
+			vpage = (svd->vpage != NULL) ?
+			    &svd->vpage[seg_page(seg, a)] : NULL;
+		} else {
+			/*
+			 * Size down case. Note lpgaddr may only be needed for
+			 * softlock case so we don't adjust it here.
+			 */
+			VM_STAT_ADD(segvnvmstats.fltanpages[11]);
+			ASSERT(IS_P2ALIGNED(a, pgsz));
+			ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
+			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
+			ASSERT(a < lpgeaddr);
+			if (a < addr) {
+				/*
+				 * The beginning of the large page region can
+				 * be pulled to the right to make a smaller
+				 * region. We haven't yet faulted a single
+				 * page.
+				 */
+				VM_STAT_ADD(segvnvmstats.fltanpages[12]);
+				a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
+				ASSERT(a >= lpgaddr);
+				aindx = svd->anon_index + seg_page(seg, a);
+				vpage = (svd->vpage != NULL) ?
+				    &svd->vpage[seg_page(seg, a)] : NULL;
+			}
+		}
+	}
+	VM_STAT_ADD(segvnvmstats.fltanpages[13]);
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+	kmem_free(ppa, ppasize);
+	return (0);
+error:
+	VM_STAT_ADD(segvnvmstats.fltanpages[14]);
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+	kmem_free(ppa, ppasize);
+	if (type == F_SOFTLOCK && a > lpgaddr) {
+		VM_STAT_ADD(segvnvmstats.fltanpages[15]);
+		segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
+	}
+	return (err);
+}
+
+int fltadvice = 1;	/* set to free behind pages for sequential access */
+
+/*
+ * This routine is called via a machine specific fault handling routine.
+ * It is also called by software routines wishing to lock or unlock
+ * a range of addresses.
+ *
+ * Here is the basic algorithm:
+ *	If unlocking
+ *		Call segvn_softunlock
+ *		Return
+ *	endif
+ *	Checking and set up work
+ *	If we will need some non-anonymous pages
+ *		Call VOP_GETPAGE over the range of non-anonymous pages
+ *	endif
+ *	Loop over all addresses requested
+ *		Call segvn_faultpage passing in page list
+ *		    to load up translations and handle anonymous pages
+ *	endloop
+ *	Load up translation to any additional pages in page list not
+ *	    already handled that fit into this segment
+ */
+static faultcode_t
+segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
+    enum fault_type type, enum seg_rw rw)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	page_t **plp, **ppp, *pp;
+	u_offset_t off;
+	caddr_t a;
+	struct vpage *vpage;
+	uint_t vpprot, prot;
+	int err;
+	page_t *pl[PVN_GETPAGE_NUM + 1];
+	size_t plsz, pl_alloc_sz;
+	size_t page;
+	ulong_t anon_index;
+	struct anon_map *amp;
+	int dogetpage = 0;
+	caddr_t	lpgaddr, lpgeaddr;
+	size_t pgsz;
+	anon_sync_obj_t cookie;
+	int brkcow = BREAK_COW_SHARE(rw, type, svd->type);
+
+	/*
+	 * S_READ_NOCOW is like read
+	 * except caller advises no need
+	 * to copy-on-write for softlock
+	 * because it holds address space
+	 * locked as writer and thus prevents
+	 * any copy on writes of a softlocked
+	 * page by another thread.
+	 * S_READ_NOCOW vs S_READ distinction was
+	 * only needed for BREAK_COW_SHARE(). After
+	 * that we treat S_READ_NOW as just S_READ.
+	 */
+	if (rw == S_READ_NOCOW) {
+		rw = S_READ;
+		ASSERT(type == F_SOFTLOCK &&
+		    AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+	}
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * First handle the easy stuff
+	 */
+	if (type == F_SOFTUNLOCK) {
+		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+		pgsz = (seg->s_szc == 0) ? PAGESIZE :
+		    page_get_pagesize(seg->s_szc);
+		VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]);
+		CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
+		segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw);
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		return (0);
+	}
+
+top:
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+	/*
+	 * If we have the same protections for the entire segment,
+	 * insure that the access being attempted is legitimate.
+	 */
+
+	if (svd->pageprot == 0) {
+		uint_t protchk;
+
+		switch (rw) {
+		case S_READ:
+			protchk = PROT_READ;
+			break;
+		case S_WRITE:
+			protchk = PROT_WRITE;
+			break;
+		case S_EXEC:
+			protchk = PROT_EXEC;
+			break;
+		case S_OTHER:
+		default:
+			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+			break;
+		}
+
+		if ((svd->prot & protchk) == 0) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (FC_PROT);	/* illegal access type */
+		}
+	}
+
+	/*
+	 * Check to see if we need to allocate an anon_map structure.
+	 */
+	if (svd->amp == NULL && (svd->vp == NULL || brkcow)) {
+		/*
+		 * Drop the "read" lock on the segment and acquire
+		 * the "write" version since we have to allocate the
+		 * anon_map.
+		 */
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+
+		if (svd->amp == NULL) {
+			svd->amp = anonmap_alloc(seg->s_size, 0);
+			svd->amp->a_szc = seg->s_szc;
+		}
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+
+		/*
+		 * Start all over again since segment protections
+		 * may have changed after we dropped the "read" lock.
+		 */
+		goto top;
+	}
+
+	amp = svd->amp;
+
+	/*
+	 * MADV_SEQUENTIAL work is ignored for large page segments.
+	 */
+	if (seg->s_szc != 0) {
+		pgsz = page_get_pagesize(seg->s_szc);
+		ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+		/*
+		 * We may need to do relocations so purge seg_pcache to allow
+		 * pages to be locked exclusively.
+		 */
+		if (svd->softlockcnt != 0)
+			segvn_purge(seg);
+		CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
+		if (svd->vp == NULL) {
+			ASSERT(svd->type == MAP_PRIVATE);
+			err = segvn_fault_anonpages(hat, seg, lpgaddr,
+			    lpgeaddr, type, rw, addr, addr + len, brkcow);
+		} else {
+			err = segvn_fault_vnodepages(hat, seg, lpgaddr,
+				lpgeaddr, type, rw, addr, addr + len, brkcow);
+			if (err == IE_RETRY) {
+				ASSERT(seg->s_szc == 0);
+				ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
+				goto cont;
+			}
+		}
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		return (err);
+	}
+
+cont:
+	page = seg_page(seg, addr);
+	if (amp != NULL) {
+		anon_index = svd->anon_index + page;
+
+		if ((type == F_PROT) && (rw == S_READ) &&
+		    svd->type == MAP_PRIVATE && svd->pageprot == 0) {
+			size_t index = anon_index;
+			struct anon *ap;
+
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+			/*
+			 * The fast path could apply to S_WRITE also, except
+			 * that the protection fault could be caused by lazy
+			 * tlb flush when ro->rw. In this case, the pte is
+			 * RW already. But RO in the other cpu's tlb causes
+			 * the fault. Since hat_chgprot won't do anything if
+			 * pte doesn't change, we may end up faulting
+			 * indefinitely until the RO tlb entry gets replaced.
+			 */
+			for (a = addr; a < addr + len; a += PAGESIZE, index++) {
+				anon_array_enter(amp, index, &cookie);
+				ap = anon_get_ptr(amp->ahp, index);
+				anon_array_exit(&cookie);
+				if ((ap == NULL) || (ap->an_refcnt != 1)) {
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+					goto slow;
+				}
+			}
+			hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (0);
+		}
+	}
+slow:
+
+	if (svd->vpage == NULL)
+		vpage = NULL;
+	else
+		vpage = &svd->vpage[page];
+
+	off = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+	/*
+	 * If MADV_SEQUENTIAL has been set for the particular page we
+	 * are faulting on, free behind all pages in the segment and put
+	 * them on the free list.
+	 */
+	if ((page != 0) && fltadvice) {	/* not if first page in segment */
+		struct vpage *vpp;
+		ulong_t fanon_index;
+		size_t fpage;
+		u_offset_t pgoff, fpgoff;
+		struct vnode *fvp;
+		struct anon *fap = NULL;
+
+		if (svd->advice == MADV_SEQUENTIAL ||
+		    (svd->pageadvice &&
+		    VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) {
+			pgoff = off - PAGESIZE;
+			fpage = page - 1;
+			if (vpage != NULL)
+				vpp = &svd->vpage[fpage];
+			if (amp != NULL)
+				fanon_index = svd->anon_index + fpage;
+
+			while (pgoff > svd->offset) {
+				if (svd->advice != MADV_SEQUENTIAL &&
+				    (!svd->pageadvice || (vpage &&
+				    VPP_ADVICE(vpp) != MADV_SEQUENTIAL)))
+					break;
+
+				/*
+				 * If this is an anon page, we must find the
+				 * correct <vp, offset> for it
+				 */
+				fap = NULL;
+				if (amp != NULL) {
+					ANON_LOCK_ENTER(&amp->a_rwlock,
+						RW_READER);
+					anon_array_enter(amp, fanon_index,
+						&cookie);
+					fap = anon_get_ptr(amp->ahp,
+					    fanon_index);
+					if (fap != NULL) {
+						swap_xlate(fap, &fvp, &fpgoff);
+					} else {
+						fpgoff = pgoff;
+						fvp = svd->vp;
+					}
+					anon_array_exit(&cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+				} else {
+					fpgoff = pgoff;
+					fvp = svd->vp;
+				}
+				if (fvp == NULL)
+					break;	/* XXX */
+				/*
+				 * Skip pages that are free or have an
+				 * "exclusive" lock.
+				 */
+				pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED);
+				if (pp == NULL)
+					break;
+				/*
+				 * We don't need the page_struct_lock to test
+				 * as this is only advisory; even if we
+				 * acquire it someone might race in and lock
+				 * the page after we unlock and before the
+				 * PUTPAGE, then VOP_PUTPAGE will do nothing.
+				 */
+				if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
+					/*
+					 * Hold the vnode before releasing
+					 * the page lock to prevent it from
+					 * being freed and re-used by some
+					 * other thread.
+					 */
+					VN_HOLD(fvp);
+					page_unlock(pp);
+					/*
+					 * We should build a page list
+					 * to kluster putpages XXX
+					 */
+					(void) VOP_PUTPAGE(fvp,
+					    (offset_t)fpgoff, PAGESIZE,
+					    (B_DONTNEED|B_FREE|B_ASYNC),
+					    svd->cred);
+					VN_RELE(fvp);
+				} else {
+					/*
+					 * XXX - Should the loop terminate if
+					 * the page is `locked'?
+					 */
+					page_unlock(pp);
+				}
+				--vpp;
+				--fanon_index;
+				pgoff -= PAGESIZE;
+			}
+		}
+	}
+
+	plp = pl;
+	*plp = NULL;
+	pl_alloc_sz = 0;
+
+	/*
+	 * See if we need to call VOP_GETPAGE for
+	 * *any* of the range being faulted on.
+	 * We can skip all of this work if there
+	 * was no original vnode.
+	 */
+	if (svd->vp != NULL) {
+		u_offset_t vp_off;
+		size_t vp_len;
+		struct anon *ap;
+		vnode_t *vp;
+
+		vp_off = off;
+		vp_len = len;
+
+		if (amp == NULL)
+			dogetpage = 1;
+		else {
+			/*
+			 * Only acquire reader lock to prevent amp->ahp
+			 * from being changed.  It's ok to miss pages,
+			 * hence we don't do anon_array_enter
+			 */
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+			ap = anon_get_ptr(amp->ahp, anon_index);
+
+			if (len <= PAGESIZE)
+				/* inline non_anon() */
+				dogetpage = (ap == NULL);
+			else
+				dogetpage = non_anon(amp->ahp, anon_index,
+				    &vp_off, &vp_len);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		}
+
+		if (dogetpage) {
+			enum seg_rw arw;
+			struct as *as = seg->s_as;
+
+			if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) {
+				/*
+				 * Page list won't fit in local array,
+				 * allocate one of the needed size.
+				 */
+				pl_alloc_sz =
+				    (btop(len) + 1) * sizeof (page_t *);
+				plp = kmem_alloc(pl_alloc_sz, KM_SLEEP);
+				plp[0] = NULL;
+				plsz = len;
+			} else if (rw == S_WRITE && svd->type == MAP_PRIVATE ||
+			    rw == S_OTHER ||
+			    (((size_t)(addr + PAGESIZE) <
+			    (size_t)(seg->s_base + seg->s_size)) &&
+			    hat_probe(as->a_hat, addr + PAGESIZE))) {
+				/*
+				 * Ask VOP_GETPAGE to return the exact number
+				 * of pages if
+				 * (a) this is a COW fault, or
+				 * (b) this is a software fault, or
+				 * (c) next page is already mapped.
+				 */
+				plsz = len;
+			} else {
+				/*
+				 * Ask VOP_GETPAGE to return adjacent pages
+				 * within the segment.
+				 */
+				plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t)
+					((seg->s_base + seg->s_size) - addr));
+				ASSERT((addr + plsz) <=
+				    (seg->s_base + seg->s_size));
+			}
+
+			/*
+			 * Need to get some non-anonymous pages.
+			 * We need to make only one call to GETPAGE to do
+			 * this to prevent certain deadlocking conditions
+			 * when we are doing locking.  In this case
+			 * non_anon() should have picked up the smallest
+			 * range which includes all the non-anonymous
+			 * pages in the requested range.  We have to
+			 * be careful regarding which rw flag to pass in
+			 * because on a private mapping, the underlying
+			 * object is never allowed to be written.
+			 */
+			if (rw == S_WRITE && svd->type == MAP_PRIVATE) {
+				arw = S_READ;
+			} else {
+				arw = rw;
+			}
+			vp = svd->vp;
+			TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
+				"segvn_getpage:seg %p addr %p vp %p",
+				seg, addr, vp);
+			err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len,
+			    &vpprot, plp, plsz, seg, addr + (vp_off - off), arw,
+			    svd->cred);
+			if (err) {
+				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+				segvn_pagelist_rele(plp);
+				if (pl_alloc_sz)
+					kmem_free(plp, pl_alloc_sz);
+				return (FC_MAKE_ERR(err));
+			}
+			if (svd->type == MAP_PRIVATE)
+				vpprot &= ~PROT_WRITE;
+		}
+	}
+
+	/*
+	 * N.B. at this time the plp array has all the needed non-anon
+	 * pages in addition to (possibly) having some adjacent pages.
+	 */
+
+	/*
+	 * Always acquire the anon_array_lock to prevent
+	 * 2 threads from allocating separate anon slots for
+	 * the same "addr".
+	 *
+	 * If this is a copy-on-write fault and we don't already
+	 * have the anon_array_lock, acquire it to prevent the
+	 * fault routine from handling multiple copy-on-write faults
+	 * on the same "addr" in the same address space.
+	 *
+	 * Only one thread should deal with the fault since after
+	 * it is handled, the other threads can acquire a translation
+	 * to the newly created private page.  This prevents two or
+	 * more threads from creating different private pages for the
+	 * same fault.
+	 *
+	 * We grab "serialization" lock here if this is a MAP_PRIVATE segment
+	 * to prevent deadlock between this thread and another thread
+	 * which has soft-locked this page and wants to acquire serial_lock.
+	 * ( bug 4026339 )
+	 *
+	 * The fix for bug 4026339 becomes unnecessary when using the
+	 * locking scheme with per amp rwlock and a global set of hash
+	 * lock, anon_array_lock.  If we steal a vnode page when low
+	 * on memory and upgrad the page lock through page_rename,
+	 * then the page is PAGE_HANDLED, nothing needs to be done
+	 * for this page after returning from segvn_faultpage.
+	 *
+	 * But really, the page lock should be downgraded after
+	 * the stolen page is page_rename'd.
+	 */
+
+	if (amp != NULL)
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+
+	/*
+	 * Ok, now loop over the address range and handle faults
+	 */
+	for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) {
+		err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot,
+		    type, rw, brkcow);
+		if (err) {
+			if (amp != NULL)
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+			if (type == F_SOFTLOCK && a > addr)
+				segvn_softunlock(seg, addr, (a - addr),
+				    S_OTHER);
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			segvn_pagelist_rele(plp);
+			if (pl_alloc_sz)
+				kmem_free(plp, pl_alloc_sz);
+			return (err);
+		}
+		if (vpage) {
+			vpage++;
+		} else if (svd->vpage) {
+			page = seg_page(seg, addr);
+			vpage = &svd->vpage[++page];
+		}
+	}
+
+	/* Didn't get pages from the underlying fs so we're done */
+	if (!dogetpage)
+		goto done;
+
+	/*
+	 * Now handle any other pages in the list returned.
+	 * If the page can be used, load up the translations now.
+	 * Note that the for loop will only be entered if "plp"
+	 * is pointing to a non-NULL page pointer which means that
+	 * VOP_GETPAGE() was called and vpprot has been initialized.
+	 */
+	if (svd->pageprot == 0)
+		prot = svd->prot & vpprot;
+
+
+	/*
+	 * Large Files: diff should be unsigned value because we started
+	 * supporting > 2GB segment sizes from 2.5.1 and when a
+	 * large file of size > 2GB gets mapped to address space
+	 * the diff value can be > 2GB.
+	 */
+
+	for (ppp = plp; (pp = *ppp) != NULL; ppp++) {
+		size_t diff;
+		struct anon *ap;
+		int anon_index;
+		anon_sync_obj_t cookie;
+		int hat_flag = HAT_LOAD_ADV;
+
+		if (svd->flags & MAP_TEXT) {
+			hat_flag |= HAT_LOAD_TEXT;
+		}
+
+		if (pp == PAGE_HANDLED)
+			continue;
+
+		if (pp->p_offset >=  svd->offset &&
+			(pp->p_offset < svd->offset + seg->s_size)) {
+
+			diff = pp->p_offset - svd->offset;
+
+			/*
+			 * Large Files: Following is the assertion
+			 * validating the above cast.
+			 */
+			ASSERT(svd->vp == pp->p_vnode);
+
+			page = btop(diff);
+			if (svd->pageprot)
+				prot = VPP_PROT(&svd->vpage[page]) & vpprot;
+
+			/*
+			 * Prevent other threads in the address space from
+			 * creating private pages (i.e., allocating anon slots)
+			 * while we are in the process of loading translations
+			 * to additional pages returned by the underlying
+			 * object.
+			 */
+			if (amp != NULL) {
+				anon_index = svd->anon_index + page;
+				anon_array_enter(amp, anon_index, &cookie);
+				ap = anon_get_ptr(amp->ahp, anon_index);
+			}
+			if ((amp == NULL) || (ap == NULL)) {
+				if (IS_VMODSORT(pp->p_vnode) ||
+				    enable_mbit_wa) {
+					if (rw == S_WRITE)
+						hat_setmod(pp);
+					else if (rw != S_OTHER &&
+					    !hat_ismod(pp))
+						prot &= ~PROT_WRITE;
+				}
+				/*
+				 * Skip mapping read ahead pages marked
+				 * for migration, so they will get migrated
+				 * properly on fault
+				 */
+				if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) {
+					hat_memload(hat, seg->s_base + diff,
+						pp, prot, hat_flag);
+				}
+			}
+			if (amp != NULL)
+				anon_array_exit(&cookie);
+		}
+		page_unlock(pp);
+	}
+done:
+	if (amp != NULL)
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	if (pl_alloc_sz)
+		kmem_free(plp, pl_alloc_sz);
+	return (0);
+}
+
+/*
+ * This routine is used to start I/O on pages asynchronously.  XXX it will
+ * only create PAGESIZE pages. At fault time they will be relocated into
+ * larger pages.
+ */
+static faultcode_t
+segvn_faulta(struct seg *seg, caddr_t addr)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	int err;
+	struct anon_map *amp;
+	vnode_t *vp;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+	if ((amp = svd->amp) != NULL) {
+		struct anon *ap;
+
+		/*
+		 * Reader lock to prevent amp->ahp from being changed.
+		 * This is advisory, it's ok to miss a page, so
+		 * we don't do anon_array_enter lock.
+		 */
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+		if ((ap = anon_get_ptr(amp->ahp,
+			svd->anon_index + seg_page(seg, addr))) != NULL) {
+
+			err = anon_getpage(&ap, NULL, NULL,
+			    0, seg, addr, S_READ, svd->cred);
+
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			if (err)
+				return (FC_MAKE_ERR(err));
+			return (0);
+		}
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+	}
+
+	if (svd->vp == NULL) {
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		return (0);			/* zfod page - do nothing now */
+	}
+
+	vp = svd->vp;
+	TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
+		"segvn_getpage:seg %p addr %p vp %p", seg, addr, vp);
+	err = VOP_GETPAGE(vp,
+	    (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)),
+	    PAGESIZE, NULL, NULL, 0, seg, addr,
+	    S_OTHER, svd->cred);
+
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	if (err)
+		return (FC_MAKE_ERR(err));
+	return (0);
+}
+
+static int
+segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct vpage *svp, *evp;
+	struct vnode *vp;
+	size_t pgsz;
+	pgcnt_t pgcnt;
+	anon_sync_obj_t cookie;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if ((svd->maxprot & prot) != prot)
+		return (EACCES);			/* violated maxprot */
+
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+
+	/* return if prot is the same */
+	if (!svd->pageprot && svd->prot == prot) {
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		return (0);
+	}
+
+	/*
+	 * Since we change protections we first have to flush the cache.
+	 * This makes sure all the pagelock calls have to recheck
+	 * protections.
+	 */
+	if (svd->softlockcnt > 0) {
+		/*
+		 * Since we do have the segvn writers lock nobody can fill
+		 * the cache with entries belonging to this seg during
+		 * the purge. The flush either succeeds or we still have
+		 * pending I/Os.
+		 */
+		segvn_purge(seg);
+		if (svd->softlockcnt > 0) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (EAGAIN);
+		}
+	}
+
+	if (seg->s_szc != 0) {
+		int err;
+		pgsz = page_get_pagesize(seg->s_szc);
+		pgcnt = pgsz >> PAGESHIFT;
+		ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			ASSERT(seg->s_base != addr || seg->s_size != len);
+			/*
+			 * If we are holding the as lock as a reader then
+			 * we need to return IE_RETRY and let the as
+			 * layer drop and re-aquire the lock as a writer.
+			 */
+			if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock))
+				return (IE_RETRY);
+			VM_STAT_ADD(segvnvmstats.demoterange[1]);
+			err = segvn_demote_range(seg, addr, len, SDR_END);
+			if (err == 0)
+				return (IE_RETRY);
+			if (err == ENOMEM)
+				return (IE_NOMEM);
+			return (err);
+		}
+	}
+
+
+	/*
+	 * If it's a private mapping and we're making it writable
+	 * and no swap space has been reserved, have to reserve
+	 * it all now.  If it's a private mapping to a file (i.e., vp != NULL)
+	 * and we're removing write permission on the entire segment and
+	 * we haven't modified any pages, we can release the swap space.
+	 */
+	if (svd->type == MAP_PRIVATE) {
+		if (prot & PROT_WRITE) {
+			size_t sz;
+			if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) {
+				if (anon_resv(seg->s_size) == 0) {
+					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+					return (IE_NOMEM);
+				}
+				sz = svd->swresv = seg->s_size;
+				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+					"anon proc:%p %lu %u",
+					seg, sz, 1);
+			}
+		} else {
+			/*
+			 * Swap space is released only if this segment
+			 * does not map anonymous memory, since read faults
+			 * on such segments still need an anon slot to read
+			 * in the data.
+			 */
+			if (svd->swresv != 0 && svd->vp != NULL &&
+			    svd->amp == NULL && addr == seg->s_base &&
+			    len == seg->s_size && svd->pageprot == 0) {
+				anon_unresv(svd->swresv);
+				svd->swresv = 0;
+				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+					"anon proc:%p %lu %u",
+					seg, 0, 0);
+			}
+		}
+	}
+
+	if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) {
+		if (svd->prot == prot) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (0);			/* all done */
+		}
+		svd->prot = (uchar_t)prot;
+	} else {
+		struct anon *ap = NULL;
+		page_t *pp;
+		u_offset_t offset, off;
+		struct anon_map *amp;
+		ulong_t anon_idx = 0;
+
+		/*
+		 * A vpage structure exists or else the change does not
+		 * involve the entire segment.  Establish a vpage structure
+		 * if none is there.  Then, for each page in the range,
+		 * adjust its individual permissions.  Note that write-
+		 * enabling a MAP_PRIVATE page can affect the claims for
+		 * locked down memory.  Overcommitting memory terminates
+		 * the operation.
+		 */
+		segvn_vpage(seg);
+		if ((amp = svd->amp) != NULL) {
+			anon_idx = svd->anon_index + seg_page(seg, addr);
+			ASSERT(seg->s_szc == 0 ||
+			    IS_P2ALIGNED(anon_idx, pgcnt));
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+		}
+
+		offset = svd->offset + (uintptr_t)(addr - seg->s_base);
+		evp = &svd->vpage[seg_page(seg, addr + len)];
+
+		/*
+		 * See Statement at the beginning of segvn_lockop regarding
+		 * the way cowcnts and lckcnts are handled.
+		 */
+		for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
+
+			ASSERT(seg->s_szc == 0 ||
+			    (svd->vp != NULL || svd->type == MAP_PRIVATE));
+
+			if (seg->s_szc != 0 && svd->type == MAP_PRIVATE) {
+				if (amp != NULL) {
+					anon_array_enter(amp, anon_idx,
+					    &cookie);
+				}
+				if (IS_P2ALIGNED(anon_idx, pgcnt) &&
+				    !segvn_claim_pages(seg, svp, offset,
+					anon_idx, prot)) {
+					if (amp != NULL) {
+						anon_array_exit(&cookie);
+					}
+					break;
+				}
+				if (amp != NULL) {
+					anon_array_exit(&cookie);
+				}
+				anon_idx++;
+			} else {
+				if (amp != NULL) {
+					anon_array_enter(amp, anon_idx,
+						&cookie);
+					ap = anon_get_ptr(amp->ahp, anon_idx++);
+				}
+
+				if (VPP_ISPPLOCK(svp) &&
+				    (VPP_PROT(svp) != prot) &&
+				    (svd->type == MAP_PRIVATE)) {
+
+					if (amp == NULL || ap == NULL) {
+						vp = svd->vp;
+						off = offset;
+					} else
+						swap_xlate(ap, &vp, &off);
+					if (amp != NULL)
+						anon_array_exit(&cookie);
+
+					if ((pp = page_lookup(vp, off,
+					    SE_SHARED)) == NULL) {
+						panic("segvn_setprot: no page");
+						/*NOTREACHED*/
+					}
+					ASSERT(seg->s_szc == 0);
+					if ((VPP_PROT(svp) ^ prot) &
+					    PROT_WRITE) {
+						if (prot & PROT_WRITE) {
+						    if (!page_addclaim(pp)) {
+							page_unlock(pp);
+							break;
+						    }
+						} else {
+						    if (!page_subclaim(pp)) {
+							page_unlock(pp);
+							break;
+						    }
+						}
+					}
+					page_unlock(pp);
+				} else if (amp != NULL)
+					anon_array_exit(&cookie);
+			}
+			VPP_SETPROT(svp, prot);
+			offset += PAGESIZE;
+		}
+		if (amp != NULL)
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+
+		/*
+		 * Did we terminate prematurely?  If so, simply unload
+		 * the translations to the things we've updated so far.
+		 */
+		if (svp != evp) {
+			len = (svp - &svd->vpage[seg_page(seg, addr)]) *
+			    PAGESIZE;
+			ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz));
+			if (len != 0)
+				hat_unload(seg->s_as->a_hat, addr,
+				    len, HAT_UNLOAD);
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (IE_NOMEM);
+		}
+	}
+
+	if ((prot & PROT_WRITE) != 0 || (prot & ~PROT_USER) == PROT_NONE) {
+		/*
+		 * Either private or shared data with write access (in
+		 * which case we need to throw out all former translations
+		 * so that we get the right translations set up on fault
+		 * and we don't allow write access to any copy-on-write pages
+		 * that might be around or to prevent write access to pages
+		 * representing holes in a file), or we don't have permission
+		 * to access the memory at all (in which case we have to
+		 * unload any current translations that might exist).
+		 */
+		hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
+	} else {
+		/*
+		 * A shared mapping or a private mapping in which write
+		 * protection is going to be denied - just change all the
+		 * protections over the range of addresses in question.
+		 * segvn does not support any other attributes other
+		 * than prot so we can use hat_chgattr.
+		 */
+		hat_chgattr(seg->s_as->a_hat, addr, len, prot);
+	}
+
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+
+	return (0);
+}
+
+/*
+ * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize,
+ * to determine if the seg is capable of mapping the requested szc.
+ */
+static int
+segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct segvn_data *nsvd;
+	struct anon_map *amp = svd->amp;
+	struct seg *nseg;
+	caddr_t eaddr = addr + len, a;
+	size_t pgsz = page_get_pagesize(szc);
+	int err;
+	u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base);
+	extern struct vnode kvp;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+	ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
+
+	if (seg->s_szc == szc || segvn_lpg_disable != 0) {
+		return (0);
+	}
+
+	/*
+	 * addr should always be pgsz aligned but eaddr may be misaligned if
+	 * it's at the end of the segment.
+	 *
+	 * XXX we should assert this condition since as_setpagesize() logic
+	 * guarantees it.
+	 */
+	if (!IS_P2ALIGNED(addr, pgsz) ||
+	    (!IS_P2ALIGNED(eaddr, pgsz) &&
+		eaddr != seg->s_base + seg->s_size)) {
+
+		segvn_setpgsz_align_err++;
+		return (EINVAL);
+	}
+
+	if ((svd->vp == NULL && svd->type == MAP_SHARED) ||
+	    (svd->flags & MAP_NORESERVE) || seg->s_as == &kas ||
+	    szc > segvn_maxpgszc) {
+		return (EINVAL);
+	}
+
+	/* paranoid check */
+	if (svd->vp != NULL &&
+	    (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) {
+		    return (EINVAL);
+	}
+
+	if (seg->s_szc == 0 && svd->vp != NULL &&
+	    map_addr_vacalign_check(addr, off)) {
+		return (EINVAL);
+	}
+
+	/*
+	 * Check that protections are the same within new page
+	 * size boundaries.
+	 */
+	if (svd->pageprot) {
+		for (a = addr; a < eaddr; a += pgsz) {
+			if ((a + pgsz) > eaddr) {
+				if (!sameprot(seg, a, eaddr - a)) {
+					return (EINVAL);
+				}
+			} else {
+				if (!sameprot(seg, a, pgsz)) {
+					return (EINVAL);
+				}
+			}
+		}
+	}
+
+	/*
+	 * Since we are changing page size we first have to flush
+	 * the cache. This makes sure all the pagelock calls have
+	 * to recheck protections.
+	 */
+	if (svd->softlockcnt > 0) {
+		/*
+		 * Since we do have the segvn writers lock nobody can fill
+		 * the cache with entries belonging to this seg during
+		 * the purge. The flush either succeeds or we still have
+		 * pending I/Os.
+		 */
+		segvn_purge(seg);
+		if (svd->softlockcnt > 0) {
+			return (EAGAIN);
+		}
+	}
+
+	/*
+	 * Operation for sub range of existing segment.
+	 */
+	if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) {
+		if (szc < seg->s_szc) {
+			VM_STAT_ADD(segvnvmstats.demoterange[2]);
+			err = segvn_demote_range(seg, addr, len, SDR_RANGE);
+			if (err == 0) {
+				return (IE_RETRY);
+			}
+			if (err == ENOMEM) {
+				return (IE_NOMEM);
+			}
+			return (err);
+		}
+		if (addr != seg->s_base) {
+			nseg = segvn_split_seg(seg, addr);
+			if (eaddr != (nseg->s_base + nseg->s_size)) {
+				/* eaddr is szc aligned */
+				(void) segvn_split_seg(nseg, eaddr);
+			}
+			return (IE_RETRY);
+		}
+		if (eaddr != (seg->s_base + seg->s_size)) {
+			/* eaddr is szc aligned */
+			(void) segvn_split_seg(seg, eaddr);
+		}
+		return (IE_RETRY);
+	}
+
+	/*
+	 * Break any low level sharing and reset seg->s_szc to 0.
+	 */
+	if ((err = segvn_clrszc(seg)) != 0) {
+		if (err == ENOMEM) {
+			err = IE_NOMEM;
+		}
+		return (err);
+	}
+	ASSERT(seg->s_szc == 0);
+
+	/*
+	 * If the end of the current segment is not pgsz aligned
+	 * then attempt to concatenate with the next segment.
+	 */
+	if (!IS_P2ALIGNED(eaddr, pgsz)) {
+		nseg = AS_SEGNEXT(seg->s_as, seg);
+		if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) {
+			return (ENOMEM);
+		}
+		if (nseg->s_ops != &segvn_ops) {
+			return (EINVAL);
+		}
+		nsvd = (struct segvn_data *)nseg->s_data;
+		if (nsvd->softlockcnt > 0) {
+			segvn_purge(nseg);
+			if (nsvd->softlockcnt > 0) {
+				return (EAGAIN);
+			}
+		}
+		err = segvn_clrszc(nseg);
+		if (err == ENOMEM) {
+			err = IE_NOMEM;
+		}
+		if (err != 0) {
+			return (err);
+		}
+		err = segvn_concat(seg, nseg, 1);
+		if (err == -1) {
+			return (EINVAL);
+		}
+		if (err == -2) {
+			return (IE_NOMEM);
+		}
+		return (IE_RETRY);
+	}
+
+	/*
+	 * May need to re-align anon array to
+	 * new szc.
+	 */
+	if (amp != NULL) {
+		pgcnt_t pgcnt = pgsz >> PAGESHIFT;
+		if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) {
+			struct anon_hdr *nahp;
+
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+			ASSERT(amp->refcnt == 1);
+			nahp = anon_create(btop(amp->size), ANON_NOSLEEP);
+			if (nahp == NULL) {
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+				return (IE_NOMEM);
+			}
+			if (anon_copy_ptr(amp->ahp, svd->anon_index,
+				nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) {
+				anon_release(nahp, btop(amp->size));
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+				return (IE_NOMEM);
+			}
+			anon_release(amp->ahp, btop(amp->size));
+			amp->ahp = nahp;
+			svd->anon_index = 0;
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		}
+	}
+	if (svd->vp != NULL && szc != 0) {
+		struct vattr va;
+		u_offset_t eoffpage = svd->offset;
+		va.va_mask = AT_SIZE;
+		eoffpage += seg->s_size;
+		eoffpage = btopr(eoffpage);
+		if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) {
+			segvn_setpgsz_getattr_err++;
+			return (EINVAL);
+		}
+		if (btopr(va.va_size) < eoffpage) {
+			segvn_setpgsz_eof_err++;
+			return (EINVAL);
+		}
+		if (amp != NULL) {
+			/*
+			 * anon_fill_cow_holes() may call VOP_GETPAGE().
+			 * don't take anon map lock here to avoid holding it
+			 * across VOP_GETPAGE() calls that may call back into
+			 * segvn for klsutering checks. We don't really need
+			 * anon map lock here since it's a private segment and
+			 * we hold as level lock as writers.
+			 */
+			if ((err = anon_fill_cow_holes(seg, seg->s_base,
+			    amp->ahp, svd->anon_index, svd->vp, svd->offset,
+			    seg->s_size, szc, svd->prot, svd->vpage,
+			    svd->cred)) != 0) {
+				return (EINVAL);
+			}
+		}
+		segvn_setvnode_mpss(svd->vp);
+	}
+
+	if (amp != NULL) {
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+		amp->a_szc = szc;
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+	}
+
+	seg->s_szc = szc;
+
+	return (0);
+}
+
+static int
+segvn_clrszc(struct seg *seg)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct anon_map *amp = svd->amp;
+	size_t pgsz;
+	pgcnt_t pages;
+	int err = 0;
+	caddr_t a = seg->s_base;
+	caddr_t ea = a + seg->s_size;
+	ulong_t an_idx = svd->anon_index;
+	vnode_t *vp = svd->vp;
+	struct vpage *vpage = svd->vpage;
+	page_t *anon_pl[1 + 1], *pp;
+	struct anon *ap, *oldap;
+	uint_t prot = svd->prot, vpprot;
+
+	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
+	    SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
+	ASSERT(svd->type == MAP_PRIVATE ||
+	    (vp != NULL && svd->amp == NULL));
+
+	if (vp == NULL && amp == NULL) {
+		seg->s_szc = 0;
+		return (0);
+	}
+
+	/*
+	 * do HAT_UNLOAD_UNMAP since we are changing the pagesize.
+	 * unload argument is 0 when we are freeing the segment
+	 * and unload was already done.
+	 */
+	hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
+	    HAT_UNLOAD_UNMAP);
+
+	if (amp == NULL) {
+		seg->s_szc = 0;
+		return (0);
+	}
+
+	pgsz = page_get_pagesize(seg->s_szc);
+	pages = btop(pgsz);
+
+	/*
+	 * XXX anon rwlock is not really needed because this is a
+	 * private segment and we are writers.
+	 */
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+
+	for (; a < ea; a += pgsz, an_idx += pages) {
+		if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) {
+			if (svd->pageprot != 0) {
+				ASSERT(vpage != NULL);
+				prot = VPP_PROT(vpage);
+				ASSERT(sameprot(seg, a, pgsz));
+			}
+			if (seg->s_szc != 0) {
+				ASSERT(vp == NULL || anon_pages(amp->ahp,
+				    an_idx, pages) == pages);
+				if ((err = anon_map_demotepages(amp, an_idx,
+				    seg, a, prot, vpage, svd->cred)) != 0) {
+					goto out;
+				}
+			} else {
+				if (oldap->an_refcnt == 1) {
+					continue;
+				}
+				if ((err = anon_getpage(&oldap, &vpprot,
+				    anon_pl, PAGESIZE, seg, a, S_READ,
+				    svd->cred))) {
+					goto out;
+				}
+				if ((pp = anon_private(&ap, seg, a, prot,
+				    anon_pl[0], 0, svd->cred)) == NULL) {
+					err = ENOMEM;
+					goto out;
+				}
+				anon_decref(oldap);
+				(void) anon_set_ptr(amp->ahp, an_idx, ap,
+				    ANON_SLEEP);
+				page_unlock(pp);
+			}
+		}
+		vpage = (vpage == NULL) ? NULL : vpage + pages;
+	}
+
+	amp->a_szc = 0;
+	seg->s_szc = 0;
+out:
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+	return (err);
+}
+
+static int
+segvn_claim_pages(
+	struct seg *seg,
+	struct vpage *svp,
+	u_offset_t off,
+	ulong_t anon_idx,
+	uint_t prot)
+{
+	pgcnt_t	pgcnt = page_get_pagecnt(seg->s_szc);
+	size_t ppasize = (pgcnt + 1) * sizeof (page_t *);
+	page_t	**ppa;
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct anon_map *amp = svd->amp;
+	struct vpage *evp = svp + pgcnt;
+	caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT)
+	    + seg->s_base;
+	struct anon *ap;
+	struct vnode *vp = svd->vp;
+	page_t *pp;
+	pgcnt_t pg_idx, i;
+	int err = 0;
+	anoff_t	aoff;
+	int anon = (amp != NULL) ? 1 : 0;
+
+	ASSERT(svd->type == MAP_PRIVATE);
+	ASSERT(svd->vpage != NULL);
+	ASSERT(seg->s_szc != 0);
+	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+	ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt));
+	ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT));
+
+	if (VPP_PROT(svp) == prot)
+		return (1);
+	if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE))
+		return (1);
+
+	ppa = kmem_alloc(ppasize, KM_SLEEP);
+	if (anon && vp != NULL) {
+		if (anon_get_ptr(amp->ahp, anon_idx) == NULL) {
+			anon = 0;
+			ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt));
+		}
+		ASSERT(!anon ||
+		    anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt);
+	}
+
+	for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) {
+		if (!VPP_ISPPLOCK(svp))
+			continue;
+		if (anon) {
+			ap = anon_get_ptr(amp->ahp, anon_idx);
+			if (ap == NULL) {
+				panic("segvn_claim_pages: no anon slot");
+			}
+			swap_xlate(ap, &vp, &aoff);
+			off = (u_offset_t)aoff;
+		}
+		ASSERT(vp != NULL);
+		if ((pp = page_lookup(vp,
+		    (u_offset_t)off, SE_SHARED)) == NULL) {
+			panic("segvn_claim_pages: no page");
+		}
+		ppa[pg_idx++] = pp;
+		off += PAGESIZE;
+	}
+
+	if (ppa[0] == NULL) {
+		kmem_free(ppa, ppasize);
+		return (1);
+	}
+
+	ASSERT(pg_idx <= pgcnt);
+	ppa[pg_idx] = NULL;
+
+	if (prot & PROT_WRITE)
+		err = page_addclaim_pages(ppa);
+	else
+		err = page_subclaim_pages(ppa);
+
+	for (i = 0; i < pg_idx; i++) {
+		ASSERT(ppa[i] != NULL);
+		page_unlock(ppa[i]);
+	}
+
+	kmem_free(ppa, ppasize);
+	return (err);
+}
+
+/*
+ * Returns right (upper address) segment if split occured.
+ * If the address is equal to the beginning or end of its segment it returns
+ * the current segment.
+ */
+static struct seg *
+segvn_split_seg(struct seg *seg, caddr_t addr)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct seg *nseg;
+	size_t nsize;
+	struct segvn_data *nsvd;
+
+	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+	ASSERT(svd->type == MAP_PRIVATE || svd->amp == NULL);
+	ASSERT(addr >= seg->s_base);
+	ASSERT(addr <= seg->s_base + seg->s_size);
+
+	if (addr == seg->s_base || addr == seg->s_base + seg->s_size)
+		return (seg);
+
+	nsize = seg->s_base + seg->s_size - addr;
+	seg->s_size = addr - seg->s_base;
+	nseg = seg_alloc(seg->s_as, addr, nsize);
+	ASSERT(nseg != NULL);
+	nseg->s_ops = seg->s_ops;
+	nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
+	nseg->s_data = (void *)nsvd;
+	nseg->s_szc = seg->s_szc;
+	*nsvd = *svd;
+	rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL);
+
+	if (nsvd->vp != NULL) {
+		VN_HOLD(nsvd->vp);
+		nsvd->offset = svd->offset +
+		    (uintptr_t)(nseg->s_base - seg->s_base);
+		if (nsvd->type == MAP_SHARED)
+			lgrp_shm_policy_init(NULL, nsvd->vp);
+	} else {
+		/*
+		 * The offset for an anonymous segment has no signifigance in
+		 * terms of an offset into a file. If we were to use the above
+		 * calculation instead, the structures read out of
+		 * /proc/<pid>/xmap would be more difficult to decipher since
+		 * it would be unclear whether two seemingly contiguous
+		 * prxmap_t structures represented different segments or a
+		 * single segment that had been split up into multiple prxmap_t
+		 * structures (e.g. if some part of the segment had not yet
+		 * been faulted in).
+		 */
+		nsvd->offset = 0;
+	}
+
+	ASSERT(svd->softlockcnt == 0);
+	crhold(svd->cred);
+
+	if (svd->vpage != NULL) {
+		size_t bytes = vpgtob(seg_pages(seg));
+		size_t nbytes = vpgtob(seg_pages(nseg));
+		struct vpage *ovpage = svd->vpage;
+
+		svd->vpage = kmem_alloc(bytes, KM_SLEEP);
+		bcopy(ovpage, svd->vpage, bytes);
+		nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+		bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes);
+		kmem_free(ovpage, bytes + nbytes);
+	}
+	if (svd->amp != NULL) {
+		struct anon_map *oamp = svd->amp, *namp;
+		struct anon_hdr *nahp;
+
+		ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER);
+		ASSERT(oamp->refcnt == 1);
+		nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
+		(void) anon_copy_ptr(oamp->ahp, svd->anon_index,
+		    nahp, 0, btop(seg->s_size), ANON_SLEEP);
+
+		namp = anonmap_alloc(nseg->s_size, 0);
+		namp->a_szc = nseg->s_szc;
+		(void) anon_copy_ptr(oamp->ahp,
+		    svd->anon_index + btop(seg->s_size),
+		    namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
+		anon_release(oamp->ahp, btop(oamp->size));
+		oamp->ahp = nahp;
+		oamp->size = seg->s_size;
+		svd->anon_index = 0;
+		nsvd->amp = namp;
+		nsvd->anon_index = 0;
+		ANON_LOCK_EXIT(&oamp->a_rwlock);
+	}
+
+	/*
+	 * Split amount of swap reserve
+	 */
+	if (svd->swresv) {
+		/*
+		 * For MAP_NORESERVE, only allocate swap reserve for pages
+		 * being used.  Other segments get enough to cover whole
+		 * segment.
+		 */
+		if (svd->flags & MAP_NORESERVE) {
+			size_t	oswresv;
+
+			ASSERT(svd->amp);
+			oswresv = svd->swresv;
+			svd->swresv = ptob(anon_pages(svd->amp->ahp,
+				svd->anon_index, btop(seg->s_size)));
+			nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
+				nsvd->anon_index, btop(nseg->s_size)));
+			ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
+		} else {
+			ASSERT(svd->swresv == seg->s_size + nseg->s_size);
+			svd->swresv = seg->s_size;
+			nsvd->swresv = nseg->s_size;
+		}
+	}
+
+	return (nseg);
+}
+
+
+/*
+ * called on memory operations (unmap, setprot, setpagesize) for a subset
+ * of a large page segment to either demote the memory range (SDR_RANGE)
+ * or the ends (SDR_END) by addr/len.
+ *
+ * returns 0 on success. returns errno, including ENOMEM, on failure.
+ */
+static int
+segvn_demote_range(struct seg *seg, caddr_t addr, size_t len, int flag)
+{
+	caddr_t eaddr = addr + len;
+	caddr_t lpgaddr, lpgeaddr;
+	struct seg *nseg;
+	struct seg *badseg1 = NULL;
+	struct seg *badseg2 = NULL;
+	size_t pgsz;
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	int err;
+
+	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+	ASSERT(seg->s_szc != 0);
+	pgsz = page_get_pagesize(seg->s_szc);
+	ASSERT(seg->s_base != addr || seg->s_size != len);
+	ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
+	ASSERT(svd->softlockcnt == 0);
+	ASSERT(svd->type == MAP_PRIVATE ||
+	    (svd->vp != NULL && svd->amp == NULL));
+
+	CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
+	ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr);
+	if (flag == SDR_RANGE) {
+		/* demote entire range */
+		badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
+		(void) segvn_split_seg(nseg, lpgeaddr);
+		ASSERT(badseg1->s_base == lpgaddr);
+		ASSERT(badseg1->s_size == lpgeaddr - lpgaddr);
+	} else if (addr != lpgaddr) {
+		ASSERT(flag == SDR_END);
+		badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
+		if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz &&
+		    eaddr < lpgaddr + 2 * pgsz) {
+			(void) segvn_split_seg(nseg, lpgeaddr);
+			ASSERT(badseg1->s_base == lpgaddr);
+			ASSERT(badseg1->s_size == 2 * pgsz);
+		} else {
+			nseg = segvn_split_seg(nseg, lpgaddr + pgsz);
+			ASSERT(badseg1->s_base == lpgaddr);
+			ASSERT(badseg1->s_size == pgsz);
+			if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) {
+				ASSERT(lpgeaddr - lpgaddr > 2 * pgsz);
+				nseg = segvn_split_seg(nseg, lpgeaddr - pgsz);
+				badseg2 = nseg;
+				(void) segvn_split_seg(nseg, lpgeaddr);
+				ASSERT(badseg2->s_base == lpgeaddr - pgsz);
+				ASSERT(badseg2->s_size == pgsz);
+			}
+		}
+	} else {
+		ASSERT(flag == SDR_END);
+		ASSERT(eaddr < lpgeaddr);
+		badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz);
+		(void) segvn_split_seg(nseg, lpgeaddr);
+		ASSERT(badseg1->s_base == lpgeaddr - pgsz);
+		ASSERT(badseg1->s_size == pgsz);
+	}
+
+	ASSERT(badseg1 != NULL);
+	ASSERT(badseg1->s_szc != 0);
+	ASSERT(page_get_pagesize(badseg1->s_szc) == pgsz);
+	ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz ||
+	    badseg1->s_size == 2 * pgsz);
+	if (err = segvn_clrszc(badseg1)) {
+		return (err);
+	}
+	ASSERT(badseg1->s_szc == 0);
+
+	if (badseg2 == NULL)
+		return (0);
+	ASSERT(badseg2->s_szc != 0);
+	ASSERT(page_get_pagesize(badseg2->s_szc) == pgsz);
+	ASSERT(badseg2->s_size == pgsz);
+	ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size));
+	if (err = segvn_clrszc(badseg2)) {
+		return (err);
+	}
+	ASSERT(badseg2->s_szc == 0);
+	return (0);
+}
+
+static int
+segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct vpage *vp, *evp;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+	/*
+	 * If segment protection can be used, simply check against them.
+	 */
+	if (svd->pageprot == 0) {
+		int err;
+
+		err = ((svd->prot & prot) != prot) ? EACCES : 0;
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		return (err);
+	}
+
+	/*
+	 * Have to check down to the vpage level.
+	 */
+	evp = &svd->vpage[seg_page(seg, addr + len)];
+	for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) {
+		if ((VPP_PROT(vp) & prot) != prot) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (EACCES);
+		}
+	}
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	return (0);
+}
+
+static int
+segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	if (pgno != 0) {
+		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+		if (svd->pageprot == 0) {
+			do
+				protv[--pgno] = svd->prot;
+			while (pgno != 0);
+		} else {
+			size_t pgoff = seg_page(seg, addr);
+
+			do {
+				pgno--;
+				protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]);
+			} while (pgno != 0);
+		}
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	}
+	return (0);
+}
+
+static u_offset_t
+segvn_getoffset(struct seg *seg, caddr_t addr)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	return (svd->offset + (uintptr_t)(addr - seg->s_base));
+}
+
+/*ARGSUSED*/
+static int
+segvn_gettype(struct seg *seg, caddr_t addr)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	return (svd->type | (svd->flags & MAP_NORESERVE));
+}
+
+/*ARGSUSED*/
+static int
+segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	*vpp = svd->vp;
+	return (0);
+}
+
+/*
+ * Check to see if it makes sense to do kluster/read ahead to
+ * addr + delta relative to the mapping at addr.  We assume here
+ * that delta is a signed PAGESIZE'd multiple (which can be negative).
+ *
+ * For segvn, we currently "approve" of the action if we are
+ * still in the segment and it maps from the same vp/off,
+ * or if the advice stored in segvn_data or vpages allows it.
+ * Currently, klustering is not allowed only if MADV_RANDOM is set.
+ */
+static int
+segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct anon *oap, *ap;
+	ssize_t pd;
+	size_t page;
+	struct vnode *vp1, *vp2;
+	u_offset_t off1, off2;
+	struct anon_map *amp;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
+	    SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+
+	if (addr + delta < seg->s_base ||
+	    addr + delta >= (seg->s_base + seg->s_size))
+		return (-1);		/* exceeded segment bounds */
+
+	pd = delta / (ssize_t)PAGESIZE;	/* divide to preserve sign bit */
+	page = seg_page(seg, addr);
+
+	/*
+	 * Check to see if either of the pages addr or addr + delta
+	 * have advice set that prevents klustering (if MADV_RANDOM advice
+	 * is set for entire segment, or MADV_SEQUENTIAL is set and delta
+	 * is negative).
+	 */
+	if (svd->advice == MADV_RANDOM ||
+	    svd->advice == MADV_SEQUENTIAL && delta < 0)
+		return (-1);
+	else if (svd->pageadvice && svd->vpage) {
+		struct vpage *bvpp, *evpp;
+
+		bvpp = &svd->vpage[page];
+		evpp = &svd->vpage[page + pd];
+		if (VPP_ADVICE(bvpp) == MADV_RANDOM ||
+		    VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0)
+			return (-1);
+		if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) &&
+		    VPP_ADVICE(evpp) == MADV_RANDOM)
+			return (-1);
+	}
+
+	if (svd->type == MAP_SHARED)
+		return (0);		/* shared mapping - all ok */
+
+	if ((amp = svd->amp) == NULL)
+		return (0);		/* off original vnode */
+
+	page += svd->anon_index;
+
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+
+	oap = anon_get_ptr(amp->ahp, page);
+	ap = anon_get_ptr(amp->ahp, page + pd);
+
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+
+	if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) {
+		return (-1);		/* one with and one without an anon */
+	}
+
+	if (oap == NULL) {		/* implies that ap == NULL */
+		return (0);		/* off original vnode */
+	}
+
+	/*
+	 * Now we know we have two anon pointers - check to
+	 * see if they happen to be properly allocated.
+	 */
+
+	/*
+	 * XXX We cheat here and don't lock the anon slots. We can't because
+	 * we may have been called from the anon layer which might already
+	 * have locked them. We are holding a refcnt on the slots so they
+	 * can't disappear. The worst that will happen is we'll get the wrong
+	 * names (vp, off) for the slots and make a poor klustering decision.
+	 */
+	swap_xlate(ap, &vp1, &off1);
+	swap_xlate(oap, &vp2, &off2);
+
+
+	if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta)
+		return (-1);
+	return (0);
+}
+
+/*
+ * Swap the pages of seg out to secondary storage, returning the
+ * number of bytes of storage freed.
+ *
+ * The basic idea is first to unload all translations and then to call
+ * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the
+ * swap device.  Pages to which other segments have mappings will remain
+ * mapped and won't be swapped.  Our caller (as_swapout) has already
+ * performed the unloading step.
+ *
+ * The value returned is intended to correlate well with the process's
+ * memory requirements.  However, there are some caveats:
+ * 1)	When given a shared segment as argument, this routine will
+ *	only succeed in swapping out pages for the last sharer of the
+ *	segment.  (Previous callers will only have decremented mapping
+ *	reference counts.)
+ * 2)	We assume that the hat layer maintains a large enough translation
+ *	cache to capture process reference patterns.
+ */
+static size_t
+segvn_swapout(struct seg *seg)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct anon_map *amp;
+	pgcnt_t pgcnt = 0;
+	pgcnt_t npages;
+	pgcnt_t page;
+	ulong_t anon_index;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+	/*
+	 * Find pages unmapped by our caller and force them
+	 * out to the virtual swap device.
+	 */
+	if ((amp = svd->amp) != NULL)
+		anon_index = svd->anon_index;
+	npages = seg->s_size >> PAGESHIFT;
+	for (page = 0; page < npages; page++) {
+		page_t *pp;
+		struct anon *ap;
+		struct vnode *vp;
+		u_offset_t off;
+		anon_sync_obj_t cookie;
+
+		/*
+		 * Obtain <vp, off> pair for the page, then look it up.
+		 *
+		 * Note that this code is willing to consider regular
+		 * pages as well as anon pages.  Is this appropriate here?
+		 */
+		ap = NULL;
+		if (amp != NULL) {
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+			anon_array_enter(amp, anon_index + page, &cookie);
+			ap = anon_get_ptr(amp->ahp, anon_index + page);
+			if (ap != NULL) {
+				swap_xlate(ap, &vp, &off);
+			} else {
+				vp = svd->vp;
+				off = svd->offset + ptob(page);
+			}
+			anon_array_exit(&cookie);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		} else {
+			vp = svd->vp;
+			off = svd->offset + ptob(page);
+		}
+		if (vp == NULL) {		/* untouched zfod page */
+			ASSERT(ap == NULL);
+			continue;
+		}
+
+		pp = page_lookup_nowait(vp, off, SE_SHARED);
+		if (pp == NULL)
+			continue;
+
+
+		/*
+		 * Examine the page to see whether it can be tossed out,
+		 * keeping track of how many we've found.
+		 */
+		if (!page_tryupgrade(pp)) {
+			/*
+			 * If the page has an i/o lock and no mappings,
+			 * it's very likely that the page is being
+			 * written out as a result of klustering.
+			 * Assume this is so and take credit for it here.
+			 */
+			if (!page_io_trylock(pp)) {
+				if (!hat_page_is_mapped(pp))
+					pgcnt++;
+			} else {
+				page_io_unlock(pp);
+			}
+			page_unlock(pp);
+			continue;
+		}
+		ASSERT(!page_iolock_assert(pp));
+
+
+		/*
+		 * Skip if page is locked or has mappings.
+		 * We don't need the page_struct_lock to look at lckcnt
+		 * and cowcnt because the page is exclusive locked.
+		 */
+		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
+		    hat_page_is_mapped(pp)) {
+			page_unlock(pp);
+			continue;
+		}
+
+		/*
+		 * dispose skips large pages so try to demote first.
+		 */
+		if (pp->p_szc != 0 && !page_try_demote_pages(pp)) {
+			page_unlock(pp);
+			/*
+			 * XXX should skip the remaining page_t's of this
+			 * large page.
+			 */
+			continue;
+		}
+
+		ASSERT(pp->p_szc == 0);
+
+		/*
+		 * No longer mapped -- we can toss it out.  How
+		 * we do so depends on whether or not it's dirty.
+		 */
+		if (hat_ismod(pp) && pp->p_vnode) {
+			/*
+			 * We must clean the page before it can be
+			 * freed.  Setting B_FREE will cause pvn_done
+			 * to free the page when the i/o completes.
+			 * XXX:	This also causes it to be accounted
+			 *	as a pageout instead of a swap: need
+			 *	B_SWAPOUT bit to use instead of B_FREE.
+			 *
+			 * Hold the vnode before releasing the page lock
+			 * to prevent it from being freed and re-used by
+			 * some other thread.
+			 */
+			VN_HOLD(vp);
+			page_unlock(pp);
+
+			/*
+			 * Queue all i/o requests for the pageout thread
+			 * to avoid saturating the pageout devices.
+			 */
+			if (!queue_io_request(vp, off))
+				VN_RELE(vp);
+		} else {
+			/*
+			 * The page was clean, free it.
+			 *
+			 * XXX:	Can we ever encounter modified pages
+			 *	with no associated vnode here?
+			 */
+			ASSERT(pp->p_vnode != NULL);
+			/*LINTED: constant in conditional context*/
+			VN_DISPOSE(pp, B_FREE, 0, kcred);
+		}
+
+		/*
+		 * Credit now even if i/o is in progress.
+		 */
+		pgcnt++;
+	}
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+
+	/*
+	 * Wakeup pageout to initiate i/o on all queued requests.
+	 */
+	cv_signal_pageout();
+	return (ptob(pgcnt));
+}
+
+/*
+ * Synchronize primary storage cache with real object in virtual memory.
+ *
+ * XXX - Anonymous pages should not be sync'ed out at all.
+ */
+static int
+segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct vpage *vpp;
+	page_t *pp;
+	u_offset_t offset;
+	struct vnode *vp;
+	u_offset_t off;
+	caddr_t eaddr;
+	int bflags;
+	int err = 0;
+	int segtype;
+	int pageprot;
+	int prot;
+	ulong_t anon_index;
+	struct anon_map *amp;
+	struct anon *ap;
+	anon_sync_obj_t cookie;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+	if (svd->softlockcnt > 0) {
+		/*
+		 * flush all pages from seg cache
+		 * otherwise we may deadlock in swap_putpage
+		 * for B_INVAL page (4175402).
+		 *
+		 * Even if we grab segvn WRITER's lock or segp_slock
+		 * here, there might be another thread which could've
+		 * successfully performed lookup/insert just before
+		 * we acquired the lock here.  So, grabbing either
+		 * lock here is of not much use.  Until we devise
+		 * a strategy at upper layers to solve the
+		 * synchronization issues completely, we expect
+		 * applications to handle this appropriately.
+		 */
+		segvn_purge(seg);
+		if (svd->softlockcnt > 0) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (EAGAIN);
+		}
+	}
+
+	vpp = svd->vpage;
+	offset = svd->offset + (uintptr_t)(addr - seg->s_base);
+	bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
+	    ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+
+	if (attr) {
+		pageprot = attr & ~(SHARED|PRIVATE);
+		segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE;
+
+		/*
+		 * We are done if the segment types don't match
+		 * or if we have segment level protections and
+		 * they don't match.
+		 */
+		if (svd->type != segtype) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (0);
+		}
+		if (vpp == NULL) {
+			if (svd->prot != pageprot) {
+				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+				return (0);
+			}
+			prot = svd->prot;
+		} else
+			vpp = &svd->vpage[seg_page(seg, addr)];
+
+	} else if (svd->vp && svd->amp == NULL &&
+	    (flags & MS_INVALIDATE) == 0) {
+
+		/*
+		 * No attributes, no anonymous pages and MS_INVALIDATE flag
+		 * is not on, just use one big request.
+		 */
+		err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
+		    bflags, svd->cred);
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		return (err);
+	}
+
+	if ((amp = svd->amp) != NULL)
+		anon_index = svd->anon_index + seg_page(seg, addr);
+
+	for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) {
+		ap = NULL;
+		if (amp != NULL) {
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+			anon_array_enter(amp, anon_index, &cookie);
+			ap = anon_get_ptr(amp->ahp, anon_index++);
+			if (ap != NULL) {
+				swap_xlate(ap, &vp, &off);
+			} else {
+				vp = svd->vp;
+				off = offset;
+			}
+			anon_array_exit(&cookie);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		} else {
+			vp = svd->vp;
+			off = offset;
+		}
+		offset += PAGESIZE;
+
+		if (vp == NULL)		/* untouched zfod page */
+			continue;
+
+		if (attr) {
+			if (vpp) {
+				prot = VPP_PROT(vpp);
+				vpp++;
+			}
+			if (prot != pageprot) {
+				continue;
+			}
+		}
+
+		/*
+		 * See if any of these pages are locked --  if so, then we
+		 * will have to truncate an invalidate request at the first
+		 * locked one. We don't need the page_struct_lock to test
+		 * as this is only advisory; even if we acquire it someone
+		 * might race in and lock the page after we unlock and before
+		 * we do the PUTPAGE, then PUTPAGE simply does nothing.
+		 */
+		if (flags & MS_INVALIDATE) {
+			if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
+				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+					page_unlock(pp);
+					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+					return (EBUSY);
+				}
+				if (ap != NULL && pp->p_szc != 0 &&
+				    page_tryupgrade(pp)) {
+					if (pp->p_lckcnt == 0 &&
+					    pp->p_cowcnt == 0) {
+						/*
+						 * swapfs VN_DISPOSE() won't
+						 * invalidate large pages.
+						 * Attempt to demote.
+						 * XXX can't help it if it
+						 * fails. But for swapfs
+						 * pages it is no big deal.
+						 */
+						(void) page_try_demote_pages(
+						    pp);
+				    }
+				}
+				page_unlock(pp);
+			}
+		} else if (svd->type == MAP_SHARED && amp != NULL) {
+			/*
+			 * Avoid writting out to disk ISM's large pages
+			 * because segspt_free_pages() relies on NULL an_pvp
+			 * of anon slots of such pages.
+			 */
+
+			ASSERT(svd->vp == NULL);
+			/*
+			 * swapfs uses page_lookup_nowait if not freeing or
+			 * invalidating and skips a page if
+			 * page_lookup_nowait returns NULL.
+			 */
+			pp = page_lookup_nowait(vp, off, SE_SHARED);
+			if (pp == NULL) {
+				continue;
+			}
+			if (pp->p_szc != 0) {
+				page_unlock(pp);
+				continue;
+			}
+
+			/*
+			 * Note ISM pages are created large so (vp, off)'s
+			 * page cannot suddenly become large after we unlock
+			 * pp.
+			 */
+			page_unlock(pp);
+		}
+		/*
+		 * XXX - Should ultimately try to kluster
+		 * calls to VOP_PUTPAGE() for performance.
+		 */
+		VN_HOLD(vp);
+		err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
+		    bflags, svd->cred);
+		VN_RELE(vp);
+		if (err)
+			break;
+	}
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	return (err);
+}
+
+/*
+ * Determine if we have data corresponding to pages in the
+ * primary storage virtual memory cache (i.e., "in core").
+ */
+static size_t
+segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct vnode *vp, *avp;
+	u_offset_t offset, aoffset;
+	size_t p, ep;
+	int ret;
+	struct vpage *vpp;
+	page_t *pp;
+	uint_t start;
+	struct anon_map *amp;		/* XXX - for locknest */
+	struct anon *ap;
+	uint_t attr;
+	anon_sync_obj_t cookie;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+	if (svd->amp == NULL && svd->vp == NULL) {
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		bzero(vec, btopr(len));
+		return (len);	/* no anonymous pages created yet */
+	}
+
+	p = seg_page(seg, addr);
+	ep = seg_page(seg, addr + len);
+	start = svd->vp ? SEG_PAGE_VNODEBACKED : 0;
+
+	amp = svd->amp;
+	for (; p < ep; p++, addr += PAGESIZE) {
+		vpp = (svd->vpage) ? &svd->vpage[p]: NULL;
+		ret = start;
+		ap = NULL;
+		avp = NULL;
+		/* Grab the vnode/offset for the anon slot */
+		if (amp != NULL) {
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+			anon_array_enter(amp, svd->anon_index + p, &cookie);
+			ap = anon_get_ptr(amp->ahp, svd->anon_index + p);
+			if (ap != NULL) {
+				swap_xlate(ap, &avp, &aoffset);
+			}
+			anon_array_exit(&cookie);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		}
+		if ((avp != NULL) && page_exists(avp, aoffset)) {
+			/* A page exists for the anon slot */
+			ret |= SEG_PAGE_INCORE;
+
+			/*
+			 * If page is mapped and writable
+			 */
+			attr = (uint_t)0;
+			if ((hat_getattr(seg->s_as->a_hat, addr,
+			    &attr) != -1) && (attr & PROT_WRITE)) {
+				ret |= SEG_PAGE_ANON;
+			}
+			/*
+			 * Don't get page_struct lock for lckcnt and cowcnt,
+			 * since this is purely advisory.
+			 */
+			if ((pp = page_lookup_nowait(avp, aoffset,
+			    SE_SHARED)) != NULL) {
+				if (pp->p_lckcnt)
+					ret |= SEG_PAGE_SOFTLOCK;
+				if (pp->p_cowcnt)
+					ret |= SEG_PAGE_HASCOW;
+				page_unlock(pp);
+			}
+		}
+
+		/* Gather vnode statistics */
+		vp = svd->vp;
+		offset = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+		if (vp != NULL) {
+			/*
+			 * Try to obtain a "shared" lock on the page
+			 * without blocking.  If this fails, determine
+			 * if the page is in memory.
+			 */
+			pp = page_lookup_nowait(vp, offset, SE_SHARED);
+			if ((pp == NULL) && (page_exists(vp, offset))) {
+				/* Page is incore, and is named */
+				ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
+			}
+			/*
+			 * Don't get page_struct lock for lckcnt and cowcnt,
+			 * since this is purely advisory.
+			 */
+			if (pp != NULL) {
+				ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
+				if (pp->p_lckcnt)
+					ret |= SEG_PAGE_SOFTLOCK;
+				if (pp->p_cowcnt)
+					ret |= SEG_PAGE_HASCOW;
+				page_unlock(pp);
+			}
+		}
+
+		/* Gather virtual page information */
+		if (vpp) {
+			if (VPP_ISPPLOCK(vpp))
+				ret |= SEG_PAGE_LOCKED;
+			vpp++;
+		}
+
+		*vec++ = (char)ret;
+	}
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	return (len);
+}
+
+/*
+ * Statement for p_cowcnts/p_lckcnts.
+ *
+ * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region
+ * irrespective of the following factors or anything else:
+ *
+ *	(1) anon slots are populated or not
+ *	(2) cow is broken or not
+ *	(3) refcnt on ap is 1 or greater than 1
+ *
+ * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock
+ * and munlock.
+ *
+ *
+ * Handling p_cowcnts/p_lckcnts during copy-on-write fault:
+ *
+ *	if vpage has PROT_WRITE
+ *		transfer cowcnt on the oldpage -> cowcnt on the newpage
+ *	else
+ *		transfer lckcnt on the oldpage -> lckcnt on the newpage
+ *
+ *	During copy-on-write, decrement p_cowcnt on the oldpage and increment
+ *	p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE.
+ *
+ *	We may also break COW if softlocking on read access in the physio case.
+ *	In this case, vpage may not have PROT_WRITE. So, we need to decrement
+ *	p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the
+ *	vpage doesn't have PROT_WRITE.
+ *
+ *
+ * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region:
+ *
+ * 	If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and
+ *	increment p_lckcnt by calling page_subclaim() which takes care of
+ * 	availrmem accounting and p_lckcnt overflow.
+ *
+ *	If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and
+ *	increment p_cowcnt by calling page_addclaim() which takes care of
+ *	availrmem availability and p_cowcnt overflow.
+ */
+
+/*
+ * Lock down (or unlock) pages mapped by this segment.
+ *
+ * XXX only creates PAGESIZE pages if anon slots are not initialized.
+ * At fault time they will be relocated into larger pages.
+ */
+static int
+segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
+    int attr, int op, ulong_t *lockmap, size_t pos)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct vpage *vpp;
+	struct vpage *evp;
+	page_t *pp;
+	u_offset_t offset;
+	u_offset_t off;
+	int segtype;
+	int pageprot;
+	int claim;
+	struct vnode *vp;
+	ulong_t anon_index;
+	struct anon_map *amp;
+	struct anon *ap;
+	struct vattr va;
+	anon_sync_obj_t cookie;
+
+	/*
+	 * Hold write lock on address space because may split or concatenate
+	 * segments
+	 */
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+	if (attr) {
+		pageprot = attr & ~(SHARED|PRIVATE);
+		segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE;
+
+		/*
+		 * We are done if the segment types don't match
+		 * or if we have segment level protections and
+		 * they don't match.
+		 */
+		if (svd->type != segtype) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (0);
+		}
+		if (svd->pageprot == 0 && svd->prot != pageprot) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (0);
+		}
+	}
+
+	/*
+	 * If we're locking, then we must create a vpage structure if
+	 * none exists.  If we're unlocking, then check to see if there
+	 * is a vpage --  if not, then we could not have locked anything.
+	 */
+
+	if ((vpp = svd->vpage) == NULL) {
+		if (op == MC_LOCK)
+			segvn_vpage(seg);
+		else {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (0);
+		}
+	}
+
+	/*
+	 * The anonymous data vector (i.e., previously
+	 * unreferenced mapping to swap space) can be allocated
+	 * by lazily testing for its existence.
+	 */
+	if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) {
+		svd->amp = anonmap_alloc(seg->s_size, 0);
+		svd->amp->a_szc = seg->s_szc;
+	}
+
+	if ((amp = svd->amp) != NULL) {
+		anon_index = svd->anon_index + seg_page(seg, addr);
+	}
+
+	offset = svd->offset + (uintptr_t)(addr - seg->s_base);
+	evp = &svd->vpage[seg_page(seg, addr + len)];
+
+	/*
+	 * Loop over all pages in the range.  Process if we're locking and
+	 * page has not already been locked in this mapping; or if we're
+	 * unlocking and the page has been locked.
+	 */
+	for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp;
+	    vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) {
+		if ((attr == 0 || VPP_PROT(vpp) == pageprot) &&
+		    ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) ||
+		    (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) {
+
+			if (amp != NULL)
+				ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+			/*
+			 * If this isn't a MAP_NORESERVE segment and
+			 * we're locking, allocate anon slots if they
+			 * don't exist.  The page is brought in later on.
+			 */
+			if (op == MC_LOCK && svd->vp == NULL &&
+			    ((svd->flags & MAP_NORESERVE) == 0) &&
+			    amp != NULL &&
+			    ((ap = anon_get_ptr(amp->ahp, anon_index))
+								== NULL)) {
+				anon_array_enter(amp, anon_index, &cookie);
+
+				if ((ap = anon_get_ptr(amp->ahp,
+						anon_index)) == NULL) {
+					pp = anon_zero(seg, addr, &ap,
+					    svd->cred);
+					if (pp == NULL) {
+						anon_array_exit(&cookie);
+						ANON_LOCK_EXIT(&amp->a_rwlock);
+						SEGVN_LOCK_EXIT(seg->s_as,
+						    &svd->lock);
+						return (ENOMEM);
+					}
+					ASSERT(anon_get_ptr(amp->ahp,
+						anon_index) == NULL);
+					(void) anon_set_ptr(amp->ahp,
+						anon_index, ap, ANON_SLEEP);
+					page_unlock(pp);
+				}
+				anon_array_exit(&cookie);
+			}
+
+			/*
+			 * Get name for page, accounting for
+			 * existence of private copy.
+			 */
+			ap = NULL;
+			if (amp != NULL) {
+				anon_array_enter(amp, anon_index, &cookie);
+				ap = anon_get_ptr(amp->ahp, anon_index);
+				if (ap != NULL) {
+					swap_xlate(ap, &vp, &off);
+				} else {
+					if (svd->vp == NULL &&
+					    (svd->flags & MAP_NORESERVE)) {
+						anon_array_exit(&cookie);
+						ANON_LOCK_EXIT(&amp->a_rwlock);
+						continue;
+					}
+					vp = svd->vp;
+					off = offset;
+				}
+				anon_array_exit(&cookie);
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+			} else {
+				vp = svd->vp;
+				off = offset;
+			}
+
+			/*
+			 * Get page frame.  It's ok if the page is
+			 * not available when we're unlocking, as this
+			 * may simply mean that a page we locked got
+			 * truncated out of existence after we locked it.
+			 *
+			 * Invoke VOP_GETPAGE() to obtain the page struct
+			 * since we may need to read it from disk if its
+			 * been paged out.
+			 */
+			if (op != MC_LOCK)
+				pp = page_lookup(vp, off, SE_SHARED);
+			else {
+				page_t *pl[1 + 1];
+				int error;
+
+				ASSERT(vp != NULL);
+
+				error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE,
+				    (uint_t *)NULL, pl, PAGESIZE, seg, addr,
+				    S_OTHER, svd->cred);
+
+				/*
+				 * If the error is EDEADLK then we must bounce
+				 * up and drop all vm subsystem locks and then
+				 * retry the operation later
+				 * This behavior is a temporary measure because
+				 * ufs/sds logging is badly designed and will
+				 * deadlock if we don't allow this bounce to
+				 * happen.  The real solution is to re-design
+				 * the logging code to work properly.  See bug
+				 * 4125102 for details of the problem.
+				 */
+				if (error == EDEADLK) {
+					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+					return (error);
+				}
+				/*
+				 * Quit if we fail to fault in the page.  Treat
+				 * the failure as an error, unless the addr
+				 * is mapped beyond the end of a file.
+				 */
+				if (error && svd->vp) {
+					va.va_mask = AT_SIZE;
+					if (VOP_GETATTR(svd->vp, &va, 0,
+					    svd->cred) != 0) {
+						SEGVN_LOCK_EXIT(seg->s_as,
+						    &svd->lock);
+						return (EIO);
+					}
+					if (btopr(va.va_size) >=
+					    btopr(off + 1)) {
+						SEGVN_LOCK_EXIT(seg->s_as,
+						    &svd->lock);
+						return (EIO);
+					}
+					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+					return (0);
+				} else if (error) {
+					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+					return (EIO);
+				}
+				pp = pl[0];
+				ASSERT(pp != NULL);
+			}
+
+			/*
+			 * See Statement at the beginning of this routine.
+			 *
+			 * claim is always set if MAP_PRIVATE and PROT_WRITE
+			 * irrespective of following factors:
+			 *
+			 * (1) anon slots are populated or not
+			 * (2) cow is broken or not
+			 * (3) refcnt on ap is 1 or greater than 1
+			 *
+			 * See 4140683 for details
+			 */
+			claim = ((VPP_PROT(vpp) & PROT_WRITE) &&
+				(svd->type == MAP_PRIVATE));
+
+			/*
+			 * Perform page-level operation appropriate to
+			 * operation.  If locking, undo the SOFTLOCK
+			 * performed to bring the page into memory
+			 * after setting the lock.  If unlocking,
+			 * and no page was found, account for the claim
+			 * separately.
+			 */
+			if (op == MC_LOCK) {
+				int ret = 1;	/* Assume success */
+
+				/*
+				 * Make sure another thread didn't lock
+				 * the page after we released the segment
+				 * lock.
+				 */
+				if ((attr == 0 || VPP_PROT(vpp) == pageprot) &&
+				    !VPP_ISPPLOCK(vpp)) {
+					ret = page_pp_lock(pp, claim, 0);
+					if (ret != 0) {
+						VPP_SETPPLOCK(vpp);
+						if (lockmap != (ulong_t *)NULL)
+							BT_SET(lockmap, pos);
+					}
+				}
+				page_unlock(pp);
+				if (ret == 0) {
+					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+					return (EAGAIN);
+				}
+			} else {
+				if (pp != NULL) {
+					if ((attr == 0 ||
+					    VPP_PROT(vpp) == pageprot) &&
+					    VPP_ISPPLOCK(vpp))
+						page_pp_unlock(pp, claim, 0);
+					page_unlock(pp);
+				}
+				VPP_CLRPPLOCK(vpp);
+			}
+		}
+	}
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	return (0);
+}
+
+/*
+ * Set advice from user for specified pages
+ * There are 5 types of advice:
+ *	MADV_NORMAL	- Normal (default) behavior (whatever that is)
+ *	MADV_RANDOM	- Random page references
+ *				do not allow readahead or 'klustering'
+ *	MADV_SEQUENTIAL	- Sequential page references
+ *				Pages previous to the one currently being
+ *				accessed (determined by fault) are 'not needed'
+ *				and are freed immediately
+ *	MADV_WILLNEED	- Pages are likely to be used (fault ahead in mctl)
+ *	MADV_DONTNEED	- Pages are not needed (synced out in mctl)
+ *	MADV_FREE	- Contents can be discarded
+ *	MADV_ACCESS_DEFAULT- Default access
+ *	MADV_ACCESS_LWP	- Next LWP will access heavily
+ *	MADV_ACCESS_MANY- Many LWPs or processes will access heavily
+ */
+static int
+segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	size_t page;
+	int err = 0;
+	int already_set;
+	struct anon_map *amp;
+	ulong_t anon_index;
+	struct seg *next;
+	lgrp_mem_policy_t policy;
+	struct seg *prev;
+	struct vnode *vp;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * In case of MADV_FREE, we won't be modifying any segment private
+	 * data structures; so, we only need to grab READER's lock
+	 */
+	if (behav != MADV_FREE)
+		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+	else
+		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+	/*
+	 * Large pages are assumed to be only turned on when accesses to the
+	 * segment's address range have spatial and temporal locality. That
+	 * justifies ignoring MADV_SEQUENTIAL for large page segments.
+	 * Also, ignore advice affecting lgroup memory allocation
+	 * if don't need to do lgroup optimizations on this system
+	 */
+
+	if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) ||
+	    (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT ||
+	    behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) {
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		return (0);
+	}
+
+	if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT ||
+	    behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) {
+		/*
+		 * Since we are going to unload hat mappings
+		 * we first have to flush the cache. Otherwise
+		 * this might lead to system panic if another
+		 * thread is doing physio on the range whose
+		 * mappings are unloaded by madvise(3C).
+		 */
+		if (svd->softlockcnt > 0) {
+			/*
+			 * Since we do have the segvn writers lock
+			 * nobody can fill the cache with entries
+			 * belonging to this seg during the purge.
+			 * The flush either succeeds or we still
+			 * have pending I/Os. In the later case,
+			 * madvise(3C) fails.
+			 */
+			segvn_purge(seg);
+			if (svd->softlockcnt > 0) {
+				/*
+				 * Since madvise(3C) is advisory and
+				 * it's not part of UNIX98, madvise(3C)
+				 * failure here doesn't cause any hardship.
+				 * Note that we don't block in "as" layer.
+				 */
+				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+				return (EAGAIN);
+			}
+		}
+	}
+
+	amp = svd->amp;
+	vp = svd->vp;
+	if (behav == MADV_FREE) {
+		/*
+		 * MADV_FREE is not supported for segments with
+		 * underlying object; if anonmap is NULL, anon slots
+		 * are not yet populated and there is nothing for
+		 * us to do. As MADV_FREE is advisory, we don't
+		 * return error in either case.
+		 */
+		if (vp || amp == NULL) {
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+			return (0);
+		}
+
+		page = seg_page(seg, addr);
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+		anon_disclaim(amp, svd->anon_index + page, len, 0);
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		return (0);
+	}
+
+	/*
+	 * If advice is to be applied to entire segment,
+	 * use advice field in seg_data structure
+	 * otherwise use appropriate vpage entry.
+	 */
+	if ((addr == seg->s_base) && (len == seg->s_size)) {
+		switch (behav) {
+		case MADV_ACCESS_LWP:
+		case MADV_ACCESS_MANY:
+		case MADV_ACCESS_DEFAULT:
+			/*
+			 * Set memory allocation policy for this segment
+			 */
+			policy = lgrp_madv_to_policy(behav, len, svd->type);
+			if (svd->type == MAP_SHARED)
+				already_set = lgrp_shm_policy_set(policy, amp,
+				    svd->anon_index, vp, svd->offset, len);
+			else {
+				/*
+				 * For private memory, need writers lock on
+				 * address space because the segment may be
+				 * split or concatenated when changing policy
+				 */
+				if (AS_READ_HELD(seg->s_as,
+				    &seg->s_as->a_lock)) {
+					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+					return (IE_RETRY);
+				}
+
+				already_set = lgrp_privm_policy_set(policy,
+				    &svd->policy_info, len);
+			}
+
+			/*
+			 * If policy set already and it shouldn't be reapplied,
+			 * don't do anything.
+			 */
+			if (already_set &&
+			    !LGRP_MEM_POLICY_REAPPLICABLE(policy))
+				break;
+
+			/*
+			 * Mark any existing pages in given range for
+			 * migration
+			 */
+			page_mark_migrate(seg, addr, len, amp, svd->anon_index,
+			    vp, svd->offset, 1);
+
+			/*
+			 * If same policy set already or this is a shared
+			 * memory segment, don't need to try to concatenate
+			 * segment with adjacent ones.
+			 */
+			if (already_set || svd->type == MAP_SHARED)
+				break;
+
+			/*
+			 * Try to concatenate this segment with previous
+			 * one and next one, since we changed policy for
+			 * this one and it may be compatible with adjacent
+			 * ones now.
+			 */
+			prev = AS_SEGPREV(seg->s_as, seg);
+			next = AS_SEGNEXT(seg->s_as, seg);
+
+			if (next && next->s_ops == &segvn_ops &&
+			    addr + len == next->s_base)
+				(void) segvn_concat(seg, next, 1);
+
+			if (prev && prev->s_ops == &segvn_ops &&
+			    addr == prev->s_base + prev->s_size) {
+				/*
+				 * Drop lock for private data of current
+				 * segment before concatenating (deleting) it
+				 * and return IE_REATTACH to tell as_ctl() that
+				 * current segment has changed
+				 */
+				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+				if (!segvn_concat(prev, seg, 1))
+					err = IE_REATTACH;
+
+				return (err);
+			}
+			break;
+
+		case MADV_SEQUENTIAL:
+			/*
+			 * unloading mapping guarantees
+			 * detection in segvn_fault
+			 */
+			ASSERT(seg->s_szc == 0);
+			hat_unload(seg->s_as->a_hat, addr, len,
+				HAT_UNLOAD);
+			/* FALLTHROUGH */
+		case MADV_NORMAL:
+		case MADV_RANDOM:
+			svd->advice = (uchar_t)behav;
+			svd->pageadvice = 0;
+			break;
+		case MADV_WILLNEED:	/* handled in memcntl */
+		case MADV_DONTNEED:	/* handled in memcntl */
+		case MADV_FREE:		/* handled above */
+			break;
+		default:
+			err = EINVAL;
+		}
+	} else {
+		caddr_t			eaddr;
+		struct seg		*new_seg;
+		struct segvn_data	*new_svd;
+		u_offset_t		off;
+		caddr_t			oldeaddr;
+
+		page = seg_page(seg, addr);
+
+		segvn_vpage(seg);
+
+		switch (behav) {
+			struct vpage *bvpp, *evpp;
+
+		case MADV_ACCESS_LWP:
+		case MADV_ACCESS_MANY:
+		case MADV_ACCESS_DEFAULT:
+			/*
+			 * Set memory allocation policy for portion of this
+			 * segment
+			 */
+
+			/*
+			 * Align address and length of advice to page
+			 * boundaries for large pages
+			 */
+			if (seg->s_szc != 0) {
+				size_t	pgsz;
+
+				pgsz = page_get_pagesize(seg->s_szc);
+				addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
+				len = P2ROUNDUP(len, pgsz);
+			}
+
+			/*
+			 * Check to see whether policy is set already
+			 */
+			policy = lgrp_madv_to_policy(behav, len, svd->type);
+
+			anon_index = svd->anon_index + page;
+			off = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+			if (svd->type == MAP_SHARED)
+				already_set = lgrp_shm_policy_set(policy, amp,
+				    anon_index, vp, off, len);
+			else
+				already_set =
+				    (policy == svd->policy_info.mem_policy);
+
+			/*
+			 * If policy set already and it shouldn't be reapplied,
+			 * don't do anything.
+			 */
+			if (already_set &&
+			    !LGRP_MEM_POLICY_REAPPLICABLE(policy))
+				break;
+
+			/*
+			 * For private memory, need writers lock on
+			 * address space because the segment may be
+			 * split or concatenated when changing policy
+			 */
+			if (svd->type == MAP_PRIVATE &&
+			    AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) {
+				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+				return (IE_RETRY);
+			}
+
+			/*
+			 * Mark any existing pages in given range for
+			 * migration
+			 */
+			page_mark_migrate(seg, addr, len, amp, svd->anon_index,
+			    vp, svd->offset, 1);
+
+			/*
+			 * Don't need to try to split or concatenate
+			 * segments, since policy is same or this is a shared
+			 * memory segment
+			 */
+			if (already_set || svd->type == MAP_SHARED)
+				break;
+
+			/*
+			 * Split off new segment if advice only applies to a
+			 * portion of existing segment starting in middle
+			 */
+			new_seg = NULL;
+			eaddr = addr + len;
+			oldeaddr = seg->s_base + seg->s_size;
+			if (addr > seg->s_base) {
+				/*
+				 * Must flush I/O page cache
+				 * before splitting segment
+				 */
+				if (svd->softlockcnt > 0)
+					segvn_purge(seg);
+
+				/*
+				 * Split segment and return IE_REATTACH to tell
+				 * as_ctl() that current segment changed
+				 */
+				new_seg = segvn_split_seg(seg, addr);
+				new_svd = (struct segvn_data *)new_seg->s_data;
+				err = IE_REATTACH;
+
+				/*
+				 * If new segment ends where old one
+				 * did, try to concatenate the new
+				 * segment with next one.
+				 */
+				if (eaddr == oldeaddr) {
+					/*
+					 * Set policy for new segment
+					 */
+					(void) lgrp_privm_policy_set(policy,
+					    &new_svd->policy_info,
+					    new_seg->s_size);
+
+					next = AS_SEGNEXT(new_seg->s_as,
+					    new_seg);
+
+					if (next &&
+					    next->s_ops == &segvn_ops &&
+					    eaddr == next->s_base)
+						(void) segvn_concat(new_seg,
+						    next, 1);
+				}
+			}
+
+			/*
+			 * Split off end of existing segment if advice only
+			 * applies to a portion of segment ending before
+			 * end of the existing segment
+			 */
+			if (eaddr < oldeaddr) {
+				/*
+				 * Must flush I/O page cache
+				 * before splitting segment
+				 */
+				if (svd->softlockcnt > 0)
+					segvn_purge(seg);
+
+				/*
+				 * If beginning of old segment was already
+				 * split off, use new segment to split end off
+				 * from.
+				 */
+				if (new_seg != NULL && new_seg != seg) {
+					/*
+					 * Split segment
+					 */
+					(void) segvn_split_seg(new_seg, eaddr);
+
+					/*
+					 * Set policy for new segment
+					 */
+					(void) lgrp_privm_policy_set(policy,
+					    &new_svd->policy_info,
+					    new_seg->s_size);
+				} else {
+					/*
+					 * Split segment and return IE_REATTACH
+					 * to tell as_ctl() that current
+					 * segment changed
+					 */
+					(void) segvn_split_seg(seg, eaddr);
+					err = IE_REATTACH;
+
+					(void) lgrp_privm_policy_set(policy,
+					    &svd->policy_info, seg->s_size);
+
+					/*
+					 * If new segment starts where old one
+					 * did, try to concatenate it with
+					 * previous segment.
+					 */
+					if (addr == seg->s_base) {
+						prev = AS_SEGPREV(seg->s_as,
+						    seg);
+
+						/*
+						 * Drop lock for private data
+						 * of current segment before
+						 * concatenating (deleting) it
+						 */
+						if (prev &&
+						    prev->s_ops ==
+						    &segvn_ops &&
+						    addr == prev->s_base +
+						    prev->s_size) {
+							SEGVN_LOCK_EXIT(
+							    seg->s_as,
+							    &svd->lock);
+							(void) segvn_concat(
+							    prev, seg, 1);
+							return (err);
+						}
+					}
+				}
+			}
+			break;
+		case MADV_SEQUENTIAL:
+			ASSERT(seg->s_szc == 0);
+			hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
+			/* FALLTHROUGH */
+		case MADV_NORMAL:
+		case MADV_RANDOM:
+			bvpp = &svd->vpage[page];
+			evpp = &svd->vpage[page + (len >> PAGESHIFT)];
+			for (; bvpp < evpp; bvpp++)
+				VPP_SETADVICE(bvpp, behav);
+			svd->advice = MADV_NORMAL;
+			break;
+		case MADV_WILLNEED:	/* handled in memcntl */
+		case MADV_DONTNEED:	/* handled in memcntl */
+		case MADV_FREE:		/* handled above */
+			break;
+		default:
+			err = EINVAL;
+		}
+	}
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	return (err);
+}
+
+/*
+ * Create a vpage structure for this seg.
+ */
+static void
+segvn_vpage(struct seg *seg)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct vpage *vp, *evp;
+
+	ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
+
+	/*
+	 * If no vpage structure exists, allocate one.  Copy the protections
+	 * and the advice from the segment itself to the individual pages.
+	 */
+	if (svd->vpage == NULL) {
+		svd->pageprot = 1;
+		svd->pageadvice = 1;
+		svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage),
+		    KM_SLEEP);
+		evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)];
+		for (vp = svd->vpage; vp < evp; vp++) {
+			VPP_SETPROT(vp, svd->prot);
+			VPP_SETADVICE(vp, svd->advice);
+		}
+	}
+}
+
+/*
+ * Dump the pages belonging to this segvn segment.
+ */
+static void
+segvn_dump(struct seg *seg)
+{
+	struct segvn_data *svd;
+	page_t *pp;
+	struct anon_map *amp;
+	ulong_t	anon_index;
+	struct vnode *vp;
+	u_offset_t off, offset;
+	pfn_t pfn;
+	pgcnt_t page, npages;
+	caddr_t addr;
+
+	npages = seg_pages(seg);
+	svd = (struct segvn_data *)seg->s_data;
+	vp = svd->vp;
+	off = offset = svd->offset;
+	addr = seg->s_base;
+
+	if ((amp = svd->amp) != NULL) {
+		anon_index = svd->anon_index;
+		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+	}
+
+	for (page = 0; page < npages; page++, offset += PAGESIZE) {
+		struct anon *ap;
+		int we_own_it = 0;
+
+		if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) {
+			swap_xlate_nopanic(ap, &vp, &off);
+		} else {
+			vp = svd->vp;
+			off = offset;
+		}
+
+		/*
+		 * If pp == NULL, the page either does not exist
+		 * or is exclusively locked.  So determine if it
+		 * exists before searching for it.
+		 */
+
+		if ((pp = page_lookup_nowait(vp, off, SE_SHARED)))
+			we_own_it = 1;
+		else
+			pp = page_exists(vp, off);
+
+		if (pp) {
+			pfn = page_pptonum(pp);
+			dump_addpage(seg->s_as, addr, pfn);
+			if (we_own_it)
+				page_unlock(pp);
+		}
+		addr += PAGESIZE;
+		dump_timeleft = dump_timeout;
+	}
+
+	if (amp != NULL)
+		ANON_LOCK_EXIT(&amp->a_rwlock);
+}
+
+/*
+ * lock/unlock anon pages over a given range. Return shadow list
+ */
+static int
+segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
+    enum lock_type type, enum seg_rw rw)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	size_t np, adjustpages = 0, npages = (len >> PAGESHIFT);
+	ulong_t anon_index;
+	uint_t protchk;
+	uint_t error;
+	struct anon_map *amp;
+	struct page **pplist, **pl, *pp;
+	caddr_t a;
+	size_t page;
+	caddr_t lpgaddr, lpgeaddr;
+
+	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START,
+		"segvn_pagelock: start seg %p addr %p", seg, addr);
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+	if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) {
+		/*
+		 * We are adjusting the pagelock region to the large page size
+		 * boundary because the unlocked part of a large page cannot
+		 * be freed anyway unless all constituent pages of a large
+		 * page are locked. Therefore this adjustment allows us to
+		 * decrement availrmem by the right value (note we don't want
+		 * to just decrement availrem by the large page size without
+		 * adjusting addr and len because then we may end up
+		 * decrementing availrmem by large page size for every
+		 * constituent page locked by a new as_pagelock call).
+		 * as_pageunlock caller must always match as_pagelock call's
+		 * addr and len.
+		 *
+		 * Note segment's page size cannot change while we are holding
+		 * as lock.  And then it cannot change while softlockcnt is
+		 * not 0. This will allow us to correctly recalculate large
+		 * page size region for the matching pageunlock/reclaim call.
+		 *
+		 * for pageunlock *ppp points to the pointer of page_t that
+		 * corresponds to the real unadjusted start address. Similar
+		 * for pagelock *ppp must point to the pointer of page_t that
+		 * corresponds to the real unadjusted start address.
+		 */
+		size_t pgsz = page_get_pagesize(seg->s_szc);
+		CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
+		adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT;
+	}
+
+	if (type == L_PAGEUNLOCK) {
+
+		/*
+		 * update hat ref bits for /proc. We need to make sure
+		 * that threads tracing the ref and mod bits of the
+		 * address space get the right data.
+		 * Note: page ref and mod bits are updated at reclaim time
+		 */
+		if (seg->s_as->a_vbits) {
+			for (a = addr; a < addr + len; a += PAGESIZE) {
+				if (rw == S_WRITE) {
+					hat_setstat(seg->s_as, a,
+					    PAGESIZE, P_REF | P_MOD);
+				} else {
+					hat_setstat(seg->s_as, a,
+					    PAGESIZE, P_REF);
+				}
+			}
+		}
+		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+		if (seg->s_szc != 0) {
+			VM_STAT_ADD(segvnvmstats.pagelock[0]);
+			seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr,
+			    *ppp - adjustpages, rw, segvn_reclaim);
+		} else {
+			seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim);
+		}
+
+		/*
+		 * If someone is blocked while unmapping, we purge
+		 * segment page cache and thus reclaim pplist synchronously
+		 * without waiting for seg_pasync_thread. This speeds up
+		 * unmapping in cases where munmap(2) is called, while
+		 * raw async i/o is still in progress or where a thread
+		 * exits on data fault in a multithreaded application.
+		 */
+		if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) {
+			/*
+			 * Even if we grab segvn WRITER's lock or segp_slock
+			 * here, there might be another thread which could've
+			 * successfully performed lookup/insert just before
+			 * we acquired the lock here.  So, grabbing either
+			 * lock here is of not much use.  Until we devise
+			 * a strategy at upper layers to solve the
+			 * synchronization issues completely, we expect
+			 * applications to handle this appropriately.
+			 */
+			segvn_purge(seg);
+		}
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
+			"segvn_pagelock: unlock seg %p addr %p", seg, addr);
+		return (0);
+	} else if (type == L_PAGERECLAIM) {
+		VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]);
+		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+		(void) segvn_reclaim(seg, addr, len, *ppp, rw);
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
+			"segvn_pagelock: reclaim seg %p addr %p", seg, addr);
+		return (0);
+	}
+
+	if (seg->s_szc != 0) {
+		VM_STAT_ADD(segvnvmstats.pagelock[2]);
+		addr = lpgaddr;
+		len = lpgeaddr - lpgaddr;
+		npages = (len >> PAGESHIFT);
+	}
+
+	/*
+	 * for now we only support pagelock to anon memory. We've to check
+	 * protections for vnode objects and call into the vnode driver.
+	 * That's too much for a fast path. Let the fault entry point handle it.
+	 */
+	if (svd->vp != NULL) {
+		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
+		    "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr);
+		*ppp = NULL;
+		return (ENOTSUP);
+	}
+
+	/*
+	 * if anonmap is not yet created, let the fault entry point populate it
+	 * with anon ptrs.
+	 */
+	if ((amp = svd->amp) == NULL) {
+		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
+		    "segvn_pagelock: anonmap null seg %p addr %p", seg, addr);
+		*ppp = NULL;
+		return (EFAULT);
+	}
+
+	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+	/*
+	 * we acquire segp_slock to prevent duplicate entries
+	 * in seg_pcache
+	 */
+	mutex_enter(&svd->segp_slock);
+
+	/*
+	 * try to find pages in segment page cache
+	 */
+	pplist = seg_plookup(seg, addr, len, rw);
+	if (pplist != NULL) {
+		mutex_exit(&svd->segp_slock);
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		*ppp = pplist + adjustpages;
+		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END,
+			"segvn_pagelock: cache hit seg %p addr %p", seg, addr);
+		return (0);
+	}
+
+	if (rw == S_READ) {
+		protchk = PROT_READ;
+	} else {
+		protchk = PROT_WRITE;
+	}
+
+	if (svd->pageprot == 0) {
+		if ((svd->prot & protchk) == 0) {
+			mutex_exit(&svd->segp_slock);
+			error = EFAULT;
+			goto out;
+		}
+	} else {
+		/*
+		 * check page protections
+		 */
+		for (a = addr; a < addr + len; a += PAGESIZE) {
+			struct vpage *vp;
+
+			vp = &svd->vpage[seg_page(seg, a)];
+			if ((VPP_PROT(vp) & protchk) == 0) {
+				mutex_exit(&svd->segp_slock);
+				error = EFAULT;
+				goto out;
+			}
+		}
+	}
+
+	mutex_enter(&freemem_lock);
+	if (availrmem < tune.t_minarmem + npages) {
+		mutex_exit(&freemem_lock);
+		mutex_exit(&svd->segp_slock);
+		error = ENOMEM;
+		goto out;
+	} else {
+		svd->softlockcnt += npages;
+		availrmem -= npages;
+		segvn_pages_locked += npages;
+	}
+	mutex_exit(&freemem_lock);
+
+	pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP);
+	pl = pplist;
+	*ppp = pplist + adjustpages;
+
+	page = seg_page(seg, addr);
+	anon_index = svd->anon_index + page;
+
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+	for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) {
+		struct anon *ap;
+		struct vnode *vp;
+		u_offset_t off;
+		anon_sync_obj_t cookie;
+
+		anon_array_enter(amp, anon_index, &cookie);
+		ap = anon_get_ptr(amp->ahp, anon_index);
+		if (ap == NULL) {
+			anon_array_exit(&cookie);
+			break;
+		} else {
+			/*
+			 * We must never use seg_pcache for COW pages
+			 * because we might end up with original page still
+			 * lying in seg_pcache even after private page is
+			 * created. This leads to data corruption as
+			 * aio_write refers to the page still in cache
+			 * while all other accesses refer to the private
+			 * page.
+			 */
+			if (ap->an_refcnt != 1) {
+				anon_array_exit(&cookie);
+				break;
+			}
+		}
+		swap_xlate(ap, &vp, &off);
+		anon_array_exit(&cookie);
+
+		pp = page_lookup_nowait(vp, off, SE_SHARED);
+		if (pp == NULL) {
+			break;
+		}
+		*pplist++ = pp;
+	}
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+
+	if (a >= addr + len) {
+		(void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH,
+			segvn_reclaim);
+		mutex_exit(&svd->segp_slock);
+		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END,
+		    "segvn_pagelock: cache fill seg %p addr %p", seg, addr);
+		return (0);
+	}
+
+	mutex_exit(&svd->segp_slock);
+	error = EFAULT;
+	pplist = pl;
+	np = ((uintptr_t)(a - addr)) >> PAGESHIFT;
+	while (np > (uint_t)0) {
+		page_unlock(*pplist);
+		np--;
+		pplist++;
+	}
+	kmem_free(pl, sizeof (page_t *) * npages);
+	mutex_enter(&freemem_lock);
+	svd->softlockcnt -= npages;
+	availrmem += npages;
+	segvn_pages_locked -= npages;
+	mutex_exit(&freemem_lock);
+	if (svd->softlockcnt <= 0) {
+		if (AS_ISUNMAPWAIT(seg->s_as)) {
+			mutex_enter(&seg->s_as->a_contents);
+			if (AS_ISUNMAPWAIT(seg->s_as)) {
+				AS_CLRUNMAPWAIT(seg->s_as);
+				cv_broadcast(&seg->s_as->a_cv);
+			}
+			mutex_exit(&seg->s_as->a_contents);
+		}
+	}
+
+out:
+	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+	*ppp = NULL;
+	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
+		"segvn_pagelock: cache miss seg %p addr %p", seg, addr);
+	return (error);
+}
+
+/*
+ * purge any cached pages in the I/O page cache
+ */
+static void
+segvn_purge(struct seg *seg)
+{
+	seg_ppurge(seg);
+}
+
+static int
+segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
+	enum seg_rw rw)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	pgcnt_t np, npages;
+	struct page **pl;
+
+#ifdef lint
+	addr = addr;
+#endif
+
+	npages = np = (len >> PAGESHIFT);
+	ASSERT(npages);
+	pl = pplist;
+	if (seg->s_szc != 0) {
+		size_t pgsz = page_get_pagesize(seg->s_szc);
+		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
+			panic("segvn_reclaim: unaligned addr or len");
+			/*NOTREACHED*/
+		}
+	}
+
+	while (np > (uint_t)0) {
+		if (rw == S_WRITE) {
+			hat_setrefmod(*pplist);
+		} else {
+			hat_setref(*pplist);
+		}
+		page_unlock(*pplist);
+		np--;
+		pplist++;
+	}
+	kmem_free(pl, sizeof (page_t *) * npages);
+
+	mutex_enter(&freemem_lock);
+	availrmem += npages;
+	segvn_pages_locked -= npages;
+	svd->softlockcnt -= npages;
+	mutex_exit(&freemem_lock);
+	if (svd->softlockcnt <= 0) {
+		if (AS_ISUNMAPWAIT(seg->s_as)) {
+			mutex_enter(&seg->s_as->a_contents);
+			if (AS_ISUNMAPWAIT(seg->s_as)) {
+				AS_CLRUNMAPWAIT(seg->s_as);
+				cv_broadcast(&seg->s_as->a_cv);
+			}
+			mutex_exit(&seg->s_as->a_contents);
+		}
+	}
+	return (0);
+}
+/*
+ * get a memory ID for an addr in a given segment
+ *
+ * XXX only creates PAGESIZE pages if anon slots are not initialized.
+ * At fault time they will be relocated into larger pages.
+ */
+static int
+segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct anon 	*ap = NULL;
+	ulong_t		anon_index;
+	struct anon_map	*amp;
+	anon_sync_obj_t cookie;
+
+	if (svd->type == MAP_PRIVATE) {
+		memidp->val[0] = (uintptr_t)seg->s_as;
+		memidp->val[1] = (uintptr_t)addr;
+		return (0);
+	}
+
+	if (svd->type == MAP_SHARED) {
+		if (svd->vp) {
+			memidp->val[0] = (uintptr_t)svd->vp;
+			memidp->val[1] = (u_longlong_t)svd->offset +
+			    (uintptr_t)(addr - seg->s_base);
+			return (0);
+		} else {
+
+			SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+			if ((amp = svd->amp) != NULL) {
+				anon_index = svd->anon_index +
+				    seg_page(seg, addr);
+			}
+			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+
+			ASSERT(amp != NULL);
+
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+			anon_array_enter(amp, anon_index, &cookie);
+			ap = anon_get_ptr(amp->ahp, anon_index);
+			if (ap == NULL) {
+				page_t		*pp;
+
+				pp = anon_zero(seg, addr, &ap, svd->cred);
+				if (pp == NULL) {
+					anon_array_exit(&cookie);
+					ANON_LOCK_EXIT(&amp->a_rwlock);
+					return (ENOMEM);
+				}
+				ASSERT(anon_get_ptr(amp->ahp, anon_index)
+								== NULL);
+				(void) anon_set_ptr(amp->ahp, anon_index,
+				    ap, ANON_SLEEP);
+				page_unlock(pp);
+			}
+
+			anon_array_exit(&cookie);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+
+			memidp->val[0] = (uintptr_t)ap;
+			memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
+			return (0);
+		}
+	}
+	return (EINVAL);
+}
+
+static int
+sameprot(struct seg *seg, caddr_t a, size_t len)
+{
+	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+	struct vpage *vpage;
+	spgcnt_t pages = btop(len);
+	uint_t prot;
+
+	if (svd->pageprot == 0)
+		return (1);
+
+	ASSERT(svd->vpage != NULL);
+
+	vpage = &svd->vpage[seg_page(seg, a)];
+	prot = VPP_PROT(vpage);
+	vpage++;
+	pages--;
+	while (pages-- > 0) {
+		if (prot != VPP_PROT(vpage))
+			return (0);
+		vpage++;
+	}
+	return (1);
+}
+
+/*
+ * Get memory allocation policy info for specified address in given segment
+ */
+static lgrp_mem_policy_info_t *
+segvn_getpolicy(struct seg *seg, caddr_t addr)
+{
+	struct anon_map		*amp;
+	ulong_t			anon_index;
+	lgrp_mem_policy_info_t	*policy_info;
+	struct segvn_data	*svn_data;
+	u_offset_t		vn_off;
+	vnode_t			*vp;
+
+	ASSERT(seg != NULL);
+
+	svn_data = (struct segvn_data *)seg->s_data;
+	if (svn_data == NULL)
+		return (NULL);
+
+	/*
+	 * Get policy info for private or shared memory
+	 */
+	if (svn_data->type != MAP_SHARED)
+		policy_info = &svn_data->policy_info;
+	else {
+		amp = svn_data->amp;
+		anon_index = svn_data->anon_index + seg_page(seg, addr);
+		vp = svn_data->vp;
+		vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base);
+		policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off);
+	}
+
+	return (policy_info);
+}
diff --git a/usr/src/uts/common/vm/seg_vn.h b/usr/src/uts/common/vm/seg_vn.h
new file mode 100644
index 0000000000..4f66d495dd
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_vn.h
@@ -0,0 +1,168 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_SEG_VN_H
+#define	_VM_SEG_VN_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/lgrp.h>
+#include <vm/anon.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * A pointer to this structure is passed to segvn_create().
+ */
+typedef struct segvn_crargs {
+	struct	vnode *vp;	/* vnode mapped from */
+	struct	cred *cred;	/* credentials */
+	u_offset_t	offset; /* starting offset of vnode for mapping */
+	uchar_t	type;		/* type of sharing done */
+	uchar_t	prot;		/* protections */
+	uchar_t	maxprot;	/* maximum protections */
+	uint_t	flags;		/* flags */
+	struct	anon_map *amp;	/* anon mapping to map to */
+	uint_t	szc;		/* max preferred page size code */
+	uint_t	lgrp_mem_policy_flags;
+} segvn_crargs_t;
+
+/*
+ * (Semi) private data maintained by the seg_vn driver per segment mapping.
+ *
+ * The read/write segment lock protects all of segvn_data including the
+ * vpage array.  All fields in segvn_data are treated as read-only when
+ * the "read" version of the address space and the segment locks are held.
+ * The "write" version of the segment lock, however, is required in order to
+ * update the following fields:
+ *
+ *	pageprot
+ *	prot
+ *	amp
+ *	vpage
+ *
+ * 	softlockcnt
+ * is written by acquiring either the readers lock on the segment and
+ * freemem lock, or any lock combination which guarantees exclusive use
+ * of this segment (e.g., adress space writers lock,
+ * address space readers lock + segment writers lock).
+ */
+typedef struct	segvn_data {
+	krwlock_t lock;		/* protect segvn_data and vpage array */
+	kmutex_t segp_slock;	/* serialize insertions into seg_pcache */
+	uchar_t	pageprot;	/* true if per page protections present */
+	uchar_t	prot;		/* current segment prot if pageprot == 0 */
+	uchar_t	maxprot;	/* maximum segment protections */
+	uchar_t	type;		/* type of sharing done */
+	u_offset_t offset;	/* starting offset of vnode for mapping */
+	struct	vnode *vp;	/* vnode that segment mapping is to */
+	ulong_t	anon_index;	/* starting index into anon_map anon array */
+	struct	anon_map *amp;	/* pointer to anon share structure, if needed */
+	struct	vpage *vpage;	/* per-page information, if needed */
+	struct	cred *cred;	/* mapping credentials */
+	size_t	swresv;		/* swap space reserved for this segment */
+	uchar_t	advice;		/* madvise flags for segment */
+	uchar_t	pageadvice;	/* true if per page advice set */
+	ushort_t flags;		/* flags - from sys/mman.h */
+	ssize_t	softlockcnt;	/* # of pages SOFTLOCKED in seg */
+	lgrp_mem_policy_info_t policy_info; /* memory allocation policy */
+} segvn_data_t;
+
+#ifdef _KERNEL
+
+/*
+ * Macros for segvn segment driver locking.
+ */
+#define	SEGVN_LOCK_ENTER(as, lock, type)	rw_enter((lock), (type))
+#define	SEGVN_LOCK_EXIT(as, lock)		rw_exit((lock))
+#define	SEGVN_LOCK_DOWNGRADE(as, lock)		rw_downgrade((lock))
+
+/*
+ * Macros to test lock states.
+ */
+#define	SEGVN_LOCK_HELD(as, lock)		RW_LOCK_HELD((lock))
+#define	SEGVN_READ_HELD(as, lock)		RW_READ_HELD((lock))
+#define	SEGVN_WRITE_HELD(as, lock)		RW_WRITE_HELD((lock))
+
+/*
+ * Macro used to detect the need to Break the sharing of COW pages
+ *
+ * The rw == S_WRITE is for the COW case
+ * rw == S_READ and type == SOFTLOCK is for the physio case
+ * We don't want to share a softlocked page because it can cause problems
+ * with multithreaded apps but if rw == S_READ_NOCOW it's ok to not break
+ * sharing of COW pages even in SOFTLOCK case.
+ */
+#define	BREAK_COW_SHARE(rw, type, seg_type) ((rw == S_WRITE || \
+	(type == F_SOFTLOCK && rw != S_READ_NOCOW)) && \
+	seg_type == MAP_PRIVATE)
+
+#define	SEGVN_ZFOD_ARGS(prot, max)	\
+	{ NULL, NULL, 0, MAP_PRIVATE, prot, max, 0, NULL, 0, 0 }
+
+#define	AS_MAP_VNSEGS_USELPGS(crfp, argsp)				\
+	((crfp) == (int (*)())segvn_create &&				\
+	(((struct segvn_crargs *)(argsp))->flags &			\
+	    (MAP_TEXT | MAP_INITDATA)) &&				\
+	((struct segvn_crargs *)(argsp))->vp != NULL &&			\
+	((struct segvn_crargs *)(argsp))->amp == NULL)
+
+
+extern void	segvn_init(void);
+extern int	segvn_create(struct seg *, void *);
+
+extern	struct seg_ops segvn_ops;
+
+/*
+ * Provided as shorthand for creating user zfod segments.
+ */
+extern	caddr_t zfod_argsp;
+extern	caddr_t kzfod_argsp;
+extern	caddr_t stack_exec_argsp;
+extern	caddr_t stack_noexec_argsp;
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_VN_H */
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
new file mode 100644
index 0000000000..b8da5c97c2
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -0,0 +1,3197 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VM - anonymous pages.
+ *
+ * This layer sits immediately above the vm_swap layer.  It manages
+ * physical pages that have no permanent identity in the file system
+ * name space, using the services of the vm_swap layer to allocate
+ * backing storage for these pages.  Since these pages have no external
+ * identity, they are discarded when the last reference is removed.
+ *
+ * An important function of this layer is to manage low-level sharing
+ * of pages that are logically distinct but that happen to be
+ * physically identical (e.g., the corresponding pages of the processes
+ * resulting from a fork before one process or the other changes their
+ * contents).  This pseudo-sharing is present only as an optimization
+ * and is not to be confused with true sharing in which multiple
+ * address spaces deliberately contain references to the same object;
+ * such sharing is managed at a higher level.
+ *
+ * The key data structure here is the anon struct, which contains a
+ * reference count for its associated physical page and a hint about
+ * the identity of that page.  Anon structs typically live in arrays,
+ * with an instance's position in its array determining where the
+ * corresponding backing storage is allocated; however, the swap_xlate()
+ * routine abstracts away this representation information so that the
+ * rest of the anon layer need not know it.  (See the swap layer for
+ * more details on anon struct layout.)
+ *
+ * In the future versions of the system, the association between an
+ * anon struct and its position on backing store will change so that
+ * we don't require backing store all anonymous pages in the system.
+ * This is important for consideration for large memory systems.
+ * We can also use this technique to delay binding physical locations
+ * to anonymous pages until pageout/swapout time where we can make
+ * smarter allocation decisions to improve anonymous klustering.
+ *
+ * Many of the routines defined here take a (struct anon **) argument,
+ * which allows the code at this level to manage anon pages directly,
+ * so that callers can regard anon structs as opaque objects and not be
+ * concerned with assigning or inspecting their contents.
+ *
+ * Clients of this layer refer to anon pages indirectly.  That is, they
+ * maintain arrays of pointers to anon structs rather than maintaining
+ * anon structs themselves.  The (struct anon **) arguments mentioned
+ * above are pointers to entries in these arrays.  It is these arrays
+ * that capture the mapping between offsets within a given segment and
+ * the corresponding anonymous backing storage address.
+ */
+
+#ifdef DEBUG
+#define	ANON_DEBUG
+#endif
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/cred.h>
+#include <sys/thread.h>
+#include <sys/vnode.h>
+#include <sys/cpuvar.h>
+#include <sys/swap.h>
+#include <sys/cmn_err.h>
+#include <sys/vtrace.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/vmsystm.h>
+#include <sys/debug.h>
+#include <sys/tnf_probe.h>
+#include <sys/lgrp.h>
+#include <sys/policy.h>
+#include <sys/condvar_impl.h>
+#include <sys/mutex_impl.h>
+
+#include <vm/as.h>
+#include <vm/hat.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <vm/vpage.h>
+#include <vm/seg.h>
+#include <vm/rm.h>
+
+#include <fs/fs_subr.h>
+
+int anon_debug;
+
+kmutex_t	anoninfo_lock;
+struct		k_anoninfo k_anoninfo;
+ani_free_t	ani_free_pool[ANI_MAX_POOL];
+pad_mutex_t	anon_array_lock[ANON_LOCKSIZE];
+kcondvar_t	anon_array_cv[ANON_LOCKSIZE];
+
+/*
+ * Global hash table for (vp, off) -> anon slot
+ */
+extern	int swap_maxcontig;
+size_t	anon_hash_size;
+struct anon **anon_hash;
+
+static struct kmem_cache *anon_cache;
+static struct kmem_cache *anonmap_cache;
+
+#ifdef VM_STATS
+static struct anonvmstats_str {
+	ulong_t getpages[30];
+	ulong_t privatepages[10];
+	ulong_t demotepages[9];
+	ulong_t decrefpages[9];
+	ulong_t	dupfillholes[4];
+	ulong_t freepages[1];
+} anonvmstats;
+#endif /* VM_STATS */
+
+
+/*ARGSUSED*/
+static int
+anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+	struct anon_map *amp = buf;
+
+	rw_init(&amp->a_rwlock, NULL, RW_DEFAULT, NULL);
+	return (0);
+}
+
+/*ARGSUSED1*/
+static void
+anonmap_cache_destructor(void *buf, void *cdrarg)
+{
+	struct anon_map *amp = buf;
+
+	rw_destroy(&amp->a_rwlock);
+}
+
+kmutex_t	anonhash_lock[AH_LOCK_SIZE];
+kmutex_t	anonpages_hash_lock[AH_LOCK_SIZE];
+
+void
+anon_init(void)
+{
+	int i;
+
+	anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN);
+
+	for (i = 0; i < AH_LOCK_SIZE; i++) {
+		mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL);
+	}
+
+	for (i = 0; i < ANON_LOCKSIZE; i++) {
+		mutex_init(&anon_array_lock[i].pad_mutex, NULL,
+			MUTEX_DEFAULT, NULL);
+		cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
+	}
+
+	anon_hash = (struct anon **)
+		kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
+	anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
+		AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
+	anonmap_cache = kmem_cache_create("anonmap_cache",
+		sizeof (struct anon_map), 0,
+		anonmap_cache_constructor, anonmap_cache_destructor, NULL,
+		NULL, NULL, 0);
+	swap_maxcontig = (1024 * 1024) >> PAGESHIFT;	/* 1MB of pages */
+}
+
+/*
+ * Global anon slot hash table manipulation.
+ */
+
+static void
+anon_addhash(struct anon *ap)
+{
+	int index;
+
+	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]));
+	index = ANON_HASH(ap->an_vp, ap->an_off);
+	ap->an_hash = anon_hash[index];
+	anon_hash[index] = ap;
+}
+
+static void
+anon_rmhash(struct anon *ap)
+{
+	struct anon **app;
+
+	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]));
+
+	for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
+	    *app; app = &((*app)->an_hash)) {
+		if (*app == ap) {
+			*app = ap->an_hash;
+			break;
+		}
+	}
+}
+
+/*
+ * The anon array interfaces. Functions allocating,
+ * freeing array of pointers, and returning/setting
+ * entries in the array of pointers for a given offset.
+ *
+ * Create the list of pointers
+ */
+struct anon_hdr *
+anon_create(pgcnt_t npages, int flags)
+{
+	struct anon_hdr *ahp;
+	ulong_t nchunks;
+	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+
+	if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
+		return (NULL);
+	}
+
+	mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
+	/*
+	 * Single level case.
+	 */
+	ahp->size = npages;
+	if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
+
+		if (flags & ANON_ALLOC_FORCE)
+			ahp->flags |= ANON_ALLOC_FORCE;
+
+		ahp->array_chunk = kmem_zalloc(
+		    ahp->size * sizeof (struct anon *), kmemflags);
+
+		if (ahp->array_chunk == NULL) {
+			kmem_free(ahp, sizeof (struct anon_hdr));
+			return (NULL);
+		}
+	} else {
+		/*
+		 * 2 Level case.
+		 */
+		nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
+
+		ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
+		    kmemflags);
+
+		if (ahp->array_chunk == NULL) {
+			kmem_free(ahp, sizeof (struct anon_hdr));
+			return (NULL);
+		}
+	}
+	return (ahp);
+}
+
+/*
+ * Free the array of pointers
+ */
+void
+anon_release(struct anon_hdr *ahp, pgcnt_t npages)
+{
+	ulong_t i;
+	void **ppp;
+	ulong_t nchunks;
+
+	ASSERT(npages == ahp->size);
+
+	/*
+	 * Single level case.
+	 */
+	if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
+		kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
+	} else {
+		/*
+		 * 2 level case.
+		 */
+		nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
+		for (i = 0; i < nchunks; i++) {
+			ppp = &ahp->array_chunk[i];
+			if (*ppp != NULL)
+				kmem_free(*ppp, PAGESIZE);
+		}
+		kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
+	}
+	mutex_destroy(&ahp->serial_lock);
+	kmem_free(ahp, sizeof (struct anon_hdr));
+}
+
+/*
+ * Return the pointer from the list for a
+ * specified anon index.
+ */
+struct anon *
+anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
+{
+	struct anon **app;
+
+	ASSERT(an_idx < ahp->size);
+
+	/*
+	 * Single level case.
+	 */
+	if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
+		return ((struct anon *)
+			((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
+	} else {
+
+		/*
+		 * 2 level case.
+		 */
+		app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+		if (app) {
+			return ((struct anon *)
+				((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
+					ANON_PTRMASK));
+		} else {
+			return (NULL);
+		}
+	}
+}
+
+/*
+ * Return the anon pointer for the first valid entry in the anon list,
+ * starting from the given index.
+ */
+struct anon *
+anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
+{
+	struct anon *ap;
+	struct anon **app;
+	ulong_t chunkoff;
+	ulong_t i;
+	ulong_t j;
+	pgcnt_t size;
+
+	i = *index;
+	size = ahp->size;
+
+	ASSERT(i < size);
+
+	if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
+		/*
+		 * 1 level case
+		 */
+		while (i < size) {
+			ap = (struct anon *)
+				((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
+			if (ap) {
+				*index = i;
+				return (ap);
+			}
+			i++;
+		}
+	} else {
+		/*
+		 * 2 level case
+		 */
+		chunkoff = i & ANON_CHUNK_OFF;
+		while (i < size) {
+			app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
+			if (app)
+				for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
+					ap = (struct anon *)
+						((uintptr_t)app[j] &
+							ANON_PTRMASK);
+					if (ap) {
+						*index = i + (j - chunkoff);
+						return (ap);
+					}
+				}
+			chunkoff = 0;
+			i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
+		}
+	}
+	*index = size;
+	return (NULL);
+}
+
+/*
+ * Set list entry with a given pointer for a specified offset
+ */
+int
+anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
+{
+	void		**ppp;
+	struct anon	**app;
+	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+	uintptr_t	*ap_addr;
+
+	ASSERT(an_idx < ahp->size);
+
+	/*
+	 * Single level case.
+	 */
+	if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
+		ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
+	} else {
+
+		/*
+		 * 2 level case.
+		 */
+		ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+
+		ASSERT(ppp != NULL);
+		if (*ppp == NULL) {
+			mutex_enter(&ahp->serial_lock);
+			ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+			if (*ppp == NULL) {
+				*ppp = kmem_zalloc(PAGESIZE, kmemflags);
+				if (*ppp == NULL) {
+					mutex_exit(&ahp->serial_lock);
+					return (ENOMEM);
+				}
+			}
+			mutex_exit(&ahp->serial_lock);
+		}
+		app = *ppp;
+		ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
+	}
+	*ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
+	return (0);
+}
+
+/*
+ * Copy anon array into a given new anon array
+ */
+int
+anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx,
+	struct anon_hdr *dahp, ulong_t d_idx,
+	pgcnt_t npages, int flags)
+{
+	void **sapp, **dapp;
+	void *ap;
+	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+
+	ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
+	ASSERT((npages <= sahp->size) && (npages <= dahp->size));
+
+	/*
+	 * Both arrays are 1 level.
+	 */
+	if (((sahp->size <= ANON_CHUNK_SIZE) &&
+	    (dahp->size <= ANON_CHUNK_SIZE)) ||
+	    ((sahp->flags & ANON_ALLOC_FORCE) &&
+	    (dahp->flags & ANON_ALLOC_FORCE))) {
+
+		bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
+		    npages * sizeof (struct anon *));
+		return (0);
+	}
+
+	/*
+	 * Both arrays are 2 levels.
+	 */
+	if (sahp->size > ANON_CHUNK_SIZE &&
+	    dahp->size > ANON_CHUNK_SIZE &&
+	    ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
+	    ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
+
+		ulong_t sapidx, dapidx;
+		ulong_t *sap, *dap;
+		ulong_t chknp;
+
+		while (npages != 0) {
+
+			sapidx = s_idx & ANON_CHUNK_OFF;
+			dapidx = d_idx & ANON_CHUNK_OFF;
+			chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
+			if (chknp > npages)
+				chknp = npages;
+
+			sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
+			if ((sap = *sapp) != NULL) {
+				dapp = &dahp->array_chunk[d_idx
+							>> ANON_CHUNK_SHIFT];
+				if ((dap = *dapp) == NULL) {
+					*dapp = kmem_zalloc(PAGESIZE,
+					    kmemflags);
+					if ((dap = *dapp) == NULL)
+						return (ENOMEM);
+				}
+				bcopy((sap + sapidx), (dap + dapidx),
+				    chknp << ANON_PTRSHIFT);
+			}
+			s_idx += chknp;
+			d_idx += chknp;
+			npages -= chknp;
+		}
+		return (0);
+	}
+
+	/*
+	 * At least one of the arrays is 2 level.
+	 */
+	while (npages--) {
+		if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
+			ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
+			if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
+					return (ENOMEM);
+		}
+		s_idx++;
+		d_idx++;
+	}
+	return (0);
+}
+
+
+/*
+ * ANON_INITBUF is a convenience macro for anon_grow() below. It
+ * takes a buffer dst, which is at least as large as buffer src. It
+ * does a bcopy from src into dst, and then bzeros the extra bytes
+ * of dst. If tail is set, the data in src is tail aligned within
+ * dst instead of head aligned.
+ */
+
+#define	ANON_INITBUF(src, srclen, dst, dstsize, tail)			      \
+	if (tail) {							      \
+		bzero((dst), (dstsize) - (srclen));			      \
+		bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
+	} else {							      \
+		bcopy((src), (dst), (srclen));				      \
+		bzero((char *)(dst) + (srclen), (dstsize) - (srclen));	      \
+	}
+
+#define	ANON_1_LEVEL_INC	(ANON_CHUNK_SIZE / 8)
+#define	ANON_2_LEVEL_INC	(ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
+
+/*
+ * anon_grow() is used to efficiently extend an existing anon array.
+ * startidx_p points to the index into the anon array of the first page
+ * that is in use. curpages is the number of pages in use, starting at
+ * *startidx_p. newpages is the number of additional pages desired.
+ *
+ * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
+ *
+ * The growth is done by creating a new top level of the anon array,
+ * and (if the array is 2-level) reusing the existing second level arrays.
+ *
+ * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
+ *
+ * Returns the new number of pages in the anon array.
+ */
+
+pgcnt_t
+anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t curpages,
+    pgcnt_t newpages, int flags)
+{
+	ulong_t startidx = startidx_p ? *startidx_p : 0;
+	pgcnt_t osz = ahp->size, nsz;
+	pgcnt_t oelems, nelems, totpages;
+	void **level1;
+	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+	int growdown = (flags & ANON_GROWDOWN);
+	size_t newarrsz, oldarrsz;
+	void *level2;
+
+	ASSERT(!(startidx_p == NULL && growdown));
+	ASSERT(startidx + curpages <= ahp->size);
+
+	/*
+	 * Determine the total number of pages needed in the new
+	 * anon array. If growing down, totpages is all pages from
+	 * startidx through the end of the array, plus <newpages>
+	 * pages. If growing up, keep all pages from page 0 through
+	 * the last page currently in use, plus <newpages> pages.
+	 */
+
+	if (growdown)
+		totpages = osz - startidx + newpages;
+	else
+		totpages = startidx + curpages + newpages;
+
+	/* If the array is already large enough, just return. */
+
+	if (osz >= totpages) {
+		nsz = osz;
+		goto out;
+	}
+
+	/*
+	 * osz/nsz are the total numbers of pages represented by the array.
+	 * oelems/nelems are the number of pointers in the top level array.
+	 *
+	 * Will the new anon array be one level or two levels?
+	 */
+
+	if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
+		nsz = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
+		oelems = osz;
+		nelems = nsz;
+	} else {
+		nsz = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
+		oelems = (osz + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
+		nelems = nsz >> ANON_CHUNK_SHIFT;
+	}
+
+	newarrsz = nelems * sizeof (void *);
+	level1 = kmem_alloc(newarrsz, kmemflags);
+	if (level1 == NULL)
+		return (0);
+
+	/* Are we converting from a one level to a two level anon array? */
+
+	if (nsz > ANON_CHUNK_SIZE && osz <= ANON_CHUNK_SIZE &&
+	    !(ahp->flags & ANON_ALLOC_FORCE)) {
+		/*
+		 * Yes, we're converting to a two level. Reuse old level 1
+		 * as new level 2 if it is exactly PAGESIZE. Otherwise
+		 * alloc a new level 2 and copy the old level 1 data into it.
+		 */
+
+		if (osz == ANON_CHUNK_SIZE) {
+			level2 = (void *)ahp->array_chunk;
+		} else {
+			level2 = kmem_alloc(PAGESIZE, kmemflags);
+			if (level2 == NULL) {
+				kmem_free(level1, newarrsz);
+				return (0);
+			}
+			oldarrsz = osz * sizeof (void *);
+
+			ANON_INITBUF(ahp->array_chunk, oldarrsz,
+			    level2, PAGESIZE, growdown);
+			kmem_free(ahp->array_chunk, oldarrsz);
+		}
+		bzero(level1, newarrsz);
+		if (growdown)
+			level1[nelems - 1] = level2;
+		else
+			level1[0] = level2;
+	} else {
+		oldarrsz = oelems * sizeof (void *);
+
+		ANON_INITBUF(ahp->array_chunk, oldarrsz,
+		    level1, newarrsz, growdown);
+		kmem_free(ahp->array_chunk, oldarrsz);
+	}
+
+	ahp->array_chunk = level1;
+	ahp->size = nsz;
+out:
+	if (growdown)
+		*startidx_p = nsz - totpages;
+	return (nsz);
+}
+
+/*
+ * Called from clock handler to sync ani_free value.
+ */
+
+void
+set_anoninfo(void)
+{
+	int	ix;
+	pgcnt_t	total = 0;
+
+	for (ix = 0; ix < ANI_MAX_POOL; ix++) {
+		total += ani_free_pool[ix].ani_count;
+	}
+	k_anoninfo.ani_free = total;
+}
+
+/*
+ * Reserve anon space.
+ *
+ * It's no longer simply a matter of incrementing ani_resv to
+ * reserve swap space, we need to check memory-based as well
+ * as disk-backed (physical) swap.  The following algorithm
+ * is used:
+ * 	Check the space on physical swap
+ * 		i.e. amount needed < ani_max - ani_phys_resv
+ * 	If we are swapping on swapfs check
+ *		amount needed < (availrmem - swapfs_minfree)
+ * Since the algorithm to check for the quantity of swap space is
+ * almost the same as that for reserving it, we'll just use anon_resvmem
+ * with a flag to decrement availrmem.
+ *
+ * Return non-zero on success.
+ */
+int
+anon_resvmem(size_t size, uint_t takemem)
+{
+	pgcnt_t npages = btopr(size);
+	pgcnt_t mswap_pages = 0;
+	pgcnt_t pswap_pages = 0;
+
+	mutex_enter(&anoninfo_lock);
+
+	/*
+	 * pswap_pages is the number of pages we can take from
+	 * physical (i.e. disk-backed) swap.
+	 */
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+	pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
+
+	ANON_PRINT(A_RESV,
+	    ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
+	    npages, takemem, pswap_pages, (void *)caller()));
+
+	if (npages <= pswap_pages) {
+		/*
+		 * we have enough space on a physical swap
+		 */
+		if (takemem)
+			k_anoninfo.ani_phys_resv += npages;
+		mutex_exit(&anoninfo_lock);
+		return (1);
+	} else if (pswap_pages != 0) {
+		/*
+		 * we have some space on a physical swap
+		 */
+		if (takemem) {
+			/*
+			 * use up remainder of phys swap
+			 */
+			k_anoninfo.ani_phys_resv += pswap_pages;
+			ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
+		}
+	}
+	/*
+	 * since (npages > pswap_pages) we need mem swap
+	 * mswap_pages is the number of pages needed from availrmem
+	 */
+	ASSERT(npages > pswap_pages);
+	mswap_pages = npages - pswap_pages;
+
+	ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
+	    mswap_pages));
+
+	/*
+	 * priv processes can reserve memory as swap as long as availrmem
+	 * remains greater than swapfs_minfree; in the case of non-priv
+	 * processes, memory can be reserved as swap only if availrmem
+	 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
+	 * swapfs_reserve amount of memswap is not available to non-priv
+	 * processes. This protects daemons such as automounter dying
+	 * as a result of application processes eating away almost entire
+	 * membased swap. This safeguard becomes useless if apps are run
+	 * with root access.
+	 *
+	 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
+	 *
+	 */
+	mutex_enter(&freemem_lock);
+	if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
+		(availrmem > (swapfs_minfree + mswap_pages) &&
+		secpolicy_resource(CRED()) == 0)) {
+
+		if (takemem) {
+			/*
+			 * Take the memory from the rest of the system.
+			 */
+			availrmem -= mswap_pages;
+			mutex_exit(&freemem_lock);
+			k_anoninfo.ani_mem_resv += mswap_pages;
+			ANI_ADD(mswap_pages);
+			ANON_PRINT((A_RESV | A_MRESV),
+				("anon_resvmem: took %ld pages of availrmem\n",
+				mswap_pages));
+		} else {
+			mutex_exit(&freemem_lock);
+		}
+
+		ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+		mutex_exit(&anoninfo_lock);
+		return (1);
+
+	} else {
+		/*
+		 * Fail if not enough memory
+		 */
+
+		if (takemem) {
+			k_anoninfo.ani_phys_resv -= pswap_pages;
+		}
+
+		mutex_exit(&freemem_lock);
+		mutex_exit(&anoninfo_lock);
+		ANON_PRINT(A_RESV,
+			("anon_resvmem: not enough space from swapfs\n"));
+		return (0);
+	}
+}
+
+
+/*
+ * Give back an anon reservation.
+ */
+void
+anon_unresv(size_t size)
+{
+	pgcnt_t npages = btopr(size);
+	spgcnt_t mem_free_pages = 0;
+	pgcnt_t phys_free_slots;
+#ifdef	ANON_DEBUG
+	pgcnt_t mem_resv;
+#endif
+
+	mutex_enter(&anoninfo_lock);
+
+	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+	/*
+	 * If some of this reservation belonged to swapfs
+	 * give it back to availrmem.
+	 * ani_mem_resv is the amount of availrmem swapfs has reserved.
+	 * but some of that memory could be locked by segspt so we can only
+	 * return non locked ani_mem_resv back to availrmem
+	 */
+	if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
+		ANON_PRINT((A_RESV | A_MRESV),
+		    ("anon_unresv: growing availrmem by %ld pages\n",
+		    MIN(k_anoninfo.ani_mem_resv, npages)));
+
+		mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
+		    k_anoninfo.ani_locked_swap), npages);
+		mutex_enter(&freemem_lock);
+		availrmem += mem_free_pages;
+		mutex_exit(&freemem_lock);
+		k_anoninfo.ani_mem_resv -= mem_free_pages;
+
+		ANI_ADD(-mem_free_pages);
+	}
+	/*
+	 * The remainder of the pages is returned to phys swap
+	 */
+	ASSERT(npages >= mem_free_pages);
+	phys_free_slots = npages - mem_free_pages;
+
+	if (phys_free_slots) {
+	    k_anoninfo.ani_phys_resv -= phys_free_slots;
+	}
+
+#ifdef	ANON_DEBUG
+	mem_resv = k_anoninfo.ani_mem_resv;
+#endif
+
+	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+	mutex_exit(&anoninfo_lock);
+
+	ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
+	    npages, mem_resv, (void *)caller()));
+}
+
+/*
+ * Allocate an anon slot and return it with the lock held.
+ */
+struct anon *
+anon_alloc(struct vnode *vp, anoff_t off)
+{
+	struct anon	*ap;
+	kmutex_t	*ahm;
+
+	ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
+	if (vp == NULL) {
+		swap_alloc(ap);
+	} else {
+		ap->an_vp = vp;
+		ap->an_off = off;
+	}
+	ap->an_refcnt = 1;
+	ap->an_pvp = NULL;
+	ap->an_poff = 0;
+	ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+	mutex_enter(ahm);
+	anon_addhash(ap);
+	mutex_exit(ahm);
+	ANI_ADD(-1);
+	ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
+	    (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
+	return (ap);
+}
+
+/*
+ * Decrement the reference count of an anon page.
+ * If reference count goes to zero, free it and
+ * its associated page (if any).
+ */
+void
+anon_decref(struct anon *ap)
+{
+	page_t *pp;
+	struct vnode *vp;
+	anoff_t off;
+	kmutex_t *ahm;
+
+	ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+	mutex_enter(ahm);
+	ASSERT(ap->an_refcnt != 0);
+	if (ap->an_refcnt == 0)
+		panic("anon_decref: slot count 0");
+	if (--ap->an_refcnt == 0) {
+		swap_xlate(ap, &vp, &off);
+		mutex_exit(ahm);
+
+		/*
+		 * If there is a page for this anon slot we will need to
+		 * call VN_DISPOSE to get rid of the vp association and
+		 * put the page back on the free list as really free.
+		 * Acquire the "exclusive" lock to ensure that any
+		 * pending i/o always completes before the swap slot
+		 * is freed.
+		 */
+		pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
+
+		/*
+		 * If there was a page, we've synchronized on it (getting
+		 * the exclusive lock is as good as gettting the iolock)
+		 * so now we can free the physical backing store. Also, this
+		 * is where we would free the name of the anonymous page
+		 * (swap_free(ap)), a no-op in the current implementation.
+		 */
+		mutex_enter(ahm);
+		ASSERT(ap->an_refcnt == 0);
+		anon_rmhash(ap);
+		if (ap->an_pvp)
+			swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
+		mutex_exit(ahm);
+
+		if (pp != NULL) {
+			/*LINTED: constant in conditional context */
+			VN_DISPOSE(pp, B_INVAL, 0, kcred);
+		}
+		ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
+		    (void *)ap, (void *)ap->an_vp));
+		kmem_cache_free(anon_cache, ap);
+
+		ANI_ADD(1);
+	} else {
+		mutex_exit(ahm);
+	}
+}
+
+static int
+anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
+{
+	struct anon *ap;
+
+	while (nslots-- > 0) {
+		if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
+		    ap->an_refcnt > 1)
+			return (1);
+		anon_index++;
+	}
+
+	return (0);
+}
+
+static void
+anon_decref_pages(
+	struct anon_hdr *ahp,
+	ulong_t an_idx,
+	uint_t szc)
+{
+	struct anon *ap = anon_get_ptr(ahp, an_idx);
+	kmutex_t *ahmpages = NULL;
+	page_t *pp;
+	pgcnt_t pgcnt = page_get_pagecnt(szc);
+	pgcnt_t i;
+	struct vnode *vp;
+	anoff_t   off;
+	kmutex_t *ahm;
+#ifdef DEBUG
+	int refcnt = 1;
+#endif
+
+	ASSERT(szc != 0);
+	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
+
+	VM_STAT_ADD(anonvmstats.decrefpages[0]);
+
+	if (ap != NULL) {
+		ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+		mutex_enter(ahmpages);
+		ASSERT((refcnt = ap->an_refcnt) != 0);
+		VM_STAT_ADD(anonvmstats.decrefpages[1]);
+		if (ap->an_refcnt == 1) {
+			VM_STAT_ADD(anonvmstats.decrefpages[2]);
+			ASSERT(!anon_share(ahp, an_idx, pgcnt));
+			mutex_exit(ahmpages);
+			ahmpages = NULL;
+		}
+	}
+
+	i = 0;
+	while (i < pgcnt) {
+		if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
+			ASSERT(refcnt == 1 && ahmpages == NULL);
+			i++;
+			continue;
+		}
+		ASSERT(ap->an_refcnt == refcnt);
+		ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
+		ASSERT(ahmpages == NULL || ap->an_refcnt > 1);
+
+		if (ahmpages == NULL) {
+			swap_xlate(ap, &vp, &off);
+			pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
+			if (pp == NULL || pp->p_szc == 0) {
+				VM_STAT_ADD(anonvmstats.decrefpages[3]);
+				ahm = &anonhash_lock[AH_LOCK(ap->an_vp,
+				    ap->an_off)];
+				(void) anon_set_ptr(ahp, an_idx + i, NULL,
+				    ANON_SLEEP);
+				mutex_enter(ahm);
+				ap->an_refcnt--;
+				ASSERT(ap->an_refcnt == 0);
+				anon_rmhash(ap);
+				if (ap->an_pvp)
+					swap_phys_free(ap->an_pvp, ap->an_poff,
+					    PAGESIZE);
+				mutex_exit(ahm);
+				if (pp != NULL) {
+					VM_STAT_ADD(anonvmstats.decrefpages[4]);
+					/*LINTED*/
+					VN_DISPOSE(pp, B_INVAL, 0, kcred);
+				}
+				kmem_cache_free(anon_cache, ap);
+				ANI_ADD(1);
+				i++;
+			} else {
+				pgcnt_t j;
+				pgcnt_t curpgcnt =
+				    page_get_pagecnt(pp->p_szc);
+				size_t ppasize = curpgcnt * sizeof (page_t *);
+				page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
+				int dispose = 0;
+
+				VM_STAT_ADD(anonvmstats.decrefpages[5]);
+
+				ASSERT(pp->p_szc <= szc);
+				ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt));
+				ASSERT(IS_P2ALIGNED(i, curpgcnt));
+				ASSERT(i + curpgcnt <= pgcnt);
+				ASSERT(!(page_pptonum(pp) & (curpgcnt - 1)));
+				ppa[0] = pp;
+				for (j = i + 1; j < i + curpgcnt; j++) {
+					ap = anon_get_ptr(ahp, an_idx + j);
+					ASSERT(ap != NULL &&
+					    ap->an_refcnt == 1);
+					swap_xlate(ap, &vp, &off);
+					pp = page_lookup(vp, (u_offset_t)off,
+					    SE_EXCL);
+					if (pp == NULL)
+						panic("anon_decref_pages: "
+						    "no page");
+
+					(void) hat_pageunload(pp,
+					    HAT_FORCE_PGUNLOAD);
+					ASSERT(pp->p_szc == ppa[0]->p_szc);
+					ASSERT(page_pptonum(pp) - 1 ==
+					    page_pptonum(ppa[j - i - 1]));
+					ppa[j - i] = pp;
+					if (ap->an_pvp != NULL &&
+					    !vn_matchopval(ap->an_pvp,
+						VOPNAME_DISPOSE,
+						(fs_generic_func_p)fs_dispose))
+						dispose = 1;
+				}
+				if (!dispose) {
+					VM_STAT_ADD(anonvmstats.decrefpages[6]);
+					page_destroy_pages(ppa[0]);
+				} else {
+					VM_STAT_ADD(anonvmstats.decrefpages[7]);
+					for (j = 0; j < curpgcnt; j++) {
+						ASSERT(PAGE_EXCL(ppa[j]));
+						ppa[j]->p_szc = 0;
+					}
+					for (j = 0; j < curpgcnt; j++) {
+						ASSERT(!hat_page_is_mapped(
+						    ppa[j]));
+						/*LINTED*/
+						VN_DISPOSE(ppa[j], B_INVAL, 0,
+						    kcred);
+					}
+				}
+				kmem_free(ppa, ppasize);
+				for (j = i; j < i + curpgcnt; j++) {
+					ap = anon_get_ptr(ahp, an_idx + j);
+					ASSERT(ap != NULL &&
+					    ap->an_refcnt == 1);
+					ahm = &anonhash_lock[AH_LOCK(ap->an_vp,
+					    ap->an_off)];
+					(void) anon_set_ptr(ahp, an_idx + j,
+					    NULL, ANON_SLEEP);
+					mutex_enter(ahm);
+					ap->an_refcnt--;
+					ASSERT(ap->an_refcnt == 0);
+					anon_rmhash(ap);
+					if (ap->an_pvp)
+						swap_phys_free(ap->an_pvp,
+							ap->an_poff, PAGESIZE);
+					mutex_exit(ahm);
+					kmem_cache_free(anon_cache, ap);
+					ANI_ADD(1);
+				}
+				i += curpgcnt;
+			}
+		} else {
+			VM_STAT_ADD(anonvmstats.decrefpages[8]);
+			(void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP);
+			ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+			mutex_enter(ahm);
+			ap->an_refcnt--;
+			mutex_exit(ahm);
+			i++;
+		}
+	}
+
+	if (ahmpages != NULL) {
+		mutex_exit(ahmpages);
+	}
+}
+
+/*
+ * Duplicate references to size bytes worth of anon pages.
+ * Used when duplicating a segment that contains private anon pages.
+ * This code assumes that procedure calling this one has already used
+ * hat_chgprot() to disable write access to the range of addresses that
+ * that *old actually refers to.
+ */
+void
+anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new,
+			ulong_t new_idx, size_t size)
+{
+	spgcnt_t npages;
+	kmutex_t *ahm;
+	struct anon *ap;
+	ulong_t off;
+	ulong_t index;
+
+	npages = btopr(size);
+	while (npages > 0) {
+		index = old_idx;
+		if ((ap = anon_get_next_ptr(old, &index)) == NULL)
+			break;
+
+		ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
+		off = index - old_idx;
+		npages -= off;
+		if (npages <= 0)
+			break;
+
+		(void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP);
+		ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+
+		mutex_enter(ahm);
+		ap->an_refcnt++;
+		mutex_exit(ahm);
+
+		off++;
+		new_idx += off;
+		old_idx += off;
+		npages--;
+	}
+}
+
+/*
+ * Just like anon_dup but also guarantees there are no holes (unallocated anon
+ * slots) within any large page region. That means if a large page region is
+ * empty in the old array it will skip it. If there are 1 or more valid slots
+ * in the large page region of the old array it will make sure to fill in any
+ * unallocated ones and also copy them to the new array. If noalloc is 1 large
+ * page region should either have no valid anon slots or all slots should be
+ * valid.
+ */
+void
+anon_dup_fill_holes(
+	struct anon_hdr *old,
+	ulong_t old_idx,
+	struct anon_hdr *new,
+	ulong_t new_idx,
+	size_t size,
+	uint_t szc,
+	int noalloc)
+{
+	struct anon	*ap;
+	spgcnt_t	npages;
+	kmutex_t	*ahm, *ahmpages = NULL;
+	pgcnt_t		pgcnt, i;
+	ulong_t		index, off;
+#ifdef DEBUG
+	int		refcnt;
+#endif
+
+	ASSERT(szc != 0);
+	pgcnt = page_get_pagecnt(szc);
+	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+	npages = btopr(size);
+	ASSERT(IS_P2ALIGNED(npages, pgcnt));
+	ASSERT(IS_P2ALIGNED(old_idx, pgcnt));
+
+	VM_STAT_ADD(anonvmstats.dupfillholes[0]);
+
+	while (npages > 0) {
+		index = old_idx;
+
+		/*
+		 * Find the next valid slot.
+		 */
+		if (anon_get_next_ptr(old, &index) == NULL)
+			break;
+
+		ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
+		/*
+		 * Now backup index to the beginning of the
+		 * current large page region of the old array.
+		 */
+		index = P2ALIGN(index, pgcnt);
+		off = index - old_idx;
+		ASSERT(IS_P2ALIGNED(off, pgcnt));
+		npages -= off;
+		if (npages <= 0)
+			break;
+
+		/*
+		 * Fill and copy a large page regions worth
+		 * of anon slots.
+		 */
+		for (i = 0; i < pgcnt; i++) {
+			if ((ap = anon_get_ptr(old, index + i)) == NULL) {
+				if (noalloc) {
+					panic("anon_dup_fill_holes: "
+					    "empty anon slot\n");
+				}
+				VM_STAT_ADD(anonvmstats.dupfillholes[1]);
+				ap = anon_alloc(NULL, 0);
+				(void) anon_set_ptr(old, index + i, ap,
+				    ANON_SLEEP);
+			} else if (i == 0) {
+				/*
+				 * make the increment of all refcnts of all
+				 * anon slots of a large page appear atomic by
+				 * getting an anonpages_hash_lock for the
+				 * first anon slot of a large page.
+				 */
+				int hash = AH_LOCK(ap->an_vp, ap->an_off);
+
+				VM_STAT_ADD(anonvmstats.dupfillholes[2]);
+
+				ahmpages = &anonpages_hash_lock[hash];
+				mutex_enter(ahmpages);
+				/*LINTED*/
+				ASSERT(refcnt = ap->an_refcnt);
+
+				VM_STAT_COND_ADD(ap->an_refcnt > 1,
+				    anonvmstats.dupfillholes[3]);
+			}
+			(void) anon_set_ptr(new, new_idx + off + i, ap,
+			    ANON_SLEEP);
+			ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+			mutex_enter(ahm);
+			ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
+			ASSERT(i == 0 || ahmpages == NULL ||
+			    refcnt == ap->an_refcnt);
+			ap->an_refcnt++;
+			mutex_exit(ahm);
+		}
+		if (ahmpages != NULL) {
+			mutex_exit(ahmpages);
+			ahmpages = NULL;
+		}
+		off += pgcnt;
+		new_idx += off;
+		old_idx += off;
+		npages -= pgcnt;
+	}
+}
+
+/*
+ * Used when a segment with a vnode changes szc. similarly to
+ * anon_dup_fill_holes() makes sure each large page region either has no anon
+ * slots or all of them. but new slots are created by COWing the file
+ * pages. on entrance no anon slots should be shared.
+ */
+int
+anon_fill_cow_holes(
+	struct seg *seg,
+	caddr_t addr,
+	struct anon_hdr *ahp,
+	ulong_t an_idx,
+	struct vnode *vp,
+	u_offset_t vp_off,
+	size_t size,
+	uint_t szc,
+	uint_t prot,
+	struct vpage vpage[],
+	struct cred *cred)
+{
+	struct anon	*ap;
+	spgcnt_t	npages;
+	pgcnt_t		pgcnt, i;
+	ulong_t		index, off;
+	int		err = 0;
+	int		pageflags = 0;
+
+	ASSERT(szc != 0);
+	pgcnt = page_get_pagecnt(szc);
+	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+	npages = btopr(size);
+	ASSERT(IS_P2ALIGNED(npages, pgcnt));
+	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
+
+	while (npages > 0) {
+		index = an_idx;
+
+		/*
+		 * Find the next valid slot.
+		 */
+		if (anon_get_next_ptr(ahp, &index) == NULL) {
+			break;
+		}
+
+		ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
+		/*
+		 * Now backup index to the beginning of the
+		 * current large page region of the anon array.
+		 */
+		index = P2ALIGN(index, pgcnt);
+		off = index - an_idx;
+		ASSERT(IS_P2ALIGNED(off, pgcnt));
+		npages -= off;
+		if (npages <= 0)
+			break;
+		an_idx += off;
+		vp_off += ptob(off);
+		addr += ptob(off);
+		if (vpage != NULL) {
+			vpage += off;
+		}
+
+		for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) {
+			if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) {
+				page_t *pl[1 + 1];
+				page_t *pp;
+
+				err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL,
+				    pl, PAGESIZE, seg, addr, S_READ, cred);
+				if (err) {
+					break;
+				}
+				if (vpage != NULL) {
+					prot = VPP_PROT(vpage);
+					pageflags = VPP_ISPPLOCK(vpage) ?
+					    LOCK_PAGE : 0;
+				}
+				pp = anon_private(&ap, seg, addr, prot, pl[0],
+					pageflags, cred);
+				if (pp == NULL) {
+					err = ENOMEM;
+					break;
+				}
+				(void) anon_set_ptr(ahp, an_idx, ap,
+				    ANON_SLEEP);
+				page_unlock(pp);
+			}
+			ASSERT(ap->an_refcnt == 1);
+			addr += PAGESIZE;
+			if (vpage != NULL) {
+				vpage++;
+			}
+		}
+		npages -= pgcnt;
+	}
+
+	return (err);
+}
+
+/*
+ * Free a group of "size" anon pages, size in bytes,
+ * and clear out the pointers to the anon entries.
+ */
+void
+anon_free(struct anon_hdr *ahp, ulong_t index, size_t size)
+{
+	spgcnt_t npages;
+	struct anon *ap;
+	ulong_t old;
+
+	npages = btopr(size);
+
+	while (npages > 0) {
+		old = index;
+		if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
+			break;
+
+		ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
+		npages -= index - old;
+		if (npages <= 0)
+			break;
+
+		(void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP);
+		anon_decref(ap);
+		/*
+		 * Bump index and decrement page count
+		 */
+		index++;
+		npages--;
+	}
+}
+
+void
+anon_free_pages(
+	struct anon_hdr *ahp,
+	ulong_t an_idx,
+	size_t size,
+	uint_t szc)
+{
+	spgcnt_t	npages;
+	pgcnt_t		pgcnt;
+	ulong_t		index, off;
+
+	ASSERT(szc != 0);
+	pgcnt = page_get_pagecnt(szc);
+	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+	npages = btopr(size);
+	ASSERT(IS_P2ALIGNED(npages, pgcnt));
+	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
+
+	VM_STAT_ADD(anonvmstats.freepages[0]);
+
+	while (npages > 0) {
+		index = an_idx;
+
+		/*
+		 * Find the next valid slot.
+		 */
+		if (anon_get_next_ptr(ahp, &index) == NULL)
+			break;
+
+		ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
+		/*
+		 * Now backup index to the beginning of the
+		 * current large page region of the old array.
+		 */
+		index = P2ALIGN(index, pgcnt);
+		off = index - an_idx;
+		ASSERT(IS_P2ALIGNED(off, pgcnt));
+		npages -= off;
+		if (npages <= 0)
+			break;
+
+		anon_decref_pages(ahp, index, szc);
+
+		off += pgcnt;
+		an_idx += off;
+		npages -= pgcnt;
+	}
+}
+
+/*
+ * Make anonymous pages discardable
+ */
+void
+anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, int flags)
+{
+	spgcnt_t npages = btopr(size);
+	struct anon *ap;
+	struct vnode *vp;
+	anoff_t off;
+	page_t *pp, *root_pp;
+	kmutex_t *ahm;
+	pgcnt_t pgcnt;
+	ulong_t old_idx, idx, i;
+	struct anon_hdr *ahp = amp->ahp;
+	anon_sync_obj_t cookie;
+
+	ASSERT(RW_READ_HELD(&amp->a_rwlock));
+	pgcnt = 1;
+	for (; npages > 0; index = (pgcnt == 1) ? index + 1:
+		P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) {
+
+		/*
+		 * get anon pointer and index for the first valid entry
+		 * in the anon list, starting from "index"
+		 */
+		old_idx = index;
+		if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
+			break;
+
+		/*
+		 * decrement npages by number of NULL anon slots we skipped
+		 */
+		npages -= index - old_idx;
+		if (npages <= 0)
+			break;
+
+		anon_array_enter(amp, index, &cookie);
+		ap = anon_get_ptr(ahp, index);
+		ASSERT(ap != NULL);
+
+		/*
+		 * Get anonymous page and try to lock it SE_EXCL;
+		 * For non blocking case if we couldn't grab the lock
+		 * we skip to next page.
+		 * For blocking case (ANON_PGLOOKUP_BLK) block
+		 * until we grab SE_EXCL lock.
+		 */
+		swap_xlate(ap, &vp, &off);
+		if (flags & ANON_PGLOOKUP_BLK)
+			pp = page_lookup_create(vp, (u_offset_t)off,
+			    SE_EXCL, NULL, NULL, SE_EXCL_WANTED);
+		else
+			pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL);
+		if (pp == NULL) {
+			segadvstat.MADV_FREE_miss.value.ul++;
+			pgcnt = 1;
+			anon_array_exit(&cookie);
+			continue;
+		}
+		pgcnt = page_get_pagecnt(pp->p_szc);
+
+		/*
+		 * we cannot free a page which is permanently locked.
+		 * The page_struct_lock need not be acquired to examine
+		 * these fields since the page has an "exclusive" lock.
+		 */
+		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+			page_unlock(pp);
+			segadvstat.MADV_FREE_miss.value.ul++;
+			anon_array_exit(&cookie);
+			continue;
+		}
+
+		ahm = &anonhash_lock[AH_LOCK(vp, off)];
+		mutex_enter(ahm);
+		ASSERT(ap->an_refcnt != 0);
+		/*
+		 * skip this one if copy-on-write is not yet broken.
+		 */
+		if (ap->an_refcnt > 1) {
+			mutex_exit(ahm);
+			page_unlock(pp);
+			segadvstat.MADV_FREE_miss.value.ul++;
+			anon_array_exit(&cookie);
+			continue;
+		}
+
+		if (pp->p_szc == 0) {
+			pgcnt = 1;
+
+			/*
+			 * free swap slot;
+			 */
+			if (ap->an_pvp) {
+				swap_phys_free(ap->an_pvp, ap->an_poff,
+				    PAGESIZE);
+				ap->an_pvp = NULL;
+				ap->an_poff = 0;
+			}
+			mutex_exit(ahm);
+			segadvstat.MADV_FREE_hit.value.ul++;
+
+			/*
+			 * while we are at it, unload all the translations
+			 * and attempt to free the page.
+			 */
+			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+			/*LINTED: constant in conditional context */
+			VN_DISPOSE(pp, B_FREE, 0, kcred);
+			anon_array_exit(&cookie);
+			continue;
+		}
+
+		pgcnt = page_get_pagecnt(pp->p_szc);
+		if (!IS_P2ALIGNED(index, pgcnt)) {
+			if (!page_try_demote_pages(pp)) {
+				mutex_exit(ahm);
+				page_unlock(pp);
+				segadvstat.MADV_FREE_miss.value.ul++;
+				anon_array_exit(&cookie);
+				continue;
+			} else {
+				pgcnt = 1;
+				if (ap->an_pvp) {
+					swap_phys_free(ap->an_pvp,
+					    ap->an_poff, PAGESIZE);
+					    ap->an_pvp = NULL;
+					    ap->an_poff = 0;
+				}
+				mutex_exit(ahm);
+				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+				/*LINTED*/
+				VN_DISPOSE(pp, B_FREE, 0, kcred);
+				segadvstat.MADV_FREE_hit.value.ul++;
+				anon_array_exit(&cookie);
+				continue;
+			}
+		}
+		mutex_exit(ahm);
+		root_pp = pp;
+
+		/*
+		 * try to lock remaining pages
+		 */
+		for (idx = 1; idx < pgcnt; idx++) {
+			pp = page_next(pp);
+			if (!page_trylock(pp, SE_EXCL))
+				break;
+			if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+				page_unlock(pp);
+				break;
+			}
+		}
+
+		if (idx == pgcnt) {
+			for (i = 0; i < pgcnt; i++) {
+				ap = anon_get_ptr(ahp, index + i);
+				if (ap == NULL)
+					break;
+				swap_xlate(ap, &vp, &off);
+				ahm = &anonhash_lock[AH_LOCK(vp, off)];
+				mutex_enter(ahm);
+				ASSERT(ap->an_refcnt != 0);
+
+				/*
+				 * skip this one if copy-on-write
+				 * is not yet broken.
+				 */
+				if (ap->an_refcnt > 1) {
+					mutex_exit(ahm);
+					goto skiplp;
+				}
+				if (ap->an_pvp) {
+					swap_phys_free(ap->an_pvp,
+					    ap->an_poff, PAGESIZE);
+					    ap->an_pvp = NULL;
+					    ap->an_poff = 0;
+				}
+				mutex_exit(ahm);
+			}
+			page_destroy_pages(root_pp);
+			segadvstat.MADV_FREE_hit.value.ul += pgcnt;
+			anon_array_exit(&cookie);
+			continue;
+		}
+skiplp:
+		segadvstat.MADV_FREE_miss.value.ul += pgcnt;
+		for (i = 0, pp = root_pp; i < idx; pp = page_next(pp), i++)
+			page_unlock(pp);
+		anon_array_exit(&cookie);
+	}
+}
+
+/*
+ * Return the kept page(s) and protections back to the segment driver.
+ */
+int
+anon_getpage(
+	struct anon **app,
+	uint_t *protp,
+	page_t *pl[],
+	size_t plsz,
+	struct seg *seg,
+	caddr_t addr,
+	enum seg_rw rw,
+	struct cred *cred)
+{
+	page_t *pp;
+	struct anon *ap = *app;
+	struct vnode *vp;
+	anoff_t off;
+	int err;
+	kmutex_t *ahm;
+
+	swap_xlate(ap, &vp, &off);
+
+	/*
+	 * Lookup the page. If page is being paged in,
+	 * wait for it to finish as we must return a list of
+	 * pages since this routine acts like the VOP_GETPAGE
+	 * routine does.
+	 */
+	if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) {
+		ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+		mutex_enter(ahm);
+		if (ap->an_refcnt == 1)
+			*protp = PROT_ALL;
+		else
+			*protp = PROT_ALL & ~PROT_WRITE;
+		mutex_exit(ahm);
+		pl[0] = pp;
+		pl[1] = NULL;
+		return (0);
+	}
+
+	/*
+	 * Simply treat it as a vnode fault on the anon vp.
+	 */
+
+	TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE,
+		"anon_getpage:seg %x addr %x vp %x",
+		seg, addr, vp);
+
+	err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz,
+	    seg, addr, rw, cred);
+
+	if (err == 0 && pl != NULL) {
+		ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+		mutex_enter(ahm);
+		if (ap->an_refcnt != 1)
+			*protp &= ~PROT_WRITE;	/* make read-only */
+		mutex_exit(ahm);
+	}
+	return (err);
+}
+
+/*
+ * Creates or returns kept pages to the segment driver.  returns -1 if a large
+ * page cannot be allocated. returns -2 if some other process has allocated a
+ * larger page.
+ *
+ * For cowfault it will alocate any size pages to fill the requested area to
+ * avoid partially overwritting anon slots (i.e. sharing only some of the anon
+ * slots within a large page with other processes). This policy greatly
+ * simplifies large page freeing (which is only freed when all anon slot
+ * refcnts are 0).
+ */
+int
+anon_map_getpages(
+	struct anon_map *amp,
+	ulong_t	start_idx,
+	uint_t	szc,
+	struct seg *seg,
+	caddr_t	addr,
+	uint_t prot,
+	uint_t *protp,
+	page_t	*ppa[],
+	uint_t	*ppa_szc,
+	struct vpage vpage[],
+	enum seg_rw rw,
+	int brkcow,
+	int anypgsz,
+	struct cred *cred)
+{
+	pgcnt_t		pgcnt;
+	struct anon	*ap;
+	struct vnode	*vp;
+	anoff_t		off;
+	page_t		*pp, *pl[2], *conpp = NULL;
+	caddr_t		vaddr;
+	ulong_t		pg_idx, an_idx, i;
+	spgcnt_t	nreloc = 0;
+	int		prealloc = 1;
+	int		err, slotcreate;
+	uint_t		vpprot;
+
+#if !defined(__i386) && !defined(__amd64)
+	ASSERT(seg->s_szc != 0);
+#endif
+	ASSERT(szc <= seg->s_szc);
+	ASSERT(ppa_szc != NULL);
+	ASSERT(rw != S_CREATE);
+
+	*protp = PROT_ALL;
+
+	VM_STAT_ADD(anonvmstats.getpages[0]);
+
+	if (szc == 0) {
+		VM_STAT_ADD(anonvmstats.getpages[1]);
+		if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) {
+			err = anon_getpage(&ap, protp, pl, PAGESIZE, seg,
+			    addr, rw, cred);
+			if (err)
+				return (err);
+			ppa[0] = pl[0];
+			if (brkcow == 0 || (*protp & PROT_WRITE)) {
+				VM_STAT_ADD(anonvmstats.getpages[2]);
+				if (ppa[0]->p_szc != 0) {
+					VM_STAT_ADD(anonvmstats.getpages[3]);
+					*ppa_szc = ppa[0]->p_szc;
+					page_unlock(ppa[0]);
+					return (-2);
+				}
+				return (0);
+			}
+			panic("anon_map_getpages: cowfault for szc 0");
+		} else {
+			VM_STAT_ADD(anonvmstats.getpages[4]);
+			ppa[0] = anon_zero(seg, addr, &ap, cred);
+			if (ppa[0] == NULL)
+				return (ENOMEM);
+			(void) anon_set_ptr(amp->ahp, start_idx, ap,
+			    ANON_SLEEP);
+			return (0);
+		}
+	}
+
+	pgcnt = page_get_pagecnt(szc);
+	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+	ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
+
+	/*
+	 * First we check for the case that the requtested large
+	 * page or larger page already exists in the system.
+	 * Actually we only check if the first constituent page
+	 * exists and only preallocate if it's not found.
+	 */
+	ap = anon_get_ptr(amp->ahp, start_idx);
+	if (ap) {
+		uint_t pszc;
+		swap_xlate(ap, &vp, &off);
+		if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) {
+			if (pszc > szc) {
+				*ppa_szc = pszc;
+				return (-2);
+			}
+			if (pszc == szc) {
+				prealloc = 0;
+			}
+		}
+	}
+
+	VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]);
+	VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]);
+
+top:
+	/*
+	 * If a smaller page or no page at all was found,
+	 * grab a large page off the freelist.
+	 */
+	if (prealloc) {
+		ASSERT(conpp == NULL);
+		if (page_alloc_pages(seg, addr, NULL, ppa, szc, 0) != 0) {
+			VM_STAT_ADD(anonvmstats.getpages[7]);
+			if (brkcow == 0 ||
+			    !anon_share(amp->ahp, start_idx, pgcnt)) {
+				/*
+				 * If the refcnt's of all anon slots are <= 1
+				 * they can't increase since we are holding
+				 * the address space's lock. So segvn can
+				 * safely decrease szc without risking to
+				 * generate a cow fault for the region smaller
+				 * than the segment's largest page size.
+				 */
+				VM_STAT_ADD(anonvmstats.getpages[8]);
+				return (-1);
+			}
+		docow:
+			/*
+			 * This is a cow fault. Copy away the entire 1 large
+			 * page region of this segment.
+			 */
+			if (szc != seg->s_szc)
+				panic("anon_map_getpages: cowfault for szc %d",
+				    szc);
+			vaddr = addr;
+			for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
+			    pg_idx++, an_idx++, vaddr += PAGESIZE) {
+				if ((ap = anon_get_ptr(amp->ahp, an_idx)) !=
+				    NULL) {
+					err = anon_getpage(&ap, &vpprot, pl,
+					    PAGESIZE, seg, vaddr, rw, cred);
+					if (err) {
+						for (i = 0; i < pg_idx; i++) {
+							if ((pp = ppa[i]) !=
+							    NULL)
+								page_unlock(pp);
+						}
+						return (err);
+					}
+					ppa[pg_idx] = pl[0];
+				} else {
+					/*
+					 * Since this is a cowfault we know
+					 * that this address space has a
+					 * parent or children which means
+					 * anon_dup_fill_holes() has initialized
+					 * all anon slots within a large page
+					 * region that had at least one anon
+					 * slot at the time of fork().
+					 */
+					panic("anon_map_getpages: "
+					    "cowfault but anon slot is empty");
+				}
+			}
+			VM_STAT_ADD(anonvmstats.getpages[9]);
+			*protp = PROT_ALL;
+			return (anon_map_privatepages(amp, start_idx, szc, seg,
+			    addr, prot, ppa, vpage, anypgsz, cred));
+		}
+	}
+
+	VM_STAT_ADD(anonvmstats.getpages[10]);
+
+	an_idx = start_idx;
+	pg_idx = 0;
+	vaddr = addr;
+	while (pg_idx < pgcnt) {
+		slotcreate = 0;
+		if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) {
+			VM_STAT_ADD(anonvmstats.getpages[11]);
+			/*
+			 * For us to have decided not to preallocate
+			 * would have meant that a large page
+			 * was found. Which also means that all of the
+			 * anon slots for that page would have been
+			 * already created for us.
+			 */
+			if (prealloc == 0)
+				panic("anon_map_getpages: prealloc = 0");
+
+			slotcreate = 1;
+			ap = anon_alloc(NULL, 0);
+		}
+		swap_xlate(ap, &vp, &off);
+
+		/*
+		 * Now setup our preallocated page to pass down
+		 * to swap_getpage().
+		 */
+		if (prealloc) {
+			ASSERT(ppa[pg_idx]->p_szc == szc);
+			conpp = ppa[pg_idx];
+		}
+		ASSERT(prealloc || conpp == NULL);
+
+		/*
+		 * If we just created this anon slot then call
+		 * with S_CREATE to prevent doing IO on the page.
+		 * Similar to the anon_zero case.
+		 */
+		err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE,
+		    NULL, pl, PAGESIZE, conpp, &nreloc, seg, vaddr,
+		    slotcreate == 1 ? S_CREATE : rw, cred);
+
+		if (err) {
+			VM_STAT_ADD(anonvmstats.getpages[12]);
+			ASSERT(slotcreate == 0);
+			goto io_err;
+		}
+
+		pp = pl[0];
+
+		if (pp->p_szc != szc) {
+			VM_STAT_ADD(anonvmstats.getpages[13]);
+			ASSERT(slotcreate == 0);
+			ASSERT(prealloc == 0);
+			ASSERT(pg_idx == 0);
+			if (pp->p_szc > szc) {
+				page_unlock(pp);
+				VM_STAT_ADD(anonvmstats.getpages[14]);
+				return (-2);
+			}
+			page_unlock(pp);
+			prealloc = 1;
+			goto top;
+		}
+
+		/*
+		 * If we decided to preallocate but VOP_GETPAGE
+		 * found a page in the system that satisfies our
+		 * request then free up our preallocated large page
+		 * and continue looping accross the existing large
+		 * page via VOP_GETPAGE.
+		 */
+		if (prealloc && pp != ppa[pg_idx]) {
+			VM_STAT_ADD(anonvmstats.getpages[15]);
+			ASSERT(slotcreate == 0);
+			ASSERT(pg_idx == 0);
+			conpp = NULL;
+			prealloc = 0;
+			page_free_pages(ppa[0]);
+		}
+
+		if (prealloc && nreloc > 1) {
+			/*
+			 * we have relocated out of a smaller large page.
+			 * skip npgs - 1 iterations and continue which will
+			 * increment by one the loop indices.
+			 */
+			spgcnt_t npgs = nreloc;
+
+			VM_STAT_ADD(anonvmstats.getpages[16]);
+
+			ASSERT(pp == ppa[pg_idx]);
+			ASSERT(slotcreate == 0);
+			ASSERT(pg_idx + npgs <= pgcnt);
+			if ((*protp & PROT_WRITE) &&
+			    anon_share(amp->ahp, an_idx, npgs)) {
+			    *protp &= ~PROT_WRITE;
+			}
+			pg_idx += npgs;
+			an_idx += npgs;
+			vaddr += PAGESIZE * npgs;
+			continue;
+		}
+
+		VM_STAT_ADD(anonvmstats.getpages[17]);
+
+		/*
+		 * Anon_zero case.
+		 */
+		if (slotcreate) {
+			ASSERT(prealloc);
+			pagezero(pp, 0, PAGESIZE);
+			CPU_STATS_ADD_K(vm, zfod, 1);
+			hat_setrefmod(pp);
+		}
+
+		ASSERT(prealloc == 0 || ppa[pg_idx] == pp);
+		ASSERT(prealloc != 0 || PAGE_SHARED(pp));
+		ASSERT(prealloc == 0 || PAGE_EXCL(pp));
+
+		if (pg_idx > 0 &&
+		    ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) ||
+		    (pp->p_szc != ppa[pg_idx - 1]->p_szc)))
+			panic("anon_map_getpages: unexpected page");
+
+		if (prealloc == 0) {
+			ppa[pg_idx] = pp;
+		}
+
+		if (ap->an_refcnt > 1) {
+			VM_STAT_ADD(anonvmstats.getpages[18]);
+			*protp &= ~PROT_WRITE;
+		}
+
+		/*
+		 * If this is a new anon slot then initialize
+		 * the anon array entry.
+		 */
+		if (slotcreate) {
+			(void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
+		}
+		pg_idx++;
+		an_idx++;
+		vaddr += PAGESIZE;
+	}
+
+	/*
+	 * Since preallocated pages come off the freelist
+	 * they are locked SE_EXCL. Simply downgrade and return.
+	 */
+	if (prealloc) {
+		VM_STAT_ADD(anonvmstats.getpages[19]);
+		conpp = NULL;
+		for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
+			page_downgrade(ppa[pg_idx]);
+		}
+	}
+	ASSERT(conpp == NULL);
+
+	if (brkcow == 0 || (*protp & PROT_WRITE)) {
+		VM_STAT_ADD(anonvmstats.getpages[20]);
+		return (0);
+	}
+
+	if (szc < seg->s_szc)
+		panic("anon_map_getpages: cowfault for szc %d", szc);
+
+	VM_STAT_ADD(anonvmstats.getpages[21]);
+
+	*protp = PROT_ALL;
+	return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
+	    ppa, vpage, anypgsz, cred));
+io_err:
+	/*
+	 * We got an IO error somewhere in our large page.
+	 * If we were using a preallocated page then just demote
+	 * all the constituent pages that we've succeeded with sofar
+	 * to PAGESIZE pages and leave them in the system
+	 * unlocked.
+	 */
+
+	ASSERT(err != -2 || pg_idx == 0);
+
+	VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]);
+	VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]);
+	VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]);
+
+	if (prealloc) {
+		conpp = NULL;
+		if (pg_idx > 0) {
+			VM_STAT_ADD(anonvmstats.getpages[25]);
+			for (i = 0; i < pgcnt; i++) {
+				pp = ppa[i];
+				ASSERT(PAGE_EXCL(pp));
+				ASSERT(pp->p_szc == szc);
+				pp->p_szc = 0;
+			}
+			for (i = 0; i < pg_idx; i++) {
+				ASSERT(!hat_page_is_mapped(ppa[i]));
+				page_unlock(ppa[i]);
+			}
+			/*
+			 * Now free up the remaining unused constituent
+			 * pages.
+			 */
+			while (pg_idx < pgcnt) {
+				ASSERT(!hat_page_is_mapped(ppa[pg_idx]));
+				page_free(ppa[pg_idx], 0);
+				pg_idx++;
+			}
+		} else {
+			VM_STAT_ADD(anonvmstats.getpages[26]);
+			page_free_pages(ppa[0]);
+		}
+	} else {
+		VM_STAT_ADD(anonvmstats.getpages[27]);
+		ASSERT(err > 0);
+		for (i = 0; i < pg_idx; i++)
+			page_unlock(ppa[i]);
+	}
+	ASSERT(conpp == NULL);
+	if (err != -1)
+		return (err);
+	/*
+	 * we are here because we failed to relocate.
+	 */
+	ASSERT(prealloc);
+	if (brkcow == 0 || !anon_share(amp->ahp, start_idx, pgcnt)) {
+		VM_STAT_ADD(anonvmstats.getpages[28]);
+		return (-1);
+	}
+	VM_STAT_ADD(anonvmstats.getpages[29]);
+	goto docow;
+}
+
+
+/*
+ * Turn a reference to an object or shared anon page
+ * into a private page with a copy of the data from the
+ * original page which is always locked by the caller.
+ * This routine unloads the translation and unlocks the
+ * original page, if it isn't being stolen, before returning
+ * to the caller.
+ *
+ * NOTE:  The original anon slot is not freed by this routine
+ *	  It must be freed by the caller while holding the
+ *	  "anon_map" lock to prevent races which can occur if
+ *	  a process has multiple lwps in its address space.
+ */
+page_t *
+anon_private(
+	struct anon **app,
+	struct seg *seg,
+	caddr_t addr,
+	uint_t	prot,
+	page_t *opp,
+	int oppflags,
+	struct cred *cred)
+{
+	struct anon *old = *app;
+	struct anon *new;
+	page_t *pp = NULL;
+	struct vnode *vp;
+	anoff_t off;
+	page_t *anon_pl[1 + 1];
+	int err;
+
+	if (oppflags & STEAL_PAGE)
+		ASSERT(PAGE_EXCL(opp));
+	else
+		ASSERT(PAGE_LOCKED(opp));
+
+	CPU_STATS_ADD_K(vm, cow_fault, 1);
+
+	/* Kernel probe */
+	TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */,
+		tnf_opaque,	address,	addr);
+
+	*app = new = anon_alloc(NULL, 0);
+	swap_xlate(new, &vp, &off);
+
+	if (oppflags & STEAL_PAGE) {
+		page_rename(opp, vp, (u_offset_t)off);
+		pp = opp;
+		TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE,
+			"anon_private:seg %p addr %x pp %p vp %p off %lx",
+			seg, addr, pp, vp, off);
+		hat_setmod(pp);
+
+		/* bug 4026339 */
+		page_downgrade(pp);
+		return (pp);
+	}
+
+	/*
+	 * Call the VOP_GETPAGE routine to create the page, thereby
+	 * enabling the vnode driver to allocate any filesystem
+	 * space (e.g., disk block allocation for UFS).  This also
+	 * prevents more than one page from being added to the
+	 * vnode at the same time.
+	 */
+	err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL,
+	    anon_pl, PAGESIZE, seg, addr, S_CREATE, cred);
+	if (err)
+		goto out;
+
+	pp = anon_pl[0];
+
+	/*
+	 * If the original page was locked, we need to move the lock
+	 * to the new page by transfering 'cowcnt/lckcnt' of the original
+	 * page to 'cowcnt/lckcnt' of the new page.
+	 *
+	 * See Statement at the beginning of segvn_lockop() and
+	 * comments in page_pp_useclaim() regarding the way
+	 * cowcnts/lckcnts are handled.
+	 *
+	 * Also availrmem must be decremented up front for read only mapping
+	 * before calling page_pp_useclaim. page_pp_useclaim will bump it back
+	 * if availrmem did not need to be decremented after all.
+	 */
+	if (oppflags & LOCK_PAGE) {
+		if ((prot & PROT_WRITE) == 0) {
+			mutex_enter(&freemem_lock);
+			if (availrmem > pages_pp_maximum) {
+				availrmem--;
+				pages_useclaim++;
+			} else {
+				mutex_exit(&freemem_lock);
+				goto out;
+			}
+			mutex_exit(&freemem_lock);
+		}
+		page_pp_useclaim(opp, pp, prot & PROT_WRITE);
+	}
+
+	/*
+	 * Now copy the contents from the original page,
+	 * which is locked and loaded in the MMU by
+	 * the caller to prevent yet another page fault.
+	 */
+	ppcopy(opp, pp);		/* XXX - should set mod bit in here */
+
+	hat_setrefmod(pp);		/* mark as modified */
+
+	/*
+	 * Unload the old translation.
+	 */
+	hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD);
+
+	/*
+	 * Free unmapped, unmodified original page.
+	 * or release the lock on the original page,
+	 * otherwise the process will sleep forever in
+	 * anon_decref() waiting for the "exclusive" lock
+	 * on the page.
+	 */
+	(void) page_release(opp, 1);
+
+	/*
+	 * we are done with page creation so downgrade the new
+	 * page's selock to shared, this helps when multiple
+	 * as_fault(...SOFTLOCK...) are done to the same
+	 * page(aio)
+	 */
+	page_downgrade(pp);
+
+	/*
+	 * NOTE:  The original anon slot must be freed by the
+	 * caller while holding the "anon_map" lock, if we
+	 * copied away from an anonymous page.
+	 */
+	return (pp);
+
+out:
+	*app = old;
+	if (pp)
+		page_unlock(pp);
+	anon_decref(new);
+	page_unlock(opp);
+	return ((page_t *)NULL);
+}
+
+int
+anon_map_privatepages(
+	struct anon_map *amp,
+	ulong_t	start_idx,
+	uint_t	szc,
+	struct seg *seg,
+	caddr_t addr,
+	uint_t	prot,
+	page_t	*ppa[],
+	struct vpage vpage[],
+	int anypgsz,
+	struct cred *cred)
+{
+	pgcnt_t		pgcnt;
+	struct vnode	*vp;
+	anoff_t		off;
+	page_t		*pl[2], *conpp = NULL;
+	int		err;
+	int		prealloc = 1;
+	struct anon	*ap, *oldap;
+	caddr_t		vaddr;
+	page_t		*pplist, *pp;
+	ulong_t		pg_idx, an_idx;
+	spgcnt_t	nreloc = 0;
+	int		pagelock = 0;
+	kmutex_t	*ahmpages = NULL;
+#ifdef DEBUG
+	int		refcnt;
+#endif
+
+	ASSERT(szc != 0);
+	ASSERT(szc == seg->s_szc);
+
+	VM_STAT_ADD(anonvmstats.privatepages[0]);
+
+	pgcnt = page_get_pagecnt(szc);
+	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+	ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
+
+	ASSERT(amp != NULL);
+	ap = anon_get_ptr(amp->ahp, start_idx);
+	ASSERT(ap == NULL || ap->an_refcnt >= 1);
+
+	VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]);
+
+	/*
+	 * Now try and allocate the large page. If we fail then just
+	 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let
+	 * the caller make this decision but to avoid added complexity
+	 * it's simplier to handle that case here.
+	 */
+	if (anypgsz == -1) {
+		VM_STAT_ADD(anonvmstats.privatepages[2]);
+		prealloc = 0;
+	} else if (page_alloc_pages(seg, addr, &pplist, NULL, szc,
+	    anypgsz) != 0) {
+		VM_STAT_ADD(anonvmstats.privatepages[3]);
+		prealloc = 0;
+	}
+
+	/*
+	 * make the decrement of all refcnts of all
+	 * anon slots of a large page appear atomic by
+	 * getting an anonpages_hash_lock for the
+	 * first anon slot of a large page.
+	 */
+	if (ap != NULL) {
+		ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp,
+		    ap->an_off)];
+		mutex_enter(ahmpages);
+		if (ap->an_refcnt == 1) {
+			VM_STAT_ADD(anonvmstats.privatepages[4]);
+			ASSERT(!anon_share(amp->ahp, start_idx, pgcnt));
+			mutex_exit(ahmpages);
+
+			if (prealloc) {
+				page_free_replacement_page(pplist);
+				page_create_putback(pgcnt);
+			}
+			ASSERT(ppa[0]->p_szc <= szc);
+			if (ppa[0]->p_szc == szc) {
+				VM_STAT_ADD(anonvmstats.privatepages[5]);
+				return (0);
+			}
+			for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
+				ASSERT(ppa[pg_idx] != NULL);
+				page_unlock(ppa[pg_idx]);
+			}
+			return (-1);
+		}
+	}
+
+	/*
+	 * If we are passed in the vpage array and this is
+	 * not PROT_WRITE then we need to decrement availrmem
+	 * up front before we try anything. If we need to and
+	 * can't decrement availrmem then its better to fail now
+	 * than in the middle of processing the new large page.
+	 * page_pp_usclaim() on behalf of each constituent page
+	 * below will adjust availrmem back for the cases not needed.
+	 */
+	if (vpage != NULL && (prot & PROT_WRITE) == 0) {
+		for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
+			if (VPP_ISPPLOCK(&vpage[pg_idx])) {
+				pagelock = 1;
+				break;
+			}
+		}
+		if (pagelock) {
+			VM_STAT_ADD(anonvmstats.privatepages[6]);
+			mutex_enter(&freemem_lock);
+			if (availrmem >= pages_pp_maximum + pgcnt) {
+				availrmem -= pgcnt;
+				pages_useclaim += pgcnt;
+			} else {
+				VM_STAT_ADD(anonvmstats.privatepages[7]);
+				mutex_exit(&freemem_lock);
+				if (ahmpages != NULL) {
+					mutex_exit(ahmpages);
+				}
+				if (prealloc) {
+					page_free_replacement_page(pplist);
+					page_create_putback(pgcnt);
+				}
+				for (pg_idx = 0; pg_idx < pgcnt; pg_idx++)
+					if (ppa[pg_idx] != NULL)
+						page_unlock(ppa[pg_idx]);
+				return (ENOMEM);
+			}
+			mutex_exit(&freemem_lock);
+		}
+	}
+
+	CPU_STATS_ADD_K(vm, cow_fault, pgcnt);
+
+	VM_STAT_ADD(anonvmstats.privatepages[8]);
+
+	an_idx = start_idx;
+	pg_idx = 0;
+	vaddr = addr;
+	for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) {
+		ASSERT(ppa[pg_idx] != NULL);
+		oldap = anon_get_ptr(amp->ahp, an_idx);
+		ASSERT(ahmpages != NULL || oldap == NULL);
+		ASSERT(ahmpages == NULL || oldap != NULL);
+		ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
+		ASSERT(ahmpages == NULL || pg_idx != 0 ||
+		    (refcnt = oldap->an_refcnt));
+		ASSERT(ahmpages == NULL || pg_idx == 0 ||
+		    refcnt == oldap->an_refcnt);
+
+		ap = anon_alloc(NULL, 0);
+
+		swap_xlate(ap, &vp, &off);
+
+		/*
+		 * Now setup our preallocated page to pass down to
+		 * swap_getpage().
+		 */
+		if (prealloc) {
+			pp = pplist;
+			page_sub(&pplist, pp);
+			conpp = pp;
+		}
+
+		err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl,
+			PAGESIZE, conpp, &nreloc, seg, vaddr, S_CREATE, cred);
+
+		/*
+		 * Impossible to fail this is S_CREATE.
+		 */
+		if (err)
+			panic("anon_map_privatepages: VOP_GETPAGE failed");
+
+		ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0);
+		ASSERT(prealloc == 0 || nreloc == 1);
+
+		pp = pl[0];
+
+		/*
+		 * If the original page was locked, we need to move
+		 * the lock to the new page by transfering
+		 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
+		 * of the new page. pg_idx can be used to index
+		 * into the vpage array since the caller will guarentee
+		 * that vpage struct passed in corresponds to addr
+		 * and forward.
+		 */
+		if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) {
+			page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE);
+		} else if (pagelock) {
+			mutex_enter(&freemem_lock);
+			availrmem++;
+			pages_useclaim--;
+			mutex_exit(&freemem_lock);
+		}
+
+		/*
+		 * Now copy the contents from the original page.
+		 */
+		ppcopy(ppa[pg_idx], pp);
+
+		hat_setrefmod(pp);		/* mark as modified */
+
+		/*
+		 * Release the lock on the original page,
+		 * derement the old slot, and down grade the lock
+		 * on the new copy.
+		 */
+		page_unlock(ppa[pg_idx]);
+
+		if (!prealloc)
+			page_downgrade(pp);
+
+		ppa[pg_idx] = pp;
+
+		/*
+		 * Now reflect the copy in the new anon array.
+		 */
+		ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
+		if (oldap != NULL)
+			anon_decref(oldap);
+		(void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
+	}
+	if (ahmpages != NULL) {
+		mutex_exit(ahmpages);
+	}
+	ASSERT(prealloc == 0 || pplist == NULL);
+	if (prealloc) {
+		VM_STAT_ADD(anonvmstats.privatepages[9]);
+		for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
+			page_downgrade(ppa[pg_idx]);
+		}
+	}
+
+	/*
+	 * Unload the old large page translation.
+	 */
+	hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD);
+	return (0);
+}
+
+/*
+ * Allocate a private zero-filled anon page.
+ */
+page_t *
+anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred)
+{
+	struct anon *ap;
+	page_t *pp;
+	struct vnode *vp;
+	anoff_t off;
+	page_t *anon_pl[1 + 1];
+	int err;
+
+	/* Kernel probe */
+	TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */,
+		tnf_opaque,	address,	addr);
+
+	*app = ap = anon_alloc(NULL, 0);
+	swap_xlate(ap, &vp, &off);
+
+	/*
+	 * Call the VOP_GETPAGE routine to create the page, thereby
+	 * enabling the vnode driver to allocate any filesystem
+	 * dependent structures (e.g., disk block allocation for UFS).
+	 * This also prevents more than on page from being added to
+	 * the vnode at the same time since it is locked.
+	 */
+	err = VOP_GETPAGE(vp, off, PAGESIZE, NULL,
+	    anon_pl, PAGESIZE, seg, addr, S_CREATE, cred);
+	if (err) {
+		*app = NULL;
+		anon_decref(ap);
+		return (NULL);
+	}
+	pp = anon_pl[0];
+
+	pagezero(pp, 0, PAGESIZE);	/* XXX - should set mod bit */
+	page_downgrade(pp);
+	CPU_STATS_ADD_K(vm, zfod, 1);
+	hat_setrefmod(pp);	/* mark as modified so pageout writes back */
+	return (pp);
+}
+
+
+/*
+ * Allocate array of private zero-filled anon pages for empty slots
+ * and kept pages for non empty slots within given range.
+ *
+ * NOTE: This rontine will try and use large pages
+ *	if available and supported by underlying platform.
+ */
+int
+anon_map_createpages(
+	struct anon_map *amp,
+	ulong_t start_index,
+	size_t len,
+	page_t *ppa[],
+	struct seg *seg,
+	caddr_t addr,
+	enum seg_rw rw,
+	struct cred *cred)
+{
+
+	struct anon	*ap;
+	struct vnode	*ap_vp;
+	page_t		*pp, *pplist, *anon_pl[1 + 1], *conpp = NULL;
+	int		err = 0;
+	ulong_t		p_index, index;
+	pgcnt_t		npgs, pg_cnt;
+	spgcnt_t	nreloc = 0;
+	uint_t		l_szc, szc, prot;
+	anoff_t		ap_off;
+	size_t		pgsz;
+	lgrp_t		*lgrp;
+
+	/*
+	 * XXX For now only handle S_CREATE.
+	 */
+	ASSERT(rw == S_CREATE);
+
+	index	= start_index;
+	p_index	= 0;
+	npgs = btopr(len);
+
+	/*
+	 * If this platform supports multiple page sizes
+	 * then try and allocate directly from the free
+	 * list for pages larger than PAGESIZE.
+	 *
+	 * NOTE:When we have page_create_ru we can stop
+	 *	directly allocating from the freelist.
+	 */
+	l_szc  = seg->s_szc;
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+	while (npgs) {
+
+		/*
+		 * if anon slot already exists
+		 *   (means page has been created)
+		 * so 1) look up the page
+		 *    2) if the page is still in memory, get it.
+		 *    3) if not, create a page and
+		 *	  page in from physical swap device.
+		 * These are done in anon_getpage().
+		 */
+		ap = anon_get_ptr(amp->ahp, index);
+		if (ap) {
+			err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE,
+			    seg, addr, S_READ, cred);
+			if (err) {
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+				panic("anon_map_createpages: anon_getpage");
+			}
+			pp = anon_pl[0];
+			ppa[p_index++] = pp;
+
+			addr += PAGESIZE;
+			index++;
+			npgs--;
+			continue;
+		}
+		/*
+		 * Now try and allocate the largest page possible
+		 * for the current address and range.
+		 * Keep dropping down in page size until:
+		 *
+		 *	1) Properly aligned
+		 *	2) Does not overlap existing anon pages
+		 *	3) Fits in remaining range.
+		 *	4) able to allocate one.
+		 *
+		 * NOTE: XXX When page_create_ru is completed this code
+		 *	 will change.
+		 */
+		szc    = l_szc;
+		pplist = NULL;
+		pg_cnt = 0;
+		while (szc) {
+			pgsz	= page_get_pagesize(szc);
+			pg_cnt	= pgsz >> PAGESHIFT;
+			if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs &&
+				anon_pages(amp->ahp, index, pg_cnt) == 0) {
+				/*
+				 * XXX
+				 * Since we are faking page_create()
+				 * we also need to do the freemem and
+				 * pcf accounting.
+				 */
+				(void) page_create_wait(pg_cnt, PG_WAIT);
+
+				/*
+				 * Get lgroup to allocate next page of shared
+				 * memory from and use it to specify where to
+				 * allocate the physical memory
+				 */
+				lgrp = lgrp_mem_choose(seg, addr, pgsz);
+
+				pplist = page_get_freelist(
+				    (struct vnode *)NULL, (u_offset_t)0, seg,
+				    addr, pgsz, 0, lgrp);
+
+				if (pplist == NULL) {
+					page_create_putback(pg_cnt);
+				}
+
+				/*
+				 * If a request for a page of size
+				 * larger than PAGESIZE failed
+				 * then don't try that size anymore.
+				 */
+				if (pplist == NULL) {
+					l_szc = szc - 1;
+				} else {
+					break;
+				}
+			}
+			szc--;
+		}
+
+		/*
+		 * If just using PAGESIZE pages then don't
+		 * directly allocate from the free list.
+		 */
+		if (pplist == NULL) {
+			ASSERT(szc == 0);
+			pp = anon_zero(seg, addr, &ap, cred);
+			if (pp == NULL) {
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+				panic("anon_map_createpages: anon_zero");
+			}
+			ppa[p_index++] = pp;
+
+			ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
+			(void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
+
+			addr += PAGESIZE;
+			index++;
+			npgs--;
+			continue;
+		}
+
+		/*
+		 * pplist is a list of pg_cnt PAGESIZE pages.
+		 * These pages are locked SE_EXCL since they
+		 * came directly off the free list.
+		 */
+		ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt));
+		ASSERT(IS_P2ALIGNED(index, pg_cnt));
+		ASSERT(conpp == NULL);
+		while (pg_cnt--) {
+
+			ap = anon_alloc(NULL, 0);
+			swap_xlate(ap, &ap_vp, &ap_off);
+
+			ASSERT(pplist != NULL);
+			pp = pplist;
+			page_sub(&pplist, pp);
+			PP_CLRFREE(pp);
+			PP_CLRAGED(pp);
+			conpp = pp;
+
+			err = swap_getconpage(ap_vp, ap_off, PAGESIZE,
+			    (uint_t *)NULL, anon_pl, PAGESIZE, conpp, &nreloc,
+			    seg, addr, S_CREATE, cred);
+
+			if (err) {
+				ANON_LOCK_EXIT(&amp->a_rwlock);
+				panic("anon_map_createpages: S_CREATE");
+			}
+
+			ASSERT(anon_pl[0] == pp);
+			ASSERT(nreloc == 1);
+			pagezero(pp, 0, PAGESIZE);
+			CPU_STATS_ADD_K(vm, zfod, 1);
+			hat_setrefmod(pp);
+
+			ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
+			(void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
+
+			ppa[p_index++] = pp;
+
+			addr += PAGESIZE;
+			index++;
+			npgs--;
+		}
+		conpp = NULL;
+		pg_cnt	= pgsz >> PAGESHIFT;
+		p_index = p_index - pg_cnt;
+		while (pg_cnt--) {
+			page_downgrade(ppa[p_index++]);
+		}
+	}
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+	return (0);
+}
+
+int
+anon_map_demotepages(
+	struct anon_map *amp,
+	ulong_t	start_idx,
+	struct seg *seg,
+	caddr_t addr,
+	uint_t prot,
+	struct vpage vpage[],
+	struct cred *cred)
+{
+	struct anon	*ap;
+	uint_t		szc = seg->s_szc;
+	pgcnt_t		pgcnt = page_get_pagecnt(szc);
+	size_t		ppasize = pgcnt * sizeof (page_t *);
+	page_t		**ppa = kmem_alloc(ppasize, KM_SLEEP);
+	page_t		*pp;
+	page_t		*pl[2];
+	pgcnt_t		i, pg_idx;
+	ulong_t		an_idx;
+	caddr_t		vaddr;
+	kmutex_t	*ahmpages = NULL;
+	int 		err;
+	int		retry = 0;
+	uint_t		vpprot;
+
+	ASSERT(RW_WRITE_HELD(&amp->a_rwlock));
+	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+	ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
+	ASSERT(ppa != NULL);
+
+	VM_STAT_ADD(anonvmstats.demotepages[0]);
+
+	ap = anon_get_ptr(amp->ahp, start_idx);
+	if (ap != NULL) {
+		VM_STAT_ADD(anonvmstats.demotepages[1]);
+		ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+		mutex_enter(ahmpages);
+	}
+top:
+	if (ap == NULL || ap->an_refcnt <= 1) {
+		int root = 0;
+		pgcnt_t npgs, curnpgs = 0;
+
+		VM_STAT_ADD(anonvmstats.demotepages[2]);
+
+		ASSERT(retry == 0 || ap != NULL);
+
+		if (ahmpages != NULL)
+			mutex_exit(ahmpages);
+		an_idx = start_idx;
+		for (i = 0; i < pgcnt; i++, an_idx++) {
+			ap = anon_get_ptr(amp->ahp, an_idx);
+			if (ap != NULL) {
+				ASSERT(ap->an_refcnt == 1);
+				pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off,
+				    SE_EXCL);
+				if (pp != NULL) {
+					(void) hat_pageunload(pp,
+					    HAT_FORCE_PGUNLOAD);
+				}
+			} else {
+				ppa[i] = NULL;
+			}
+		}
+		for (i = 0; i < pgcnt; i++) {
+			if ((pp = ppa[i]) != NULL && pp->p_szc != 0) {
+				ASSERT(pp->p_szc <= szc);
+				if (!root) {
+					VM_STAT_ADD(anonvmstats.demotepages[3]);
+					if (curnpgs != 0)
+						panic("anon_map_demotepages: "
+						    "bad large page");
+
+					root = 1;
+					curnpgs = npgs =
+					    page_get_pagecnt(pp->p_szc);
+
+					ASSERT(npgs <= pgcnt);
+					ASSERT(IS_P2ALIGNED(npgs, npgs));
+					ASSERT(!(page_pptonum(pp) &
+					    (npgs - 1)));
+				} else {
+					ASSERT(i > 0);
+					ASSERT(page_pptonum(pp) - 1 ==
+					    page_pptonum(ppa[i - 1]));
+					if ((page_pptonum(pp) & (npgs - 1)) ==
+					    npgs - 1)
+						root = 0;
+				}
+				ASSERT(PAGE_EXCL(pp));
+				pp->p_szc = 0;
+				curnpgs--;
+			}
+		}
+		if (root != 0 || curnpgs != 0)
+			panic("anon_map_demotepages: bad large page");
+
+		for (i = 0; i < pgcnt; i++) {
+			if ((pp = ppa[i]) != NULL) {
+				ASSERT(!hat_page_is_mapped(pp));
+				ASSERT(pp->p_szc == 0);
+				page_unlock(pp);
+			}
+		}
+		kmem_free(ppa, ppasize);
+		return (0);
+	}
+	ASSERT(ahmpages != NULL);
+	mutex_exit(ahmpages);
+	ahmpages = NULL;
+
+	VM_STAT_ADD(anonvmstats.demotepages[4]);
+
+	ASSERT(retry == 0); /* we can be here only once */
+
+	vaddr = addr;
+	for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
+	    pg_idx++, an_idx++, vaddr += PAGESIZE) {
+		ap = anon_get_ptr(amp->ahp, an_idx);
+		if (ap == NULL)
+			panic("anon_map_demotepages: no anon slot");
+		err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr,
+		    S_READ, cred);
+		if (err) {
+			for (i = 0; i < pg_idx; i++) {
+				if ((pp = ppa[i]) != NULL)
+					page_unlock(pp);
+			}
+			kmem_free(ppa, ppasize);
+			return (err);
+		}
+		ppa[pg_idx] = pl[0];
+	}
+
+	err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
+	    vpage, -1, cred);
+	if (err > 0) {
+		VM_STAT_ADD(anonvmstats.demotepages[5]);
+		kmem_free(ppa, ppasize);
+		return (err);
+	}
+	ASSERT(err == 0 || err == -1);
+	if (err == -1) {
+		VM_STAT_ADD(anonvmstats.demotepages[6]);
+		retry = 1;
+		goto top;
+	}
+	for (i = 0; i < pgcnt; i++) {
+		ASSERT(ppa[i] != NULL);
+		if (ppa[i]->p_szc != 0)
+			retry = 1;
+		page_unlock(ppa[i]);
+	}
+	if (retry) {
+		VM_STAT_ADD(anonvmstats.demotepages[7]);
+		goto top;
+	}
+
+	VM_STAT_ADD(anonvmstats.demotepages[8]);
+
+	kmem_free(ppa, ppasize);
+
+	return (0);
+}
+
+/*
+ * Allocate and initialize an anon_map structure for seg
+ * associating the given swap reservation with the new anon_map.
+ */
+struct anon_map *
+anonmap_alloc(size_t size, size_t swresv)
+{
+	struct anon_map *amp;
+
+	amp = kmem_cache_alloc(anonmap_cache, KM_SLEEP);
+
+	amp->refcnt = 1;
+	amp->size = size;
+
+	amp->ahp = anon_create(btopr(size), ANON_SLEEP);
+	amp->swresv = swresv;
+	amp->locality = 0;
+	amp->a_szc = 0;
+	return (amp);
+}
+
+void
+anonmap_free(struct anon_map *amp)
+{
+	ASSERT(amp->ahp);
+	ASSERT(amp->refcnt == 0);
+
+	lgrp_shm_policy_fini(amp, NULL);
+	anon_release(amp->ahp, btopr(amp->size));
+	kmem_cache_free(anonmap_cache, amp);
+}
+
+/*
+ * Returns true if the app array has some empty slots.
+ * The offp and lenp paramters are in/out paramters.  On entry
+ * these values represent the starting offset and length of the
+ * mapping.  When true is returned, these values may be modified
+ * to be the largest range which includes empty slots.
+ */
+int
+non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp,
+				size_t *lenp)
+{
+	ulong_t i, el;
+	ssize_t low, high;
+	struct anon *ap;
+
+	low = -1;
+	for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) {
+		ap = anon_get_ptr(ahp, anon_idx);
+		if (ap == NULL) {
+			if (low == -1)
+				low = i;
+			high = i;
+		}
+	}
+	if (low != -1) {
+		/*
+		 * Found at least one non-anon page.
+		 * Set up the off and len return values.
+		 */
+		if (low != 0)
+			*offp += low;
+		*lenp = high - low + PAGESIZE;
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * Return a count of the number of existing anon pages in the anon array
+ * app in the range (off, off+len). The array and slots must be guaranteed
+ * stable by the caller.
+ */
+pgcnt_t
+anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
+{
+	pgcnt_t cnt = 0;
+
+	while (nslots-- > 0) {
+		if ((anon_get_ptr(ahp, anon_index)) != NULL)
+			cnt++;
+		anon_index++;
+	}
+	return (cnt);
+}
+
+/*
+ * Move reserved phys swap into memory swap (unreserve phys swap
+ * and reserve mem swap by the same amount).
+ * Used by segspt when it needs to lock resrved swap npages in memory
+ */
+int
+anon_swap_adjust(pgcnt_t npages)
+{
+	pgcnt_t unlocked_mem_swap;
+
+	mutex_enter(&anoninfo_lock);
+
+	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+	unlocked_mem_swap = k_anoninfo.ani_mem_resv
+					- k_anoninfo.ani_locked_swap;
+	if (npages > unlocked_mem_swap) {
+		spgcnt_t adjusted_swap = npages - unlocked_mem_swap;
+
+		/*
+		 * if there is not enough unlocked mem swap we take missing
+		 * amount from phys swap and give it to mem swap
+		 */
+		mutex_enter(&freemem_lock);
+		if (availrmem < adjusted_swap + segspt_minfree) {
+			mutex_exit(&freemem_lock);
+			mutex_exit(&anoninfo_lock);
+			return (ENOMEM);
+		}
+		availrmem -= adjusted_swap;
+		mutex_exit(&freemem_lock);
+
+		k_anoninfo.ani_mem_resv += adjusted_swap;
+		ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap);
+		k_anoninfo.ani_phys_resv -= adjusted_swap;
+
+		ANI_ADD(adjusted_swap);
+	}
+	k_anoninfo.ani_locked_swap += npages;
+
+	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+	mutex_exit(&anoninfo_lock);
+
+	return (0);
+}
+
+/*
+ * 'unlocked' reserved mem swap so when it is unreserved it
+ * can be moved back phys (disk) swap
+ */
+void
+anon_swap_restore(pgcnt_t npages)
+{
+	mutex_enter(&anoninfo_lock);
+
+	ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
+
+	ASSERT(k_anoninfo.ani_locked_swap >= npages);
+	k_anoninfo.ani_locked_swap -= npages;
+
+	ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
+
+	mutex_exit(&anoninfo_lock);
+}
+
+/*
+ * Return the pointer from the list for a
+ * specified anon index.
+ */
+ulong_t *
+anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx)
+{
+	struct anon	**app;
+	void 		**ppp;
+
+	ASSERT(an_idx < ahp->size);
+
+	/*
+	 * Single level case.
+	 */
+	if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
+		return ((ulong_t *)&ahp->array_chunk[an_idx]);
+	} else {
+
+		/*
+		 * 2 level case.
+		 */
+		ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+		if (*ppp == NULL) {
+			mutex_enter(&ahp->serial_lock);
+			ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+			if (*ppp == NULL)
+				*ppp = kmem_zalloc(PAGESIZE, KM_SLEEP);
+			mutex_exit(&ahp->serial_lock);
+		}
+		app = *ppp;
+		return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]);
+	}
+}
+
+void
+anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj)
+{
+	ulong_t		*ap_slot;
+	kmutex_t	*mtx;
+	kcondvar_t	*cv;
+	int		hash;
+
+	/*
+	 * Use szc to determine anon slot(s) to appear atomic.
+	 * If szc = 0, then lock the anon slot and mark it busy.
+	 * If szc > 0, then lock the range of slots by getting the
+	 * anon_array_lock for the first anon slot, and mark only the
+	 * first anon slot busy to represent whole range being busy.
+	 */
+
+	ASSERT(RW_READ_HELD(&amp->a_rwlock));
+	an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
+	hash = ANON_ARRAY_HASH(amp, an_idx);
+	sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
+	sobj->sync_cv = cv = &anon_array_cv[hash];
+	mutex_enter(mtx);
+	ap_slot = anon_get_slot(amp->ahp, an_idx);
+	while (ANON_ISBUSY(ap_slot))
+		cv_wait(cv, mtx);
+	ANON_SETBUSY(ap_slot);
+	sobj->sync_data = ap_slot;
+	mutex_exit(mtx);
+}
+
+void
+anon_array_exit(anon_sync_obj_t *sobj)
+{
+	mutex_enter(sobj->sync_mutex);
+	ASSERT(ANON_ISBUSY(sobj->sync_data));
+	ANON_CLRBUSY(sobj->sync_data);
+	if (CV_HAS_WAITERS(sobj->sync_cv))
+		cv_broadcast(sobj->sync_cv);
+	mutex_exit(sobj->sync_mutex);
+}
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
new file mode 100644
index 0000000000..f54ae54359
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -0,0 +1,2898 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VM - address spaces.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/sysmacros.h>
+#include <sys/cpuvar.h>
+#include <sys/sysinfo.h>
+#include <sys/kmem.h>
+#include <sys/vnode.h>
+#include <sys/vmsystm.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/tnf_probe.h>
+#include <sys/vtrace.h>
+
+#include <vm/hat.h>
+#include <vm/xhat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <vm/seg_dev.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_map.h>
+#include <vm/seg_spt.h>
+#include <vm/page.h>
+
+clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
+
+static struct kmem_cache *as_cache;
+
+static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
+static void as_clearwatchprot(struct as *, caddr_t, size_t);
+
+
+/*
+ * Verifying the segment lists is very time-consuming; it may not be
+ * desirable always to define VERIFY_SEGLIST when DEBUG is set.
+ */
+#ifdef DEBUG
+#define	VERIFY_SEGLIST
+int do_as_verify = 0;
+#endif
+
+/*
+ * Allocate a new callback data structure entry and fill in the events of
+ * interest, the address range of interest, and the callback argument.
+ * Link the entry on the as->a_callbacks list. A callback entry for the
+ * entire address space may be specified with vaddr = 0 and size = -1.
+ *
+ * CALLERS RESPONSIBILITY: If not calling from within the process context for
+ * the specified as, the caller must guarantee persistence of the specified as
+ * for the duration of this function (eg. pages being locked within the as
+ * will guarantee persistence).
+ */
+int
+as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
+		caddr_t vaddr, size_t size, int sleepflag)
+{
+	struct as_callback 	*current_head, *cb;
+	caddr_t 		saddr;
+	size_t 			rsize;
+
+	/* callback function and an event are mandatory */
+	if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
+		return (EINVAL);
+
+	/* Adding a callback after as_free has been called is not allowed */
+	if (as == &kas)
+		return (ENOMEM);
+
+	/*
+	 * vaddr = 0 and size = -1 is used to indicate that the callback range
+	 * is the entire address space so no rounding is done in that case.
+	 */
+	if (size != -1) {
+		saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
+		rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
+			(size_t)saddr;
+		/* check for wraparound */
+		if (saddr + rsize < saddr)
+			return (ENOMEM);
+	} else {
+		if (vaddr != 0)
+			return (EINVAL);
+		saddr = vaddr;
+		rsize = size;
+	}
+
+	/* Allocate and initialize a callback entry */
+	cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
+	if (cb == NULL)
+		return (EAGAIN);
+
+	cb->ascb_func = cb_func;
+	cb->ascb_arg = arg;
+	cb->ascb_events = events;
+	cb->ascb_saddr = saddr;
+	cb->ascb_len = rsize;
+
+	/* Add the entry to the list */
+	mutex_enter(&as->a_contents);
+	current_head = as->a_callbacks;
+	as->a_callbacks = cb;
+	cb->ascb_next = current_head;
+
+	/*
+	 * The call to this function may lose in a race with
+	 * a pertinent event - eg. a thread does long term memory locking
+	 * but before the callback is added another thread executes as_unmap.
+	 * A broadcast here resolves that.
+	 */
+	if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
+		AS_CLRUNMAPWAIT(as);
+		cv_broadcast(&as->a_cv);
+	}
+
+	mutex_exit(&as->a_contents);
+	return (0);
+}
+
+/*
+ * Search the callback list for an entry which pertains to arg.
+ *
+ * This is called from within the client upon completion of the callback.
+ * RETURN VALUES:
+ *	AS_CALLBACK_DELETED  (callback entry found and deleted)
+ *	AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
+ *	AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
+ *			entry will be made in as_do_callbacks)
+ *
+ * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
+ * set, it indicates that as_do_callbacks is processing this entry.  The
+ * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
+ * to unblock as_do_callbacks, in case it is blocked.
+ *
+ * CALLERS RESPONSIBILITY: If not calling from within the process context for
+ * the specified as, the caller must guarantee persistence of the specified as
+ * for the duration of this function (eg. pages being locked within the as
+ * will guarantee persistence).
+ */
+uint_t
+as_delete_callback(struct as *as, void *arg)
+{
+	struct as_callback **prevcb = &as->a_callbacks;
+	struct as_callback *cb;
+	uint_t rc = AS_CALLBACK_NOTFOUND;
+
+	mutex_enter(&as->a_contents);
+	for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
+		if (cb->ascb_arg != arg)
+			continue;
+
+		/*
+		 * If the events indicate AS_CALLBACK_CALLED, just clear
+		 * AS_ALL_EVENT in the events field and wakeup the thread
+		 * that may be waiting in as_do_callbacks.  as_do_callbacks
+		 * will take care of removing this entry from the list.  In
+		 * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
+		 * (AS_CALLBACK_CALLED not set), just remove it from the
+		 * list, return the memory and return AS_CALLBACK_DELETED.
+		 */
+		if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
+			/* leave AS_CALLBACK_CALLED */
+			cb->ascb_events &= ~AS_ALL_EVENT;
+			rc = AS_CALLBACK_DELETE_DEFERRED;
+			cv_broadcast(&as->a_cv);
+		} else {
+			*prevcb = cb->ascb_next;
+			kmem_free(cb, sizeof (struct as_callback));
+			rc = AS_CALLBACK_DELETED;
+		}
+		break;
+	}
+	mutex_exit(&as->a_contents);
+	return (rc);
+}
+
+/*
+ * Searches the as callback list for a matching entry.
+ * Returns a pointer to the first matching callback, or NULL if
+ * nothing is found.
+ * This function never sleeps so it is ok to call it with more
+ * locks held but the (required) a_contents mutex.
+ *
+ * See also comment on as_do_callbacks below.
+ */
+static struct as_callback *
+as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
+			size_t event_len)
+{
+	struct as_callback	*cb;
+
+	ASSERT(MUTEX_HELD(&as->a_contents));
+	for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
+		/*
+		 * If the callback has not already been called, then
+		 * check if events or address range pertains.  An event_len
+		 * of zero means do an unconditional callback.
+		 */
+		if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
+		    ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
+		    (event_addr + event_len < cb->ascb_saddr) ||
+		    (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
+			continue;
+		}
+		break;
+	}
+	return (cb);
+}
+
+/*
+ * Executes a given callback and removes it from the callback list for
+ * this address space.
+ * This function may sleep so the caller must drop all locks except
+ * a_contents before calling this func.
+ *
+ * See also comments on as_do_callbacks below.
+ */
+static void
+as_execute_callback(struct as *as, struct as_callback *cb,
+				uint_t events)
+{
+	struct as_callback **prevcb;
+	void	*cb_arg;
+
+	ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
+	cb->ascb_events |= AS_CALLBACK_CALLED;
+	mutex_exit(&as->a_contents);
+	(*cb->ascb_func)(as, cb->ascb_arg, events);
+	mutex_enter(&as->a_contents);
+	/*
+	 * the callback function is required to delete the callback
+	 * when the callback function determines it is OK for
+	 * this thread to continue. as_delete_callback will clear
+	 * the AS_ALL_EVENT in the events field when it is deleted.
+	 * If the callback function called as_delete_callback,
+	 * events will already be cleared and there will be no blocking.
+	 */
+	while ((cb->ascb_events & events) != 0) {
+		cv_wait(&as->a_cv, &as->a_contents);
+	}
+	/*
+	 * This entry needs to be taken off the list. Normally, the
+	 * callback func itself does that, but unfortunately the list
+	 * may have changed while the callback was running because the
+	 * a_contents mutex was dropped and someone else other than the
+	 * callback func itself could have called as_delete_callback,
+	 * so we have to search to find this entry again.  The entry
+	 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
+	 */
+	cb_arg = cb->ascb_arg;
+	prevcb = &as->a_callbacks;
+	for (cb = as->a_callbacks; cb != NULL;
+	    prevcb = &cb->ascb_next, cb = *prevcb) {
+		if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
+		    (cb_arg != cb->ascb_arg)) {
+			continue;
+		}
+		*prevcb = cb->ascb_next;
+		kmem_free(cb, sizeof (struct as_callback));
+		break;
+	}
+}
+
+/*
+ * Check the callback list for a matching event and intersection of
+ * address range. If there is a match invoke the callback.  Skip an entry if:
+ *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
+ *    - not event of interest
+ *    - not address range of interest
+ *
+ * An event_len of zero indicates a request for an unconditional callback
+ * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
+ * a_contents lock must be dropped before a callback, so only one callback
+ * can be done before returning. Return -1 (true) if a callback was
+ * executed and removed from the list, else return 0 (false).
+ *
+ * The logically separate parts, i.e. finding a matching callback and
+ * executing a given callback have been separated into two functions
+ * so that they can be called with different sets of locks held beyond
+ * the always-required a_contents. as_find_callback does not sleep so
+ * it is ok to call it if more locks than a_contents (i.e. the a_lock
+ * rwlock) are held. as_execute_callback on the other hand may sleep
+ * so all locks beyond a_contents must be dropped by the caller if one
+ * does not want to end comatose.
+ */
+static int
+as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
+			size_t event_len)
+{
+	struct as_callback *cb;
+
+	if ((cb = as_find_callback(as, events, event_addr, event_len))) {
+		as_execute_callback(as, cb, events);
+		return (-1);
+	}
+	return (0);
+}
+
+/*
+ * Search for the segment containing addr. If a segment containing addr
+ * exists, that segment is returned.  If no such segment exists, and
+ * the list spans addresses greater than addr, then the first segment
+ * whose base is greater than addr is returned; otherwise, NULL is
+ * returned unless tail is true, in which case the last element of the
+ * list is returned.
+ *
+ * a_seglast is used to cache the last found segment for repeated
+ * searches to the same addr (which happens frequently).
+ */
+struct seg *
+as_findseg(struct as *as, caddr_t addr, int tail)
+{
+	struct seg *seg = as->a_seglast;
+	avl_index_t where;
+
+	ASSERT(AS_LOCK_HELD(as, &as->a_lock));
+
+	if (seg != NULL &&
+	    seg->s_base <= addr &&
+	    addr < seg->s_base + seg->s_size)
+		return (seg);
+
+	seg = avl_find(&as->a_segtree, &addr, &where);
+	if (seg != NULL)
+		return (as->a_seglast = seg);
+
+	seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
+	if (seg == NULL && tail)
+		seg = avl_last(&as->a_segtree);
+	return (as->a_seglast = seg);
+}
+
+#ifdef VERIFY_SEGLIST
+/*
+ * verify that the linked list is coherent
+ */
+static void
+as_verify(struct as *as)
+{
+	struct seg *seg, *seglast, *p, *n;
+	uint_t nsegs = 0;
+
+	if (do_as_verify == 0)
+		return;
+
+	seglast = as->a_seglast;
+
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+		ASSERT(seg->s_as == as);
+		p = AS_SEGPREV(as, seg);
+		n = AS_SEGNEXT(as, seg);
+		ASSERT(p == NULL || p->s_as == as);
+		ASSERT(p == NULL || p->s_base < seg->s_base);
+		ASSERT(n == NULL || n->s_base > seg->s_base);
+		ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
+		if (seg == seglast)
+			seglast = NULL;
+		nsegs++;
+	}
+	ASSERT(seglast == NULL);
+	ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
+}
+#endif /* VERIFY_SEGLIST */
+
+/*
+ * Add a new segment to the address space. The avl_find()
+ * may be expensive so we attempt to use last segment accessed
+ * in as_gap() as an insertion point.
+ */
+int
+as_addseg(struct as  *as, struct seg *newseg)
+{
+	struct seg *seg;
+	caddr_t addr;
+	caddr_t eaddr;
+	avl_index_t where;
+
+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+	as->a_updatedir = 1;	/* inform /proc */
+	gethrestime(&as->a_updatetime);
+
+	if (as->a_lastgaphl != NULL) {
+		struct seg *hseg = NULL;
+		struct seg *lseg = NULL;
+
+		if (as->a_lastgaphl->s_base > newseg->s_base) {
+			hseg = as->a_lastgaphl;
+			lseg = AVL_PREV(&as->a_segtree, hseg);
+		} else {
+			lseg = as->a_lastgaphl;
+			hseg = AVL_NEXT(&as->a_segtree, lseg);
+		}
+
+		if (hseg && lseg && lseg->s_base < newseg->s_base &&
+		    hseg->s_base > newseg->s_base) {
+			avl_insert_here(&as->a_segtree, newseg, lseg,
+			    AVL_AFTER);
+			as->a_lastgaphl = NULL;
+			as->a_seglast = newseg;
+			return (0);
+		}
+		as->a_lastgaphl = NULL;
+	}
+
+	addr = newseg->s_base;
+	eaddr = addr + newseg->s_size;
+again:
+
+	seg = avl_find(&as->a_segtree, &addr, &where);
+
+	if (seg == NULL)
+		seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
+
+	if (seg == NULL)
+		seg = avl_last(&as->a_segtree);
+
+	if (seg != NULL) {
+		caddr_t base = seg->s_base;
+
+		/*
+		 * If top of seg is below the requested address, then
+		 * the insertion point is at the end of the linked list,
+		 * and seg points to the tail of the list.  Otherwise,
+		 * the insertion point is immediately before seg.
+		 */
+		if (base + seg->s_size > addr) {
+			if (addr >= base || eaddr > base) {
+#ifdef __sparc
+				extern struct seg_ops segnf_ops;
+
+				/*
+				 * no-fault segs must disappear if overlaid.
+				 * XXX need new segment type so
+				 * we don't have to check s_ops
+				 */
+				if (seg->s_ops == &segnf_ops) {
+					seg_unmap(seg);
+					goto again;
+				}
+#endif
+				return (-1);	/* overlapping segment */
+			}
+		}
+	}
+	as->a_seglast = newseg;
+	avl_insert(&as->a_segtree, newseg, where);
+
+#ifdef VERIFY_SEGLIST
+	as_verify(as);
+#endif
+	return (0);
+}
+
+struct seg *
+as_removeseg(struct as *as, struct seg *seg)
+{
+	avl_tree_t *t;
+
+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+	as->a_updatedir = 1;	/* inform /proc */
+	gethrestime(&as->a_updatetime);
+
+	if (seg == NULL)
+		return (NULL);
+
+	t = &as->a_segtree;
+	if (as->a_seglast == seg)
+		as->a_seglast = NULL;
+	as->a_lastgaphl = NULL;
+
+	/*
+	 * if this segment is at an address higher than
+	 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
+	 */
+	if (as->a_lastgap &&
+	    (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
+		as->a_lastgap = AVL_NEXT(t, seg);
+
+	/*
+	 * remove the segment from the seg tree
+	 */
+	avl_remove(t, seg);
+
+#ifdef VERIFY_SEGLIST
+	as_verify(as);
+#endif
+	return (seg);
+}
+
+/*
+ * Find a segment containing addr.
+ */
+struct seg *
+as_segat(struct as *as, caddr_t addr)
+{
+	struct seg *seg = as->a_seglast;
+
+	ASSERT(AS_LOCK_HELD(as, &as->a_lock));
+
+	if (seg != NULL && seg->s_base <= addr &&
+	    addr < seg->s_base + seg->s_size)
+		return (seg);
+
+	seg = avl_find(&as->a_segtree, &addr, NULL);
+	return (seg);
+}
+
+/*
+ * Serialize all searches for holes in an address space to
+ * prevent two or more threads from allocating the same virtual
+ * address range.  The address space must not be "read/write"
+ * locked by the caller since we may block.
+ */
+void
+as_rangelock(struct as *as)
+{
+	mutex_enter(&as->a_contents);
+	while (AS_ISCLAIMGAP(as))
+		cv_wait(&as->a_cv, &as->a_contents);
+	AS_SETCLAIMGAP(as);
+	mutex_exit(&as->a_contents);
+}
+
+/*
+ * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
+ */
+void
+as_rangeunlock(struct as *as)
+{
+	mutex_enter(&as->a_contents);
+	AS_CLRCLAIMGAP(as);
+	cv_signal(&as->a_cv);
+	mutex_exit(&as->a_contents);
+}
+
+/*
+ * compar segments (or just an address) by segment address range
+ */
+static int
+as_segcompar(const void *x, const void *y)
+{
+	struct seg *a = (struct seg *)x;
+	struct seg *b = (struct seg *)y;
+
+	if (a->s_base < b->s_base)
+		return (-1);
+	if (a->s_base >= b->s_base + b->s_size)
+		return (1);
+	return (0);
+}
+
+
+void
+as_avlinit(struct as *as)
+{
+	avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
+	    offsetof(struct seg, s_tree));
+	avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
+	    offsetof(struct watched_page, wp_link));
+}
+
+/*ARGSUSED*/
+static int
+as_constructor(void *buf, void *cdrarg, int kmflags)
+{
+	struct as *as = buf;
+
+	mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
+	rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
+	as_avlinit(as);
+	return (0);
+}
+
+/*ARGSUSED1*/
+static void
+as_destructor(void *buf, void *cdrarg)
+{
+	struct as *as = buf;
+
+	avl_destroy(&as->a_segtree);
+	mutex_destroy(&as->a_contents);
+	cv_destroy(&as->a_cv);
+	rw_destroy(&as->a_lock);
+}
+
+void
+as_init(void)
+{
+	as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
+		as_constructor, as_destructor, NULL, NULL, NULL, 0);
+}
+
+/*
+ * Allocate and initialize an address space data structure.
+ * We call hat_alloc to allow any machine dependent
+ * information in the hat structure to be initialized.
+ */
+struct as *
+as_alloc(void)
+{
+	struct as *as;
+
+	as = kmem_cache_alloc(as_cache, KM_SLEEP);
+
+	as->a_flags		= 0;
+	as->a_vbits		= 0;
+	as->a_hrm		= NULL;
+	as->a_seglast		= NULL;
+	as->a_size		= 0;
+	as->a_updatedir		= 0;
+	gethrestime(&as->a_updatetime);
+	as->a_objectdir		= NULL;
+	as->a_sizedir		= 0;
+	as->a_userlimit		= (caddr_t)USERLIMIT;
+	as->a_lastgap		= NULL;
+	as->a_lastgaphl		= NULL;
+	as->a_callbacks		= NULL;
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+	as->a_hat = hat_alloc(as);	/* create hat for default system mmu */
+	AS_LOCK_EXIT(as, &as->a_lock);
+
+	as->a_xhat = NULL;
+
+	return (as);
+}
+
+/*
+ * Free an address space data structure.
+ * Need to free the hat first and then
+ * all the segments on this as and finally
+ * the space for the as struct itself.
+ */
+void
+as_free(struct as *as)
+{
+	struct hat *hat = as->a_hat;
+	struct seg *seg, *next;
+	int called = 0;
+
+top:
+	/*
+	 * Invoke ALL callbacks. as_do_callbacks will do one callback
+	 * per call, and not return (-1) until the callback has completed.
+	 * When as_do_callbacks returns zero, all callbacks have completed.
+	 */
+	mutex_enter(&as->a_contents);
+	while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0));
+
+	/* This will prevent new XHATs from attaching to as */
+	if (!called)
+		AS_SETBUSY(as);
+	mutex_exit(&as->a_contents);
+	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+
+	if (!called) {
+		called = 1;
+		hat_free_start(hat);
+		if (as->a_xhat != NULL)
+			xhat_free_start_all(as);
+	}
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
+		int err;
+
+		next = AS_SEGNEXT(as, seg);
+		err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
+		if (err == EAGAIN) {
+			mutex_enter(&as->a_contents);
+			if (as->a_callbacks) {
+				AS_LOCK_EXIT(as, &as->a_lock);
+			} else {
+				/*
+				 * Memory is currently locked. Wait for a
+				 * cv_signal that it has been unlocked, then
+				 * try the operation again.
+				 */
+				if (AS_ISUNMAPWAIT(as) == 0)
+					cv_broadcast(&as->a_cv);
+				AS_SETUNMAPWAIT(as);
+				AS_LOCK_EXIT(as, &as->a_lock);
+				while (AS_ISUNMAPWAIT(as))
+					cv_wait(&as->a_cv, &as->a_contents);
+			}
+			mutex_exit(&as->a_contents);
+			goto top;
+		} else {
+			/*
+			 * We do not expect any other error return at this
+			 * time. This is similar to an ASSERT in seg_unmap()
+			 */
+			ASSERT(err == 0);
+		}
+	}
+	hat_free_end(hat);
+	if (as->a_xhat != NULL)
+		xhat_free_end_all(as);
+	AS_LOCK_EXIT(as, &as->a_lock);
+
+	/* /proc stuff */
+	ASSERT(avl_numnodes(&as->a_wpage) == 0);
+	if (as->a_objectdir) {
+		kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
+		as->a_objectdir = NULL;
+		as->a_sizedir = 0;
+	}
+
+	/*
+	 * Free the struct as back to kmem.  Assert it has no segments.
+	 */
+	ASSERT(avl_numnodes(&as->a_segtree) == 0);
+	kmem_cache_free(as_cache, as);
+}
+
+int
+as_dup(struct as *as, struct as **outas)
+{
+	struct as *newas;
+	struct seg *seg, *newseg;
+	int error;
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+	as_clearwatch(as);
+	newas = as_alloc();
+	newas->a_userlimit = as->a_userlimit;
+	AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
+
+	/* This will prevent new XHATs from attaching */
+	mutex_enter(&as->a_contents);
+	AS_SETBUSY(as);
+	mutex_exit(&as->a_contents);
+	mutex_enter(&newas->a_contents);
+	AS_SETBUSY(newas);
+	mutex_exit(&newas->a_contents);
+
+
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+
+		if (seg->s_flags & S_PURGE)
+			continue;
+
+		newseg = seg_alloc(newas, seg->s_base, seg->s_size);
+		if (newseg == NULL) {
+			AS_LOCK_EXIT(newas, &newas->a_lock);
+			as_setwatch(as);
+			mutex_enter(&as->a_contents);
+			AS_CLRBUSY(as);
+			mutex_exit(&as->a_contents);
+			AS_LOCK_EXIT(as, &as->a_lock);
+			as_free(newas);
+			return (-1);
+		}
+		if ((error = SEGOP_DUP(seg, newseg)) != 0) {
+			/*
+			 * We call seg_free() on the new seg
+			 * because the segment is not set up
+			 * completely; i.e. it has no ops.
+			 */
+			as_setwatch(as);
+			mutex_enter(&as->a_contents);
+			AS_CLRBUSY(as);
+			mutex_exit(&as->a_contents);
+			AS_LOCK_EXIT(as, &as->a_lock);
+			seg_free(newseg);
+			AS_LOCK_EXIT(newas, &newas->a_lock);
+			as_free(newas);
+			return (error);
+		}
+		newas->a_size += seg->s_size;
+	}
+
+	error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
+	if (as->a_xhat != NULL)
+		error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
+
+	mutex_enter(&newas->a_contents);
+	AS_CLRBUSY(newas);
+	mutex_exit(&newas->a_contents);
+	AS_LOCK_EXIT(newas, &newas->a_lock);
+
+	as_setwatch(as);
+	mutex_enter(&as->a_contents);
+	AS_CLRBUSY(as);
+	mutex_exit(&as->a_contents);
+	AS_LOCK_EXIT(as, &as->a_lock);
+	if (error != 0) {
+		as_free(newas);
+		return (error);
+	}
+	*outas = newas;
+	return (0);
+}
+
+/*
+ * Handle a ``fault'' at addr for size bytes.
+ */
+faultcode_t
+as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
+	enum fault_type type, enum seg_rw rw)
+{
+	struct seg *seg;
+	caddr_t raddr;			/* rounded down addr */
+	size_t rsize;			/* rounded up size */
+	size_t ssize;
+	faultcode_t res = 0;
+	caddr_t addrsav;
+	struct seg *segsav;
+	int as_lock_held;
+	klwp_t *lwp = ttolwp(curthread);
+	int is_xhat = 0;
+	int holding_wpage = 0;
+	extern struct seg_ops   segdev_ops;
+
+
+
+	if (as->a_hat != hat) {
+		/* This must be an XHAT then */
+		is_xhat = 1;
+
+		if ((type != F_INVAL) || (as == &kas))
+			return (FC_NOSUPPORT);
+	}
+
+retry:
+	if (!is_xhat) {
+		/*
+		 * Indicate that the lwp is not to be stopped while waiting
+		 * for a pagefault.  This is to avoid deadlock while debugging
+		 * a process via /proc over NFS (in particular).
+		 */
+		if (lwp != NULL)
+			lwp->lwp_nostop++;
+
+		/*
+		 * same length must be used when we softlock and softunlock.
+		 * We don't support softunlocking lengths less than
+		 * the original length when there is largepage support.
+		 * See seg_dev.c for more comments.
+		 */
+		switch (type) {
+
+		case F_SOFTLOCK:
+			CPU_STATS_ADD_K(vm, softlock, 1);
+			break;
+
+		case F_SOFTUNLOCK:
+			break;
+
+		case F_PROT:
+			CPU_STATS_ADD_K(vm, prot_fault, 1);
+			break;
+
+		case F_INVAL:
+			CPU_STATS_ENTER_K();
+			CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
+			if (as == &kas)
+				CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
+			CPU_STATS_EXIT_K();
+			break;
+		}
+	}
+
+	/* Kernel probe */
+	TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
+		tnf_opaque,	address,	addr,
+		tnf_fault_type,	fault_type,	type,
+		tnf_seg_access,	access,		rw);
+
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+
+	/*
+	 * XXX -- Don't grab the as lock for segkmap. We should grab it for
+	 * correctness, but then we could be stuck holding this lock for
+	 * a LONG time if the fault needs to be resolved on a slow
+	 * filesystem, and then no-one will be able to exec new commands,
+	 * as exec'ing requires the write lock on the as.
+	 */
+	if (as == &kas && segkmap && segkmap->s_base <= raddr &&
+	    raddr + size < segkmap->s_base + segkmap->s_size) {
+		/*
+		 * if (as==&kas), this can't be XHAT: we've already returned
+		 * FC_NOSUPPORT.
+		 */
+		seg = segkmap;
+		as_lock_held = 0;
+	} else {
+		AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+		if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
+			/*
+			 * Grab and hold the writers' lock on the as
+			 * if the fault is to a watched page.
+			 * This will keep CPUs from "peeking" at the
+			 * address range while we're temporarily boosting
+			 * the permissions for the XHAT device to
+			 * resolve the fault in the segment layer.
+			 *
+			 * We could check whether faulted address
+			 * is within a watched page and only then grab
+			 * the writer lock, but this is simpler.
+			 */
+			AS_LOCK_EXIT(as, &as->a_lock);
+			AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+		}
+
+		seg = as_segat(as, raddr);
+		if (seg == NULL) {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			if ((lwp != NULL) && (!is_xhat))
+				lwp->lwp_nostop--;
+			return (FC_NOMAP);
+		}
+
+		as_lock_held = 1;
+	}
+
+	addrsav = raddr;
+	segsav = seg;
+
+	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+		if (raddr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			if (seg == NULL || raddr != seg->s_base) {
+				res = FC_NOMAP;
+				break;
+			}
+		}
+		if (raddr + rsize > seg->s_base + seg->s_size)
+			ssize = seg->s_base + seg->s_size - raddr;
+		else
+			ssize = rsize;
+
+		if (!is_xhat || (seg->s_ops != &segdev_ops)) {
+
+			if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
+			    pr_is_watchpage_as(raddr, rw, as)) {
+				/*
+				 * Handle watch pages.  If we're faulting on a
+				 * watched page from an X-hat, we have to
+				 * restore the original permissions while we
+				 * handle the fault.
+				 */
+				as_clearwatch(as);
+				holding_wpage = 1;
+			}
+
+			res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
+
+			/* Restore watchpoints */
+			if (holding_wpage) {
+				as_setwatch(as);
+				holding_wpage = 0;
+			}
+
+			if (res != 0)
+				break;
+		} else {
+			/* XHAT does not support seg_dev */
+			res = FC_NOSUPPORT;
+			break;
+		}
+	}
+
+	/*
+	 * If we were SOFTLOCKing and encountered a failure,
+	 * we must SOFTUNLOCK the range we already did. (Maybe we
+	 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
+	 * right here...)
+	 */
+	if (res != 0 && type == F_SOFTLOCK) {
+		for (seg = segsav; addrsav < raddr; addrsav += ssize) {
+			if (addrsav >= seg->s_base + seg->s_size)
+				seg = AS_SEGNEXT(as, seg);
+			ASSERT(seg != NULL);
+			/*
+			 * Now call the fault routine again to perform the
+			 * unlock using S_OTHER instead of the rw variable
+			 * since we never got a chance to touch the pages.
+			 */
+			if (raddr > seg->s_base + seg->s_size)
+				ssize = seg->s_base + seg->s_size - addrsav;
+			else
+				ssize = raddr - addrsav;
+			(void) SEGOP_FAULT(hat, seg, addrsav, ssize,
+			    F_SOFTUNLOCK, S_OTHER);
+		}
+	}
+	if (as_lock_held)
+		AS_LOCK_EXIT(as, &as->a_lock);
+	if ((lwp != NULL) && (!is_xhat))
+		lwp->lwp_nostop--;
+	/*
+	 * If the lower levels returned EDEADLK for a fault,
+	 * It means that we should retry the fault.  Let's wait
+	 * a bit also to let the deadlock causing condition clear.
+	 * This is part of a gross hack to work around a design flaw
+	 * in the ufs/sds logging code and should go away when the
+	 * logging code is re-designed to fix the problem. See bug
+	 * 4125102 for details of the problem.
+	 */
+	if (FC_ERRNO(res) == EDEADLK) {
+		delay(deadlk_wait);
+		res = 0;
+		goto retry;
+	}
+	return (res);
+}
+
+
+
+/*
+ * Asynchronous ``fault'' at addr for size bytes.
+ */
+faultcode_t
+as_faulta(struct as *as, caddr_t addr, size_t size)
+{
+	struct seg *seg;
+	caddr_t raddr;			/* rounded down addr */
+	size_t rsize;			/* rounded up size */
+	faultcode_t res = 0;
+	klwp_t *lwp = ttolwp(curthread);
+
+retry:
+	/*
+	 * Indicate that the lwp is not to be stopped while waiting
+	 * for a pagefault.  This is to avoid deadlock while debugging
+	 * a process via /proc over NFS (in particular).
+	 */
+	if (lwp != NULL)
+		lwp->lwp_nostop++;
+
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	seg = as_segat(as, raddr);
+	if (seg == NULL) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+		if (lwp != NULL)
+			lwp->lwp_nostop--;
+		return (FC_NOMAP);
+	}
+
+	for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
+		if (raddr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			if (seg == NULL || raddr != seg->s_base) {
+				res = FC_NOMAP;
+				break;
+			}
+		}
+		res = SEGOP_FAULTA(seg, raddr);
+		if (res != 0)
+			break;
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+	if (lwp != NULL)
+		lwp->lwp_nostop--;
+	/*
+	 * If the lower levels returned EDEADLK for a fault,
+	 * It means that we should retry the fault.  Let's wait
+	 * a bit also to let the deadlock causing condition clear.
+	 * This is part of a gross hack to work around a design flaw
+	 * in the ufs/sds logging code and should go away when the
+	 * logging code is re-designed to fix the problem. See bug
+	 * 4125102 for details of the problem.
+	 */
+	if (FC_ERRNO(res) == EDEADLK) {
+		delay(deadlk_wait);
+		res = 0;
+		goto retry;
+	}
+	return (res);
+}
+
+/*
+ * Set the virtual mapping for the interval from [addr : addr + size)
+ * in address space `as' to have the specified protection.
+ * It is ok for the range to cross over several segments,
+ * as long as they are contiguous.
+ */
+int
+as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
+{
+	struct seg *seg;
+	struct as_callback *cb;
+	size_t ssize;
+	caddr_t raddr;			/* rounded down addr */
+	size_t rsize;			/* rounded up size */
+	int error = 0, writer = 0;
+	caddr_t saveraddr;
+	size_t saversize;
+
+setprot_top:
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+
+	if (raddr + rsize < raddr)		/* check for wraparound */
+		return (ENOMEM);
+
+	saveraddr = raddr;
+	saversize = rsize;
+
+	/*
+	 * Normally we only lock the as as a reader. But
+	 * if due to setprot the segment driver needs to split
+	 * a segment it will return IE_RETRY. Therefore we re-aquire
+	 * the as lock as a writer so the segment driver can change
+	 * the seg list. Also the segment driver will return IE_RETRY
+	 * after it has changed the segment list so we therefore keep
+	 * locking as a writer. Since these opeartions should be rare
+	 * want to only lock as a writer when necessary.
+	 */
+	if (writer || avl_numnodes(&as->a_wpage) != 0) {
+		AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+	} else {
+		AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	}
+
+	as_clearwatchprot(as, raddr, rsize);
+	seg = as_segat(as, raddr);
+	if (seg == NULL) {
+		as_setwatch(as);
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (ENOMEM);
+	}
+
+	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+		if (raddr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			if (seg == NULL || raddr != seg->s_base) {
+				error = ENOMEM;
+				break;
+			}
+		}
+		if ((raddr + rsize) > (seg->s_base + seg->s_size))
+			ssize = seg->s_base + seg->s_size - raddr;
+		else
+			ssize = rsize;
+		error = SEGOP_SETPROT(seg, raddr, ssize, prot);
+
+		if (error == IE_NOMEM) {
+			error = EAGAIN;
+			break;
+		}
+
+		if (error == IE_RETRY) {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			writer = 1;
+			goto setprot_top;
+		}
+
+		if (error == EAGAIN) {
+			/*
+			 * Make sure we have a_lock as writer.
+			 */
+			if (writer == 0) {
+				AS_LOCK_EXIT(as, &as->a_lock);
+				writer = 1;
+				goto setprot_top;
+			}
+
+			/*
+			 * Memory is currently locked.  It must be unlocked
+			 * before this operation can succeed through a retry.
+			 * The possible reasons for locked memory and
+			 * corresponding strategies for unlocking are:
+			 * (1) Normal I/O
+			 *	wait for a signal that the I/O operation
+			 *	has completed and the memory is unlocked.
+			 * (2) Asynchronous I/O
+			 *	The aio subsystem does not unlock pages when
+			 *	the I/O is completed. Those pages are unlocked
+			 *	when the application calls aiowait/aioerror.
+			 *	So, to prevent blocking forever, cv_broadcast()
+			 *	is done to wake up aio_cleanup_thread.
+			 *	Subsequently, segvn_reclaim will be called, and
+			 *	that will do AS_CLRUNMAPWAIT() and wake us up.
+			 * (3) Long term page locking:
+			 *	Drivers intending to have pages locked for a
+			 *	period considerably longer than for normal I/O
+			 *	(essentially forever) may have registered for a
+			 *	callback so they may unlock these pages on
+			 *	request. This is needed to allow this operation
+			 *	to succeed. Each entry on the callback list is
+			 *	examined. If the event or address range pertains
+			 *	the callback is invoked (unless it already is in
+			 *	progress). The a_contents lock must be dropped
+			 *	before the callback, so only one callback can
+			 *	be done at a time. Go to the top and do more
+			 *	until zero is returned. If zero is returned,
+			 *	either there were no callbacks for this event
+			 *	or they were already in progress.
+			 */
+			mutex_enter(&as->a_contents);
+			if (as->a_callbacks &&
+				(cb = as_find_callback(as, AS_SETPROT_EVENT,
+						seg->s_base, seg->s_size))) {
+				AS_LOCK_EXIT(as, &as->a_lock);
+				as_execute_callback(as, cb, AS_SETPROT_EVENT);
+			} else {
+				if (AS_ISUNMAPWAIT(as) == 0)
+					cv_broadcast(&as->a_cv);
+				AS_SETUNMAPWAIT(as);
+				AS_LOCK_EXIT(as, &as->a_lock);
+				while (AS_ISUNMAPWAIT(as))
+					cv_wait(&as->a_cv, &as->a_contents);
+			}
+			mutex_exit(&as->a_contents);
+			goto setprot_top;
+		} else if (error != 0)
+			break;
+	}
+	if (error != 0) {
+		as_setwatch(as);
+	} else {
+		as_setwatchprot(as, saveraddr, saversize, prot);
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (error);
+}
+
+/*
+ * Check to make sure that the interval [addr, addr + size)
+ * in address space `as' has at least the specified protection.
+ * It is ok for the range to cross over several segments, as long
+ * as they are contiguous.
+ */
+int
+as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
+{
+	struct seg *seg;
+	size_t ssize;
+	caddr_t raddr;			/* rounded down addr */
+	size_t rsize;			/* rounded up size */
+	int error = 0;
+
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+
+	if (raddr + rsize < raddr)		/* check for wraparound */
+		return (ENOMEM);
+
+	/*
+	 * This is ugly as sin...
+	 * Normally, we only acquire the address space readers lock.
+	 * However, if the address space has watchpoints present,
+	 * we must acquire the writer lock on the address space for
+	 * the benefit of as_clearwatchprot() and as_setwatchprot().
+	 */
+	if (avl_numnodes(&as->a_wpage) != 0)
+		AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+	else
+		AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	as_clearwatchprot(as, raddr, rsize);
+	seg = as_segat(as, raddr);
+	if (seg == NULL) {
+		as_setwatch(as);
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (ENOMEM);
+	}
+
+	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+		if (raddr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			if (seg == NULL || raddr != seg->s_base) {
+				error = ENOMEM;
+				break;
+			}
+		}
+		if ((raddr + rsize) > (seg->s_base + seg->s_size))
+			ssize = seg->s_base + seg->s_size - raddr;
+		else
+			ssize = rsize;
+
+		error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
+		if (error != 0)
+			break;
+	}
+	as_setwatch(as);
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (error);
+}
+
+int
+as_unmap(struct as *as, caddr_t addr, size_t size)
+{
+	struct seg *seg, *seg_next;
+	struct as_callback *cb;
+	caddr_t raddr, eaddr;
+	size_t ssize;
+	int err;
+
+top:
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
+	    (uintptr_t)PAGEMASK);
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+
+	as->a_updatedir = 1;	/* inform /proc */
+	gethrestime(&as->a_updatetime);
+
+	/*
+	 * Use as_findseg to find the first segment in the range, then
+	 * step through the segments in order, following s_next.
+	 */
+	as_clearwatchprot(as, raddr, eaddr - raddr);
+
+	for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
+		if (eaddr <= seg->s_base)
+			break;		/* eaddr was in a gap; all done */
+
+		/* this is implied by the test above */
+		ASSERT(raddr < eaddr);
+
+		if (raddr < seg->s_base)
+			raddr = seg->s_base; 	/* raddr was in a gap */
+
+		if (eaddr > (seg->s_base + seg->s_size))
+			ssize = seg->s_base + seg->s_size - raddr;
+		else
+			ssize = eaddr - raddr;
+
+		/*
+		 * Save next segment pointer since seg can be
+		 * destroyed during the segment unmap operation.
+		 */
+		seg_next = AS_SEGNEXT(as, seg);
+
+		err = SEGOP_UNMAP(seg, raddr, ssize);
+		if (err == EAGAIN) {
+			/*
+			 * Memory is currently locked.  It must be unlocked
+			 * before this operation can succeed through a retry.
+			 * The possible reasons for locked memory and
+			 * corresponding strategies for unlocking are:
+			 * (1) Normal I/O
+			 *	wait for a signal that the I/O operation
+			 *	has completed and the memory is unlocked.
+			 * (2) Asynchronous I/O
+			 *	The aio subsystem does not unlock pages when
+			 *	the I/O is completed. Those pages are unlocked
+			 *	when the application calls aiowait/aioerror.
+			 *	So, to prevent blocking forever, cv_broadcast()
+			 *	is done to wake up aio_cleanup_thread.
+			 *	Subsequently, segvn_reclaim will be called, and
+			 *	that will do AS_CLRUNMAPWAIT() and wake us up.
+			 * (3) Long term page locking:
+			 *	Drivers intending to have pages locked for a
+			 *	period considerably longer than for normal I/O
+			 *	(essentially forever) may have registered for a
+			 *	callback so they may unlock these pages on
+			 *	request. This is needed to allow this operation
+			 *	to succeed. Each entry on the callback list is
+			 *	examined. If the event or address range pertains
+			 *	the callback is invoked (unless it already is in
+			 *	progress). The a_contents lock must be dropped
+			 *	before the callback, so only one callback can
+			 *	be done at a time. Go to the top and do more
+			 *	until zero is returned. If zero is returned,
+			 *	either there were no callbacks for this event
+			 *	or they were already in progress.
+			 */
+			as_setwatch(as);
+			mutex_enter(&as->a_contents);
+			if (as->a_callbacks &&
+				(cb = as_find_callback(as, AS_UNMAP_EVENT,
+						seg->s_base, seg->s_size))) {
+				AS_LOCK_EXIT(as, &as->a_lock);
+				as_execute_callback(as, cb, AS_UNMAP_EVENT);
+			} else {
+				if (AS_ISUNMAPWAIT(as) == 0)
+					cv_broadcast(&as->a_cv);
+				AS_SETUNMAPWAIT(as);
+				AS_LOCK_EXIT(as, &as->a_lock);
+				while (AS_ISUNMAPWAIT(as))
+					cv_wait(&as->a_cv, &as->a_contents);
+			}
+			mutex_exit(&as->a_contents);
+			goto top;
+		} else if (err == IE_RETRY) {
+			as_setwatch(as);
+			AS_LOCK_EXIT(as, &as->a_lock);
+			goto top;
+		} else if (err) {
+			as_setwatch(as);
+			AS_LOCK_EXIT(as, &as->a_lock);
+			return (-1);
+		}
+
+		as->a_size -= ssize;
+		raddr += ssize;
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (0);
+}
+
+static int
+as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
+    int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
+{
+	int text = vn_a->flags & MAP_TEXT;
+	uint_t szcvec = map_execseg_pgszcvec(text, addr, size);
+	uint_t szc;
+	uint_t nszc;
+	int error;
+	caddr_t a;
+	caddr_t eaddr;
+	size_t segsize;
+	struct seg *seg;
+	uint_t save_szcvec;
+	size_t pgsz;
+	struct vattr va;
+	u_offset_t eoff;
+	size_t save_size = 0;
+
+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
+	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
+	ASSERT(vn_a->vp != NULL);
+	ASSERT(vn_a->amp == NULL);
+
+again:
+	if (szcvec <= 1) {
+		seg = seg_alloc(as, addr, size);
+		if (seg == NULL) {
+			return (ENOMEM);
+		}
+		vn_a->szc = 0;
+		error = (*crfp)(seg, vn_a);
+		if (error != 0) {
+			seg_free(seg);
+		}
+		return (error);
+	}
+
+	va.va_mask = AT_SIZE;
+	if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred) != 0) {
+		szcvec = 0;
+		goto again;
+	}
+	eoff = vn_a->offset & PAGEMASK;
+	if (eoff >= va.va_size) {
+		szcvec = 0;
+		goto again;
+	}
+	eoff += size;
+	if (btopr(va.va_size) < btopr(eoff)) {
+		save_size = size;
+		size = va.va_size - (vn_a->offset & PAGEMASK);
+		size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
+		szcvec = map_execseg_pgszcvec(text, addr, size);
+		if (szcvec <= 1) {
+			size = save_size;
+			goto again;
+		}
+	}
+
+	eaddr = addr + size;
+	save_szcvec = szcvec;
+	szcvec >>= 1;
+	szc = 0;
+	nszc = 0;
+	while (szcvec) {
+		if ((szcvec & 0x1) == 0) {
+			nszc++;
+			szcvec >>= 1;
+			continue;
+		}
+		nszc++;
+		pgsz = page_get_pagesize(nszc);
+		a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
+		if (a != addr) {
+			ASSERT(a < eaddr);
+			segsize = a - addr;
+			seg = seg_alloc(as, addr, segsize);
+			if (seg == NULL) {
+				return (ENOMEM);
+			}
+			vn_a->szc = szc;
+			error = (*crfp)(seg, vn_a);
+			if (error != 0) {
+				seg_free(seg);
+				return (error);
+			}
+			*segcreated = 1;
+			vn_a->offset += segsize;
+			addr = a;
+		}
+		szc = nszc;
+		szcvec >>= 1;
+	}
+
+	ASSERT(addr < eaddr);
+	szcvec = save_szcvec | 1; /* add 8K pages */
+	while (szcvec) {
+		a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
+		ASSERT(a >= addr);
+		if (a != addr) {
+			segsize = a - addr;
+			seg = seg_alloc(as, addr, segsize);
+			if (seg == NULL) {
+				return (ENOMEM);
+			}
+			vn_a->szc = szc;
+			error = (*crfp)(seg, vn_a);
+			if (error != 0) {
+				seg_free(seg);
+				return (error);
+			}
+			*segcreated = 1;
+			vn_a->offset += segsize;
+			addr = a;
+		}
+		szcvec &= ~(1 << szc);
+		if (szcvec) {
+			szc = highbit(szcvec) - 1;
+			pgsz = page_get_pagesize(szc);
+		}
+	}
+	ASSERT(addr == eaddr);
+
+	if (save_size) {
+		size = save_size - size;
+		goto again;
+	}
+
+	return (0);
+}
+
+int
+as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
+{
+	struct seg *seg = NULL;
+	caddr_t raddr;			/* rounded down addr */
+	size_t rsize;			/* rounded up size */
+	int error;
+	struct proc *p = curproc;
+
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+
+	/*
+	 * check for wrap around
+	 */
+	if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (ENOMEM);
+	}
+
+	as->a_updatedir = 1;	/* inform /proc */
+	gethrestime(&as->a_updatetime);
+
+	if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+
+		(void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
+		    RCA_UNSAFE_ALL);
+
+		return (ENOMEM);
+	}
+
+	if (AS_MAP_VNSEGS_USELPGS(crfp, argsp)) {
+		int unmap = 0;
+		error = as_map_vnsegs(as, raddr, rsize, crfp,
+		    (struct segvn_crargs *)argsp, &unmap);
+		if (error != 0) {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			if (unmap) {
+				(void) as_unmap(as, addr, size);
+			}
+			return (error);
+		}
+	} else {
+		seg = seg_alloc(as, addr, size);
+		if (seg == NULL) {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			return (ENOMEM);
+		}
+
+		error = (*crfp)(seg, argsp);
+		if (error != 0) {
+			seg_free(seg);
+			AS_LOCK_EXIT(as, &as->a_lock);
+			return (error);
+		}
+	}
+
+	/*
+	 * Add size now so as_unmap will work if as_ctl fails.
+	 */
+	as->a_size += rsize;
+
+	as_setwatch(as);
+
+	/*
+	 * If the address space is locked,
+	 * establish memory locks for the new segment.
+	 */
+	mutex_enter(&as->a_contents);
+	if (AS_ISPGLCK(as)) {
+		mutex_exit(&as->a_contents);
+		AS_LOCK_EXIT(as, &as->a_lock);
+		error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
+		if (error != 0)
+			(void) as_unmap(as, addr, size);
+	} else {
+		mutex_exit(&as->a_contents);
+		AS_LOCK_EXIT(as, &as->a_lock);
+	}
+	return (error);
+}
+
+
+/*
+ * Delete all segments in the address space marked with S_PURGE.
+ * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
+ * These segments are deleted as a first step before calls to as_gap(), so
+ * that they don't affect mmap() or shmat().
+ */
+void
+as_purge(struct as *as)
+{
+	struct seg *seg;
+	struct seg *next_seg;
+
+	/*
+	 * the setting of NEEDSPURGE is protect by as_rangelock(), so
+	 * no need to grab a_contents mutex for this check
+	 */
+	if ((as->a_flags & AS_NEEDSPURGE) == 0)
+		return;
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+	next_seg = NULL;
+	seg = AS_SEGFIRST(as);
+	while (seg != NULL) {
+		next_seg = AS_SEGNEXT(as, seg);
+		if (seg->s_flags & S_PURGE)
+			SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
+		seg = next_seg;
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+
+	mutex_enter(&as->a_contents);
+	as->a_flags &= ~AS_NEEDSPURGE;
+	mutex_exit(&as->a_contents);
+}
+
+/*
+ * Find a hole of at least size minlen within [base, base + len).
+ *
+ * If flags specifies AH_HI, the hole will have the highest possible address
+ * in the range.  We use the as->a_lastgap field to figure out where to
+ * start looking for a gap.
+ *
+ * Otherwise, the gap will have the lowest possible address.
+ *
+ * If flags specifies AH_CONTAIN, the hole will contain the address addr.
+ *
+ * If an adequate hole is found, base and len are set to reflect the part of
+ * the hole that is within range, and 0 is returned, otherwise,
+ * -1 is returned.
+ *
+ * NOTE: This routine is not correct when base+len overflows caddr_t.
+ */
+int
+as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
+    caddr_t addr)
+{
+	caddr_t lobound = *basep;
+	caddr_t hibound = lobound + *lenp;
+	struct seg *lseg, *hseg;
+	caddr_t lo, hi;
+	int forward;
+	caddr_t save_base;
+	size_t save_len;
+
+	save_base = *basep;
+	save_len = *lenp;
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	if (AS_SEGFIRST(as) == NULL) {
+		if (valid_va_range(basep, lenp, minlen, flags & AH_DIR)) {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			return (0);
+		} else {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			*basep = save_base;
+			*lenp = save_len;
+			return (-1);
+		}
+	}
+
+	/*
+	 * Set up to iterate over all the inter-segment holes in the given
+	 * direction.  lseg is NULL for the lowest-addressed hole and hseg is
+	 * NULL for the highest-addressed hole.  If moving backwards, we reset
+	 * sseg to denote the highest-addressed segment.
+	 */
+	forward = (flags & AH_DIR) == AH_LO;
+	if (forward) {
+		hseg = as_findseg(as, lobound, 1);
+		lseg = AS_SEGPREV(as, hseg);
+	} else {
+
+		/*
+		 * If allocating at least as much as the last allocation,
+		 * use a_lastgap's base as a better estimate of hibound.
+		 */
+		if (as->a_lastgap &&
+		    minlen >= as->a_lastgap->s_size &&
+		    hibound >= as->a_lastgap->s_base)
+			hibound = as->a_lastgap->s_base;
+
+		hseg = as_findseg(as, hibound, 1);
+		if (hseg->s_base + hseg->s_size < hibound) {
+			lseg = hseg;
+			hseg = NULL;
+		} else {
+			lseg = AS_SEGPREV(as, hseg);
+		}
+	}
+
+	for (;;) {
+		/*
+		 * Set lo and hi to the hole's boundaries.  (We should really
+		 * use MAXADDR in place of hibound in the expression below,
+		 * but can't express it easily; using hibound in its place is
+		 * harmless.)
+		 */
+		lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
+		hi = (hseg == NULL) ? hibound : hseg->s_base;
+		/*
+		 * If the iteration has moved past the interval from lobound
+		 * to hibound it's pointless to continue.
+		 */
+		if ((forward && lo > hibound) || (!forward && hi < lobound))
+			break;
+		else if (lo > hibound || hi < lobound)
+			goto cont;
+		/*
+		 * Candidate hole lies at least partially within the allowable
+		 * range.  Restrict it to fall completely within that range,
+		 * i.e., to [max(lo, lobound), min(hi, hibound)].
+		 */
+		if (lo < lobound)
+			lo = lobound;
+		if (hi > hibound)
+			hi = hibound;
+		/*
+		 * Verify that the candidate hole is big enough and meets
+		 * hardware constraints.
+		 */
+		*basep = lo;
+		*lenp = hi - lo;
+		if (valid_va_range(basep, lenp, minlen,
+		    forward ? AH_LO : AH_HI) &&
+		    ((flags & AH_CONTAIN) == 0 ||
+		    (*basep <= addr && *basep + *lenp > addr))) {
+			if (!forward)
+				as->a_lastgap = hseg;
+			if (hseg != NULL)
+				as->a_lastgaphl = hseg;
+			else
+				as->a_lastgaphl = lseg;
+			AS_LOCK_EXIT(as, &as->a_lock);
+			return (0);
+		}
+	cont:
+		/*
+		 * Move to the next hole.
+		 */
+		if (forward) {
+			lseg = hseg;
+			if (lseg == NULL)
+				break;
+			hseg = AS_SEGNEXT(as, hseg);
+		} else {
+			hseg = lseg;
+			if (hseg == NULL)
+				break;
+			lseg = AS_SEGPREV(as, lseg);
+		}
+	}
+	*basep = save_base;
+	*lenp = save_len;
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (-1);
+}
+
+/*
+ * Return the next range within [base, base + len) that is backed
+ * with "real memory".  Skip holes and non-seg_vn segments.
+ * We're lazy and only return one segment at a time.
+ */
+int
+as_memory(struct as *as, caddr_t *basep, size_t *lenp)
+{
+	extern struct seg_ops segspt_shmops;	/* needs a header file */
+	struct seg *seg;
+	caddr_t addr, eaddr;
+	caddr_t segend;
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+
+	addr = *basep;
+	eaddr = addr + *lenp;
+
+	seg = as_findseg(as, addr, 0);
+	if (seg != NULL)
+		addr = MAX(seg->s_base, addr);
+
+	for (;;) {
+		if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			return (EINVAL);
+		}
+
+		if (seg->s_ops == &segvn_ops) {
+			segend = seg->s_base + seg->s_size;
+			break;
+		}
+
+		/*
+		 * We do ISM by looking into the private data
+		 * to determine the real size of the segment.
+		 */
+		if (seg->s_ops == &segspt_shmops) {
+			segend = seg->s_base + spt_realsize(seg);
+			if (addr < segend)
+				break;
+		}
+
+		seg = AS_SEGNEXT(as, seg);
+
+		if (seg != NULL)
+			addr = seg->s_base;
+	}
+
+	*basep = addr;
+
+	if (segend > eaddr)
+		*lenp = eaddr - addr;
+	else
+		*lenp = segend - addr;
+
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (0);
+}
+
+/*
+ * Swap the pages associated with the address space as out to
+ * secondary storage, returning the number of bytes actually
+ * swapped.
+ *
+ * The value returned is intended to correlate well with the process's
+ * memory requirements.  Its usefulness for this purpose depends on
+ * how well the segment-level routines do at returning accurate
+ * information.
+ */
+size_t
+as_swapout(struct as *as)
+{
+	struct seg *seg;
+	size_t swpcnt = 0;
+
+	/*
+	 * Kernel-only processes have given up their address
+	 * spaces.  Of course, we shouldn't be attempting to
+	 * swap out such processes in the first place...
+	 */
+	if (as == NULL)
+		return (0);
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+
+	/* Prevent XHATs from attaching */
+	mutex_enter(&as->a_contents);
+	AS_SETBUSY(as);
+	mutex_exit(&as->a_contents);
+
+
+	/*
+	 * Free all mapping resources associated with the address
+	 * space.  The segment-level swapout routines capitalize
+	 * on this unmapping by scavanging pages that have become
+	 * unmapped here.
+	 */
+	hat_swapout(as->a_hat);
+	if (as->a_xhat != NULL)
+		xhat_swapout_all(as);
+
+	mutex_enter(&as->a_contents);
+	AS_CLRBUSY(as);
+	mutex_exit(&as->a_contents);
+
+	/*
+	 * Call the swapout routines of all segments in the address
+	 * space to do the actual work, accumulating the amount of
+	 * space reclaimed.
+	 */
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+		struct seg_ops *ov = seg->s_ops;
+
+		/*
+		 * We have to check to see if the seg has
+		 * an ops vector because the seg may have
+		 * been in the middle of being set up when
+		 * the process was picked for swapout.
+		 */
+		if ((ov != NULL) && (ov->swapout != NULL))
+			swpcnt += SEGOP_SWAPOUT(seg);
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (swpcnt);
+}
+
+/*
+ * Determine whether data from the mappings in interval [addr, addr + size)
+ * are in the primary memory (core) cache.
+ */
+int
+as_incore(struct as *as, caddr_t addr,
+    size_t size, char *vec, size_t *sizep)
+{
+	struct seg *seg;
+	size_t ssize;
+	caddr_t raddr;		/* rounded down addr */
+	size_t rsize;		/* rounded up size */
+	size_t isize;			/* iteration size */
+	int error = 0;		/* result, assume success */
+
+	*sizep = 0;
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+
+	if (raddr + rsize < raddr)		/* check for wraparound */
+		return (ENOMEM);
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	seg = as_segat(as, raddr);
+	if (seg == NULL) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (-1);
+	}
+
+	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+		if (raddr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			if (seg == NULL || raddr != seg->s_base) {
+				error = -1;
+				break;
+			}
+		}
+		if ((raddr + rsize) > (seg->s_base + seg->s_size))
+			ssize = seg->s_base + seg->s_size - raddr;
+		else
+			ssize = rsize;
+		*sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
+		if (isize != ssize) {
+			error = -1;
+			break;
+		}
+		vec += btopr(ssize);
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (error);
+}
+
+static void
+as_segunlock(struct seg *seg, caddr_t addr, int attr,
+	ulong_t *bitmap, size_t position, size_t npages)
+{
+	caddr_t	range_start;
+	size_t	pos1 = position;
+	size_t	pos2;
+	size_t	size;
+	size_t  end_pos = npages + position;
+
+	while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
+		size = ptob((pos2 - pos1));
+		range_start = (caddr_t)((uintptr_t)addr +
+			ptob(pos1 - position));
+
+		(void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
+			(ulong_t *)NULL, (size_t)NULL);
+		pos1 = pos2;
+	}
+}
+
+static void
+as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
+	caddr_t raddr, size_t rsize)
+{
+	struct seg *seg = as_segat(as, raddr);
+	size_t ssize;
+
+	while (rsize != 0) {
+		if (raddr >= seg->s_base + seg->s_size)
+			seg = AS_SEGNEXT(as, seg);
+
+		if ((raddr + rsize) > (seg->s_base + seg->s_size))
+			ssize = seg->s_base + seg->s_size - raddr;
+		else
+			ssize = rsize;
+
+		as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
+
+		rsize -= ssize;
+		raddr += ssize;
+	}
+}
+
+/*
+ * Cache control operations over the interval [addr, addr + size) in
+ * address space "as".
+ */
+/*ARGSUSED*/
+int
+as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
+    uintptr_t arg, ulong_t *lock_map, size_t pos)
+{
+	struct seg *seg;	/* working segment */
+	caddr_t raddr;		/* rounded down addr */
+	caddr_t initraddr;	/* saved initial rounded down addr */
+	size_t rsize;		/* rounded up size */
+	size_t initrsize;	/* saved initial rounded up size */
+	size_t ssize;		/* size of seg */
+	int error = 0;			/* result */
+	size_t mlock_size;	/* size of bitmap */
+	ulong_t *mlock_map;	/* pointer to bitmap used */
+				/* to represent the locked */
+				/* pages. */
+retry:
+	if (error == IE_RETRY)
+		AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+	else
+		AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+
+	/*
+	 * If these are address space lock/unlock operations, loop over
+	 * all segments in the address space, as appropriate.
+	 */
+	if (func == MC_LOCKAS) {
+		size_t npages, idx;
+		size_t rlen = 0;	/* rounded as length */
+
+		idx = pos;
+
+		if (arg & MCL_FUTURE) {
+			mutex_enter(&as->a_contents);
+			AS_SETPGLCK(as);
+			mutex_exit(&as->a_contents);
+		}
+		if ((arg & MCL_CURRENT) == 0) {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			return (0);
+		}
+
+		seg = AS_SEGFIRST(as);
+		if (seg == NULL) {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			return (0);
+		}
+
+		do {
+			raddr = (caddr_t)((uintptr_t)seg->s_base &
+			    (uintptr_t)PAGEMASK);
+			rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
+				PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
+		} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
+
+		mlock_size = BT_BITOUL(btopr(rlen));
+		if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
+			sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
+				AS_LOCK_EXIT(as, &as->a_lock);
+				return (EAGAIN);
+		}
+
+		for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
+			error = SEGOP_LOCKOP(seg, seg->s_base,
+			    seg->s_size, attr, MC_LOCK, mlock_map, pos);
+			if (error != 0)
+				break;
+			pos += seg_pages(seg);
+		}
+
+		if (error) {
+			for (seg = AS_SEGFIRST(as); seg != NULL;
+				seg = AS_SEGNEXT(as, seg)) {
+
+				raddr = (caddr_t)((uintptr_t)seg->s_base &
+					(uintptr_t)PAGEMASK);
+				npages = seg_pages(seg);
+				as_segunlock(seg, raddr, attr, mlock_map,
+					idx, npages);
+				idx += npages;
+			}
+		}
+
+		kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
+		AS_LOCK_EXIT(as, &as->a_lock);
+		goto lockerr;
+	} else if (func == MC_UNLOCKAS) {
+		mutex_enter(&as->a_contents);
+		AS_CLRPGLCK(as);
+		mutex_exit(&as->a_contents);
+
+		for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
+			error = SEGOP_LOCKOP(seg, seg->s_base,
+			    seg->s_size, attr, MC_UNLOCK, NULL, 0);
+			if (error != 0)
+				break;
+		}
+
+		AS_LOCK_EXIT(as, &as->a_lock);
+		goto lockerr;
+	}
+
+	/*
+	 * Normalize addresses and sizes.
+	 */
+	initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+
+	if (raddr + rsize < raddr) {		/* check for wraparound */
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (ENOMEM);
+	}
+
+	/*
+	 * Get initial segment.
+	 */
+	if ((seg = as_segat(as, raddr)) == NULL) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (ENOMEM);
+	}
+
+	if (func == MC_LOCK) {
+		mlock_size = BT_BITOUL(btopr(rsize));
+		if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
+			sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
+				AS_LOCK_EXIT(as, &as->a_lock);
+				return (EAGAIN);
+		}
+	}
+
+	/*
+	 * Loop over all segments.  If a hole in the address range is
+	 * discovered, then fail.  For each segment, perform the appropriate
+	 * control operation.
+	 */
+	while (rsize != 0) {
+
+		/*
+		 * Make sure there's no hole, calculate the portion
+		 * of the next segment to be operated over.
+		 */
+		if (raddr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			if (seg == NULL || raddr != seg->s_base) {
+				if (func == MC_LOCK) {
+					as_unlockerr(as, attr, mlock_map,
+						initraddr, initrsize - rsize);
+					kmem_free(mlock_map,
+						mlock_size * sizeof (ulong_t));
+				}
+				AS_LOCK_EXIT(as, &as->a_lock);
+				return (ENOMEM);
+			}
+		}
+		if ((raddr + rsize) > (seg->s_base + seg->s_size))
+			ssize = seg->s_base + seg->s_size - raddr;
+		else
+			ssize = rsize;
+
+		/*
+		 * Dispatch on specific function.
+		 */
+		switch (func) {
+
+		/*
+		 * Synchronize cached data from mappings with backing
+		 * objects.
+		 */
+		case MC_SYNC:
+			if (error = SEGOP_SYNC(seg, raddr, ssize,
+			    attr, (uint_t)arg)) {
+				AS_LOCK_EXIT(as, &as->a_lock);
+				return (error);
+			}
+			break;
+
+		/*
+		 * Lock pages in memory.
+		 */
+		case MC_LOCK:
+			if (error = SEGOP_LOCKOP(seg, raddr, ssize,
+				attr, func, mlock_map, pos)) {
+				as_unlockerr(as, attr, mlock_map, initraddr,
+					initrsize - rsize + ssize);
+				kmem_free(mlock_map, mlock_size *
+					sizeof (ulong_t));
+				AS_LOCK_EXIT(as, &as->a_lock);
+				goto lockerr;
+			}
+			break;
+
+		/*
+		 * Unlock mapped pages.
+		 */
+		case MC_UNLOCK:
+			(void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
+				(ulong_t *)NULL, (size_t)NULL);
+			break;
+
+		/*
+		 * Store VM advise for mapped pages in segment layer.
+		 */
+		case MC_ADVISE:
+			error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
+
+			/*
+			 * Check for regular errors and special retry error
+			 */
+			if (error) {
+				if (error == IE_RETRY) {
+					/*
+					 * Need to acquire writers lock, so
+					 * have to drop readers lock and start
+					 * all over again
+					 */
+					AS_LOCK_EXIT(as, &as->a_lock);
+					goto retry;
+				} else if (error == IE_REATTACH) {
+					/*
+					 * Find segment for current address
+					 * because current segment just got
+					 * split or concatenated
+					 */
+					seg = as_segat(as, raddr);
+					if (seg == NULL) {
+						AS_LOCK_EXIT(as, &as->a_lock);
+						return (ENOMEM);
+					}
+				} else {
+					/*
+					 * Regular error
+					 */
+					AS_LOCK_EXIT(as, &as->a_lock);
+					return (error);
+				}
+			}
+			break;
+
+		/*
+		 * Can't happen.
+		 */
+		default:
+			panic("as_ctl: bad operation %d", func);
+			/*NOTREACHED*/
+		}
+
+		rsize -= ssize;
+		raddr += ssize;
+	}
+
+	if (func == MC_LOCK)
+		kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (0);
+lockerr:
+
+	/*
+	 * If the lower levels returned EDEADLK for a segment lockop,
+	 * it means that we should retry the operation.  Let's wait
+	 * a bit also to let the deadlock causing condition clear.
+	 * This is part of a gross hack to work around a design flaw
+	 * in the ufs/sds logging code and should go away when the
+	 * logging code is re-designed to fix the problem. See bug
+	 * 4125102 for details of the problem.
+	 */
+	if (error == EDEADLK) {
+		delay(deadlk_wait);
+		error = 0;
+		goto retry;
+	}
+	return (error);
+}
+
+/*
+ * Special code for exec to move the stack segment from its interim
+ * place in the old address to the right place in the new address space.
+ */
+/*ARGSUSED*/
+int
+as_exec(struct as *oas, caddr_t ostka, size_t stksz,
+    struct as *nas, caddr_t nstka, uint_t hatflag)
+{
+	struct seg *stkseg;
+
+	AS_LOCK_ENTER(oas, &oas->a_lock, RW_WRITER);
+	stkseg = as_segat(oas, ostka);
+	stkseg = as_removeseg(oas, stkseg);
+	ASSERT(stkseg != NULL);
+	ASSERT(stkseg->s_base == ostka && stkseg->s_size == stksz);
+	stkseg->s_as = nas;
+	stkseg->s_base = nstka;
+
+	/*
+	 * It's ok to lock the address space we are about to exec to.
+	 */
+	AS_LOCK_ENTER(nas, &nas->a_lock, RW_WRITER);
+	ASSERT(avl_numnodes(&nas->a_wpage) == 0);
+	nas->a_size += stkseg->s_size;
+	oas->a_size -= stkseg->s_size;
+	(void) as_addseg(nas, stkseg);
+	AS_LOCK_EXIT(nas, &nas->a_lock);
+	AS_LOCK_EXIT(oas, &oas->a_lock);
+	return (0);
+}
+
+static int
+f_decode(faultcode_t fault_err)
+{
+	int error = 0;
+
+	switch (FC_CODE(fault_err)) {
+	case FC_OBJERR:
+		error = FC_ERRNO(fault_err);
+		break;
+	case FC_PROT:
+		error = EACCES;
+		break;
+	default:
+		error = EFAULT;
+		break;
+	}
+	return (error);
+}
+
+/*
+ * lock pages in a given address space. Return shadow list. If
+ * the list is NULL, the MMU mapping is also locked.
+ */
+int
+as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
+    size_t size, enum seg_rw rw)
+{
+	size_t rsize;
+	caddr_t base;
+	caddr_t raddr;
+	faultcode_t fault_err;
+	struct seg *seg;
+	int res;
+	int prefaulted = 0;
+
+	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
+	    "as_pagelock_start: addr %p size %ld", addr, size);
+
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+top:
+	/*
+	 * if the request crosses two segments let
+	 * as_fault handle it.
+	 */
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	seg = as_findseg(as, addr, 0);
+	if ((seg == NULL) || ((base = seg->s_base) > addr) ||
+	    (addr + size) > base + seg->s_size) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+		goto slow;
+	}
+
+	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
+	    "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
+
+	/*
+	 * try to lock pages and pass back shadow list
+	 */
+	res = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
+
+	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
+	AS_LOCK_EXIT(as, &as->a_lock);
+	if (res == 0) {
+		return (0);
+	} else if (res == ENOTSUP || prefaulted) {
+		/*
+		 * (1) segment driver doesn't support PAGELOCK fastpath, or
+		 * (2) we've already tried fast path unsuccessfully after
+		 *    faulting in the addr range below; system might be
+		 *    thrashing or there may not be enough availrmem.
+		 */
+		goto slow;
+	}
+
+	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_FAULT_START,
+	    "as_fault_start: addr %p size %ld", addr, size);
+
+	/*
+	 * we might get here because of some COW fault or non
+	 * existing page. Let as_fault deal with it. Just load
+	 * the page, don't lock the MMU mapping.
+	 */
+	fault_err = as_fault(as->a_hat, as, addr, size, F_INVAL, rw);
+	if (fault_err != 0) {
+		return (f_decode(fault_err));
+	}
+
+	prefaulted = 1;
+
+	/*
+	 * try fast path again; since we've dropped a_lock,
+	 * we need to try the dance from the start to see if
+	 * the addr range is still valid.
+	 */
+	goto top;
+slow:
+	/*
+	 * load the page and lock the MMU mapping.
+	 */
+	fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
+	if (fault_err != 0) {
+		return (f_decode(fault_err));
+	}
+	*ppp = NULL;
+
+	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
+	return (0);
+}
+
+/*
+ * unlock pages in a given address range
+ */
+void
+as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
+    enum seg_rw rw)
+{
+	struct seg *seg;
+	size_t rsize;
+	caddr_t raddr;
+
+	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
+	    "as_pageunlock_start: addr %p size %ld", addr, size);
+
+	/*
+	 * if the shadow list is NULL, as_pagelock was
+	 * falling back to as_fault
+	 */
+	if (pp == NULL) {
+		(void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
+		return;
+	}
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	seg = as_findseg(as, addr, 0);
+	ASSERT(seg);
+	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
+	    "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
+	SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
+	AS_LOCK_EXIT(as, &as->a_lock);
+	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
+}
+
+/*
+ * reclaim cached pages in a given address range
+ */
+void
+as_pagereclaim(struct as *as, struct page **pp, caddr_t addr,
+    size_t size, enum seg_rw rw)
+{
+	struct seg *seg;
+	size_t rsize;
+	caddr_t raddr;
+
+	ASSERT(AS_READ_HELD(as, &as->a_lock));
+	ASSERT(pp != NULL);
+
+	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+		(size_t)raddr;
+	seg = as_findseg(as, addr, 0);
+	ASSERT(seg);
+	SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGERECLAIM, rw);
+}
+
+#define	MAXPAGEFLIP	4
+#define	MAXPAGEFLIPSIZ	MAXPAGEFLIP*PAGESIZE
+
+int
+as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
+    boolean_t wait)
+{
+	struct seg *seg;
+	size_t ssize;
+	caddr_t raddr;			/* rounded down addr */
+	size_t rsize;			/* rounded up size */
+	int error = 0;
+	size_t pgsz = page_get_pagesize(szc);
+
+setpgsz_top:
+	if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
+		return (EINVAL);
+	}
+
+	raddr = addr;
+	rsize = size;
+
+	if (raddr + rsize < raddr)		/* check for wraparound */
+		return (ENOMEM);
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+	as_clearwatchprot(as, raddr, rsize);
+	seg = as_segat(as, raddr);
+	if (seg == NULL) {
+		as_setwatch(as);
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (ENOMEM);
+	}
+
+	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+		if (raddr >= seg->s_base + seg->s_size) {
+			seg = AS_SEGNEXT(as, seg);
+			if (seg == NULL || raddr != seg->s_base) {
+				error = ENOMEM;
+				break;
+			}
+		}
+		if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
+			ssize = seg->s_base + seg->s_size - raddr;
+		} else {
+			ssize = rsize;
+		}
+
+		error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
+
+		if (error == IE_NOMEM) {
+			error = EAGAIN;
+			break;
+		}
+
+		if (error == IE_RETRY) {
+			AS_LOCK_EXIT(as, &as->a_lock);
+			goto setpgsz_top;
+		}
+
+		if (error == ENOTSUP) {
+			error = EINVAL;
+			break;
+		}
+
+		if (wait && (error == EAGAIN)) {
+			/*
+			 * Memory is currently locked.  It must be unlocked
+			 * before this operation can succeed through a retry.
+			 * The possible reasons for locked memory and
+			 * corresponding strategies for unlocking are:
+			 * (1) Normal I/O
+			 *	wait for a signal that the I/O operation
+			 *	has completed and the memory is unlocked.
+			 * (2) Asynchronous I/O
+			 *	The aio subsystem does not unlock pages when
+			 *	the I/O is completed. Those pages are unlocked
+			 *	when the application calls aiowait/aioerror.
+			 *	So, to prevent blocking forever, cv_broadcast()
+			 *	is done to wake up aio_cleanup_thread.
+			 *	Subsequently, segvn_reclaim will be called, and
+			 *	that will do AS_CLRUNMAPWAIT() and wake us up.
+			 * (3) Long term page locking:
+			 *	This is not relevant for as_setpagesize()
+			 *	because we cannot change the page size for
+			 *	driver memory. The attempt to do so will
+			 *	fail with a different error than EAGAIN so
+			 *	there's no need to trigger as callbacks like
+			 *	as_unmap, as_setprot or as_free would do.
+			 */
+			mutex_enter(&as->a_contents);
+			if (AS_ISUNMAPWAIT(as) == 0) {
+				cv_broadcast(&as->a_cv);
+			}
+			AS_SETUNMAPWAIT(as);
+			AS_LOCK_EXIT(as, &as->a_lock);
+			while (AS_ISUNMAPWAIT(as)) {
+				cv_wait(&as->a_cv, &as->a_contents);
+			}
+			mutex_exit(&as->a_contents);
+			goto setpgsz_top;
+		} else if (error != 0) {
+			break;
+		}
+	}
+	as_setwatch(as);
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (error);
+}
+
+/*
+ * Setup all of the uninitialized watched pages that we can.
+ */
+void
+as_setwatch(struct as *as)
+{
+	struct watched_page *pwp;
+	struct seg *seg;
+	caddr_t vaddr;
+	uint_t prot;
+	int  err, retrycnt;
+
+	if (avl_numnodes(&as->a_wpage) == 0)
+		return;
+
+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+	for (pwp = avl_first(&as->a_wpage); pwp != NULL;
+	    pwp = AVL_NEXT(&as->a_wpage, pwp)) {
+		retrycnt = 0;
+	retry:
+		vaddr = pwp->wp_vaddr;
+		if (pwp->wp_oprot != 0 ||	/* already set up */
+		    (seg = as_segat(as, vaddr)) == NULL ||
+		    SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
+			continue;
+
+		pwp->wp_oprot = prot;
+		if (pwp->wp_read)
+			prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
+		if (pwp->wp_write)
+			prot &= ~PROT_WRITE;
+		if (pwp->wp_exec)
+			prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
+		if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
+			err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
+			if (err == IE_RETRY) {
+				pwp->wp_oprot = 0;
+				ASSERT(retrycnt == 0);
+				retrycnt++;
+				goto retry;
+			}
+		}
+		pwp->wp_prot = prot;
+	}
+}
+
+/*
+ * Clear all of the watched pages in the address space.
+ */
+void
+as_clearwatch(struct as *as)
+{
+	struct watched_page *pwp;
+	struct seg *seg;
+	caddr_t vaddr;
+	uint_t prot;
+	int err, retrycnt;
+
+	if (avl_numnodes(&as->a_wpage) == 0)
+		return;
+
+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+	for (pwp = avl_first(&as->a_wpage); pwp != NULL;
+	    pwp = AVL_NEXT(&as->a_wpage, pwp)) {
+		retrycnt = 0;
+	retry:
+		vaddr = pwp->wp_vaddr;
+		if (pwp->wp_oprot == 0 ||	/* not set up */
+		    (seg = as_segat(as, vaddr)) == NULL)
+			continue;
+
+		if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
+			err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
+			if (err == IE_RETRY) {
+				ASSERT(retrycnt == 0);
+				retrycnt++;
+				goto retry;
+			}
+		}
+		pwp->wp_oprot = 0;
+		pwp->wp_prot = 0;
+	}
+}
+
+/*
+ * Force a new setup for all the watched pages in the range.
+ */
+static void
+as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
+{
+	struct watched_page *pwp;
+	struct watched_page tpw;
+	caddr_t eaddr = addr + size;
+	caddr_t vaddr;
+	struct seg *seg;
+	int err, retrycnt;
+	uint_t	wprot;
+	avl_index_t where;
+
+	if (avl_numnodes(&as->a_wpage) == 0)
+		return;
+
+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+	tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
+		pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
+
+	while (pwp != NULL && pwp->wp_vaddr < eaddr) {
+		retrycnt = 0;
+		vaddr = pwp->wp_vaddr;
+
+		wprot = prot;
+		if (pwp->wp_read)
+			wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
+		if (pwp->wp_write)
+			wprot &= ~PROT_WRITE;
+		if (pwp->wp_exec)
+			wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
+		if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
+		retry:
+			seg = as_segat(as, vaddr);
+			if (seg == NULL) {
+				panic("as_setwatchprot: no seg");
+				/*NOTREACHED*/
+			}
+			err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
+			if (err == IE_RETRY) {
+				ASSERT(retrycnt == 0);
+				retrycnt++;
+				goto retry;
+			}
+		}
+		pwp->wp_oprot = prot;
+		pwp->wp_prot = wprot;
+
+		pwp = AVL_NEXT(&as->a_wpage, pwp);
+	}
+}
+
+/*
+ * Clear all of the watched pages in the range.
+ */
+static void
+as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
+{
+	caddr_t eaddr = addr + size;
+	struct watched_page *pwp;
+	struct watched_page tpw;
+	uint_t prot;
+	struct seg *seg;
+	int err, retrycnt;
+	avl_index_t where;
+
+	if (avl_numnodes(&as->a_wpage) == 0)
+		return;
+
+	tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+	if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
+		pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
+
+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+	while (pwp != NULL && pwp->wp_vaddr < eaddr) {
+		ASSERT(addr >= pwp->wp_vaddr);
+
+		if ((prot = pwp->wp_oprot) != 0) {
+			retrycnt = 0;
+
+			if (prot != pwp->wp_prot) {
+			retry:
+				seg = as_segat(as, pwp->wp_vaddr);
+				if (seg == NULL)
+					continue;
+				err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
+				    PAGESIZE, prot);
+				if (err == IE_RETRY) {
+					ASSERT(retrycnt == 0);
+					retrycnt++;
+					goto retry;
+
+				}
+			}
+			pwp->wp_oprot = 0;
+			pwp->wp_prot = 0;
+		}
+
+		pwp = AVL_NEXT(&as->a_wpage, pwp);
+	}
+}
+
+void
+as_signal_proc(struct as *as, k_siginfo_t *siginfo)
+{
+	struct proc *p;
+
+	mutex_enter(&pidlock);
+	for (p = practive; p; p = p->p_next) {
+		if (p->p_as == as) {
+			mutex_enter(&p->p_lock);
+			if (p->p_as == as)
+				sigaddq(p, NULL, siginfo, KM_NOSLEEP);
+			mutex_exit(&p->p_lock);
+		}
+	}
+	mutex_exit(&pidlock);
+}
+
+/*
+ * return memory object ID
+ */
+int
+as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
+{
+	struct seg	*seg;
+	int		sts;
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	seg = as_segat(as, addr);
+	if (seg == NULL) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (EFAULT);
+	}
+	/*
+	 * catch old drivers which may not support getmemid
+	 */
+	if (seg->s_ops->getmemid == NULL) {
+		AS_LOCK_EXIT(as, &as->a_lock);
+		return (ENODEV);
+	}
+
+	sts = SEGOP_GETMEMID(seg, addr, memidp);
+
+	AS_LOCK_EXIT(as, &as->a_lock);
+	return (sts);
+}
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
new file mode 100644
index 0000000000..67b4e58f0f
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -0,0 +1,6708 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VM - physical page management.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/vm.h>
+#include <sys/vtrace.h>
+#include <sys/swap.h>
+#include <sys/cmn_err.h>
+#include <sys/tuneable.h>
+#include <sys/sysmacros.h>
+#include <sys/cpuvar.h>
+#include <sys/callb.h>
+#include <sys/debug.h>
+#include <sys/tnf_probe.h>
+#include <sys/condvar_impl.h>
+#include <sys/mem_config.h>
+#include <sys/mem_cage.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/strlog.h>
+#include <sys/mman.h>
+#include <sys/ontrap.h>
+#include <sys/lgrp.h>
+#include <sys/vfs.h>
+
+#include <vm/hat.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <vm/seg.h>
+#include <vm/pvn.h>
+#include <vm/seg_kmem.h>
+#include <vm/vm_dep.h>
+
+#include <fs/fs_subr.h>
+
+static int nopageage = 0;
+
+static pgcnt_t max_page_get;	/* max page_get request size in pages */
+pgcnt_t total_pages = 0;	/* total number of pages (used by /proc) */
+
+/*
+ * vnode for all pages which are retired from the VM system;
+ * such as pages with Uncorrectable Errors.
+ */
+struct vnode retired_ppages;
+
+static void	page_retired_init(void);
+static void	retired_dispose(vnode_t *vp, page_t *pp, int flag,
+			int dn, cred_t *cr);
+static void	retired_inactive(vnode_t *vp, cred_t *cr);
+static void	page_retired(page_t *pp);
+static void	retired_page_removed(page_t *pp);
+void		page_unretire_pages(void);
+
+/*
+ * The maximum number of pages that will be unretired in one iteration.
+ * This number is totally arbitrary.
+ */
+#define	UNRETIRE_PAGES		256
+
+/*
+ * We limit the number of pages that may be retired to
+ * a percentage of the total physical memory. Note that
+ * the percentage values are  stored as 'basis points',
+ * ie, 100 basis points is 1%.
+ */
+#define	MAX_PAGES_RETIRED_BPS_DEFAULT	10	/* .1% */
+
+uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT;
+
+static int	pages_retired_limit_exceeded(void);
+
+/*
+ * operations vector for vnode with retired pages. Only VOP_DISPOSE
+ * and VOP_INACTIVE are intercepted.
+ */
+struct vnodeops retired_vnodeops = {
+	"retired_vnodeops",
+	fs_nosys,	/* open */
+	fs_nosys,	/* close */
+	fs_nosys,	/* read */
+	fs_nosys,	/* write */
+	fs_nosys,	/* ioctl */
+	fs_nosys,	/* setfl */
+	fs_nosys,	/* getattr */
+	fs_nosys,	/* setattr */
+	fs_nosys,	/* access */
+	fs_nosys,	/* lookup */
+	fs_nosys,	/* create */
+	fs_nosys,	/* remove */
+	fs_nosys,	/* link */
+	fs_nosys,	/* rename */
+	fs_nosys,	/* mkdir */
+	fs_nosys,	/* rmdir */
+	fs_nosys,	/* readdir */
+	fs_nosys,	/* symlink */
+	fs_nosys,	/* readlink */
+	fs_nosys,	/* fsync */
+	retired_inactive,
+	fs_nosys,	/* fid */
+	fs_rwlock,	/* rwlock */
+	fs_rwunlock,	/* rwunlock */
+	fs_nosys,	/* seek */
+	fs_nosys,	/* cmp */
+	fs_nosys,	/* frlock */
+	fs_nosys,	/* space */
+	fs_nosys,	/* realvp */
+	fs_nosys,	/* getpage */
+	fs_nosys,	/* putpage */
+	fs_nosys_map,
+	fs_nosys_addmap,
+	fs_nosys,	/* delmap */
+	fs_nosys_poll,
+	fs_nosys,	/* dump */
+	fs_nosys,	/* l_pathconf */
+	fs_nosys,	/* pageio */
+	fs_nosys,	/* dumpctl */
+	retired_dispose,
+	fs_nosys,	/* setsecattr */
+	fs_nosys,	/* getsecatt */
+	fs_nosys,	/* shrlock */
+	fs_vnevent_nosupport	/* vnevent */
+};
+
+/*
+ * freemem_lock protects all freemem variables:
+ * availrmem. Also this lock protects the globals which track the
+ * availrmem changes for accurate kernel footprint calculation.
+ * See below for an explanation of these
+ * globals.
+ */
+kmutex_t freemem_lock;
+pgcnt_t availrmem;
+pgcnt_t availrmem_initial;
+
+/*
+ * These globals track availrmem changes to get a more accurate
+ * estimate of tke kernel size. Historically pp_kernel is used for
+ * kernel size and is based on availrmem. But availrmem is adjusted for
+ * locked pages in the system not just for kernel locked pages.
+ * These new counters will track the pages locked through segvn and
+ * by explicit user locking.
+ *
+ * segvn_pages_locked : This keeps track on a global basis how many pages
+ * are currently locked because of I/O.
+ *
+ * pages_locked : How many pages are locked becuase of user specified
+ * locking through mlock or plock.
+ *
+ * pages_useclaim,pages_claimed : These two variables track the
+ * cliam adjustments because of the protection changes on a segvn segment.
+ *
+ * All these globals are protected by the same lock which protects availrmem.
+ */
+pgcnt_t segvn_pages_locked;
+pgcnt_t pages_locked;
+pgcnt_t pages_useclaim;
+pgcnt_t pages_claimed;
+
+
+/*
+ * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
+ */
+static kmutex_t	new_freemem_lock;
+static uint_t	freemem_wait;	/* someone waiting for freemem */
+static kcondvar_t freemem_cv;
+
+/*
+ * The logical page free list is maintained as two lists, the 'free'
+ * and the 'cache' lists.
+ * The free list contains those pages that should be reused first.
+ *
+ * The implementation of the lists is machine dependent.
+ * page_get_freelist(), page_get_cachelist(),
+ * page_list_sub(), and page_list_add()
+ * form the interface to the machine dependent implementation.
+ *
+ * Pages with p_free set are on the cache list.
+ * Pages with p_free and p_age set are on the free list,
+ *
+ * A page may be locked while on either list.
+ */
+
+/*
+ * free list accounting stuff.
+ *
+ *
+ * Spread out the value for the number of pages on the
+ * page free and page cache lists.  If there is just one
+ * value, then it must be under just one lock.
+ * The lock contention and cache traffic are a real bother.
+ *
+ * When we acquire and then drop a single pcf lock
+ * we can start in the middle of the array of pcf structures.
+ * If we acquire more than one pcf lock at a time, we need to
+ * start at the front to avoid deadlocking.
+ *
+ * pcf_count holds the number of pages in each pool.
+ *
+ * pcf_block is set when page_create_get_something() has asked the
+ * PSM page freelist and page cachelist routines without specifying
+ * a color and nothing came back.  This is used to block anything
+ * else from moving pages from one list to the other while the
+ * lists are searched again.  If a page is freeed while pcf_block is
+ * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
+ * of clearning pcf_block, doing the wakeups, etc.
+ */
+
+#if NCPU <= 4
+#define	PAD	1
+#define	PCF_FANOUT	4
+static	uint_t	pcf_mask = PCF_FANOUT - 1;
+#else
+#define	PAD	9
+#ifdef sun4v
+#define	PCF_FANOUT	32
+#else
+#define	PCF_FANOUT	128
+#endif
+static	uint_t	pcf_mask = PCF_FANOUT - 1;
+#endif
+
+struct pcf {
+	uint_t		pcf_touch;	/* just to help the cache */
+	uint_t		pcf_count;	/* page count */
+	kmutex_t	pcf_lock;	/* protects the structure */
+	uint_t		pcf_wait;	/* number of waiters */
+	uint_t		pcf_block; 	/* pcgs flag to page_free() */
+	uint_t		pcf_reserve; 	/* pages freed after pcf_block set */
+	uint_t		pcf_fill[PAD];	/* to line up on the caches */
+};
+
+static struct	pcf	pcf[PCF_FANOUT];
+#define	PCF_INDEX()	((CPU->cpu_id) & (pcf_mask))
+
+kmutex_t	pcgs_lock;		/* serializes page_create_get_ */
+kmutex_t	pcgs_cagelock;		/* serializes NOSLEEP cage allocs */
+kmutex_t	pcgs_wait_lock;		/* used for delay in pcgs */
+static kcondvar_t	pcgs_cv;	/* cv for delay in pcgs */
+
+#define	PAGE_LOCK_MAXIMUM \
+	((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1)
+
+/*
+ * Control over the verbosity of page retirement.  When set to zero, no messages
+ * will be printed.  A value of one will trigger messages for retirement
+ * operations, and is intended for processors which don't yet support FMA
+ * (spitfire).  Two will cause verbose messages to be printed when retirements
+ * complete, and is intended only for debugging purposes.
+ */
+int page_retire_messages = 0;
+
+#ifdef VM_STATS
+
+/*
+ * No locks, but so what, they are only statistics.
+ */
+
+static struct page_tcnt {
+	int	pc_free_cache;		/* free's into cache list */
+	int	pc_free_dontneed;	/* free's with dontneed */
+	int	pc_free_pageout;	/* free's from pageout */
+	int	pc_free_free;		/* free's into free list */
+	int	pc_free_pages;		/* free's into large page free list */
+	int	pc_destroy_pages;	/* large page destroy's */
+	int	pc_get_cache;		/* get's from cache list */
+	int	pc_get_free;		/* get's from free list */
+	int	pc_reclaim;		/* reclaim's */
+	int	pc_abortfree;		/* abort's of free pages */
+	int	pc_find_hit;		/* find's that find page */
+	int	pc_find_miss;		/* find's that don't find page */
+	int	pc_destroy_free;	/* # of free pages destroyed */
+#define	PC_HASH_CNT	(4*PAGE_HASHAVELEN)
+	int	pc_find_hashlen[PC_HASH_CNT+1];
+	int	pc_addclaim_pages;
+	int	pc_subclaim_pages;
+	int	pc_free_replacement_page[2];
+	int	pc_try_demote_pages[6];
+	int	pc_demote_pages[2];
+} pagecnt;
+
+uint_t	hashin_count;
+uint_t	hashin_not_held;
+uint_t	hashin_already;
+
+uint_t	hashout_count;
+uint_t	hashout_not_held;
+
+uint_t	page_create_count;
+uint_t	page_create_not_enough;
+uint_t	page_create_not_enough_again;
+uint_t	page_create_zero;
+uint_t	page_create_hashout;
+uint_t	page_create_page_lock_failed;
+uint_t	page_create_trylock_failed;
+uint_t	page_create_found_one;
+uint_t	page_create_hashin_failed;
+uint_t	page_create_dropped_phm;
+
+uint_t	page_create_new;
+uint_t	page_create_exists;
+uint_t	page_create_putbacks;
+uint_t	page_create_overshoot;
+
+uint_t	page_reclaim_zero;
+uint_t	page_reclaim_zero_locked;
+
+uint_t	page_rename_exists;
+uint_t	page_rename_count;
+
+uint_t	page_lookup_cnt[20];
+uint_t	page_lookup_nowait_cnt[10];
+uint_t	page_find_cnt;
+uint_t	page_exists_cnt;
+uint_t	page_exists_forreal_cnt;
+uint_t	page_lookup_dev_cnt;
+uint_t	get_cachelist_cnt;
+uint_t	page_create_cnt[10];
+uint_t	alloc_pages[8];
+uint_t	page_exphcontg[19];
+uint_t  page_create_large_cnt[10];
+
+/*
+ * Collects statistics.
+ */
+#define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
+	uint_t	mylen = 0; \
+			\
+	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
+		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
+			break; \
+	} \
+	if ((pp) != NULL) \
+		pagecnt.pc_find_hit++; \
+	else \
+		pagecnt.pc_find_miss++; \
+	if (mylen > PC_HASH_CNT) \
+		mylen = PC_HASH_CNT; \
+	pagecnt.pc_find_hashlen[mylen]++; \
+}
+
+#else	/* VM_STATS */
+
+/*
+ * Don't collect statistics
+ */
+#define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
+	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
+		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
+			break; \
+	} \
+}
+
+#endif	/* VM_STATS */
+
+
+
+#ifdef DEBUG
+#define	MEMSEG_SEARCH_STATS
+#endif
+
+#ifdef MEMSEG_SEARCH_STATS
+struct memseg_stats {
+    uint_t nsearch;
+    uint_t nlastwon;
+    uint_t nhashwon;
+    uint_t nnotfound;
+} memseg_stats;
+
+#define	MEMSEG_STAT_INCR(v) \
+	atomic_add_32(&memseg_stats.v, 1)
+#else
+#define	MEMSEG_STAT_INCR(x)
+#endif
+
+struct memseg *memsegs;		/* list of memory segments */
+
+
+static void page_init_mem_config(void);
+static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
+static void page_do_hashout(page_t *);
+
+static void page_demote_vp_pages(page_t *);
+
+/*
+ * vm subsystem related initialization
+ */
+void
+vm_init(void)
+{
+	boolean_t callb_vm_cpr(void *, int);
+
+	(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
+	page_init_mem_config();
+
+	/*
+	 * initialise the vnode for retired pages
+	 */
+	page_retired_init();
+}
+
+/*
+ * This function is called at startup and when memory is added or deleted.
+ */
+void
+init_pages_pp_maximum()
+{
+	static pgcnt_t p_min;
+	static pgcnt_t pages_pp_maximum_startup;
+	static pgcnt_t avrmem_delta;
+	static int init_done;
+	static int user_set;	/* true if set in /etc/system */
+
+	if (init_done == 0) {
+
+		/* If the user specified a value, save it */
+		if (pages_pp_maximum != 0) {
+			user_set = 1;
+			pages_pp_maximum_startup = pages_pp_maximum;
+		}
+
+		/*
+		 * Setting of pages_pp_maximum is based first time
+		 * on the value of availrmem just after the start-up
+		 * allocations. To preserve this relationship at run
+		 * time, use a delta from availrmem_initial.
+		 */
+		ASSERT(availrmem_initial >= availrmem);
+		avrmem_delta = availrmem_initial - availrmem;
+
+		/* The allowable floor of pages_pp_maximum */
+		p_min = tune.t_minarmem + 100;
+
+		/* Make sure we don't come through here again. */
+		init_done = 1;
+	}
+	/*
+	 * Determine pages_pp_maximum, the number of currently available
+	 * pages (availrmem) that can't be `locked'. If not set by
+	 * the user, we set it to 4% of the currently available memory
+	 * plus 4MB.
+	 * But we also insist that it be greater than tune.t_minarmem;
+	 * otherwise a process could lock down a lot of memory, get swapped
+	 * out, and never have enough to get swapped back in.
+	 */
+	if (user_set)
+		pages_pp_maximum = pages_pp_maximum_startup;
+	else
+		pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
+		    + btop(4 * 1024 * 1024);
+
+	if (pages_pp_maximum <= p_min) {
+		pages_pp_maximum = p_min;
+	}
+}
+
+void
+set_max_page_get(pgcnt_t target_total_pages)
+{
+	max_page_get = target_total_pages / 2;
+}
+
+static pgcnt_t pending_delete;
+
+/*ARGSUSED*/
+static void
+page_mem_config_post_add(
+	void *arg,
+	pgcnt_t delta_pages)
+{
+	set_max_page_get(total_pages - pending_delete);
+	init_pages_pp_maximum();
+}
+
+/*ARGSUSED*/
+static int
+page_mem_config_pre_del(
+	void *arg,
+	pgcnt_t delta_pages)
+{
+	pgcnt_t nv;
+
+	nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
+	set_max_page_get(total_pages - nv);
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+page_mem_config_post_del(
+	void *arg,
+	pgcnt_t delta_pages,
+	int cancelled)
+{
+	pgcnt_t nv;
+
+	nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
+	set_max_page_get(total_pages - nv);
+	if (!cancelled)
+		init_pages_pp_maximum();
+}
+
+static kphysm_setup_vector_t page_mem_config_vec = {
+	KPHYSM_SETUP_VECTOR_VERSION,
+	page_mem_config_post_add,
+	page_mem_config_pre_del,
+	page_mem_config_post_del,
+};
+
+static void
+page_init_mem_config(void)
+{
+	int ret;
+
+	ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
+	ASSERT(ret == 0);
+}
+
+/*
+ * Evenly spread out the PCF counters for large free pages
+ */
+static void
+page_free_large_ctr(pgcnt_t npages)
+{
+	static struct pcf	*p = pcf;
+	pgcnt_t			lump;
+
+	freemem += npages;
+
+	lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT;
+
+	while (npages > 0) {
+
+		ASSERT(!p->pcf_block);
+
+		if (lump < npages) {
+			p->pcf_count += (uint_t)lump;
+			npages -= lump;
+		} else {
+			p->pcf_count += (uint_t)npages;
+			npages = 0;
+		}
+
+		ASSERT(!p->pcf_wait);
+
+		if (++p > &pcf[PCF_FANOUT - 1])
+			p = pcf;
+	}
+
+	ASSERT(npages == 0);
+}
+
+/*
+ * Add a physical chunk of memory to the system freee lists during startup.
+ * Platform specific startup() allocates the memory for the page structs.
+ *
+ * num	- number of page structures
+ * base - page number (pfn) to be associated with the first page.
+ *
+ * Since we are doing this during startup (ie. single threaded), we will
+ * use shortcut routines to avoid any locking overhead while putting all
+ * these pages on the freelists.
+ *
+ * NOTE: Any changes performed to page_free(), must also be performed to
+ *	 add_physmem() since this is how we initialize all page_t's at
+ *	 boot time.
+ */
+void
+add_physmem(
+	page_t	*pp,
+	pgcnt_t	num,
+	pfn_t	pnum)
+{
+	page_t	*root = NULL;
+	uint_t	szc = page_num_pagesizes() - 1;
+	pgcnt_t	large = page_get_pagecnt(szc);
+	pgcnt_t	cnt = 0;
+
+	TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
+		"add_physmem:pp %p num %lu", pp, num);
+
+	/*
+	 * Arbitrarily limit the max page_get request
+	 * to 1/2 of the page structs we have.
+	 */
+	total_pages += num;
+	set_max_page_get(total_pages);
+
+	/*
+	 * The physical space for the pages array
+	 * representing ram pages has already been
+	 * allocated.  Here we initialize each lock
+	 * in the page structure, and put each on
+	 * the free list
+	 */
+	for (; num; pp = page_next_raw(pp), pnum++, num--) {
+
+		/*
+		 * this needs to fill in the page number
+		 * and do any other arch specific initialization
+		 */
+		add_physmem_cb(pp, pnum);
+
+		/*
+		 * Initialize the page lock as unlocked, since nobody
+		 * can see or access this page yet.
+		 */
+		pp->p_selock = 0;
+
+		/*
+		 * Initialize IO lock
+		 */
+		page_iolock_init(pp);
+
+		/*
+		 * initialize other fields in the page_t
+		 */
+		PP_SETFREE(pp);
+		page_clr_all_props(pp);
+		PP_SETAGED(pp);
+		pp->p_offset = (u_offset_t)-1;
+		pp->p_next = pp;
+		pp->p_prev = pp;
+
+		/*
+		 * Simple case: System doesn't support large pages.
+		 */
+		if (szc == 0) {
+			pp->p_szc = 0;
+			page_free_at_startup(pp);
+			continue;
+		}
+
+		/*
+		 * Handle unaligned pages, we collect them up onto
+		 * the root page until we have a full large page.
+		 */
+		if (!IS_P2ALIGNED(pnum, large)) {
+
+			/*
+			 * If not in a large page,
+			 * just free as small page.
+			 */
+			if (root == NULL) {
+				pp->p_szc = 0;
+				page_free_at_startup(pp);
+				continue;
+			}
+
+			/*
+			 * Link a constituent page into the large page.
+			 */
+			pp->p_szc = szc;
+			page_list_concat(&root, &pp);
+
+			/*
+			 * When large page is fully formed, free it.
+			 */
+			if (++cnt == large) {
+				page_free_large_ctr(cnt);
+				page_list_add_pages(root, PG_LIST_ISINIT);
+				root = NULL;
+				cnt = 0;
+			}
+			continue;
+		}
+
+		/*
+		 * At this point we have a page number which
+		 * is aligned. We assert that we aren't already
+		 * in a different large page.
+		 */
+		ASSERT(IS_P2ALIGNED(pnum, large));
+		ASSERT(root == NULL && cnt == 0);
+
+		/*
+		 * If insufficient number of pages left to form
+		 * a large page, just free the small page.
+		 */
+		if (num < large) {
+			pp->p_szc = 0;
+			page_free_at_startup(pp);
+			continue;
+		}
+
+		/*
+		 * Otherwise start a new large page.
+		 */
+		pp->p_szc = szc;
+		cnt++;
+		root = pp;
+	}
+	ASSERT(root == NULL && cnt == 0);
+}
+
+/*
+ * Find a page representing the specified [vp, offset].
+ * If we find the page but it is intransit coming in,
+ * it will have an "exclusive" lock and we wait for
+ * the i/o to complete.  A page found on the free list
+ * is always reclaimed and then locked.  On success, the page
+ * is locked, its data is valid and it isn't on the free
+ * list, while a NULL is returned if the page doesn't exist.
+ */
+page_t *
+page_lookup(vnode_t *vp, u_offset_t off, se_t se)
+{
+	return (page_lookup_create(vp, off, se, NULL, NULL, 0));
+}
+
+/*
+ * Find a page representing the specified [vp, offset].
+ * We either return the one we found or, if passed in,
+ * create one with identity of [vp, offset] of the
+ * pre-allocated page. If we find exsisting page but it is
+ * intransit coming in, it will have an "exclusive" lock
+ * and we wait for the i/o to complete.  A page found on
+ * the free list is always reclaimed and then locked.
+ * On success, the page is locked, its data is valid and
+ * it isn't on the free list, while a NULL is returned
+ * if the page doesn't exist and newpp is NULL;
+ */
+page_t *
+page_lookup_create(
+	vnode_t *vp,
+	u_offset_t off,
+	se_t se,
+	page_t *newpp,
+	spgcnt_t *nrelocp,
+	int flags)
+{
+	page_t		*pp;
+	kmutex_t	*phm;
+	ulong_t		index;
+	uint_t		hash_locked;
+	uint_t		es;
+
+	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+	VM_STAT_ADD(page_lookup_cnt[0]);
+	ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
+
+	/*
+	 * Acquire the appropriate page hash lock since
+	 * we have to search the hash list.  Pages that
+	 * hash to this list can't change identity while
+	 * this lock is held.
+	 */
+	hash_locked = 0;
+	index = PAGE_HASH_FUNC(vp, off);
+	phm = NULL;
+top:
+	PAGE_HASH_SEARCH(index, pp, vp, off);
+	if (pp != NULL) {
+		VM_STAT_ADD(page_lookup_cnt[1]);
+		es = (newpp != NULL) ? 1 : 0;
+		es |= flags;
+		if (!hash_locked) {
+			VM_STAT_ADD(page_lookup_cnt[2]);
+			if (!page_try_reclaim_lock(pp, se, es)) {
+				/*
+				 * On a miss, acquire the phm.  Then
+				 * next time, page_lock() will be called,
+				 * causing a wait if the page is busy.
+				 * just looping with page_trylock() would
+				 * get pretty boring.
+				 */
+				VM_STAT_ADD(page_lookup_cnt[3]);
+				phm = PAGE_HASH_MUTEX(index);
+				mutex_enter(phm);
+				hash_locked = 1;
+				goto top;
+			}
+		} else {
+			VM_STAT_ADD(page_lookup_cnt[4]);
+			if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
+				VM_STAT_ADD(page_lookup_cnt[5]);
+				goto top;
+			}
+		}
+
+		/*
+		 * Since `pp' is locked it can not change identity now.
+		 * Reconfirm we locked the correct page.
+		 *
+		 * Both the p_vnode and p_offset *must* be cast volatile
+		 * to force a reload of their values: The PAGE_HASH_SEARCH
+		 * macro will have stuffed p_vnode and p_offset into
+		 * registers before calling page_trylock(); another thread,
+		 * actually holding the hash lock, could have changed the
+		 * page's identity in memory, but our registers would not
+		 * be changed, fooling the reconfirmation.  If the hash
+		 * lock was held during the search, the casting would
+		 * not be needed.
+		 */
+		VM_STAT_ADD(page_lookup_cnt[6]);
+		if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
+		    ((volatile u_offset_t)(pp->p_offset) != off)) {
+			VM_STAT_ADD(page_lookup_cnt[7]);
+			if (hash_locked) {
+				panic("page_lookup_create: lost page %p",
+				    (void *)pp);
+				/*NOTREACHED*/
+			}
+			page_unlock(pp);
+			phm = PAGE_HASH_MUTEX(index);
+			mutex_enter(phm);
+			hash_locked = 1;
+			goto top;
+		}
+
+		/*
+		 * If page_trylock() was called, then pp may still be on
+		 * the cachelist (can't be on the free list, it would not
+		 * have been found in the search).  If it is on the
+		 * cachelist it must be pulled now. To pull the page from
+		 * the cachelist, it must be exclusively locked.
+		 *
+		 * The other big difference between page_trylock() and
+		 * page_lock(), is that page_lock() will pull the
+		 * page from whatever free list (the cache list in this
+		 * case) the page is on.  If page_trylock() was used
+		 * above, then we have to do the reclaim ourselves.
+		 */
+		if ((!hash_locked) && (PP_ISFREE(pp))) {
+			ASSERT(PP_ISAGED(pp) == 0);
+			VM_STAT_ADD(page_lookup_cnt[8]);
+
+			/*
+			 * page_relcaim will insure that we
+			 * have this page exclusively
+			 */
+
+			if (!page_reclaim(pp, NULL)) {
+				/*
+				 * Page_reclaim dropped whatever lock
+				 * we held.
+				 */
+				VM_STAT_ADD(page_lookup_cnt[9]);
+				phm = PAGE_HASH_MUTEX(index);
+				mutex_enter(phm);
+				hash_locked = 1;
+				goto top;
+			} else if (se == SE_SHARED && newpp == NULL) {
+				VM_STAT_ADD(page_lookup_cnt[10]);
+				page_downgrade(pp);
+			}
+		}
+
+		if (hash_locked) {
+			mutex_exit(phm);
+		}
+
+		if (newpp != NULL && pp->p_szc < newpp->p_szc &&
+		    PAGE_EXCL(pp) && nrelocp != NULL) {
+			ASSERT(nrelocp != NULL);
+			(void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
+			    NULL);
+			if (*nrelocp > 0) {
+				VM_STAT_COND_ADD(*nrelocp == 1,
+				    page_lookup_cnt[11]);
+				VM_STAT_COND_ADD(*nrelocp > 1,
+				    page_lookup_cnt[12]);
+				pp = newpp;
+				se = SE_EXCL;
+			} else {
+				if (se == SE_SHARED) {
+					page_downgrade(pp);
+				}
+				VM_STAT_ADD(page_lookup_cnt[13]);
+			}
+		} else if (newpp != NULL && nrelocp != NULL) {
+			if (PAGE_EXCL(pp) && se == SE_SHARED) {
+				page_downgrade(pp);
+			}
+			VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
+			    page_lookup_cnt[14]);
+			VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
+			    page_lookup_cnt[15]);
+			VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
+			    page_lookup_cnt[16]);
+		} else if (newpp != NULL && PAGE_EXCL(pp)) {
+			se = SE_EXCL;
+		}
+	} else if (!hash_locked) {
+		VM_STAT_ADD(page_lookup_cnt[17]);
+		phm = PAGE_HASH_MUTEX(index);
+		mutex_enter(phm);
+		hash_locked = 1;
+		goto top;
+	} else if (newpp != NULL) {
+		/*
+		 * If we have a preallocated page then
+		 * insert it now and basically behave like
+		 * page_create.
+		 */
+		VM_STAT_ADD(page_lookup_cnt[18]);
+		/*
+		 * Since we hold the page hash mutex and
+		 * just searched for this page, page_hashin
+		 * had better not fail.  If it does, that
+		 * means some thread did not follow the
+		 * page hash mutex rules.  Panic now and
+		 * get it over with.  As usual, go down
+		 * holding all the locks.
+		 */
+		ASSERT(MUTEX_HELD(phm));
+		if (!page_hashin(newpp, vp, off, phm)) {
+			ASSERT(MUTEX_HELD(phm));
+			panic("page_lookup_create: hashin failed %p %p %llx %p",
+			    (void *)newpp, (void *)vp, off, (void *)phm);
+			/*NOTREACHED*/
+		}
+		ASSERT(MUTEX_HELD(phm));
+		mutex_exit(phm);
+		phm = NULL;
+		page_set_props(newpp, P_REF);
+		page_io_lock(newpp);
+		pp = newpp;
+		se = SE_EXCL;
+	} else {
+		VM_STAT_ADD(page_lookup_cnt[19]);
+		mutex_exit(phm);
+	}
+
+	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
+
+	ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
+
+	return (pp);
+}
+
+/*
+ * Search the hash list for the page representing the
+ * specified [vp, offset] and return it locked.  Skip
+ * free pages and pages that cannot be locked as requested.
+ * Used while attempting to kluster pages.
+ */
+page_t *
+page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
+{
+	page_t		*pp;
+	kmutex_t	*phm;
+	ulong_t		index;
+	uint_t		locked;
+
+	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+	VM_STAT_ADD(page_lookup_nowait_cnt[0]);
+
+	index = PAGE_HASH_FUNC(vp, off);
+	PAGE_HASH_SEARCH(index, pp, vp, off);
+	locked = 0;
+	if (pp == NULL) {
+top:
+		VM_STAT_ADD(page_lookup_nowait_cnt[1]);
+		locked = 1;
+		phm = PAGE_HASH_MUTEX(index);
+		mutex_enter(phm);
+		PAGE_HASH_SEARCH(index, pp, vp, off);
+	}
+
+	if (pp == NULL || PP_ISFREE(pp)) {
+		VM_STAT_ADD(page_lookup_nowait_cnt[2]);
+		pp = NULL;
+	} else {
+		if (!page_trylock(pp, se)) {
+			VM_STAT_ADD(page_lookup_nowait_cnt[3]);
+			pp = NULL;
+		} else {
+			VM_STAT_ADD(page_lookup_nowait_cnt[4]);
+			/*
+			 * See the comment in page_lookup()
+			 */
+			if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
+			    ((u_offset_t)(pp->p_offset) != off)) {
+				VM_STAT_ADD(page_lookup_nowait_cnt[5]);
+				if (locked) {
+					panic("page_lookup_nowait %p",
+					    (void *)pp);
+					/*NOTREACHED*/
+				}
+				page_unlock(pp);
+				goto top;
+			}
+			if (PP_ISFREE(pp)) {
+				VM_STAT_ADD(page_lookup_nowait_cnt[6]);
+				page_unlock(pp);
+				pp = NULL;
+			}
+		}
+	}
+	if (locked) {
+		VM_STAT_ADD(page_lookup_nowait_cnt[7]);
+		mutex_exit(phm);
+	}
+
+	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
+
+	return (pp);
+}
+
+/*
+ * Search the hash list for a page with the specified [vp, off]
+ * that is known to exist and is already locked.  This routine
+ * is typically used by segment SOFTUNLOCK routines.
+ */
+page_t *
+page_find(vnode_t *vp, u_offset_t off)
+{
+	page_t		*pp;
+	kmutex_t	*phm;
+	ulong_t		index;
+
+	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+	VM_STAT_ADD(page_find_cnt);
+
+	index = PAGE_HASH_FUNC(vp, off);
+	phm = PAGE_HASH_MUTEX(index);
+
+	mutex_enter(phm);
+	PAGE_HASH_SEARCH(index, pp, vp, off);
+	mutex_exit(phm);
+
+	ASSERT(pp != NULL);
+	ASSERT(PAGE_LOCKED(pp) || panicstr);
+	return (pp);
+}
+
+/*
+ * Determine whether a page with the specified [vp, off]
+ * currently exists in the system.  Obviously this should
+ * only be considered as a hint since nothing prevents the
+ * page from disappearing or appearing immediately after
+ * the return from this routine. Subsequently, we don't
+ * even bother to lock the list.
+ */
+page_t *
+page_exists(vnode_t *vp, u_offset_t off)
+{
+	page_t	*pp;
+	ulong_t		index;
+
+	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+	VM_STAT_ADD(page_exists_cnt);
+
+	index = PAGE_HASH_FUNC(vp, off);
+	PAGE_HASH_SEARCH(index, pp, vp, off);
+
+	return (pp);
+}
+
+/*
+ * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
+ * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
+ * with these pages locked SHARED. If necessary reclaim pages from
+ * freelist. Return 1 if contiguous pages exist and 0 otherwise.
+ *
+ * If we fail to lock pages still return 1 if pages exist and contiguous.
+ * But in this case return value is just a hint. ppa array won't be filled.
+ * Caller should initialize ppa[0] as NULL to distinguish return value.
+ *
+ * Returns 0 if pages don't exist or not physically contiguous.
+ *
+ * This routine doesn't work for anonymous(swapfs) pages.
+ */
+int
+page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
+{
+	pgcnt_t pages;
+	pfn_t pfn;
+	page_t *rootpp;
+	pgcnt_t i;
+	pgcnt_t j;
+	u_offset_t save_off = off;
+	ulong_t index;
+	kmutex_t *phm;
+	page_t *pp;
+	uint_t pszc;
+	int loopcnt = 0;
+
+	ASSERT(szc != 0);
+	ASSERT(vp != NULL);
+	ASSERT(!IS_SWAPFSVP(vp));
+	ASSERT(vp != &kvp);
+
+again:
+	if (++loopcnt > 3) {
+		VM_STAT_ADD(page_exphcontg[0]);
+		return (0);
+	}
+
+	index = PAGE_HASH_FUNC(vp, off);
+	phm = PAGE_HASH_MUTEX(index);
+
+	mutex_enter(phm);
+	PAGE_HASH_SEARCH(index, pp, vp, off);
+	mutex_exit(phm);
+
+	VM_STAT_ADD(page_exphcontg[1]);
+
+	if (pp == NULL) {
+		VM_STAT_ADD(page_exphcontg[2]);
+		return (0);
+	}
+
+	pages = page_get_pagecnt(szc);
+	rootpp = pp;
+	pfn = rootpp->p_pagenum;
+
+	if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
+		VM_STAT_ADD(page_exphcontg[3]);
+		if (!page_trylock(pp, SE_SHARED)) {
+			VM_STAT_ADD(page_exphcontg[4]);
+			return (1);
+		}
+		if (pp->p_szc != pszc || pp->p_vnode != vp ||
+		    pp->p_offset != off) {
+			VM_STAT_ADD(page_exphcontg[5]);
+			page_unlock(pp);
+			off = save_off;
+			goto again;
+		}
+		/*
+		 * szc was non zero and vnode and offset matched after we
+		 * locked the page it means it can't become free on us.
+		 */
+		ASSERT(!PP_ISFREE(pp));
+		if (!IS_P2ALIGNED(pfn, pages)) {
+			page_unlock(pp);
+			return (0);
+		}
+		ppa[0] = pp;
+		pp++;
+		off += PAGESIZE;
+		pfn++;
+		for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
+			if (!page_trylock(pp, SE_SHARED)) {
+				VM_STAT_ADD(page_exphcontg[6]);
+				pp--;
+				while (i-- > 0) {
+					page_unlock(pp);
+					pp--;
+				}
+				ppa[0] = NULL;
+				return (1);
+			}
+			if (pp->p_szc != pszc) {
+				VM_STAT_ADD(page_exphcontg[7]);
+				page_unlock(pp);
+				pp--;
+				while (i-- > 0) {
+					page_unlock(pp);
+					pp--;
+				}
+				ppa[0] = NULL;
+				off = save_off;
+				goto again;
+			}
+			/*
+			 * szc the same as for previous already locked pages
+			 * with right identity. Since this page had correct
+			 * szc after we locked it can't get freed or destroyed
+			 * and therefore must have the expected identity.
+			 */
+			ASSERT(!PP_ISFREE(pp));
+			if (pp->p_vnode != vp ||
+			    pp->p_offset != off) {
+				panic("page_exists_physcontig: "
+				    "large page identity doesn't match");
+			}
+			ppa[i] = pp;
+			ASSERT(pp->p_pagenum == pfn);
+		}
+		VM_STAT_ADD(page_exphcontg[8]);
+		ppa[pages] = NULL;
+		return (1);
+	} else if (pszc >= szc) {
+		VM_STAT_ADD(page_exphcontg[9]);
+		if (!IS_P2ALIGNED(pfn, pages)) {
+			return (0);
+		}
+		return (1);
+	}
+
+	if (!IS_P2ALIGNED(pfn, pages)) {
+		VM_STAT_ADD(page_exphcontg[10]);
+		return (0);
+	}
+
+	if (page_numtomemseg_nolock(pfn) !=
+	    page_numtomemseg_nolock(pfn + pages - 1)) {
+		VM_STAT_ADD(page_exphcontg[11]);
+		return (0);
+	}
+
+	/*
+	 * We loop up 4 times across pages to promote page size.
+	 * We're extra cautious to promote page size atomically with respect
+	 * to everybody else.  But we can probably optimize into 1 loop if
+	 * this becomes an issue.
+	 */
+
+	for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
+		ASSERT(pp->p_pagenum == pfn);
+		if (!page_trylock(pp, SE_EXCL)) {
+			VM_STAT_ADD(page_exphcontg[12]);
+			break;
+		}
+		if (pp->p_vnode != vp ||
+		    pp->p_offset != off) {
+			VM_STAT_ADD(page_exphcontg[13]);
+			page_unlock(pp);
+			break;
+		}
+		if (pp->p_szc >= szc) {
+			ASSERT(i == 0);
+			page_unlock(pp);
+			off = save_off;
+			goto again;
+		}
+	}
+
+	if (i != pages) {
+		VM_STAT_ADD(page_exphcontg[14]);
+		--pp;
+		while (i-- > 0) {
+			page_unlock(pp);
+			--pp;
+		}
+		return (0);
+	}
+
+	pp = rootpp;
+	for (i = 0; i < pages; i++, pp++) {
+		if (PP_ISFREE(pp)) {
+			VM_STAT_ADD(page_exphcontg[15]);
+			ASSERT(!PP_ISAGED(pp));
+			ASSERT(pp->p_szc == 0);
+			if (!page_reclaim(pp, NULL)) {
+				break;
+			}
+		} else {
+			ASSERT(pp->p_szc < szc);
+			VM_STAT_ADD(page_exphcontg[16]);
+			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+		}
+	}
+	if (i < pages) {
+		VM_STAT_ADD(page_exphcontg[17]);
+		/*
+		 * page_reclaim failed because we were out of memory.
+		 * drop the rest of the locks and return because this page
+		 * must be already reallocated anyway.
+		 */
+		pp = rootpp;
+		for (j = 0; j < pages; j++, pp++) {
+			if (j != i) {
+				page_unlock(pp);
+			}
+		}
+		return (0);
+	}
+
+	off = save_off;
+	pp = rootpp;
+	for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
+		ASSERT(PAGE_EXCL(pp));
+		ASSERT(!PP_ISFREE(pp));
+		ASSERT(!hat_page_is_mapped(pp));
+		ASSERT(pp->p_vnode == vp);
+		ASSERT(pp->p_offset == off);
+		pp->p_szc = szc;
+	}
+	pp = rootpp;
+	for (i = 0; i < pages; i++, pp++) {
+		if (ppa == NULL) {
+			page_unlock(pp);
+		} else {
+			ppa[i] = pp;
+			page_downgrade(ppa[i]);
+		}
+	}
+	if (ppa != NULL) {
+		ppa[pages] = NULL;
+	}
+	VM_STAT_ADD(page_exphcontg[18]);
+	ASSERT(vp->v_pages != NULL);
+	return (1);
+}
+
+/*
+ * Determine whether a page with the specified [vp, off]
+ * currently exists in the system and if so return its
+ * size code. Obviously this should only be considered as
+ * a hint since nothing prevents the page from disappearing
+ * or appearing immediately after the return from this routine.
+ */
+int
+page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
+{
+	page_t		*pp;
+	kmutex_t	*phm;
+	ulong_t		index;
+	int		rc = 0;
+
+	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+	ASSERT(szc != NULL);
+	VM_STAT_ADD(page_exists_forreal_cnt);
+
+	index = PAGE_HASH_FUNC(vp, off);
+	phm = PAGE_HASH_MUTEX(index);
+
+	mutex_enter(phm);
+	PAGE_HASH_SEARCH(index, pp, vp, off);
+	if (pp != NULL) {
+		*szc = pp->p_szc;
+		rc = 1;
+	}
+	mutex_exit(phm);
+	return (rc);
+}
+
+/* wakeup threads waiting for pages in page_create_get_something() */
+void
+wakeup_pcgs(void)
+{
+	if (!CV_HAS_WAITERS(&pcgs_cv))
+		return;
+	cv_broadcast(&pcgs_cv);
+}
+
+/*
+ * 'freemem' is used all over the kernel as an indication of how many
+ * pages are free (either on the cache list or on the free page list)
+ * in the system.  In very few places is a really accurate 'freemem'
+ * needed.  To avoid contention of the lock protecting a the
+ * single freemem, it was spread out into NCPU buckets.  Set_freemem
+ * sets freemem to the total of all NCPU buckets.  It is called from
+ * clock() on each TICK.
+ */
+void
+set_freemem()
+{
+	struct pcf	*p;
+	ulong_t		t;
+	uint_t		i;
+
+	t = 0;
+	p = pcf;
+	for (i = 0;  i < PCF_FANOUT; i++) {
+		t += p->pcf_count;
+		p++;
+	}
+	freemem = t;
+
+	/*
+	 * Don't worry about grabbing mutex.  It's not that
+	 * critical if we miss a tick or two.  This is
+	 * where we wakeup possible delayers in
+	 * page_create_get_something().
+	 */
+	wakeup_pcgs();
+}
+
+ulong_t
+get_freemem()
+{
+	struct pcf	*p;
+	ulong_t		t;
+	uint_t		i;
+
+	t = 0;
+	p = pcf;
+	for (i = 0; i < PCF_FANOUT; i++) {
+		t += p->pcf_count;
+		p++;
+	}
+	/*
+	 * We just calculated it, might as well set it.
+	 */
+	freemem = t;
+	return (t);
+}
+
+/*
+ * Acquire all of the page cache & free (pcf) locks.
+ */
+void
+pcf_acquire_all()
+{
+	struct pcf	*p;
+	uint_t		i;
+
+	p = pcf;
+	for (i = 0; i < PCF_FANOUT; i++) {
+		p->pcf_touch = 1;
+		mutex_enter(&p->pcf_lock);
+		p++;
+	}
+}
+
+/*
+ * Release all the pcf_locks.
+ */
+void
+pcf_release_all()
+{
+	struct pcf	*p;
+	uint_t		i;
+
+	p = pcf;
+	for (i = 0; i < PCF_FANOUT; i++) {
+		mutex_exit(&p->pcf_lock);
+		p++;
+	}
+}
+
+/*
+ * Inform the VM system that we need some pages freed up.
+ * Calls must be symmetric, e.g.:
+ *
+ *	page_needfree(100);
+ *	wait a bit;
+ *	page_needfree(-100);
+ */
+void
+page_needfree(spgcnt_t npages)
+{
+	mutex_enter(&new_freemem_lock);
+	needfree += npages;
+	mutex_exit(&new_freemem_lock);
+}
+
+/*
+ * Throttle for page_create(): try to prevent freemem from dropping
+ * below throttlefree.  We can't provide a 100% guarantee because
+ * KM_NOSLEEP allocations, page_reclaim(), and various other things
+ * nibble away at the freelist.  However, we can block all PG_WAIT
+ * allocations until memory becomes available.  The motivation is
+ * that several things can fall apart when there's no free memory:
+ *
+ * (1) If pageout() needs memory to push a page, the system deadlocks.
+ *
+ * (2) By (broken) specification, timeout(9F) can neither fail nor
+ *     block, so it has no choice but to panic the system if it
+ *     cannot allocate a callout structure.
+ *
+ * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
+ *     it panics if it cannot allocate a callback structure.
+ *
+ * (4) Untold numbers of third-party drivers have not yet been hardened
+ *     against KM_NOSLEEP and/or allocb() failures; they simply assume
+ *     success and panic the system with a data fault on failure.
+ *     (The long-term solution to this particular problem is to ship
+ *     hostile fault-injecting DEBUG kernels with the DDK.)
+ *
+ * It is theoretically impossible to guarantee success of non-blocking
+ * allocations, but in practice, this throttle is very hard to break.
+ */
+static int
+page_create_throttle(pgcnt_t npages, int flags)
+{
+	ulong_t	fm;
+	uint_t	i;
+	pgcnt_t tf;	/* effective value of throttlefree */
+
+	/*
+	 * Never deny pages when:
+	 * - it's a thread that cannot block [NOMEMWAIT()]
+	 * - the allocation cannot block and must not fail
+	 * - the allocation cannot block and is pageout dispensated
+	 */
+	if (NOMEMWAIT() ||
+	    ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
+	    ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
+		return (1);
+
+	/*
+	 * If the allocation can't block, we look favorably upon it
+	 * unless we're below pageout_reserve.  In that case we fail
+	 * the allocation because we want to make sure there are a few
+	 * pages available for pageout.
+	 */
+	if ((flags & PG_WAIT) == 0)
+		return (freemem >= npages + pageout_reserve);
+
+	/* Calculate the effective throttlefree value */
+	tf = throttlefree -
+	    ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
+
+	cv_signal(&proc_pageout->p_cv);
+
+	while (freemem < npages + tf) {
+		pcf_acquire_all();
+		mutex_enter(&new_freemem_lock);
+		fm = 0;
+		for (i = 0; i < PCF_FANOUT; i++) {
+			fm += pcf[i].pcf_count;
+			pcf[i].pcf_wait++;
+			mutex_exit(&pcf[i].pcf_lock);
+		}
+		freemem = fm;
+		needfree += npages;
+		freemem_wait++;
+		cv_wait(&freemem_cv, &new_freemem_lock);
+		freemem_wait--;
+		needfree -= npages;
+		mutex_exit(&new_freemem_lock);
+	}
+	return (1);
+}
+
+/*
+ * page_create_wait() is called to either coalecse pages from the
+ * different pcf buckets or to wait because there simply are not
+ * enough pages to satisfy the caller's request.
+ *
+ * Sadly, this is called from platform/vm/vm_machdep.c
+ */
+int
+page_create_wait(size_t npages, uint_t flags)
+{
+	pgcnt_t		total;
+	uint_t		i;
+	struct pcf	*p;
+
+	/*
+	 * Wait until there are enough free pages to satisfy our
+	 * entire request.
+	 * We set needfree += npages before prodding pageout, to make sure
+	 * it does real work when npages > lotsfree > freemem.
+	 */
+	VM_STAT_ADD(page_create_not_enough);
+
+	ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
+checkagain:
+	if ((flags & PG_NORELOC) &&
+	    kcage_freemem < kcage_throttlefree + npages)
+		(void) kcage_create_throttle(npages, flags);
+
+	if (freemem < npages + throttlefree)
+		if (!page_create_throttle(npages, flags))
+			return (0);
+
+	/*
+	 * Since page_create_va() looked at every
+	 * bucket, assume we are going to have to wait.
+	 * Get all of the pcf locks.
+	 */
+	total = 0;
+	p = pcf;
+	for (i = 0; i < PCF_FANOUT; i++) {
+		p->pcf_touch = 1;
+		mutex_enter(&p->pcf_lock);
+		total += p->pcf_count;
+		if (total >= npages) {
+			/*
+			 * Wow!  There are enough pages laying around
+			 * to satisfy the request.  Do the accounting,
+			 * drop the locks we acquired, and go back.
+			 *
+			 * freemem is not protected by any lock. So,
+			 * we cannot have any assertion containing
+			 * freemem.
+			 */
+			freemem -= npages;
+
+			while (p >= pcf) {
+				if (p->pcf_count <= npages) {
+					npages -= p->pcf_count;
+					p->pcf_count = 0;
+				} else {
+					p->pcf_count -= (uint_t)npages;
+					npages = 0;
+				}
+				mutex_exit(&p->pcf_lock);
+				p--;
+			}
+			ASSERT(npages == 0);
+			return (1);
+		}
+		p++;
+	}
+
+	/*
+	 * All of the pcf locks are held, there are not enough pages
+	 * to satisfy the request (npages < total).
+	 * Be sure to acquire the new_freemem_lock before dropping
+	 * the pcf locks.  This prevents dropping wakeups in page_free().
+	 * The order is always pcf_lock then new_freemem_lock.
+	 *
+	 * Since we hold all the pcf locks, it is a good time to set freemem.
+	 *
+	 * If the caller does not want to wait, return now.
+	 * Else turn the pageout daemon loose to find something
+	 * and wait till it does.
+	 *
+	 */
+	freemem = total;
+
+	if ((flags & PG_WAIT) == 0) {
+		pcf_release_all();
+
+		TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
+		"page_create_nomem:npages %ld freemem %ld", npages, freemem);
+		return (0);
+	}
+
+	ASSERT(proc_pageout != NULL);
+	cv_signal(&proc_pageout->p_cv);
+
+	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
+	    "page_create_sleep_start: freemem %ld needfree %ld",
+	    freemem, needfree);
+
+	/*
+	 * We are going to wait.
+	 * We currently hold all of the pcf_locks,
+	 * get the new_freemem_lock (it protects freemem_wait),
+	 * before dropping the pcf_locks.
+	 */
+	mutex_enter(&new_freemem_lock);
+
+	p = pcf;
+	for (i = 0; i < PCF_FANOUT; i++) {
+		p->pcf_wait++;
+		mutex_exit(&p->pcf_lock);
+		p++;
+	}
+
+	needfree += npages;
+	freemem_wait++;
+
+	cv_wait(&freemem_cv, &new_freemem_lock);
+
+	freemem_wait--;
+	needfree -= npages;
+
+	mutex_exit(&new_freemem_lock);
+
+	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
+	    "page_create_sleep_end: freemem %ld needfree %ld",
+	    freemem, needfree);
+
+	VM_STAT_ADD(page_create_not_enough_again);
+	goto checkagain;
+}
+
+/*
+ * A routine to do the opposite of page_create_wait().
+ */
+void
+page_create_putback(spgcnt_t npages)
+{
+	struct pcf	*p;
+	pgcnt_t		lump;
+	uint_t		*which;
+
+	/*
+	 * When a contiguous lump is broken up, we have to
+	 * deal with lots of pages (min 64) so lets spread
+	 * the wealth around.
+	 */
+	lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT;
+	freemem += npages;
+
+	for (p = pcf; (npages > 0) && (p < &pcf[PCF_FANOUT]); p++) {
+		which = &p->pcf_count;
+
+		mutex_enter(&p->pcf_lock);
+
+		if (p->pcf_block) {
+			which = &p->pcf_reserve;
+		}
+
+		if (lump < npages) {
+			*which += (uint_t)lump;
+			npages -= lump;
+		} else {
+			*which += (uint_t)npages;
+			npages = 0;
+		}
+
+		if (p->pcf_wait) {
+			mutex_enter(&new_freemem_lock);
+			/*
+			 * Check to see if some other thread
+			 * is actually waiting.  Another bucket
+			 * may have woken it up by now.  If there
+			 * are no waiters, then set our pcf_wait
+			 * count to zero to avoid coming in here
+			 * next time.
+			 */
+			if (freemem_wait) {
+				if (npages > 1) {
+					cv_broadcast(&freemem_cv);
+				} else {
+					cv_signal(&freemem_cv);
+				}
+				p->pcf_wait--;
+			} else {
+				p->pcf_wait = 0;
+			}
+			mutex_exit(&new_freemem_lock);
+		}
+		mutex_exit(&p->pcf_lock);
+	}
+	ASSERT(npages == 0);
+}
+
+/*
+ * A helper routine for page_create_get_something.
+ * The indenting got to deep down there.
+ * Unblock the pcf counters.  Any pages freed after
+ * pcf_block got set are moved to pcf_count and
+ * wakeups (cv_broadcast() or cv_signal()) are done as needed.
+ */
+static void
+pcgs_unblock(void)
+{
+	int		i;
+	struct pcf	*p;
+
+	/* Update freemem while we're here. */
+	freemem = 0;
+	p = pcf;
+	for (i = 0; i < PCF_FANOUT; i++) {
+		mutex_enter(&p->pcf_lock);
+		ASSERT(p->pcf_count == 0);
+		p->pcf_count = p->pcf_reserve;
+		p->pcf_block = 0;
+		freemem += p->pcf_count;
+		if (p->pcf_wait) {
+			mutex_enter(&new_freemem_lock);
+			if (freemem_wait) {
+				if (p->pcf_reserve > 1) {
+					cv_broadcast(&freemem_cv);
+					p->pcf_wait = 0;
+				} else {
+					cv_signal(&freemem_cv);
+					p->pcf_wait--;
+				}
+			} else {
+				p->pcf_wait = 0;
+			}
+			mutex_exit(&new_freemem_lock);
+		}
+		p->pcf_reserve = 0;
+		mutex_exit(&p->pcf_lock);
+		p++;
+	}
+}
+
+/*
+ * Called from page_create_va() when both the cache and free lists
+ * have been checked once.
+ *
+ * Either returns a page or panics since the accounting was done
+ * way before we got here.
+ *
+ * We don't come here often, so leave the accounting on permanently.
+ */
+
+#define	MAX_PCGS	100
+
+#ifdef	DEBUG
+#define	PCGS_TRIES	100
+#else	/* DEBUG */
+#define	PCGS_TRIES	10
+#endif	/* DEBUG */
+
+#ifdef	VM_STATS
+uint_t	pcgs_counts[PCGS_TRIES];
+uint_t	pcgs_too_many;
+uint_t	pcgs_entered;
+uint_t	pcgs_entered_noreloc;
+uint_t	pcgs_locked;
+uint_t	pcgs_cagelocked;
+#endif	/* VM_STATS */
+
+static page_t *
+page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
+    caddr_t vaddr, uint_t flags)
+{
+	uint_t		count;
+	page_t		*pp;
+	uint_t		locked, i;
+	struct	pcf	*p;
+	lgrp_t		*lgrp;
+	int		cagelocked = 0;
+
+	VM_STAT_ADD(pcgs_entered);
+
+	/*
+	 * Tap any reserve freelists: if we fail now, we'll die
+	 * since the page(s) we're looking for have already been
+	 * accounted for.
+	 */
+	flags |= PG_PANIC;
+
+	if ((flags & PG_NORELOC) != 0) {
+		VM_STAT_ADD(pcgs_entered_noreloc);
+		/*
+		 * Requests for free pages from critical threads
+		 * such as pageout still won't throttle here, but
+		 * we must try again, to give the cageout thread
+		 * another chance to catch up. Since we already
+		 * accounted for the pages, we had better get them
+		 * this time.
+		 *
+		 * N.B. All non-critical threads acquire the pcgs_cagelock
+		 * to serialize access to the freelists. This implements a
+		 * turnstile-type synchornization to avoid starvation of
+		 * critical requests for PG_NORELOC memory by non-critical
+		 * threads: all non-critical threads must acquire a 'ticket'
+		 * before passing through, which entails making sure
+		 * kcage_freemem won't fall below minfree prior to grabbing
+		 * pages from the freelists.
+		 */
+		if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
+			mutex_enter(&pcgs_cagelock);
+			cagelocked = 1;
+			VM_STAT_ADD(pcgs_cagelocked);
+		}
+	}
+
+	/*
+	 * Time to get serious.
+	 * We failed to get a `correctly colored' page from both the
+	 * free and cache lists.
+	 * We escalate in stage.
+	 *
+	 * First try both lists without worring about color.
+	 *
+	 * Then, grab all page accounting locks (ie. pcf[]) and
+	 * steal any pages that they have and set the pcf_block flag to
+	 * stop deletions from the lists.  This will help because
+	 * a page can get added to the free list while we are looking
+	 * at the cache list, then another page could be added to the cache
+	 * list allowing the page on the free list to be removed as we
+	 * move from looking at the cache list to the free list. This
+	 * could happen over and over. We would never find the page
+	 * we have accounted for.
+	 *
+	 * Noreloc pages are a subset of the global (relocatable) page pool.
+	 * They are not tracked separately in the pcf bins, so it is
+	 * impossible to know when doing pcf accounting if the available
+	 * page(s) are noreloc pages or not. When looking for a noreloc page
+	 * it is quite easy to end up here even if the global (relocatable)
+	 * page pool has plenty of free pages but the noreloc pool is empty.
+	 *
+	 * When the noreloc pool is empty (or low), additional noreloc pages
+	 * are created by converting pages from the global page pool. This
+	 * process will stall during pcf accounting if the pcf bins are
+	 * already locked. Such is the case when a noreloc allocation is
+	 * looping here in page_create_get_something waiting for more noreloc
+	 * pages to appear.
+	 *
+	 * Short of adding a new field to the pcf bins to accurately track
+	 * the number of free noreloc pages, we instead do not grab the
+	 * pcgs_lock, do not set the pcf blocks and do not timeout when
+	 * allocating a noreloc page. This allows noreloc allocations to
+	 * loop without blocking global page pool allocations.
+	 *
+	 * NOTE: the behaviour of page_create_get_something has not changed
+	 * for the case of global page pool allocations.
+	 */
+
+	flags &= ~PG_MATCH_COLOR;
+	locked = 0;
+#ifndef __sparc
+	/*
+	 * page_create_get_something may be called because 4g memory may be
+	 * depleted. Set flags to allow for relocation of base page below
+	 * 4g if necessary.
+	 */
+	if (physmax4g)
+		flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
+#endif
+
+	lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
+
+	for (count = 0; kcage_on || count < MAX_PCGS; count++) {
+		pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
+		    flags, lgrp);
+		if (pp == NULL) {
+			pp = page_get_cachelist(vp, off, seg, vaddr,
+				flags, lgrp);
+		}
+		if (pp == NULL) {
+			/*
+			 * Serialize.  Don't fight with other pcgs().
+			 */
+			if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
+				mutex_enter(&pcgs_lock);
+				VM_STAT_ADD(pcgs_locked);
+				locked = 1;
+				p = pcf;
+				for (i = 0; i < PCF_FANOUT; i++) {
+					mutex_enter(&p->pcf_lock);
+					ASSERT(p->pcf_block == 0);
+					p->pcf_block = 1;
+					p->pcf_reserve = p->pcf_count;
+					p->pcf_count = 0;
+					mutex_exit(&p->pcf_lock);
+					p++;
+				}
+				freemem = 0;
+			}
+
+			if (count) {
+				/*
+				 * Since page_free() puts pages on
+				 * a list then accounts for it, we
+				 * just have to wait for page_free()
+				 * to unlock any page it was working
+				 * with. The page_lock()-page_reclaim()
+				 * path falls in the same boat.
+				 *
+				 * We don't need to check on the
+				 * PG_WAIT flag, we have already
+				 * accounted for the page we are
+				 * looking for in page_create_va().
+				 *
+				 * We just wait a moment to let any
+				 * locked pages on the lists free up,
+				 * then continue around and try again.
+				 *
+				 * Will be awakened by set_freemem().
+				 */
+				mutex_enter(&pcgs_wait_lock);
+				cv_wait(&pcgs_cv, &pcgs_wait_lock);
+				mutex_exit(&pcgs_wait_lock);
+			}
+		} else {
+#ifdef VM_STATS
+			if (count >= PCGS_TRIES) {
+				VM_STAT_ADD(pcgs_too_many);
+			} else {
+				VM_STAT_ADD(pcgs_counts[count]);
+			}
+#endif
+			if (locked) {
+				pcgs_unblock();
+				mutex_exit(&pcgs_lock);
+			}
+			if (cagelocked)
+				mutex_exit(&pcgs_cagelock);
+			return (pp);
+		}
+	}
+	/*
+	 * we go down holding the pcf locks.
+	 */
+	panic("no %spage found %d",
+	    ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
+	/*NOTREACHED*/
+}
+
+/*
+ * Create enough pages for "bytes" worth of data starting at
+ * "off" in "vp".
+ *
+ *	Where flag must be one of:
+ *
+ *		PG_EXCL:	Exclusive create (fail if any page already
+ *				exists in the page cache) which does not
+ *				wait for memory to become available.
+ *
+ *		PG_WAIT:	Non-exclusive create which can wait for
+ *				memory to become available.
+ *
+ *		PG_PHYSCONTIG:	Allocate physically contiguous pages.
+ *				(Not Supported)
+ *
+ * A doubly linked list of pages is returned to the caller.  Each page
+ * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
+ * lock.
+ *
+ * Unable to change the parameters to page_create() in a minor release,
+ * we renamed page_create() to page_create_va(), changed all known calls
+ * from page_create() to page_create_va(), and created this wrapper.
+ *
+ * Upon a major release, we should break compatibility by deleting this
+ * wrapper, and replacing all the strings "page_create_va", with "page_create".
+ *
+ * NOTE: There is a copy of this interface as page_create_io() in
+ *	 i86/vm/vm_machdep.c. Any bugs fixed here should be applied
+ *	 there.
+ */
+page_t *
+page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
+{
+	caddr_t random_vaddr;
+	struct seg kseg;
+
+#ifdef DEBUG
+	cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
+	    (void *)caller());
+#endif
+
+	random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
+	    (uintptr_t)(off >> PAGESHIFT));
+	kseg.s_as = &kas;
+
+	return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
+}
+
+#ifdef DEBUG
+uint32_t pg_alloc_pgs_mtbf = 0;
+#endif
+
+/*
+ * Used for large page support. It will attempt to allocate
+ * a large page(s) off the freelist.
+ *
+ * Returns non zero on failure.
+ */
+int
+page_alloc_pages(struct seg *seg, caddr_t addr, page_t **basepp,
+    page_t *ppa[], uint_t szc, int anypgsz)
+{
+	pgcnt_t		npgs, curnpgs, totpgs;
+	size_t		pgsz;
+	page_t		*pplist = NULL, *pp;
+	int		err = 0;
+	lgrp_t		*lgrp;
+
+	ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
+
+	VM_STAT_ADD(alloc_pages[0]);
+
+#ifdef DEBUG
+	if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
+		return (ENOMEM);
+	}
+#endif
+
+	pgsz = page_get_pagesize(szc);
+	totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
+
+	ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
+	/*
+	 * One must be NULL but not both.
+	 * And one must be non NULL but not both.
+	 */
+	ASSERT(basepp != NULL || ppa != NULL);
+	ASSERT(basepp == NULL || ppa == NULL);
+
+	(void) page_create_wait(npgs, PG_WAIT);
+
+	while (npgs && szc) {
+		lgrp = lgrp_mem_choose(seg, addr, pgsz);
+		pp = page_get_freelist(NULL, 0, seg, addr, pgsz, 0, lgrp);
+		if (pp != NULL) {
+			VM_STAT_ADD(alloc_pages[1]);
+			page_list_concat(&pplist, &pp);
+			ASSERT(npgs >= curnpgs);
+			npgs -= curnpgs;
+		} else if (anypgsz) {
+			VM_STAT_ADD(alloc_pages[2]);
+			szc--;
+			pgsz = page_get_pagesize(szc);
+			curnpgs = pgsz >> PAGESHIFT;
+		} else {
+			VM_STAT_ADD(alloc_pages[3]);
+			ASSERT(npgs == totpgs);
+			page_create_putback(npgs);
+			return (ENOMEM);
+		}
+	}
+	if (szc == 0) {
+		VM_STAT_ADD(alloc_pages[4]);
+		ASSERT(npgs != 0);
+		page_create_putback(npgs);
+		err = ENOMEM;
+	} else if (basepp != NULL) {
+		ASSERT(npgs == 0);
+		ASSERT(ppa == NULL);
+		*basepp = pplist;
+	}
+
+	npgs = totpgs - npgs;
+	pp = pplist;
+
+	/*
+	 * Clear the free and age bits. Also if we were passed in a ppa then
+	 * fill it in with all the constituent pages from the large page. But
+	 * if we failed to allocate all the pages just free what we got.
+	 */
+	while (npgs != 0) {
+		ASSERT(PP_ISFREE(pp));
+		ASSERT(PP_ISAGED(pp));
+		if (ppa != NULL || err != 0) {
+			if (err == 0) {
+				VM_STAT_ADD(alloc_pages[5]);
+				PP_CLRFREE(pp);
+				PP_CLRAGED(pp);
+				page_sub(&pplist, pp);
+				*ppa++ = pp;
+				npgs--;
+			} else {
+				VM_STAT_ADD(alloc_pages[6]);
+				ASSERT(pp->p_szc != 0);
+				curnpgs = page_get_pagecnt(pp->p_szc);
+				page_list_break(&pp, &pplist, curnpgs);
+				page_list_add_pages(pp, 0);
+				page_create_putback(curnpgs);
+				ASSERT(npgs >= curnpgs);
+				npgs -= curnpgs;
+			}
+			pp = pplist;
+		} else {
+			VM_STAT_ADD(alloc_pages[7]);
+			PP_CLRFREE(pp);
+			PP_CLRAGED(pp);
+			pp = pp->p_next;
+			npgs--;
+		}
+	}
+	return (err);
+}
+
+/*
+ * Get a single large page off of the freelists, and set it up for use.
+ * Number of bytes requested must be a supported page size.
+ *
+ * Note that this call may fail even if there is sufficient
+ * memory available or PG_WAIT is set, so the caller must
+ * be willing to fallback on page_create_va(), block and retry,
+ * or fail the requester.
+ */
+page_t *
+page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
+    struct seg *seg, caddr_t vaddr, void *arg)
+{
+	pgcnt_t		npages, pcftotal;
+	page_t		*pp;
+	page_t		*rootpp;
+	lgrp_t		*lgrp;
+	uint_t		enough;
+	uint_t		pcf_index;
+	uint_t		i;
+	struct pcf	*p;
+	struct pcf	*q;
+	lgrp_id_t	*lgrpid = (lgrp_id_t *)arg;
+
+	ASSERT(vp != NULL);
+
+	ASSERT((flags & ~(PG_EXCL | PG_WAIT |
+		    PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
+	/* but no others */
+
+	ASSERT((flags & PG_EXCL) == PG_EXCL);
+
+	npages = btop(bytes);
+
+	if (!kcage_on || panicstr) {
+		/*
+		 * Cage is OFF, or we are single threaded in
+		 * panic, so make everything a RELOC request.
+		 */
+		flags &= ~PG_NORELOC;
+	}
+
+	/*
+	 * Make sure there's adequate physical memory available.
+	 * Note: PG_WAIT is ignored here.
+	 */
+	if (freemem <= throttlefree + npages) {
+		VM_STAT_ADD(page_create_large_cnt[1]);
+		return (NULL);
+	}
+
+	/*
+	 * If cage is on, dampen draw from cage when available
+	 * cage space is low.
+	 */
+	if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
+	    kcage_freemem < kcage_throttlefree + npages) {
+
+		/*
+		 * The cage is on, the caller wants PG_NORELOC
+		 * pages and available cage memory is very low.
+		 * Call kcage_create_throttle() to attempt to
+		 * control demand on the cage.
+		 */
+		if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
+			VM_STAT_ADD(page_create_large_cnt[2]);
+			return (NULL);
+		}
+	}
+
+	enough = 0;
+	pcf_index = PCF_INDEX();
+	p = &pcf[pcf_index];
+	p->pcf_touch = 1;
+	q = &pcf[PCF_FANOUT];
+	for (pcftotal = 0, i = 0; i < PCF_FANOUT; i++) {
+		if (p->pcf_count > npages) {
+			/*
+			 * a good one to try.
+			 */
+			mutex_enter(&p->pcf_lock);
+			if (p->pcf_count > npages) {
+				p->pcf_count -= (uint_t)npages;
+				/*
+				 * freemem is not protected by any lock.
+				 * Thus, we cannot have any assertion
+				 * containing freemem here.
+				 */
+				freemem -= npages;
+				enough = 1;
+				mutex_exit(&p->pcf_lock);
+				break;
+			}
+			mutex_exit(&p->pcf_lock);
+		}
+		pcftotal += p->pcf_count;
+		p++;
+		if (p >= q) {
+			p = pcf;
+		}
+		p->pcf_touch = 1;
+	}
+
+	if (!enough) {
+		/* If there isn't enough memory available, give up. */
+		if (pcftotal < npages) {
+			VM_STAT_ADD(page_create_large_cnt[3]);
+			return (NULL);
+		}
+
+		/* try to collect pages from several pcf bins */
+		for (p = pcf, pcftotal = 0, i = 0; i < PCF_FANOUT; i++) {
+			p->pcf_touch = 1;
+			mutex_enter(&p->pcf_lock);
+			pcftotal += p->pcf_count;
+			if (pcftotal >= npages) {
+				/*
+				 * Wow!  There are enough pages laying around
+				 * to satisfy the request.  Do the accounting,
+				 * drop the locks we acquired, and go back.
+				 *
+				 * freemem is not protected by any lock. So,
+				 * we cannot have any assertion containing
+				 * freemem.
+				 */
+				pgcnt_t	tpages = npages;
+				freemem -= npages;
+				while (p >= pcf) {
+					if (p->pcf_count <= tpages) {
+						tpages -= p->pcf_count;
+						p->pcf_count = 0;
+					} else {
+						p->pcf_count -= (uint_t)tpages;
+						tpages = 0;
+					}
+					mutex_exit(&p->pcf_lock);
+					p--;
+				}
+				ASSERT(tpages == 0);
+				break;
+			}
+			p++;
+		}
+		if (i == PCF_FANOUT) {
+			/* failed to collect pages - release the locks */
+			while (--p >= pcf) {
+				mutex_exit(&p->pcf_lock);
+			}
+			VM_STAT_ADD(page_create_large_cnt[4]);
+			return (NULL);
+		}
+	}
+
+	/*
+	 * This is where this function behaves fundamentally differently
+	 * than page_create_va(); since we're intending to map the page
+	 * with a single TTE, we have to get it as a physically contiguous
+	 * hardware pagesize chunk.  If we can't, we fail.
+	 */
+	if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
+		LGRP_EXISTS(lgrp_table[*lgrpid]))
+		lgrp = lgrp_table[*lgrpid];
+	else
+		lgrp = lgrp_mem_choose(seg, vaddr, bytes);
+
+	if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
+	    bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
+		page_create_putback(npages);
+		VM_STAT_ADD(page_create_large_cnt[5]);
+		return (NULL);
+	}
+
+	/*
+	 * if we got the page with the wrong mtype give it back this is a
+	 * workaround for CR 6249718. When CR 6249718 is fixed we never get
+	 * inside "if" and the workaround becomes just a nop
+	 */
+	if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
+		page_list_add_pages(rootpp, 0);
+		page_create_putback(npages);
+		VM_STAT_ADD(page_create_large_cnt[6]);
+		return (NULL);
+	}
+
+	/*
+	 * If satisfying this request has left us with too little
+	 * memory, start the wheels turning to get some back.  The
+	 * first clause of the test prevents waking up the pageout
+	 * daemon in situations where it would decide that there's
+	 * nothing to do.
+	 */
+	if (nscan < desscan && freemem < minfree) {
+		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
+		    "pageout_cv_signal:freemem %ld", freemem);
+		cv_signal(&proc_pageout->p_cv);
+	}
+
+	pp = rootpp;
+	while (npages--) {
+		ASSERT(PAGE_EXCL(pp));
+		ASSERT(pp->p_vnode == NULL);
+		ASSERT(!hat_page_is_mapped(pp));
+		PP_CLRFREE(pp);
+		PP_CLRAGED(pp);
+		if (!page_hashin(pp, vp, off, NULL))
+			panic("page_create_large: hashin failed: page %p",
+			    (void *)pp);
+		page_io_lock(pp);
+		off += PAGESIZE;
+		pp = pp->p_next;
+	}
+
+	VM_STAT_ADD(page_create_large_cnt[0]);
+	return (rootpp);
+}
+
+page_t *
+page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
+    struct seg *seg, caddr_t vaddr)
+{
+	page_t		*plist = NULL;
+	pgcnt_t		npages;
+	pgcnt_t		found_on_free = 0;
+	pgcnt_t		pages_req;
+	page_t		*npp = NULL;
+	uint_t		enough;
+	uint_t		i;
+	uint_t		pcf_index;
+	struct pcf	*p;
+	struct pcf	*q;
+	lgrp_t		*lgrp;
+
+	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
+		"page_create_start:vp %p off %llx bytes %lu flags %x",
+		vp, off, bytes, flags);
+
+	ASSERT(bytes != 0 && vp != NULL);
+
+	if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
+		panic("page_create: invalid flags");
+		/*NOTREACHED*/
+	}
+	ASSERT((flags & ~(PG_EXCL | PG_WAIT |
+	    PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
+	    /* but no others */
+
+	pages_req = npages = btopr(bytes);
+	/*
+	 * Try to see whether request is too large to *ever* be
+	 * satisfied, in order to prevent deadlock.  We arbitrarily
+	 * decide to limit maximum size requests to max_page_get.
+	 */
+	if (npages >= max_page_get) {
+		if ((flags & PG_WAIT) == 0) {
+			TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
+			    "page_create_toobig:vp %p off %llx npages "
+			    "%lu max_page_get %lu",
+			    vp, off, npages, max_page_get);
+			return (NULL);
+		} else {
+			cmn_err(CE_WARN,
+			    "Request for too much kernel memory "
+			    "(%lu bytes), will hang forever", bytes);
+			for (;;)
+				delay(1000000000);
+		}
+	}
+
+	if (!kcage_on || panicstr) {
+		/*
+		 * Cage is OFF, or we are single threaded in
+		 * panic, so make everything a RELOC request.
+		 */
+		flags &= ~PG_NORELOC;
+	}
+
+	if (freemem <= throttlefree + npages)
+		if (!page_create_throttle(npages, flags))
+			return (NULL);
+
+	/*
+	 * If cage is on, dampen draw from cage when available
+	 * cage space is low.
+	 */
+	if ((flags & PG_NORELOC) &&
+		kcage_freemem < kcage_throttlefree + npages) {
+
+		/*
+		 * The cage is on, the caller wants PG_NORELOC
+		 * pages and available cage memory is very low.
+		 * Call kcage_create_throttle() to attempt to
+		 * control demand on the cage.
+		 */
+		if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
+			return (NULL);
+	}
+
+	VM_STAT_ADD(page_create_cnt[0]);
+
+	enough = 0;
+	pcf_index = PCF_INDEX();
+
+	p = &pcf[pcf_index];
+	p->pcf_touch = 1;
+	q = &pcf[PCF_FANOUT];
+	for (i = 0; i < PCF_FANOUT; i++) {
+		if (p->pcf_count > npages) {
+			/*
+			 * a good one to try.
+			 */
+			mutex_enter(&p->pcf_lock);
+			if (p->pcf_count > npages) {
+				p->pcf_count -= (uint_t)npages;
+				/*
+				 * freemem is not protected by any lock.
+				 * Thus, we cannot have any assertion
+				 * containing freemem here.
+				 */
+				freemem -= npages;
+				enough = 1;
+				mutex_exit(&p->pcf_lock);
+				break;
+			}
+			mutex_exit(&p->pcf_lock);
+		}
+		p++;
+		if (p >= q) {
+			p = pcf;
+		}
+		p->pcf_touch = 1;
+	}
+
+	if (!enough) {
+		/*
+		 * Have to look harder.  If npages is greater than
+		 * one, then we might have to coalecse the counters.
+		 *
+		 * Go wait.  We come back having accounted
+		 * for the memory.
+		 */
+		VM_STAT_ADD(page_create_cnt[1]);
+		if (!page_create_wait(npages, flags)) {
+			VM_STAT_ADD(page_create_cnt[2]);
+			return (NULL);
+		}
+	}
+
+	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
+		"page_create_success:vp %p off %llx", vp, off);
+
+	/*
+	 * If satisfying this request has left us with too little
+	 * memory, start the wheels turning to get some back.  The
+	 * first clause of the test prevents waking up the pageout
+	 * daemon in situations where it would decide that there's
+	 * nothing to do.
+	 */
+	if (nscan < desscan && freemem < minfree) {
+		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
+			"pageout_cv_signal:freemem %ld", freemem);
+		cv_signal(&proc_pageout->p_cv);
+	}
+
+	/*
+	 * Loop around collecting the requested number of pages.
+	 * Most of the time, we have to `create' a new page. With
+	 * this in mind, pull the page off the free list before
+	 * getting the hash lock.  This will minimize the hash
+	 * lock hold time, nesting, and the like.  If it turns
+	 * out we don't need the page, we put it back at the end.
+	 */
+	while (npages--) {
+		page_t		*pp;
+		kmutex_t	*phm = NULL;
+		ulong_t		index;
+
+		index = PAGE_HASH_FUNC(vp, off);
+top:
+		ASSERT(phm == NULL);
+		ASSERT(index == PAGE_HASH_FUNC(vp, off));
+		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+
+		if (npp == NULL) {
+			/*
+			 * Try to get a page from the freelist (ie,
+			 * a page with no [vp, off] tag).  If that
+			 * fails, use the cachelist.
+			 *
+			 * During the first attempt at both the free
+			 * and cache lists we try for the correct color.
+			 */
+			/*
+			 * XXXX-how do we deal with virtual indexed
+			 * caches and and colors?
+			 */
+			VM_STAT_ADD(page_create_cnt[4]);
+			/*
+			 * Get lgroup to allocate next page of shared memory
+			 * from and use it to specify where to allocate
+			 * the physical memory
+			 */
+			lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
+			npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
+			    flags | PG_MATCH_COLOR, lgrp);
+			if (npp == NULL) {
+				npp = page_get_cachelist(vp, off, seg,
+				    vaddr, flags | PG_MATCH_COLOR, lgrp);
+				if (npp == NULL) {
+					npp = page_create_get_something(vp,
+					    off, seg, vaddr,
+					    flags & ~PG_MATCH_COLOR);
+				}
+
+				if (PP_ISAGED(npp) == 0) {
+					/*
+					 * Since this page came from the
+					 * cachelist, we must destroy the
+					 * old vnode association.
+					 */
+					page_hashout(npp, NULL);
+				}
+			}
+		}
+
+		/*
+		 * We own this page!
+		 */
+		ASSERT(PAGE_EXCL(npp));
+		ASSERT(npp->p_vnode == NULL);
+		ASSERT(!hat_page_is_mapped(npp));
+		PP_CLRFREE(npp);
+		PP_CLRAGED(npp);
+
+		/*
+		 * Here we have a page in our hot little mits and are
+		 * just waiting to stuff it on the appropriate lists.
+		 * Get the mutex and check to see if it really does
+		 * not exist.
+		 */
+		phm = PAGE_HASH_MUTEX(index);
+		mutex_enter(phm);
+		PAGE_HASH_SEARCH(index, pp, vp, off);
+		if (pp == NULL) {
+			VM_STAT_ADD(page_create_new);
+			pp = npp;
+			npp = NULL;
+			if (!page_hashin(pp, vp, off, phm)) {
+				/*
+				 * Since we hold the page hash mutex and
+				 * just searched for this page, page_hashin
+				 * had better not fail.  If it does, that
+				 * means somethread did not follow the
+				 * page hash mutex rules.  Panic now and
+				 * get it over with.  As usual, go down
+				 * holding all the locks.
+				 */
+				ASSERT(MUTEX_HELD(phm));
+				panic("page_create: "
+				    "hashin failed %p %p %llx %p",
+				    (void *)pp, (void *)vp, off, (void *)phm);
+				/*NOTREACHED*/
+			}
+			ASSERT(MUTEX_HELD(phm));
+			mutex_exit(phm);
+			phm = NULL;
+
+			/*
+			 * Hat layer locking need not be done to set
+			 * the following bits since the page is not hashed
+			 * and was on the free list (i.e., had no mappings).
+			 *
+			 * Set the reference bit to protect
+			 * against immediate pageout
+			 *
+			 * XXXmh modify freelist code to set reference
+			 * bit so we don't have to do it here.
+			 */
+			page_set_props(pp, P_REF);
+			found_on_free++;
+		} else {
+			VM_STAT_ADD(page_create_exists);
+			if (flags & PG_EXCL) {
+				/*
+				 * Found an existing page, and the caller
+				 * wanted all new pages.  Undo all of the work
+				 * we have done.
+				 */
+				mutex_exit(phm);
+				phm = NULL;
+				while (plist != NULL) {
+					pp = plist;
+					page_sub(&plist, pp);
+					page_io_unlock(pp);
+					/* large pages should not end up here */
+					ASSERT(pp->p_szc == 0);
+					/*LINTED: constant in conditional ctx*/
+					VN_DISPOSE(pp, B_INVAL, 0, kcred);
+				}
+				VM_STAT_ADD(page_create_found_one);
+				goto fail;
+			}
+			ASSERT(flags & PG_WAIT);
+			if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
+				/*
+				 * Start all over again if we blocked trying
+				 * to lock the page.
+				 */
+				mutex_exit(phm);
+				VM_STAT_ADD(page_create_page_lock_failed);
+				phm = NULL;
+				goto top;
+			}
+			mutex_exit(phm);
+			phm = NULL;
+
+			if (PP_ISFREE(pp)) {
+				ASSERT(PP_ISAGED(pp) == 0);
+				VM_STAT_ADD(pagecnt.pc_get_cache);
+				page_list_sub(pp, PG_CACHE_LIST);
+				PP_CLRFREE(pp);
+				found_on_free++;
+			}
+		}
+
+		/*
+		 * Got a page!  It is locked.  Acquire the i/o
+		 * lock since we are going to use the p_next and
+		 * p_prev fields to link the requested pages together.
+		 */
+		page_io_lock(pp);
+		page_add(&plist, pp);
+		plist = plist->p_next;
+		off += PAGESIZE;
+		vaddr += PAGESIZE;
+	}
+
+	ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
+fail:
+	if (npp != NULL) {
+		/*
+		 * Did not need this page after all.
+		 * Put it back on the free list.
+		 */
+		VM_STAT_ADD(page_create_putbacks);
+		PP_SETFREE(npp);
+		PP_SETAGED(npp);
+		npp->p_offset = (u_offset_t)-1;
+		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
+		page_unlock(npp);
+
+	}
+
+	ASSERT(pages_req >= found_on_free);
+
+	{
+		uint_t overshoot = (uint_t)(pages_req - found_on_free);
+
+		if (overshoot) {
+			VM_STAT_ADD(page_create_overshoot);
+			p = &pcf[pcf_index];
+			p->pcf_touch = 1;
+			mutex_enter(&p->pcf_lock);
+			if (p->pcf_block) {
+				p->pcf_reserve += overshoot;
+			} else {
+				p->pcf_count += overshoot;
+				if (p->pcf_wait) {
+					mutex_enter(&new_freemem_lock);
+					if (freemem_wait) {
+						cv_signal(&freemem_cv);
+						p->pcf_wait--;
+					} else {
+						p->pcf_wait = 0;
+					}
+					mutex_exit(&new_freemem_lock);
+				}
+			}
+			mutex_exit(&p->pcf_lock);
+			/* freemem is approximate, so this test OK */
+			if (!p->pcf_block)
+				freemem += overshoot;
+		}
+	}
+
+	return (plist);
+}
+
+/*
+ * One or more constituent pages of this large page has been marked
+ * toxic. Simply demote the large page to PAGESIZE pages and let
+ * page_free() handle it. This routine should only be called by
+ * large page free routines (page_free_pages() and page_destroy_pages().
+ * All pages are locked SE_EXCL and have already been marked free.
+ */
+static void
+page_free_toxic_pages(page_t *rootpp)
+{
+	page_t	*tpp;
+	pgcnt_t	i, pgcnt = page_get_pagecnt(rootpp->p_szc);
+	uint_t	szc = rootpp->p_szc;
+
+	for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
+		ASSERT(tpp->p_szc == szc);
+		ASSERT((PAGE_EXCL(tpp) &&
+		    !page_iolock_assert(tpp)) || panicstr);
+		tpp->p_szc = 0;
+	}
+
+	while (rootpp != NULL) {
+		tpp = rootpp;
+		page_sub(&rootpp, tpp);
+		ASSERT(PP_ISFREE(tpp));
+		PP_CLRFREE(tpp);
+		page_free(tpp, 1);
+	}
+}
+
+/*
+ * Put page on the "free" list.
+ * The free list is really two lists maintained by
+ * the PSM of whatever machine we happen to be on.
+ */
+void
+page_free(page_t *pp, int dontneed)
+{
+	struct pcf	*p;
+	uint_t		pcf_index;
+
+	ASSERT((PAGE_EXCL(pp) &&
+	    !page_iolock_assert(pp)) || panicstr);
+
+	if (page_deteriorating(pp)) {
+		volatile int i = 0;
+		char *kaddr;
+		volatile int rb, wb;
+		uint64_t pa;
+		volatile int ue = 0;
+		on_trap_data_t otd;
+
+		if (pp->p_vnode != NULL) {
+			/*
+			 * Let page_destroy() do its bean counting and
+			 * hash out the page; it will then call back
+			 * into page_free() with pp->p_vnode == NULL.
+			 */
+			page_destroy(pp, 0);
+			return;
+		}
+
+		if (page_isfailing(pp)) {
+			/*
+			 * If we have already exceeded the limit for
+			 * pages retired, we will treat this page as
+			 * 'toxic' rather than failing. That will ensure
+			 * that the page is at least cleaned, and if
+			 * a UE is detected, the page will be retired
+			 * anyway.
+			 */
+			if (pages_retired_limit_exceeded()) {
+				/*
+				 * clear the flag and reset to toxic
+				 */
+				page_clrtoxic(pp);
+				page_settoxic(pp, PAGE_IS_TOXIC);
+			} else {
+				pa = ptob((uint64_t)page_pptonum(pp));
+				if (page_retire_messages) {
+					cmn_err(CE_NOTE, "Page 0x%08x.%08x "
+					    "removed from service",
+					    (uint32_t)(pa >> 32), (uint32_t)pa);
+				}
+				goto page_failed;
+			}
+		}
+
+		pagescrub(pp, 0, PAGESIZE);
+
+		/*
+		 * We want to determine whether the error that occurred on
+		 * this page is transient or persistent, so we get a mapping
+		 * to the page and try every possible bit pattern to compare
+		 * what we write with what we read back.  A smaller number
+		 * of bit patterns might suffice, but there's no point in
+		 * getting fancy.  If this is the hot path on your system,
+		 * you've got bigger problems.
+		 */
+		kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
+		for (wb = 0xff; wb >= 0; wb--) {
+			if (on_trap(&otd, OT_DATA_EC)) {
+				pa = ptob((uint64_t)page_pptonum(pp)) + i;
+				page_settoxic(pp, PAGE_IS_FAILING);
+
+				if (page_retire_messages) {
+					cmn_err(CE_WARN, "Uncorrectable Error "
+					    "occurred at PA 0x%08x.%08x while "
+					    "attempting to clear previously "
+					    "reported error; page removed from "
+					    "service", (uint32_t)(pa >> 32),
+					    (uint32_t)pa);
+				}
+
+				ue++;
+				break;
+			}
+
+			/*
+			 * Write out the bit pattern, flush it to memory, and
+			 * read it back while under on_trap() protection.
+			 */
+			for (i = 0; i < PAGESIZE; i++)
+				kaddr[i] = wb;
+
+			sync_data_memory(kaddr, PAGESIZE);
+
+			for (i = 0; i < PAGESIZE; i++) {
+				if ((rb = (uchar_t)kaddr[i]) != wb) {
+					page_settoxic(pp, PAGE_IS_FAILING);
+					goto out;
+				}
+			}
+		}
+out:
+		no_trap();
+		ppmapout(kaddr);
+
+		if (wb >= 0 && !ue) {
+			pa = ptob((uint64_t)page_pptonum(pp)) + i;
+			if (page_retire_messages) {
+				cmn_err(CE_WARN, "Data Mismatch occurred at PA "
+				    "0x%08x.%08x [ 0x%x != 0x%x ] while "
+				    "attempting to clear previously reported "
+				    "error; page removed from service",
+				    (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb);
+			}
+		}
+page_failed:
+		/*
+		 * DR operations change the association between a page_t
+		 * and the physical page it represents. Check if the
+		 * page is still bad. If it is, then retire it.
+		 */
+		if (page_isfaulty(pp) && page_isfailing(pp)) {
+			/*
+			 * In the future, it might be useful to have a platform
+			 * callback here to tell the hardware to fence off this
+			 * page during the next reboot.
+			 *
+			 * We move the page to the retired_vnode here
+			 */
+			(void) page_hashin(pp, &retired_ppages,
+			    (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL);
+			mutex_enter(&freemem_lock);
+			availrmem--;
+			mutex_exit(&freemem_lock);
+			page_retired(pp);
+			page_downgrade(pp);
+
+			/*
+			 * If DR raced with the above page retirement code,
+			 * we might have retired a good page. If so, unretire
+			 * the page.
+			 */
+			if (!page_isfaulty(pp))
+				page_unretire_pages();
+			return;
+		}
+
+		pa = ptob((uint64_t)page_pptonum(pp));
+
+		if (page_retire_messages) {
+			cmn_err(CE_NOTE, "Previously reported error on page "
+			    "0x%08x.%08x cleared", (uint32_t)(pa >> 32),
+			    (uint32_t)pa);
+		}
+
+		page_clrtoxic(pp);
+	}
+
+	if (PP_ISFREE(pp)) {
+		panic("page_free: page %p is free", (void *)pp);
+	}
+
+	if (pp->p_szc != 0) {
+		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
+		    pp->p_vnode == &kvp) {
+			panic("page_free: anon or kernel "
+			    "or no vnode large page %p", (void *)pp);
+		}
+		page_demote_vp_pages(pp);
+		ASSERT(pp->p_szc == 0);
+	}
+
+	/*
+	 * The page_struct_lock need not be acquired to examine these
+	 * fields since the page has an "exclusive" lock.
+	 */
+	if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+		panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d",
+		    pp, page_pptonum(pp), pp->p_lckcnt, pp->p_cowcnt);
+		/*NOTREACHED*/
+	}
+
+	ASSERT(!hat_page_getshare(pp));
+
+	PP_SETFREE(pp);
+	ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
+	    !hat_ismod(pp));
+	page_clr_all_props(pp);
+	ASSERT(!hat_page_getshare(pp));
+
+	/*
+	 * Now we add the page to the head of the free list.
+	 * But if this page is associated with a paged vnode
+	 * then we adjust the head forward so that the page is
+	 * effectively at the end of the list.
+	 */
+	if (pp->p_vnode == NULL) {
+		/*
+		 * Page has no identity, put it on the free list.
+		 */
+		PP_SETAGED(pp);
+		pp->p_offset = (u_offset_t)-1;
+		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
+		VM_STAT_ADD(pagecnt.pc_free_free);
+		TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
+		    "page_free_free:pp %p", pp);
+	} else {
+		PP_CLRAGED(pp);
+
+		if (!dontneed || nopageage) {
+			/* move it to the tail of the list */
+			page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
+
+			VM_STAT_ADD(pagecnt.pc_free_cache);
+			TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
+			    "page_free_cache_tail:pp %p", pp);
+		} else {
+			page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
+
+			VM_STAT_ADD(pagecnt.pc_free_dontneed);
+			TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
+			    "page_free_cache_head:pp %p", pp);
+		}
+	}
+	page_unlock(pp);
+
+	/*
+	 * Now do the `freemem' accounting.
+	 */
+	pcf_index = PCF_INDEX();
+	p = &pcf[pcf_index];
+	p->pcf_touch = 1;
+
+	mutex_enter(&p->pcf_lock);
+	if (p->pcf_block) {
+		p->pcf_reserve += 1;
+	} else {
+		p->pcf_count += 1;
+		if (p->pcf_wait) {
+			mutex_enter(&new_freemem_lock);
+			/*
+			 * Check to see if some other thread
+			 * is actually waiting.  Another bucket
+			 * may have woken it up by now.  If there
+			 * are no waiters, then set our pcf_wait
+			 * count to zero to avoid coming in here
+			 * next time.  Also, since only one page
+			 * was put on the free list, just wake
+			 * up one waiter.
+			 */
+			if (freemem_wait) {
+				cv_signal(&freemem_cv);
+				p->pcf_wait--;
+			} else {
+				p->pcf_wait = 0;
+			}
+			mutex_exit(&new_freemem_lock);
+		}
+	}
+	mutex_exit(&p->pcf_lock);
+
+	/* freemem is approximate, so this test OK */
+	if (!p->pcf_block)
+		freemem += 1;
+}
+
+/*
+ * Put page on the "free" list during intial startup.
+ * This happens during initial single threaded execution.
+ */
+void
+page_free_at_startup(page_t *pp)
+{
+	struct pcf	*p;
+	uint_t		pcf_index;
+
+	page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
+	VM_STAT_ADD(pagecnt.pc_free_free);
+
+	/*
+	 * Now do the `freemem' accounting.
+	 */
+	pcf_index = PCF_INDEX();
+	p = &pcf[pcf_index];
+	p->pcf_touch = 1;
+
+	ASSERT(p->pcf_block == 0);
+	ASSERT(p->pcf_wait == 0);
+	p->pcf_count += 1;
+
+	/* freemem is approximate, so this is OK */
+	freemem += 1;
+}
+
+void
+page_free_pages(page_t *pp)
+{
+	page_t	*tpp, *rootpp = NULL;
+	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
+	pgcnt_t	i;
+	uint_t	szc = pp->p_szc;
+	int	toxic = 0;
+
+	VM_STAT_ADD(pagecnt.pc_free_pages);
+	TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
+	    "page_free_free:pp %p", pp);
+
+	ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
+	if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
+		panic("page_free_pages: not root page %p", (void *)pp);
+		/*NOTREACHED*/
+	}
+
+	for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) {
+		ASSERT((PAGE_EXCL(tpp) &&
+		    !page_iolock_assert(tpp)) || panicstr);
+		if (PP_ISFREE(tpp)) {
+			panic("page_free_pages: page %p is free", (void *)tpp);
+			/*NOTREACHED*/
+		}
+		if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
+		    tpp->p_cowcnt != 0) {
+			panic("page_free_pages %p", (void *)tpp);
+			/*NOTREACHED*/
+		}
+
+		ASSERT(!hat_page_getshare(tpp));
+		ASSERT(tpp->p_vnode == NULL);
+		ASSERT(tpp->p_szc == szc);
+
+		if (page_deteriorating(tpp))
+			toxic = 1;
+
+		PP_SETFREE(tpp);
+		page_clr_all_props(tpp);
+		PP_SETAGED(tpp);
+		tpp->p_offset = (u_offset_t)-1;
+		ASSERT(tpp->p_next == tpp);
+		ASSERT(tpp->p_prev == tpp);
+		page_list_concat(&rootpp, &tpp);
+	}
+	ASSERT(rootpp == pp);
+
+	if (toxic) {
+		page_free_toxic_pages(rootpp);
+		return;
+	}
+	page_list_add_pages(rootpp, 0);
+	page_create_putback(pgcnt);
+}
+
+int free_pages = 1;
+
+/*
+ * This routine attempts to return pages to the cachelist via page_release().
+ * It does not *have* to be successful in all cases, since the pageout scanner
+ * will catch any pages it misses.  It does need to be fast and not introduce
+ * too much overhead.
+ *
+ * If a page isn't found on the unlocked sweep of the page_hash bucket, we
+ * don't lock and retry.  This is ok, since the page scanner will eventually
+ * find any page we miss in free_vp_pages().
+ */
+void
+free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
+{
+	page_t *pp;
+	u_offset_t eoff;
+	extern int swap_in_range(vnode_t *, u_offset_t, size_t);
+
+	eoff = off + len;
+
+	if (free_pages == 0)
+		return;
+	if (swap_in_range(vp, off, len))
+		return;
+
+	for (; off < eoff; off += PAGESIZE) {
+
+		/*
+		 * find the page using a fast, but inexact search. It'll be OK
+		 * if a few pages slip through the cracks here.
+		 */
+		pp = page_exists(vp, off);
+
+		/*
+		 * If we didn't find the page (it may not exist), the page
+		 * is free, looks still in use (shared), or we can't lock it,
+		 * just give up.
+		 */
+		if (pp == NULL ||
+		    PP_ISFREE(pp) ||
+		    page_share_cnt(pp) > 0 ||
+		    !page_trylock(pp, SE_EXCL))
+			continue;
+
+		/*
+		 * Once we have locked pp, verify that it's still the
+		 * correct page and not already free
+		 */
+		ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
+		if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
+			page_unlock(pp);
+			continue;
+		}
+
+		/*
+		 * try to release the page...
+		 */
+		(void) page_release(pp, 1);
+	}
+}
+
+/*
+ * Reclaim the given page from the free list.
+ * Returns 1 on success or 0 on failure.
+ *
+ * The page is unlocked if it can't be reclaimed (when freemem == 0).
+ * If `lock' is non-null, it will be dropped and re-acquired if
+ * the routine must wait while freemem is 0.
+ *
+ * As it turns out, boot_getpages() does this.  It picks a page,
+ * based on where OBP mapped in some address, gets its pfn, searches
+ * the memsegs, locks the page, then pulls it off the free list!
+ */
+int
+page_reclaim(page_t *pp, kmutex_t *lock)
+{
+	struct pcf	*p;
+	uint_t		pcf_index;
+	struct cpu	*cpup;
+	int		enough;
+	uint_t		i;
+
+	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
+	ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
+	ASSERT(pp->p_szc == 0);
+
+	/*
+	 * If `freemem' is 0, we cannot reclaim this page from the
+	 * freelist, so release every lock we might hold: the page,
+	 * and the `lock' before blocking.
+	 *
+	 * The only way `freemem' can become 0 while there are pages
+	 * marked free (have their p->p_free bit set) is when the
+	 * system is low on memory and doing a page_create().  In
+	 * order to guarantee that once page_create() starts acquiring
+	 * pages it will be able to get all that it needs since `freemem'
+	 * was decreased by the requested amount.  So, we need to release
+	 * this page, and let page_create() have it.
+	 *
+	 * Since `freemem' being zero is not supposed to happen, just
+	 * use the usual hash stuff as a starting point.  If that bucket
+	 * is empty, then assume the worst, and start at the beginning
+	 * of the pcf array.  If we always start at the beginning
+	 * when acquiring more than one pcf lock, there won't be any
+	 * deadlock problems.
+	 */
+
+	/* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
+
+	if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
+		pcf_acquire_all();
+		goto page_reclaim_nomem;
+	}
+
+	enough = 0;
+	pcf_index = PCF_INDEX();
+	p = &pcf[pcf_index];
+	p->pcf_touch = 1;
+	mutex_enter(&p->pcf_lock);
+	if (p->pcf_count >= 1) {
+		enough = 1;
+		p->pcf_count--;
+	}
+	mutex_exit(&p->pcf_lock);
+
+	if (!enough) {
+		VM_STAT_ADD(page_reclaim_zero);
+		/*
+		 * Check again. Its possible that some other thread
+		 * could have been right behind us, and added one
+		 * to a list somewhere.  Acquire each of the pcf locks
+		 * until we find a page.
+		 */
+		p = pcf;
+		for (i = 0; i < PCF_FANOUT; i++) {
+			p->pcf_touch = 1;
+			mutex_enter(&p->pcf_lock);
+			if (p->pcf_count >= 1) {
+				p->pcf_count -= 1;
+				enough = 1;
+				break;
+			}
+			p++;
+		}
+
+		if (!enough) {
+page_reclaim_nomem:
+			/*
+			 * We really can't have page `pp'.
+			 * Time for the no-memory dance with
+			 * page_free().  This is just like
+			 * page_create_wait().  Plus the added
+			 * attraction of releasing whatever mutex
+			 * we held when we were called with in `lock'.
+			 * Page_unlock() will wakeup any thread
+			 * waiting around for this page.
+			 */
+			if (lock) {
+				VM_STAT_ADD(page_reclaim_zero_locked);
+				mutex_exit(lock);
+			}
+			page_unlock(pp);
+
+			/*
+			 * get this before we drop all the pcf locks.
+			 */
+			mutex_enter(&new_freemem_lock);
+
+			p = pcf;
+			for (i = 0; i < PCF_FANOUT; i++) {
+				p->pcf_wait++;
+				mutex_exit(&p->pcf_lock);
+				p++;
+			}
+
+			freemem_wait++;
+			cv_wait(&freemem_cv, &new_freemem_lock);
+			freemem_wait--;
+
+			mutex_exit(&new_freemem_lock);
+
+			if (lock) {
+				mutex_enter(lock);
+			}
+			return (0);
+		}
+
+		/*
+		 * There was a page to be found.
+		 * The pcf accounting has been done,
+		 * though none of the pcf_wait flags have been set,
+		 * drop the locks and continue on.
+		 */
+		while (p >= pcf) {
+			mutex_exit(&p->pcf_lock);
+			p--;
+		}
+	}
+
+	/*
+	 * freemem is not protected by any lock. Thus, we cannot
+	 * have any assertion containing freemem here.
+	 */
+	freemem -= 1;
+
+	VM_STAT_ADD(pagecnt.pc_reclaim);
+	if (PP_ISAGED(pp)) {
+		page_list_sub(pp, PG_FREE_LIST);
+		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
+		    "page_reclaim_free:pp %p", pp);
+	} else {
+		page_list_sub(pp, PG_CACHE_LIST);
+		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
+		    "page_reclaim_cache:pp %p", pp);
+	}
+
+	/*
+	 * clear the p_free & p_age bits since this page is no longer
+	 * on the free list.  Notice that there was a brief time where
+	 * a page is marked as free, but is not on the list.
+	 *
+	 * Set the reference bit to protect against immediate pageout.
+	 */
+	PP_CLRFREE(pp);
+	PP_CLRAGED(pp);
+	page_set_props(pp, P_REF);
+
+	CPU_STATS_ENTER_K();
+	cpup = CPU;	/* get cpup now that CPU cannot change */
+	CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
+	CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
+	CPU_STATS_EXIT_K();
+
+	return (1);
+}
+
+
+
+/*
+ * Destroy identity of the page and put it back on
+ * the page free list.  Assumes that the caller has
+ * acquired the "exclusive" lock on the page.
+ */
+void
+page_destroy(page_t *pp, int dontfree)
+{
+	ASSERT((PAGE_EXCL(pp) &&
+	    !page_iolock_assert(pp)) || panicstr);
+
+	if (pp->p_szc != 0) {
+		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
+		    pp->p_vnode == &kvp) {
+			panic("page_destroy: anon or kernel or no vnode "
+			    "large page %p", (void *)pp);
+		}
+		page_demote_vp_pages(pp);
+		ASSERT(pp->p_szc == 0);
+	}
+
+	TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
+
+	/*
+	 * Unload translations, if any, then hash out the
+	 * page to erase its identity.
+	 */
+	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+	page_hashout(pp, NULL);
+
+	if (!dontfree) {
+		/*
+		 * Acquire the "freemem_lock" for availrmem.
+		 * The page_struct_lock need not be acquired for lckcnt
+		 * and cowcnt since the page has an "exclusive" lock.
+		 */
+		if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
+			mutex_enter(&freemem_lock);
+			if (pp->p_lckcnt != 0) {
+				availrmem++;
+				pp->p_lckcnt = 0;
+			}
+			if (pp->p_cowcnt != 0) {
+				availrmem += pp->p_cowcnt;
+				pp->p_cowcnt = 0;
+			}
+			mutex_exit(&freemem_lock);
+		}
+		/*
+		 * Put the page on the "free" list.
+		 */
+		page_free(pp, 0);
+	}
+}
+
+void
+page_destroy_pages(page_t *pp)
+{
+
+	page_t	*tpp, *rootpp = NULL;
+	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
+	pgcnt_t	i, pglcks = 0;
+	uint_t	szc = pp->p_szc;
+	int	toxic = 0;
+
+	ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
+
+	VM_STAT_ADD(pagecnt.pc_destroy_pages);
+
+	TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
+
+	if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
+		panic("page_destroy_pages: not root page %p", (void *)pp);
+		/*NOTREACHED*/
+	}
+
+	for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) {
+		ASSERT((PAGE_EXCL(tpp) &&
+		    !page_iolock_assert(tpp)) || panicstr);
+		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
+		page_hashout(tpp, NULL);
+		ASSERT(tpp->p_offset == (u_offset_t)-1);
+		if (tpp->p_lckcnt != 0) {
+			pglcks++;
+			tpp->p_lckcnt = 0;
+		} else if (tpp->p_cowcnt != 0) {
+			pglcks += tpp->p_cowcnt;
+			tpp->p_cowcnt = 0;
+		}
+		ASSERT(!hat_page_getshare(tpp));
+		ASSERT(tpp->p_vnode == NULL);
+		ASSERT(tpp->p_szc == szc);
+
+		if (page_deteriorating(tpp))
+			toxic = 1;
+
+		PP_SETFREE(tpp);
+		page_clr_all_props(tpp);
+		PP_SETAGED(tpp);
+		ASSERT(tpp->p_next == tpp);
+		ASSERT(tpp->p_prev == tpp);
+		page_list_concat(&rootpp, &tpp);
+	}
+
+	ASSERT(rootpp == pp);
+	if (pglcks != 0) {
+		mutex_enter(&freemem_lock);
+		availrmem += pglcks;
+		mutex_exit(&freemem_lock);
+	}
+
+	if (toxic) {
+		page_free_toxic_pages(rootpp);
+		return;
+	}
+	page_list_add_pages(rootpp, 0);
+	page_create_putback(pgcnt);
+}
+
+/*
+ * Similar to page_destroy(), but destroys pages which are
+ * locked and known to be on the page free list.  Since
+ * the page is known to be free and locked, no one can access
+ * it.
+ *
+ * Also, the number of free pages does not change.
+ */
+void
+page_destroy_free(page_t *pp)
+{
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(pp->p_vnode);
+	ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
+	ASSERT(!hat_page_is_mapped(pp));
+	ASSERT(PP_ISAGED(pp) == 0);
+	ASSERT(pp->p_szc == 0);
+
+	VM_STAT_ADD(pagecnt.pc_destroy_free);
+	page_list_sub(pp, PG_CACHE_LIST);
+
+	page_hashout(pp, NULL);
+	ASSERT(pp->p_vnode == NULL);
+	ASSERT(pp->p_offset == (u_offset_t)-1);
+	ASSERT(pp->p_hash == NULL);
+
+	PP_SETAGED(pp);
+	page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
+	page_unlock(pp);
+
+	mutex_enter(&new_freemem_lock);
+	if (freemem_wait) {
+		cv_signal(&freemem_cv);
+	}
+	mutex_exit(&new_freemem_lock);
+}
+
+/*
+ * Rename the page "opp" to have an identity specified
+ * by [vp, off].  If a page already exists with this name
+ * it is locked and destroyed.  Note that the page's
+ * translations are not unloaded during the rename.
+ *
+ * This routine is used by the anon layer to "steal" the
+ * original page and is not unlike destroying a page and
+ * creating a new page using the same page frame.
+ *
+ * XXX -- Could deadlock if caller 1 tries to rename A to B while
+ * caller 2 tries to rename B to A.
+ */
+void
+page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
+{
+	page_t		*pp;
+	int		olckcnt = 0;
+	int		ocowcnt = 0;
+	kmutex_t	*phm;
+	ulong_t		index;
+
+	ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
+	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+	ASSERT(PP_ISFREE(opp) == 0);
+
+	VM_STAT_ADD(page_rename_count);
+
+	TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
+		"page rename:pp %p vp %p off %llx", opp, vp, off);
+
+	page_hashout(opp, NULL);
+	PP_CLRAGED(opp);
+
+	/*
+	 * Acquire the appropriate page hash lock, since
+	 * we're going to rename the page.
+	 */
+	index = PAGE_HASH_FUNC(vp, off);
+	phm = PAGE_HASH_MUTEX(index);
+	mutex_enter(phm);
+top:
+	/*
+	 * Look for an existing page with this name and destroy it if found.
+	 * By holding the page hash lock all the way to the page_hashin()
+	 * call, we are assured that no page can be created with this
+	 * identity.  In the case when the phm lock is dropped to undo any
+	 * hat layer mappings, the existing page is held with an "exclusive"
+	 * lock, again preventing another page from being created with
+	 * this identity.
+	 */
+	PAGE_HASH_SEARCH(index, pp, vp, off);
+	if (pp != NULL) {
+		VM_STAT_ADD(page_rename_exists);
+
+		/*
+		 * As it turns out, this is one of only two places where
+		 * page_lock() needs to hold the passed in lock in the
+		 * successful case.  In all of the others, the lock could
+		 * be dropped as soon as the attempt is made to lock
+		 * the page.  It is tempting to add yet another arguement,
+		 * PL_KEEP or PL_DROP, to let page_lock know what to do.
+		 */
+		if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
+			/*
+			 * Went to sleep because the page could not
+			 * be locked.  We were woken up when the page
+			 * was unlocked, or when the page was destroyed.
+			 * In either case, `phm' was dropped while we
+			 * slept.  Hence we should not just roar through
+			 * this loop.
+			 */
+			goto top;
+		}
+
+		if (hat_page_is_mapped(pp)) {
+			/*
+			 * Unload translations.  Since we hold the
+			 * exclusive lock on this page, the page
+			 * can not be changed while we drop phm.
+			 * This is also not a lock protocol violation,
+			 * but rather the proper way to do things.
+			 */
+			mutex_exit(phm);
+			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+			mutex_enter(phm);
+		}
+		page_hashout(pp, phm);
+	}
+	/*
+	 * Hash in the page with the new identity.
+	 */
+	if (!page_hashin(opp, vp, off, phm)) {
+		/*
+		 * We were holding phm while we searched for [vp, off]
+		 * and only dropped phm if we found and locked a page.
+		 * If we can't create this page now, then some thing
+		 * is really broken.
+		 */
+		panic("page_rename: Can't hash in page: %p", (void *)pp);
+		/*NOTREACHED*/
+	}
+
+	ASSERT(MUTEX_HELD(phm));
+	mutex_exit(phm);
+
+	/*
+	 * Now that we have dropped phm, lets get around to finishing up
+	 * with pp.
+	 */
+	if (pp != NULL) {
+		ASSERT(!hat_page_is_mapped(pp));
+		/* for now large pages should not end up here */
+		ASSERT(pp->p_szc == 0);
+		/*
+		 * Save the locks for transfer to the new page and then
+		 * clear them so page_free doesn't think they're important.
+		 * The page_struct_lock need not be acquired for lckcnt and
+		 * cowcnt since the page has an "exclusive" lock.
+		 */
+		olckcnt = pp->p_lckcnt;
+		ocowcnt = pp->p_cowcnt;
+		pp->p_lckcnt = pp->p_cowcnt = 0;
+
+		/*
+		 * Put the page on the "free" list after we drop
+		 * the lock.  The less work under the lock the better.
+		 */
+		/*LINTED: constant in conditional context*/
+		VN_DISPOSE(pp, B_FREE, 0, kcred);
+	}
+
+	/*
+	 * Transfer the lock count from the old page (if any).
+	 * The page_struct_lock need not be acquired for lckcnt and
+	 * cowcnt since the page has an "exclusive" lock.
+	 */
+	opp->p_lckcnt += olckcnt;
+	opp->p_cowcnt += ocowcnt;
+}
+
+/*
+ * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
+ *
+ * Pages are normally inserted at the start of a vnode's v_pages list.
+ * If the vnode is VMODSORT and the page is modified, it goes at the end.
+ * This can happen when a modified page is relocated for DR.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
+static int
+page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
+{
+	page_t		**listp;
+	page_t		*tp;
+	ulong_t		index;
+
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(vp != NULL);
+	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
+
+	/*
+	 * Be sure to set these up before the page is inserted on the hash
+	 * list.  As soon as the page is placed on the list some other
+	 * thread might get confused and wonder how this page could
+	 * possibly hash to this list.
+	 */
+	pp->p_vnode = vp;
+	pp->p_offset = offset;
+
+	/*
+	 * record if this page is on a swap vnode
+	 */
+	if ((vp->v_flag & VISSWAP) != 0)
+		PP_SETSWAP(pp);
+
+	index = PAGE_HASH_FUNC(vp, offset);
+	ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
+	listp = &page_hash[index];
+
+	/*
+	 * If this page is already hashed in, fail this attempt to add it.
+	 */
+	for (tp = *listp; tp != NULL; tp = tp->p_hash) {
+		if (tp->p_vnode == vp && tp->p_offset == offset) {
+			pp->p_vnode = NULL;
+			pp->p_offset = (u_offset_t)(-1);
+			return (0);
+		}
+	}
+	pp->p_hash = *listp;
+	*listp = pp;
+
+	/*
+	 * Add the page to the vnode's list of pages
+	 */
+	if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
+		listp = &vp->v_pages->p_vpprev->p_vpnext;
+	else
+		listp = &vp->v_pages;
+
+	page_vpadd(listp, pp);
+
+	return (1);
+}
+
+/*
+ * Add page `pp' to both the hash and vp chains for [vp, offset].
+ *
+ * Returns 1 on success and 0 on failure.
+ * If hold is passed in, it is not dropped.
+ */
+int
+page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
+{
+	kmutex_t	*phm = NULL;
+	kmutex_t	*vphm;
+	int		rc;
+
+	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+
+	TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
+		"page_hashin:pp %p vp %p offset %llx",
+		pp, vp, offset);
+
+	VM_STAT_ADD(hashin_count);
+
+	if (hold != NULL)
+		phm = hold;
+	else {
+		VM_STAT_ADD(hashin_not_held);
+		phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
+		mutex_enter(phm);
+	}
+
+	vphm = page_vnode_mutex(vp);
+	mutex_enter(vphm);
+	rc = page_do_hashin(pp, vp, offset);
+	mutex_exit(vphm);
+	if (hold == NULL)
+		mutex_exit(phm);
+	if (rc == 0)
+		VM_STAT_ADD(hashin_already);
+	return (rc);
+}
+
+/*
+ * Remove page ``pp'' from the hash and vp chains and remove vp association.
+ * All mutexes must be held
+ */
+static void
+page_do_hashout(page_t *pp)
+{
+	page_t	**hpp;
+	page_t	*hp;
+	vnode_t	*vp = pp->p_vnode;
+
+	ASSERT(vp != NULL);
+	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
+
+	/*
+	 * First, take pp off of its hash chain.
+	 */
+	hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
+
+	for (;;) {
+		hp = *hpp;
+		if (hp == pp)
+			break;
+		if (hp == NULL) {
+			panic("page_do_hashout");
+			/*NOTREACHED*/
+		}
+		hpp = &hp->p_hash;
+	}
+	*hpp = pp->p_hash;
+
+	/*
+	 * Now remove it from its associated vnode.
+	 */
+	if (vp->v_pages)
+		page_vpsub(&vp->v_pages, pp);
+
+	pp->p_hash = NULL;
+	page_clr_all_props(pp);
+	PP_CLRSWAP(pp);
+	pp->p_vnode = NULL;
+	pp->p_offset = (u_offset_t)-1;
+}
+
+/*
+ * Remove page ``pp'' from the hash and vp chains and remove vp association.
+ *
+ * When `phm' is non-NULL it contains the address of the mutex protecting the
+ * hash list pp is on.  It is not dropped.
+ */
+void
+page_hashout(page_t *pp, kmutex_t *phm)
+{
+	vnode_t		*vp;
+	ulong_t		index;
+	kmutex_t	*nphm;
+	kmutex_t	*vphm;
+	kmutex_t	*sep;
+
+	ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
+	ASSERT(pp->p_vnode != NULL);
+	ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
+	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
+
+	vp = pp->p_vnode;
+
+	TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
+		"page_hashout:pp %p vp %p", pp, vp);
+
+	/* Kernel probe */
+	TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
+	    tnf_opaque, vnode, vp,
+	    tnf_offset, offset, pp->p_offset);
+
+	/*
+	 *
+	 */
+	VM_STAT_ADD(hashout_count);
+	index = PAGE_HASH_FUNC(vp, pp->p_offset);
+	if (phm == NULL) {
+		VM_STAT_ADD(hashout_not_held);
+		nphm = PAGE_HASH_MUTEX(index);
+		mutex_enter(nphm);
+	}
+	ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
+
+
+	/*
+	 * grab page vnode mutex and remove it...
+	 */
+	vphm = page_vnode_mutex(vp);
+	mutex_enter(vphm);
+
+	page_do_hashout(pp);
+
+	mutex_exit(vphm);
+	if (phm == NULL)
+		mutex_exit(nphm);
+
+	/*
+	 * If the page was retired, update the pages_retired
+	 * total and clear the page flag
+	 */
+	if (page_isretired(pp)) {
+		retired_page_removed(pp);
+	}
+
+	/*
+	 * Wake up processes waiting for this page.  The page's
+	 * identity has been changed, and is probably not the
+	 * desired page any longer.
+	 */
+	sep = page_se_mutex(pp);
+	mutex_enter(sep);
+	if (CV_HAS_WAITERS(&pp->p_cv))
+		cv_broadcast(&pp->p_cv);
+	mutex_exit(sep);
+}
+
+/*
+ * Add the page to the front of a linked list of pages
+ * using the p_next & p_prev pointers for the list.
+ * The caller is responsible for protecting the list pointers.
+ */
+void
+page_add(page_t **ppp, page_t *pp)
+{
+	ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
+
+	page_add_common(ppp, pp);
+}
+
+
+
+/*
+ *  Common code for page_add() and mach_page_add()
+ */
+void
+page_add_common(page_t **ppp, page_t *pp)
+{
+	if (*ppp == NULL) {
+		pp->p_next = pp->p_prev = pp;
+	} else {
+		pp->p_next = *ppp;
+		pp->p_prev = (*ppp)->p_prev;
+		(*ppp)->p_prev = pp;
+		pp->p_prev->p_next = pp;
+	}
+	*ppp = pp;
+}
+
+
+/*
+ * Remove this page from a linked list of pages
+ * using the p_next & p_prev pointers for the list.
+ *
+ * The caller is responsible for protecting the list pointers.
+ */
+void
+page_sub(page_t **ppp, page_t *pp)
+{
+	ASSERT((PP_ISFREE(pp)) ? 1 :
+	    (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
+
+	if (*ppp == NULL || pp == NULL) {
+		panic("page_sub: bad arg(s): pp %p, *ppp %p",
+		    (void *)pp, (void *)(*ppp));
+		/*NOTREACHED*/
+	}
+
+	page_sub_common(ppp, pp);
+}
+
+
+/*
+ *  Common code for page_sub() and mach_page_sub()
+ */
+void
+page_sub_common(page_t **ppp, page_t *pp)
+{
+	if (*ppp == pp)
+		*ppp = pp->p_next;		/* go to next page */
+
+	if (*ppp == pp)
+		*ppp = NULL;			/* page list is gone */
+	else {
+		pp->p_prev->p_next = pp->p_next;
+		pp->p_next->p_prev = pp->p_prev;
+	}
+	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
+}
+
+
+/*
+ * Break page list cppp into two lists with npages in the first list.
+ * The tail is returned in nppp.
+ */
+void
+page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
+{
+	page_t *s1pp = *oppp;
+	page_t *s2pp;
+	page_t *e1pp, *e2pp;
+	long n = 0;
+
+	if (s1pp == NULL) {
+		*nppp = NULL;
+		return;
+	}
+	if (npages == 0) {
+		*nppp = s1pp;
+		*oppp = NULL;
+		return;
+	}
+	for (n = 0, s2pp = *oppp; n < npages; n++) {
+		s2pp = s2pp->p_next;
+	}
+	/* Fix head and tail of new lists */
+	e1pp = s2pp->p_prev;
+	e2pp = s1pp->p_prev;
+	s1pp->p_prev = e1pp;
+	e1pp->p_next = s1pp;
+	s2pp->p_prev = e2pp;
+	e2pp->p_next = s2pp;
+
+	/* second list empty */
+	if (s2pp == s1pp) {
+		*oppp = s1pp;
+		*nppp = NULL;
+	} else {
+		*oppp = s1pp;
+		*nppp = s2pp;
+	}
+}
+
+/*
+ * Concatenate page list nppp onto the end of list ppp.
+ */
+void
+page_list_concat(page_t **ppp, page_t **nppp)
+{
+	page_t *s1pp, *s2pp, *e1pp, *e2pp;
+
+	if (*nppp == NULL) {
+		return;
+	}
+	if (*ppp == NULL) {
+		*ppp = *nppp;
+		return;
+	}
+	s1pp = *ppp;
+	e1pp =  s1pp->p_prev;
+	s2pp = *nppp;
+	e2pp = s2pp->p_prev;
+	s1pp->p_prev = e2pp;
+	e2pp->p_next = s1pp;
+	e1pp->p_next = s2pp;
+	s2pp->p_prev = e1pp;
+}
+
+/*
+ * return the next page in the page list
+ */
+page_t *
+page_list_next(page_t *pp)
+{
+	return (pp->p_next);
+}
+
+
+/*
+ * Add the page to the front of the linked list of pages
+ * using p_vpnext/p_vpprev pointers for the list.
+ *
+ * The caller is responsible for protecting the lists.
+ */
+void
+page_vpadd(page_t **ppp, page_t *pp)
+{
+	if (*ppp == NULL) {
+		pp->p_vpnext = pp->p_vpprev = pp;
+	} else {
+		pp->p_vpnext = *ppp;
+		pp->p_vpprev = (*ppp)->p_vpprev;
+		(*ppp)->p_vpprev = pp;
+		pp->p_vpprev->p_vpnext = pp;
+	}
+	*ppp = pp;
+}
+
+/*
+ * Remove this page from the linked list of pages
+ * using p_vpnext/p_vpprev pointers for the list.
+ *
+ * The caller is responsible for protecting the lists.
+ */
+void
+page_vpsub(page_t **ppp, page_t *pp)
+{
+	if (*ppp == NULL || pp == NULL) {
+		panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
+		    (void *)pp, (void *)(*ppp));
+		/*NOTREACHED*/
+	}
+
+	if (*ppp == pp)
+		*ppp = pp->p_vpnext;		/* go to next page */
+
+	if (*ppp == pp)
+		*ppp = NULL;			/* page list is gone */
+	else {
+		pp->p_vpprev->p_vpnext = pp->p_vpnext;
+		pp->p_vpnext->p_vpprev = pp->p_vpprev;
+	}
+	pp->p_vpprev = pp->p_vpnext = pp;	/* make pp a list of one */
+}
+
+/*
+ * Lock a physical page into memory "long term".  Used to support "lock
+ * in memory" functions.  Accepts the page to be locked, and a cow variable
+ * to indicate whether a the lock will travel to the new page during
+ * a potential copy-on-write.
+ */
+int
+page_pp_lock(
+	page_t *pp,			/* page to be locked */
+	int cow,			/* cow lock */
+	int kernel)			/* must succeed -- ignore checking */
+{
+	int r = 0;			/* result -- assume failure */
+
+	ASSERT(PAGE_LOCKED(pp));
+
+	page_struct_lock(pp);
+	/*
+	 * Acquire the "freemem_lock" for availrmem.
+	 */
+	if (cow) {
+		mutex_enter(&freemem_lock);
+		if ((availrmem > pages_pp_maximum) &&
+		    (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
+			availrmem--;
+			pages_locked++;
+			mutex_exit(&freemem_lock);
+			r = 1;
+			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+				cmn_err(CE_WARN,
+				    "COW lock limit reached on pfn 0x%lx",
+				    page_pptonum(pp));
+			}
+		} else
+			mutex_exit(&freemem_lock);
+	} else {
+		if (pp->p_lckcnt) {
+			if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
+				r = 1;
+				if (++pp->p_lckcnt ==
+				    (ushort_t)PAGE_LOCK_MAXIMUM) {
+					cmn_err(CE_WARN, "Page lock limit "
+					    "reached on pfn 0x%lx",
+					    page_pptonum(pp));
+				}
+			}
+		} else {
+			if (kernel) {
+				/* availrmem accounting done by caller */
+				++pp->p_lckcnt;
+				r = 1;
+			} else {
+				mutex_enter(&freemem_lock);
+				if (availrmem > pages_pp_maximum) {
+					availrmem--;
+					pages_locked++;
+					++pp->p_lckcnt;
+					r = 1;
+				}
+				mutex_exit(&freemem_lock);
+			}
+		}
+	}
+	page_struct_unlock(pp);
+	return (r);
+}
+
+/*
+ * Decommit a lock on a physical page frame.  Account for cow locks if
+ * appropriate.
+ */
+void
+page_pp_unlock(
+	page_t *pp,			/* page to be unlocked */
+	int cow,			/* expect cow lock */
+	int kernel)			/* this was a kernel lock */
+{
+	ASSERT(PAGE_LOCKED(pp));
+
+	page_struct_lock(pp);
+	/*
+	 * Acquire the "freemem_lock" for availrmem.
+	 * If cowcnt or lcknt is already 0 do nothing; i.e., we
+	 * could be called to unlock even if nothing is locked. This could
+	 * happen if locked file pages were truncated (removing the lock)
+	 * and the file was grown again and new pages faulted in; the new
+	 * pages are unlocked but the segment still thinks they're locked.
+	 */
+	if (cow) {
+		if (pp->p_cowcnt) {
+			mutex_enter(&freemem_lock);
+			pp->p_cowcnt--;
+			availrmem++;
+			pages_locked--;
+			mutex_exit(&freemem_lock);
+		}
+	} else {
+		if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
+			if (!kernel) {
+				mutex_enter(&freemem_lock);
+				availrmem++;
+				pages_locked--;
+				mutex_exit(&freemem_lock);
+			}
+		}
+	}
+	page_struct_unlock(pp);
+}
+
+/*
+ * This routine reserves availrmem for npages;
+ * 	flags: KM_NOSLEEP or KM_SLEEP
+ * 	returns 1 on success or 0 on failure
+ */
+int
+page_resv(pgcnt_t npages, uint_t flags)
+{
+	mutex_enter(&freemem_lock);
+	while (availrmem < tune.t_minarmem + npages) {
+		if (flags & KM_NOSLEEP) {
+			mutex_exit(&freemem_lock);
+			return (0);
+		}
+		mutex_exit(&freemem_lock);
+		page_needfree(npages);
+		kmem_reap();
+		delay(hz >> 2);
+		page_needfree(-(spgcnt_t)npages);
+		mutex_enter(&freemem_lock);
+	}
+	availrmem -= npages;
+	mutex_exit(&freemem_lock);
+	return (1);
+}
+
+/*
+ * This routine unreserves availrmem for npages;
+ */
+void
+page_unresv(pgcnt_t npages)
+{
+	mutex_enter(&freemem_lock);
+	availrmem += npages;
+	mutex_exit(&freemem_lock);
+}
+
+/*
+ * See Statement at the beginning of segvn_lockop() regarding
+ * the way we handle cowcnts and lckcnts.
+ *
+ * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
+ * that breaks COW has PROT_WRITE.
+ *
+ * Note that, we may also break COW in case we are softlocking
+ * on read access during physio;
+ * in this softlock case, the vpage may not have PROT_WRITE.
+ * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
+ * if the vpage doesn't have PROT_WRITE.
+ *
+ * This routine is never called if we are stealing a page
+ * in anon_private.
+ *
+ * The caller subtracted from availrmem for read only mapping.
+ * if lckcnt is 1 increment availrmem.
+ */
+void
+page_pp_useclaim(
+	page_t *opp,		/* original page frame losing lock */
+	page_t *npp,		/* new page frame gaining lock */
+	uint_t	write_perm) 	/* set if vpage has PROT_WRITE */
+{
+	int payback = 0;
+
+	ASSERT(PAGE_LOCKED(opp));
+	ASSERT(PAGE_LOCKED(npp));
+
+	page_struct_lock(opp);
+
+	ASSERT(npp->p_cowcnt == 0);
+	ASSERT(npp->p_lckcnt == 0);
+
+	/* Don't use claim if nothing is locked (see page_pp_unlock above) */
+	if ((write_perm && opp->p_cowcnt != 0) ||
+	    (!write_perm && opp->p_lckcnt != 0)) {
+
+		if (write_perm) {
+			npp->p_cowcnt++;
+			ASSERT(opp->p_cowcnt != 0);
+			opp->p_cowcnt--;
+		} else {
+
+			ASSERT(opp->p_lckcnt != 0);
+
+			/*
+			 * We didn't need availrmem decremented if p_lckcnt on
+			 * original page is 1. Here, we are unlocking
+			 * read-only copy belonging to original page and
+			 * are locking a copy belonging to new page.
+			 */
+			if (opp->p_lckcnt == 1)
+				payback = 1;
+
+			npp->p_lckcnt++;
+			opp->p_lckcnt--;
+		}
+	}
+	if (payback) {
+		mutex_enter(&freemem_lock);
+		availrmem++;
+		pages_useclaim--;
+		mutex_exit(&freemem_lock);
+	}
+	page_struct_unlock(opp);
+}
+
+/*
+ * Simple claim adjust functions -- used to support changes in
+ * claims due to changes in access permissions.  Used by segvn_setprot().
+ */
+int
+page_addclaim(page_t *pp)
+{
+	int r = 0;			/* result */
+
+	ASSERT(PAGE_LOCKED(pp));
+
+	page_struct_lock(pp);
+	ASSERT(pp->p_lckcnt != 0);
+
+	if (pp->p_lckcnt == 1) {
+		if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
+			--pp->p_lckcnt;
+			r = 1;
+			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+				cmn_err(CE_WARN,
+				    "COW lock limit reached on pfn 0x%lx",
+				    page_pptonum(pp));
+			}
+		}
+	} else {
+		mutex_enter(&freemem_lock);
+		if ((availrmem > pages_pp_maximum) &&
+		    (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
+			--availrmem;
+			++pages_claimed;
+			mutex_exit(&freemem_lock);
+			--pp->p_lckcnt;
+			r = 1;
+			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+				cmn_err(CE_WARN,
+				    "COW lock limit reached on pfn 0x%lx",
+				    page_pptonum(pp));
+			}
+		} else
+			mutex_exit(&freemem_lock);
+	}
+	page_struct_unlock(pp);
+	return (r);
+}
+
+int
+page_subclaim(page_t *pp)
+{
+	int r = 0;
+
+	ASSERT(PAGE_LOCKED(pp));
+
+	page_struct_lock(pp);
+	ASSERT(pp->p_cowcnt != 0);
+
+	if (pp->p_lckcnt) {
+		if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
+			r = 1;
+			/*
+			 * for availrmem
+			 */
+			mutex_enter(&freemem_lock);
+			availrmem++;
+			pages_claimed--;
+			mutex_exit(&freemem_lock);
+
+			pp->p_cowcnt--;
+
+			if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+				cmn_err(CE_WARN,
+				    "Page lock limit reached on pfn 0x%lx",
+				    page_pptonum(pp));
+			}
+		}
+	} else {
+		r = 1;
+		pp->p_cowcnt--;
+		pp->p_lckcnt++;
+	}
+	page_struct_unlock(pp);
+	return (r);
+}
+
+int
+page_addclaim_pages(page_t  **ppa)
+{
+
+	pgcnt_t	lckpgs = 0, pg_idx;
+
+	VM_STAT_ADD(pagecnt.pc_addclaim_pages);
+
+	mutex_enter(&page_llock);
+	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
+
+		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
+		ASSERT(ppa[pg_idx]->p_lckcnt != 0);
+		if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+			mutex_exit(&page_llock);
+			return (0);
+		}
+		if (ppa[pg_idx]->p_lckcnt > 1)
+			lckpgs++;
+	}
+
+	if (lckpgs != 0) {
+		mutex_enter(&freemem_lock);
+		if (availrmem >= pages_pp_maximum + lckpgs) {
+			availrmem -= lckpgs;
+			pages_claimed += lckpgs;
+		} else {
+			mutex_exit(&freemem_lock);
+			mutex_exit(&page_llock);
+			return (0);
+		}
+		mutex_exit(&freemem_lock);
+	}
+
+	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
+		ppa[pg_idx]->p_lckcnt--;
+		ppa[pg_idx]->p_cowcnt++;
+	}
+	mutex_exit(&page_llock);
+	return (1);
+}
+
+int
+page_subclaim_pages(page_t  **ppa)
+{
+	pgcnt_t	ulckpgs = 0, pg_idx;
+
+	VM_STAT_ADD(pagecnt.pc_subclaim_pages);
+
+	mutex_enter(&page_llock);
+	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
+
+		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
+		ASSERT(ppa[pg_idx]->p_cowcnt != 0);
+		if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+			mutex_exit(&page_llock);
+			return (0);
+		}
+		if (ppa[pg_idx]->p_lckcnt != 0)
+			ulckpgs++;
+	}
+
+	if (ulckpgs != 0) {
+		mutex_enter(&freemem_lock);
+		availrmem += ulckpgs;
+		pages_claimed -= ulckpgs;
+		mutex_exit(&freemem_lock);
+	}
+
+	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
+		ppa[pg_idx]->p_cowcnt--;
+		ppa[pg_idx]->p_lckcnt++;
+
+	}
+	mutex_exit(&page_llock);
+	return (1);
+}
+
+page_t *
+page_numtopp(pfn_t pfnum, se_t se)
+{
+	page_t *pp;
+
+retry:
+	pp = page_numtopp_nolock(pfnum);
+	if (pp == NULL) {
+		return ((page_t *)NULL);
+	}
+
+	/*
+	 * Acquire the appropriate lock on the page.
+	 */
+	while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
+		if (page_pptonum(pp) != pfnum)
+			goto retry;
+		continue;
+	}
+
+	if (page_pptonum(pp) != pfnum) {
+		page_unlock(pp);
+		goto retry;
+	}
+
+	return (pp);
+}
+
+page_t *
+page_numtopp_noreclaim(pfn_t pfnum, se_t se)
+{
+	page_t *pp;
+
+retry:
+	pp = page_numtopp_nolock(pfnum);
+	if (pp == NULL) {
+		return ((page_t *)NULL);
+	}
+
+	/*
+	 * Acquire the appropriate lock on the page.
+	 */
+	while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
+		if (page_pptonum(pp) != pfnum)
+			goto retry;
+		continue;
+	}
+
+	if (page_pptonum(pp) != pfnum) {
+		page_unlock(pp);
+		goto retry;
+	}
+
+	return (pp);
+}
+
+/*
+ * This routine is like page_numtopp, but will only return page structs
+ * for pages which are ok for loading into hardware using the page struct.
+ */
+page_t *
+page_numtopp_nowait(pfn_t pfnum, se_t se)
+{
+	page_t *pp;
+
+retry:
+	pp = page_numtopp_nolock(pfnum);
+	if (pp == NULL) {
+		return ((page_t *)NULL);
+	}
+
+	/*
+	 * Try to acquire the appropriate lock on the page.
+	 */
+	if (PP_ISFREE(pp))
+		pp = NULL;
+	else {
+		if (!page_trylock(pp, se))
+			pp = NULL;
+		else {
+			if (page_pptonum(pp) != pfnum) {
+				page_unlock(pp);
+				goto retry;
+			}
+			if (PP_ISFREE(pp)) {
+				page_unlock(pp);
+				pp = NULL;
+			}
+		}
+	}
+	return (pp);
+}
+
+/*
+ * Returns a count of dirty pages that are in the process
+ * of being written out.  If 'cleanit' is set, try to push the page.
+ */
+pgcnt_t
+page_busy(int cleanit)
+{
+	page_t *page0 = page_first();
+	page_t *pp = page0;
+	pgcnt_t nppbusy = 0;
+	u_offset_t off;
+
+	do {
+		vnode_t *vp = pp->p_vnode;
+
+		/*
+		 * A page is a candidate for syncing if it is:
+		 *
+		 * (a)	On neither the freelist nor the cachelist
+		 * (b)	Hashed onto a vnode
+		 * (c)	Not a kernel page
+		 * (d)	Dirty
+		 * (e)	Not part of a swapfile
+		 * (f)	a page which belongs to a real vnode; eg has a non-null
+		 *	v_vfsp pointer.
+		 * (g)	Backed by a filesystem which doesn't have a
+		 *	stubbed-out sync operation
+		 */
+		if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp &&
+		    hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
+		    vfs_can_sync(vp->v_vfsp)) {
+			nppbusy++;
+			vfs_syncprogress();
+
+			if (!cleanit)
+				continue;
+			if (!page_trylock(pp, SE_EXCL))
+				continue;
+
+			if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
+			    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
+			    !(hat_pagesync(pp,
+			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
+				page_unlock(pp);
+				continue;
+			}
+			off = pp->p_offset;
+			VN_HOLD(vp);
+			page_unlock(pp);
+			(void) VOP_PUTPAGE(vp, off, PAGESIZE,
+			    B_ASYNC | B_FREE, kcred);
+			VN_RELE(vp);
+		}
+	} while ((pp = page_next(pp)) != page0);
+
+	return (nppbusy);
+}
+
+void page_invalidate_pages(void);
+
+/*
+ * callback handler to vm sub-system
+ *
+ * callers make sure no recursive entries to this func.
+ */
+/*ARGSUSED*/
+boolean_t
+callb_vm_cpr(void *arg, int code)
+{
+	if (code == CB_CODE_CPR_CHKPT)
+		page_invalidate_pages();
+	return (B_TRUE);
+}
+
+/*
+ * Invalidate all pages of the system.
+ * It shouldn't be called until all user page activities are all stopped.
+ */
+void
+page_invalidate_pages()
+{
+	page_t *pp;
+	page_t *page0;
+	pgcnt_t nbusypages;
+	int retry = 0;
+	const int MAXRETRIES = 4;
+#if defined(__sparc)
+	extern struct vnode prom_ppages;
+#endif /* __sparc */
+
+top:
+	/*
+	 * Flush dirty pages and destory the clean ones.
+	 */
+	nbusypages = 0;
+
+	pp = page0 = page_first();
+	do {
+		struct vnode	*vp;
+		u_offset_t	offset;
+		int		mod;
+
+		/*
+		 * skip the page if it has no vnode or the page associated
+		 * with the kernel vnode or prom allocated kernel mem.
+		 */
+#if defined(__sparc)
+		if ((vp = pp->p_vnode) == NULL || vp == &kvp ||
+		    vp == &prom_ppages)
+#else /* x86 doesn't have prom or prom_ppage */
+		if ((vp = pp->p_vnode) == NULL || vp == &kvp)
+#endif /* __sparc */
+			continue;
+
+		/*
+		 * skip the page which is already free invalidated.
+		 */
+		if (PP_ISFREE(pp) && PP_ISAGED(pp))
+			continue;
+
+		/*
+		 * skip pages that are already locked or can't be "exclusively"
+		 * locked or are already free.  After we lock the page, check
+		 * the free and age bits again to be sure it's not destroied
+		 * yet.
+		 * To achieve max. parallelization, we use page_trylock instead
+		 * of page_lock so that we don't get block on individual pages
+		 * while we have thousands of other pages to process.
+		 */
+		if (!page_trylock(pp, SE_EXCL)) {
+			nbusypages++;
+			continue;
+		} else if (PP_ISFREE(pp)) {
+			if (!PP_ISAGED(pp)) {
+				page_destroy_free(pp);
+			} else {
+				page_unlock(pp);
+			}
+			continue;
+		}
+		/*
+		 * Is this page involved in some I/O? shared?
+		 *
+		 * The page_struct_lock need not be acquired to
+		 * examine these fields since the page has an
+		 * "exclusive" lock.
+		 */
+		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+			page_unlock(pp);
+			continue;
+		}
+
+		if (vp->v_type == VCHR) {
+			panic("vp->v_type == VCHR");
+			/*NOTREACHED*/
+		}
+
+		if (!page_try_demote_pages(pp)) {
+			page_unlock(pp);
+			continue;
+		}
+
+		/*
+		 * Check the modified bit. Leave the bits alone in hardware
+		 * (they will be modified if we do the putpage).
+		 */
+		mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
+			& P_MOD);
+		if (mod) {
+			offset = pp->p_offset;
+			/*
+			 * Hold the vnode before releasing the page lock
+			 * to prevent it from being freed and re-used by
+			 * some other thread.
+			 */
+			VN_HOLD(vp);
+			page_unlock(pp);
+			/*
+			 * No error return is checked here. Callers such as
+			 * cpr deals with the dirty pages at the dump time
+			 * if this putpage fails.
+			 */
+			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
+			    kcred);
+			VN_RELE(vp);
+		} else {
+			page_destroy(pp, 0);
+		}
+	} while ((pp = page_next(pp)) != page0);
+	if (nbusypages && retry++ < MAXRETRIES) {
+		delay(1);
+		goto top;
+	}
+}
+
+/*
+ * Replace the page "old" with the page "new" on the page hash and vnode lists
+ *
+ * the replacemnt must be done in place, ie the equivalent sequence:
+ *
+ *	vp = old->p_vnode;
+ *	off = old->p_offset;
+ *	page_do_hashout(old)
+ *	page_do_hashin(new, vp, off)
+ *
+ * doesn't work, since
+ *  1) if old is the only page on the vnode, the v_pages list has a window
+ *     where it looks empty. This will break file system assumptions.
+ * and
+ *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
+ */
+static void
+page_do_relocate_hash(page_t *new, page_t *old)
+{
+	page_t	**hash_list;
+	vnode_t	*vp = old->p_vnode;
+	kmutex_t *sep;
+
+	ASSERT(PAGE_EXCL(old));
+	ASSERT(PAGE_EXCL(new));
+	ASSERT(vp != NULL);
+	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
+	ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
+
+	/*
+	 * First find old page on the page hash list
+	 */
+	hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
+
+	for (;;) {
+		if (*hash_list == old)
+			break;
+		if (*hash_list == NULL) {
+			panic("page_do_hashout");
+			/*NOTREACHED*/
+		}
+		hash_list = &(*hash_list)->p_hash;
+	}
+
+	/*
+	 * update new and replace old with new on the page hash list
+	 */
+	new->p_vnode = old->p_vnode;
+	new->p_offset = old->p_offset;
+	new->p_hash = old->p_hash;
+	*hash_list = new;
+
+	if ((new->p_vnode->v_flag & VISSWAP) != 0)
+		PP_SETSWAP(new);
+
+	/*
+	 * replace old with new on the vnode's page list
+	 */
+	if (old->p_vpnext == old) {
+		new->p_vpnext = new;
+		new->p_vpprev = new;
+	} else {
+		new->p_vpnext = old->p_vpnext;
+		new->p_vpprev = old->p_vpprev;
+		new->p_vpnext->p_vpprev = new;
+		new->p_vpprev->p_vpnext = new;
+	}
+	if (vp->v_pages == old)
+		vp->v_pages = new;
+
+	/*
+	 * clear out the old page
+	 */
+	old->p_hash = NULL;
+	old->p_vpnext = NULL;
+	old->p_vpprev = NULL;
+	old->p_vnode = NULL;
+	PP_CLRSWAP(old);
+	old->p_offset = (u_offset_t)-1;
+	page_clr_all_props(old);
+
+	/*
+	 * Wake up processes waiting for this page.  The page's
+	 * identity has been changed, and is probably not the
+	 * desired page any longer.
+	 */
+	sep = page_se_mutex(old);
+	mutex_enter(sep);
+	if (CV_HAS_WAITERS(&old->p_cv))
+		cv_broadcast(&old->p_cv);
+	mutex_exit(sep);
+}
+
+/*
+ * This function moves the identity of page "pp_old" to page "pp_new".
+ * Both pages must be locked on entry.  "pp_new" is free, has no identity,
+ * and need not be hashed out from anywhere.
+ */
+void
+page_relocate_hash(page_t *pp_new, page_t *pp_old)
+{
+	vnode_t *vp = pp_old->p_vnode;
+	u_offset_t off = pp_old->p_offset;
+	kmutex_t *phm, *vphm;
+
+	/*
+	 * Rehash two pages
+	 */
+	ASSERT(PAGE_EXCL(pp_old));
+	ASSERT(PAGE_EXCL(pp_new));
+	ASSERT(vp != NULL);
+	ASSERT(pp_new->p_vnode == NULL);
+
+	/*
+	 * hashout then hashin while holding the mutexes
+	 */
+	phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
+	mutex_enter(phm);
+	vphm = page_vnode_mutex(vp);
+	mutex_enter(vphm);
+
+	page_do_relocate_hash(pp_new, pp_old);
+
+	mutex_exit(vphm);
+	mutex_exit(phm);
+
+	/*
+	 * The page_struct_lock need not be acquired for lckcnt and
+	 * cowcnt since the page has an "exclusive" lock.
+	 */
+	ASSERT(pp_new->p_lckcnt == 0);
+	ASSERT(pp_new->p_cowcnt == 0);
+	pp_new->p_lckcnt = pp_old->p_lckcnt;
+	pp_new->p_cowcnt = pp_old->p_cowcnt;
+	pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
+
+	/* The following comment preserved from page_flip(). */
+	/* XXX - Do we need to protect fsdata? */
+	pp_new->p_fsdata = pp_old->p_fsdata;
+}
+
+/*
+ * Helper routine used to lock all remaining members of a
+ * large page. The caller is responsible for passing in a locked
+ * pp. If pp is a large page, then it succeeds in locking all the
+ * remaining constituent pages or it returns with only the
+ * original page locked.
+ *
+ * Returns 1 on success, 0 on failure.
+ *
+ * If success is returned this routine gurantees p_szc for all constituent
+ * pages of a large page pp belongs to can't change. To achieve this we
+ * recheck szc of pp after locking all constituent pages and retry if szc
+ * changed (it could only decrease). Since hat_page_demote() needs an EXCL
+ * lock on one of constituent pages it can't be running after all constituent
+ * pages are locked.  hat_page_demote() with a lock on a constituent page
+ * outside of this large page (i.e. pp belonged to a larger large page) is
+ * already done with all constituent pages of pp since the root's p_szc is
+ * changed last. Thefore no need to synchronize with hat_page_demote() that
+ * locked a constituent page outside of pp's current large page.
+ */
+#ifdef DEBUG
+uint32_t gpg_trylock_mtbf = 0;
+#endif
+
+int
+group_page_trylock(page_t *pp, se_t se)
+{
+	page_t  *tpp;
+	pgcnt_t	npgs, i, j;
+	uint_t pszc = pp->p_szc;
+
+#ifdef DEBUG
+	if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
+		return (0);
+	}
+#endif
+
+	if (pp != PP_GROUPLEADER(pp, pszc)) {
+		return (0);
+	}
+
+retry:
+	ASSERT(PAGE_LOCKED_SE(pp, se));
+	ASSERT(!PP_ISFREE(pp));
+	if (pszc == 0) {
+		return (1);
+	}
+	npgs = page_get_pagecnt(pszc);
+	tpp = pp + 1;
+	for (i = 1; i < npgs; i++, tpp++) {
+		if (!page_trylock(tpp, se)) {
+			tpp = pp + 1;
+			for (j = 1; j < i; j++, tpp++) {
+				page_unlock(tpp);
+			}
+			return (0);
+		}
+	}
+	if (pp->p_szc != pszc) {
+		ASSERT(pp->p_szc < pszc);
+		ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp &&
+		    !IS_SWAPFSVP(pp->p_vnode));
+		tpp = pp + 1;
+		for (i = 1; i < npgs; i++, tpp++) {
+			page_unlock(tpp);
+		}
+		pszc = pp->p_szc;
+		goto retry;
+	}
+	return (1);
+}
+
+void
+group_page_unlock(page_t *pp)
+{
+	page_t *tpp;
+	pgcnt_t	npgs, i;
+
+	ASSERT(PAGE_LOCKED(pp));
+	ASSERT(!PP_ISFREE(pp));
+	ASSERT(pp == PP_PAGEROOT(pp));
+	npgs = page_get_pagecnt(pp->p_szc);
+	for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
+		page_unlock(tpp);
+	}
+}
+
+/*
+ * returns
+ * 0 		: on success and *nrelocp is number of relocated PAGESIZE pages
+ * ERANGE	: this is not a base page
+ * EBUSY	: failure to get locks on the page/pages
+ * ENOMEM	: failure to obtain replacement pages
+ * EAGAIN	: OBP has not yet completed its boot-time handoff to the kernel
+ *
+ * Return with all constituent members of target and replacement
+ * SE_EXCL locked. It is the callers responsibility to drop the
+ * locks.
+ */
+int
+do_page_relocate(
+	page_t **target,
+	page_t **replacement,
+	int grouplock,
+	spgcnt_t *nrelocp,
+	lgrp_t *lgrp)
+{
+#ifdef DEBUG
+	page_t *first_repl;
+#endif /* DEBUG */
+	page_t *repl;
+	page_t *targ;
+	page_t *pl = NULL;
+	uint_t ppattr;
+	pfn_t   pfn, repl_pfn;
+	uint_t	szc;
+	spgcnt_t npgs, i;
+	int repl_contig = 0;
+	uint_t flags = 0;
+	spgcnt_t dofree = 0;
+
+	*nrelocp = 0;
+
+#if defined(__sparc)
+	/*
+	 * We need to wait till OBP has completed
+	 * its boot-time handoff of its resources to the kernel
+	 * before we allow page relocation
+	 */
+	if (page_relocate_ready == 0) {
+		return (EAGAIN);
+	}
+#endif
+
+	/*
+	 * If this is not a base page,
+	 * just return with 0x0 pages relocated.
+	 */
+	targ = *target;
+	ASSERT(PAGE_EXCL(targ));
+	ASSERT(!PP_ISFREE(targ));
+	szc = targ->p_szc;
+	ASSERT(szc < mmu_page_sizes);
+	VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
+	pfn = targ->p_pagenum;
+	if (pfn != PFN_BASE(pfn, szc)) {
+		VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
+		return (ERANGE);
+	}
+
+	if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
+		repl_pfn = repl->p_pagenum;
+		if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
+			VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
+			return (ERANGE);
+		}
+		repl_contig = 1;
+	}
+
+	/*
+	 * We must lock all members of this large page or we cannot
+	 * relocate any part of it.
+	 */
+	if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
+		VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
+		return (EBUSY);
+	}
+
+	/*
+	 * reread szc it could have been decreased before
+	 * group_page_trylock() was done.
+	 */
+	szc = targ->p_szc;
+	ASSERT(szc < mmu_page_sizes);
+	VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
+	ASSERT(pfn == PFN_BASE(pfn, szc));
+
+	npgs = page_get_pagecnt(targ->p_szc);
+
+	if (repl == NULL) {
+		dofree = npgs;		/* Size of target page in MMU pages */
+		if (!page_create_wait(dofree, 0)) {
+			if (grouplock != 0) {
+				group_page_unlock(targ);
+			}
+			VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
+			return (ENOMEM);
+		}
+
+		/*
+		 * seg kmem pages require that the target and replacement
+		 * page be the same pagesize.
+		 */
+		flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0;
+		repl = page_get_replacement_page(targ, lgrp, flags);
+		if (repl == NULL) {
+			if (grouplock != 0) {
+				group_page_unlock(targ);
+			}
+			page_create_putback(dofree);
+			VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
+			return (ENOMEM);
+		}
+	}
+#ifdef DEBUG
+	else {
+		ASSERT(PAGE_LOCKED(repl));
+	}
+#endif /* DEBUG */
+
+#if defined(__sparc)
+	/*
+	 * Let hat_page_relocate() complete the relocation if it's kernel page
+	 */
+	if (targ->p_vnode == &kvp) {
+		*replacement = repl;
+		if (hat_page_relocate(target, replacement, nrelocp) != 0) {
+			if (grouplock != 0) {
+				group_page_unlock(targ);
+			}
+			if (dofree) {
+				*replacement = NULL;
+				page_free_replacement_page(repl);
+				page_create_putback(dofree);
+			}
+			VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
+			return (EAGAIN);
+		}
+		VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
+		return (0);
+	}
+#else
+#if defined(lint)
+	dofree = dofree;
+#endif
+#endif
+
+#ifdef DEBUG
+	first_repl = repl;
+#endif /* DEBUG */
+
+	for (i = 0; i < npgs; i++) {
+		ASSERT(PAGE_EXCL(targ));
+
+		(void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
+
+		ASSERT(hat_page_getshare(targ) == 0);
+		ASSERT(!PP_ISFREE(targ));
+		ASSERT(targ->p_pagenum == (pfn + i));
+		ASSERT(repl_contig == 0 ||
+		    repl->p_pagenum == (repl_pfn + i));
+
+		/*
+		 * Copy the page contents and attributes then
+		 * relocate the page in the page hash.
+		 */
+		ppcopy(targ, repl);
+		ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
+		page_clr_all_props(repl);
+		page_set_props(repl, ppattr);
+		page_relocate_hash(repl, targ);
+
+		ASSERT(hat_page_getshare(targ) == 0);
+		ASSERT(hat_page_getshare(repl) == 0);
+		/*
+		 * Now clear the props on targ, after the
+		 * page_relocate_hash(), they no longer
+		 * have any meaning.
+		 */
+		page_clr_all_props(targ);
+		ASSERT(targ->p_next == targ);
+		ASSERT(targ->p_prev == targ);
+		page_list_concat(&pl, &targ);
+
+		targ++;
+		if (repl_contig != 0) {
+			repl++;
+		} else {
+			repl = repl->p_next;
+		}
+	}
+	/* assert that we have come full circle with repl */
+	ASSERT(repl_contig == 1 || first_repl == repl);
+
+	*target = pl;
+	if (*replacement == NULL) {
+		ASSERT(first_repl == repl);
+		*replacement = repl;
+	}
+	VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
+	*nrelocp = npgs;
+	return (0);
+}
+/*
+ * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
+ */
+int
+page_relocate(
+	page_t **target,
+	page_t **replacement,
+	int grouplock,
+	int freetarget,
+	spgcnt_t *nrelocp,
+	lgrp_t *lgrp)
+{
+	spgcnt_t ret;
+
+	/* do_page_relocate returns 0 on success or errno value */
+	ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
+
+	if (ret != 0 || freetarget == 0) {
+		return (ret);
+	}
+	if (*nrelocp == 1) {
+		ASSERT(*target != NULL);
+		page_free(*target, 1);
+	} else {
+		page_t *tpp = *target;
+		uint_t szc = tpp->p_szc;
+		pgcnt_t npgs = page_get_pagecnt(szc);
+		ASSERT(npgs > 1);
+		ASSERT(szc != 0);
+		do {
+			ASSERT(PAGE_EXCL(tpp));
+			ASSERT(!hat_page_is_mapped(tpp));
+			ASSERT(tpp->p_szc == szc);
+			PP_SETFREE(tpp);
+			PP_SETAGED(tpp);
+			npgs--;
+		} while ((tpp = tpp->p_next) != *target);
+		ASSERT(npgs == 0);
+		page_list_add_pages(*target, 0);
+		npgs = page_get_pagecnt(szc);
+		page_create_putback(npgs);
+	}
+	return (ret);
+}
+
+/*
+ * it is up to the caller to deal with pcf accounting.
+ */
+void
+page_free_replacement_page(page_t *pplist)
+{
+	page_t *pp;
+
+	while (pplist != NULL) {
+		/*
+		 * pp_targ is a linked list.
+		 */
+		pp = pplist;
+		if (pp->p_szc == 0) {
+			page_sub(&pplist, pp);
+			page_clr_all_props(pp);
+			PP_SETFREE(pp);
+			PP_SETAGED(pp);
+			page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
+			page_unlock(pp);
+			VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
+		} else {
+			spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
+			page_t *tpp;
+			page_list_break(&pp, &pplist, curnpgs);
+			tpp = pp;
+			do {
+				ASSERT(PAGE_EXCL(tpp));
+				ASSERT(!hat_page_is_mapped(tpp));
+				page_clr_all_props(pp);
+				PP_SETFREE(tpp);
+				PP_SETAGED(tpp);
+			} while ((tpp = tpp->p_next) != pp);
+			page_list_add_pages(pp, 0);
+			VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
+		}
+	}
+}
+
+/*
+ * Relocate target to non-relocatable replacement page.
+ */
+int
+page_relocate_cage(page_t **target, page_t **replacement)
+{
+	page_t *tpp, *rpp;
+	spgcnt_t pgcnt, npgs;
+	int result;
+
+	tpp = *target;
+
+	ASSERT(PAGE_EXCL(tpp));
+	ASSERT(tpp->p_szc == 0);
+
+	pgcnt = btop(page_get_pagesize(tpp->p_szc));
+
+	do {
+		(void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
+		rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
+		if (rpp == NULL) {
+			page_create_putback(pgcnt);
+			kcage_cageout_wakeup();
+		}
+	} while (rpp == NULL);
+
+	ASSERT(PP_ISNORELOC(rpp));
+
+	result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
+
+	if (result == 0) {
+		*replacement = rpp;
+		if (pgcnt != npgs)
+			panic("page_relocate_cage: partial relocation");
+	}
+
+	return (result);
+}
+
+/*
+ * Release the page lock on a page, place on cachelist
+ * tail if no longer mapped. Caller can let us know if
+ * the page is known to be clean.
+ */
+int
+page_release(page_t *pp, int checkmod)
+{
+	int status;
+
+	ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
+		(pp->p_vnode != NULL));
+
+	if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
+	    ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
+	    pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
+	    !hat_page_is_mapped(pp)) {
+
+		/*
+		 * If page is modified, unlock it
+		 *
+		 * (p_nrm & P_MOD) bit has the latest stuff because:
+		 * (1) We found that this page doesn't have any mappings
+		 *	_after_ holding SE_EXCL and
+		 * (2) We didn't drop SE_EXCL lock after the check in (1)
+		 */
+		if (checkmod && hat_ismod(pp)) {
+			page_unlock(pp);
+			status = PGREL_MOD;
+		} else {
+			/*LINTED: constant in conditional context*/
+			VN_DISPOSE(pp, B_FREE, 0, kcred);
+			status = PGREL_CLEAN;
+		}
+	} else {
+		page_unlock(pp);
+		status = PGREL_NOTREL;
+	}
+	return (status);
+}
+
+int
+page_try_demote_pages(page_t *pp)
+{
+	page_t *tpp, *rootpp = pp;
+	pfn_t	pfn = page_pptonum(pp);
+	spgcnt_t i, npgs;
+	uint_t	szc = pp->p_szc;
+	vnode_t *vp = pp->p_vnode;
+
+	ASSERT(PAGE_EXCL(rootpp));
+
+	VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
+
+	if (rootpp->p_szc == 0) {
+		VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
+		return (1);
+	}
+
+	if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) {
+		VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
+		page_demote_vp_pages(rootpp);
+		ASSERT(pp->p_szc == 0);
+		return (1);
+	}
+
+	/*
+	 * Adjust rootpp if  passed in is not the base
+	 * constituent page.
+	 */
+	npgs = page_get_pagecnt(rootpp->p_szc);
+	ASSERT(npgs > 1);
+	if (!IS_P2ALIGNED(pfn, npgs)) {
+		pfn = P2ALIGN(pfn, npgs);
+		rootpp = page_numtopp_nolock(pfn);
+		VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
+		ASSERT(rootpp->p_vnode != NULL);
+		ASSERT(rootpp->p_szc == szc);
+	}
+
+	/*
+	 * We can't demote kernel pages since we can't hat_unload()
+	 * the mappings.
+	 */
+	if (rootpp->p_vnode == &kvp)
+		return (0);
+
+	/*
+	 * Attempt to lock all constituent pages except the page passed
+	 * in since it's already locked.
+	 */
+	for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) {
+		ASSERT(!PP_ISFREE(tpp));
+		ASSERT(tpp->p_vnode != NULL);
+
+		if (tpp != pp && !page_trylock(tpp, SE_EXCL))
+			break;
+		ASSERT(tpp->p_szc == rootpp->p_szc);
+		ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
+		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
+	}
+
+	/*
+	 * If we failed to lock them all then unlock what we have locked
+	 * so far and bail.
+	 */
+	if (i < npgs) {
+		tpp = rootpp;
+		while (i-- > 0) {
+			if (tpp != pp)
+				page_unlock(tpp);
+			tpp = page_next(tpp);
+		}
+		VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
+		return (0);
+	}
+
+	/*
+	 * XXX probably p_szc clearing and page unlocking can be done within
+	 * one loop but since this is rare code we can play very safe.
+	 */
+	for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) {
+		ASSERT(PAGE_EXCL(tpp));
+		tpp->p_szc = 0;
+	}
+
+	/*
+	 * Unlock all pages except the page passed in.
+	 */
+	for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) {
+		ASSERT(!hat_page_is_mapped(tpp));
+		if (tpp != pp)
+			page_unlock(tpp);
+	}
+	VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
+	return (1);
+}
+
+/*
+ * Called by page_free() and page_destroy() to demote the page size code
+ * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
+ * p_szc on free list, neither can we just clear p_szc of a single page_t
+ * within a large page since it will break other code that relies on p_szc
+ * being the same for all page_t's of a large page). Anonymous pages should
+ * never end up here because anon_map_getpages() cannot deal with p_szc
+ * changes after a single constituent page is locked.  While anonymous or
+ * kernel large pages are demoted or freed the entire large page at a time
+ * with all constituent pages locked EXCL for the file system pages we
+ * have to be able to demote a large page (i.e. decrease all constituent pages
+ * p_szc) with only just an EXCL lock on one of constituent pages. The reason
+ * we can easily deal with anonymous page demotion the entire large page at a
+ * time is that those operation originate at address space level and concern
+ * the entire large page region with actual demotion only done when pages are
+ * not shared with any other processes (therefore we can always get EXCL lock
+ * on all anonymous constituent pages after clearing segment page
+ * cache). However file system pages can be truncated or invalidated at a
+ * PAGESIZE level from the file system side and end up in page_free() or
+ * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
+ * and therfore pageout should be able to demote a large page by EXCL locking
+ * any constituent page that is not under SOFTLOCK). In those cases we cannot
+ * rely on being able to lock EXCL all constituent pages.
+ *
+ * To prevent szc changes on file system pages one has to lock all constituent
+ * pages at least SHARED (or call page_szc_lock()). The only subsystem that
+ * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
+ * prevent szc changes is hat layer that uses its own page level mlist
+ * locks. hat assumes that szc doesn't change after mlist lock for a page is
+ * taken. Therefore we need to change szc under hat level locks if we only
+ * have an EXCL lock on a single constituent page and hat still references any
+ * of constituent pages.  (Note we can't "ignore" hat layer by simply
+ * hat_pageunload() all constituent pages without having EXCL locks on all of
+ * constituent pages). We use hat_page_demote() call to safely demote szc of
+ * all constituent pages under hat locks when we only have an EXCL lock on one
+ * of constituent pages.
+ *
+ * This routine calls page_szc_lock() before calling hat_page_demote() to
+ * allow segvn in one special case not to lock all constituent pages SHARED
+ * before calling hat_memload_array() that relies on p_szc not changeing even
+ * before hat level mlist lock is taken.  In that case segvn uses
+ * page_szc_lock() to prevent hat_page_demote() changeing p_szc values.
+ *
+ * Anonymous or kernel page demotion still has to lock all pages exclusively
+ * and do hat_pageunload() on all constituent pages before demoting the page
+ * therefore there's no need for anonymous or kernel page demotion to use
+ * hat_page_demote() mechanism.
+ *
+ * hat_page_demote() removes all large mappings that map pp and then decreases
+ * p_szc starting from the last constituent page of the large page. By working
+ * from the tail of a large page in pfn decreasing order allows one looking at
+ * the root page to know that hat_page_demote() is done for root's szc area.
+ * e.g. if a root page has szc 1 one knows it only has to lock all constituent
+ * pages within szc 1 area to prevent szc changes because hat_page_demote()
+ * that started on this page when it had szc > 1 is done for this szc 1 area.
+ *
+ * We are guranteed that all constituent pages of pp's large page belong to
+ * the same vnode with the consecutive offsets increasing in the direction of
+ * the pfn i.e. the identity of constituent pages can't change until their
+ * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
+ * large mappings to pp even though we don't lock any constituent page except
+ * pp (i.e. we won't unload e.g. kernel locked page).
+ */
+static void
+page_demote_vp_pages(page_t *pp)
+{
+	kmutex_t *mtx;
+
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(!PP_ISFREE(pp));
+	ASSERT(pp->p_vnode != NULL);
+	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
+	ASSERT(pp->p_vnode != &kvp);
+
+	VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
+
+	mtx = page_szc_lock(pp);
+	if (mtx != NULL) {
+		hat_page_demote(pp);
+		mutex_exit(mtx);
+	}
+	ASSERT(pp->p_szc == 0);
+}
+
+/*
+ * Page retire operation.
+ *
+ * page_retire()
+ * Attempt to retire (throw away) page pp.  We cannot do this if
+ * the page is dirty; if the page is clean, we can try.  We return 0 on
+ * success, -1 on failure.  This routine should be invoked by the platform's
+ * memory error detection code.
+ *
+ * pages_retired_limit_exceeded()
+ * We set a limit on the number of pages which may be retired. This
+ * is set to a percentage of total physical memory. This limit is
+ * enforced here.
+ */
+
+static pgcnt_t	retired_pgcnt = 0;
+
+/*
+ * routines to update the count of retired pages
+ */
+static void
+page_retired(page_t *pp)
+{
+	ASSERT(pp);
+
+	page_settoxic(pp, PAGE_IS_RETIRED);
+	atomic_add_long(&retired_pgcnt, 1);
+}
+
+static void
+retired_page_removed(page_t *pp)
+{
+	ASSERT(pp);
+	ASSERT(page_isretired(pp));
+	ASSERT(retired_pgcnt > 0);
+
+	page_clrtoxic(pp);
+	atomic_add_long(&retired_pgcnt, -1);
+}
+
+
+static int
+pages_retired_limit_exceeded()
+{
+	pgcnt_t	retired_max;
+
+	/*
+	 * If the percentage is zero or is not set correctly,
+	 * return TRUE so that pages are not retired.
+	 */
+	if (max_pages_retired_bps <= 0 ||
+	    max_pages_retired_bps >= 10000)
+		return (1);
+
+	/*
+	 * Calculate the maximum number of pages allowed to
+	 * be retired as a percentage of total physical memory
+	 * (Remember that we are using basis points, hence the 10000.)
+	 */
+	retired_max = (physmem * max_pages_retired_bps) / 10000;
+
+	/*
+	 * return 'TRUE' if we have already retired more
+	 * than the legal limit
+	 */
+	return (retired_pgcnt >= retired_max);
+}
+
+#define	PAGE_RETIRE_SELOCK	0
+#define	PAGE_RETIRE_NORECLAIM	1
+#define	PAGE_RETIRE_LOCKED	2
+#define	PAGE_RETIRE_COW		3
+#define	PAGE_RETIRE_DIRTY	4
+#define	PAGE_RETIRE_LPAGE	5
+#define	PAGE_RETIRE_SUCCESS	6
+#define	PAGE_RETIRE_LIMIT	7
+#define	PAGE_RETIRE_NCODES	8
+
+typedef struct page_retire_op {
+	int	pr_count;
+	short	pr_unlock;
+	short	pr_retval;
+	char	*pr_message;
+} page_retire_op_t;
+
+page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = {
+	{	0,	0,	-1,	"cannot lock page"		},
+	{	0,	0,	-1,	"cannot reclaim cached page"	},
+	{	0,	1,	-1,	"page is locked"		},
+	{	0,	1,	-1,	"copy-on-write page"		},
+	{	0,	1,	-1,	"page is dirty"			},
+	{	0,	1,	-1,	"cannot demote large page"	},
+	{	0,	0,	0,	"page successfully retired"	},
+	{	0,	0,	-1,	"excess pages retired already"	},
+};
+
+static int
+page_retire_done(page_t *pp, int code)
+{
+	page_retire_op_t *prop = &page_retire_ops[code];
+
+	prop->pr_count++;
+
+	if (prop->pr_unlock)
+		page_unlock(pp);
+
+	if (page_retire_messages > 1) {
+		printf("page_retire(%p) pfn 0x%lx %s: %s\n",
+		    (void *)pp, page_pptonum(pp),
+		    prop->pr_retval == -1 ? "failed" : "succeeded",
+		    prop->pr_message);
+	}
+
+	return (prop->pr_retval);
+}
+
+int
+page_retire(page_t *pp, uchar_t flag)
+{
+	uint64_t pa = ptob((uint64_t)page_pptonum(pp));
+
+	ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC);
+
+	/*
+	 * DR operations change the association between a page_t
+	 * and the physical page it represents. Check if the
+	 * page is still bad.
+	 */
+	if (!page_isfaulty(pp)) {
+		page_clrtoxic(pp);
+		return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
+	}
+
+	/*
+	 * We set the flag here so that even if we fail due
+	 * to exceeding the limit for retired pages, the
+	 * page will still be checked and either cleared
+	 * or retired in page_free().
+	 */
+	page_settoxic(pp, flag);
+
+	if (flag == PAGE_IS_TOXIC) {
+		if (page_retire_messages) {
+			cmn_err(CE_NOTE, "Scheduling clearing of error on"
+			    " page 0x%08x.%08x",
+			    (uint32_t)(pa >> 32), (uint32_t)pa);
+		}
+
+	} else { /* PAGE_IS_FAILING */
+		if (pages_retired_limit_exceeded()) {
+			/*
+			 * Return as we have already exceeded the
+			 * maximum number of pages allowed to be
+			 * retired
+			 */
+			return (page_retire_done(pp, PAGE_RETIRE_LIMIT));
+		}
+
+		if (page_retire_messages) {
+			cmn_err(CE_NOTE, "Scheduling removal of "
+			    "page 0x%08x.%08x",
+			    (uint32_t)(pa >> 32), (uint32_t)pa);
+		}
+	}
+
+	if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL))
+		return (page_retire_done(pp, PAGE_RETIRE_SELOCK));
+
+	/*
+	 * If this is a large page we first try and demote it
+	 * to PAGESIZE pages and then dispose of the toxic page.
+	 * On failure we will let the page free/destroy
+	 * code handle it later since this is a mapped page.
+	 * Note that free large pages can always be demoted.
+	 *
+	 */
+	if (pp->p_szc != 0) {
+		if (PP_ISFREE(pp))
+			(void) page_demote_free_pages(pp);
+		else
+			(void) page_try_demote_pages(pp);
+
+		if (pp->p_szc != 0)
+			return (page_retire_done(pp, PAGE_RETIRE_LPAGE));
+	}
+
+	if (PP_ISFREE(pp)) {
+		if (!page_reclaim(pp, NULL))
+			return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM));
+		/*LINTED: constant in conditional context*/
+		VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred)
+		return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
+	}
+
+	if (pp->p_lckcnt != 0)
+		return (page_retire_done(pp, PAGE_RETIRE_LOCKED));
+
+	if (pp->p_cowcnt != 0)
+		return (page_retire_done(pp, PAGE_RETIRE_COW));
+
+	/*
+	 * Unload all translations to this page.  No new translations
+	 * can be created while we hold the exclusive lock on the page.
+	 */
+	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+
+	if (hat_ismod(pp))
+		return (page_retire_done(pp, PAGE_RETIRE_DIRTY));
+
+	/*LINTED: constant in conditional context*/
+	VN_DISPOSE(pp, B_INVAL, 0, kcred);
+
+	return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
+}
+
+/*
+ * Mark any existing pages for migration in the given range
+ */
+void
+page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
+    struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
+    u_offset_t vnoff, int rflag)
+{
+	struct anon	*ap;
+	vnode_t		*curvp;
+	lgrp_t		*from;
+	pgcnt_t		i;
+	pgcnt_t		nlocked;
+	u_offset_t	off;
+	pfn_t		pfn;
+	size_t		pgsz;
+	size_t		segpgsz;
+	pgcnt_t		pages;
+	uint_t		pszc;
+	page_t		**ppa;
+	pgcnt_t		ppa_nentries;
+	page_t		*pp;
+	caddr_t		va;
+	ulong_t		an_idx;
+	anon_sync_obj_t	cookie;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/*
+	 * Don't do anything if don't need to do lgroup optimizations
+	 * on this system
+	 */
+	if (!lgrp_optimizations())
+		return;
+
+	/*
+	 * Align address and length to (potentially large) page boundary
+	 */
+	segpgsz = page_get_pagesize(seg->s_szc);
+	addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
+	if (rflag)
+		len = P2ROUNDUP(len, segpgsz);
+
+	/*
+	 * Allocate page array to accomodate largest page size
+	 */
+	pgsz = page_get_pagesize(page_num_pagesizes() - 1);
+	ppa_nentries = btop(pgsz);
+	ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP);
+
+	/*
+	 * Do one (large) page at a time
+	 */
+	va = addr;
+	while (va < addr + len) {
+		/*
+		 * Lookup (root) page for vnode and offset corresponding to
+		 * this virtual address
+		 * Try anonmap first since there may be copy-on-write
+		 * pages, but initialize vnode pointer and offset using
+		 * vnode arguments just in case there isn't an amp.
+		 */
+		curvp = vp;
+		off = vnoff + va - seg->s_base;
+		if (amp) {
+			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+			an_idx = anon_index + seg_page(seg, va);
+			anon_array_enter(amp, an_idx, &cookie);
+			ap = anon_get_ptr(amp->ahp, an_idx);
+			if (ap)
+				swap_xlate(ap, &curvp, &off);
+			anon_array_exit(&cookie);
+			ANON_LOCK_EXIT(&amp->a_rwlock);
+		}
+
+		pp = NULL;
+		if (curvp)
+			pp = page_lookup(curvp, off, SE_SHARED);
+
+		/*
+		 * If there isn't a page at this virtual address,
+		 * skip to next page
+		 */
+		if (pp == NULL) {
+			va += PAGESIZE;
+			continue;
+		}
+
+		/*
+		 * Figure out which lgroup this page is in for kstats
+		 */
+		pfn = page_pptonum(pp);
+		from = lgrp_pfn_to_lgrp(pfn);
+
+		/*
+		 * Get page size, and round up and skip to next page boundary
+		 * if unaligned address
+		 */
+		pszc = pp->p_szc;
+		pgsz = page_get_pagesize(pszc);
+		pages = btop(pgsz);
+		if (!IS_P2ALIGNED(va, pgsz) ||
+		    !IS_P2ALIGNED(pfn, pages) ||
+		    pgsz > segpgsz) {
+			pgsz = MIN(pgsz, segpgsz);
+			page_unlock(pp);
+			i = btop(P2END((uintptr_t)va, pgsz) -
+			    (uintptr_t)va);
+			va = (caddr_t)P2END((uintptr_t)va, pgsz);
+			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i);
+			continue;
+		}
+
+		/*
+		 * Upgrade to exclusive lock on page
+		 */
+		if (!page_tryupgrade(pp)) {
+			page_unlock(pp);
+			va += pgsz;
+			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
+			    btop(pgsz));
+			continue;
+		}
+
+		/*
+		 * Remember pages locked exclusively and how many
+		 */
+		ppa[0] = pp;
+		nlocked = 1;
+
+		/*
+		 * Lock constituent pages if this is large page
+		 */
+		if (pages > 1) {
+			/*
+			 * Lock all constituents except root page, since it
+			 * should be locked already.
+			 */
+			for (i = 1; i < pages; i++) {
+				pp = page_next(pp);
+				if (!page_trylock(pp, SE_EXCL)) {
+					break;
+				}
+				if (PP_ISFREE(pp) ||
+				    pp->p_szc != pszc) {
+					/*
+					 * hat_page_demote() raced in with us.
+					 */
+					ASSERT(!IS_SWAPFSVP(curvp));
+					page_unlock(pp);
+					break;
+				}
+				ppa[nlocked] = pp;
+				nlocked++;
+			}
+		}
+
+		/*
+		 * If all constituent pages couldn't be locked,
+		 * unlock pages locked so far and skip to next page.
+		 */
+		if (nlocked != pages) {
+			for (i = 0; i < nlocked; i++)
+				page_unlock(ppa[i]);
+			va += pgsz;
+			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
+			    btop(pgsz));
+			continue;
+		}
+
+		/*
+		 * hat_page_demote() can no longer happen
+		 * since last cons page had the right p_szc after
+		 * all cons pages were locked. all cons pages
+		 * should now have the same p_szc.
+		 */
+
+		/*
+		 * All constituent pages locked successfully, so mark
+		 * large page for migration and unload the mappings of
+		 * constituent pages, so a fault will occur on any part of the
+		 * large page
+		 */
+		PP_SETMIGRATE(ppa[0]);
+		for (i = 0; i < nlocked; i++) {
+			pp = ppa[i];
+			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+			ASSERT(hat_page_getshare(pp) == 0);
+			page_unlock(pp);
+		}
+		lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
+
+		va += pgsz;
+	}
+	kmem_free(ppa, ppa_nentries * sizeof (page_t *));
+}
+
+/*
+ * Migrate any pages that have been marked for migration in the given range
+ */
+void
+page_migrate(
+	struct seg	*seg,
+	caddr_t		addr,
+	page_t		**ppa,
+	pgcnt_t		npages)
+{
+	lgrp_t		*from;
+	lgrp_t		*to;
+	page_t		*newpp;
+	page_t		*pp;
+	pfn_t		pfn;
+	size_t		pgsz;
+	spgcnt_t	page_cnt;
+	spgcnt_t	i;
+	uint_t		pszc;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	while (npages > 0) {
+		pp = *ppa;
+		pszc = pp->p_szc;
+		pgsz = page_get_pagesize(pszc);
+		page_cnt = btop(pgsz);
+
+		/*
+		 * Check to see whether this page is marked for migration
+		 *
+		 * Assume that root page of large page is marked for
+		 * migration and none of the other constituent pages
+		 * are marked.  This really simplifies clearing the
+		 * migrate bit by not having to clear it from each
+		 * constituent page.
+		 *
+		 * note we don't want to relocate an entire large page if
+		 * someone is only using one subpage.
+		 */
+		if (npages < page_cnt)
+			break;
+
+		/*
+		 * Is it marked for migration?
+		 */
+		if (!PP_ISMIGRATE(pp))
+			goto next;
+
+		/*
+		 * Determine lgroups that page is being migrated between
+		 */
+		pfn = page_pptonum(pp);
+		if (!IS_P2ALIGNED(pfn, page_cnt)) {
+			break;
+		}
+		from = lgrp_pfn_to_lgrp(pfn);
+		to = lgrp_mem_choose(seg, addr, pgsz);
+
+		/*
+		 * Check to see whether we are trying to migrate page to lgroup
+		 * where it is allocated already
+		 */
+		if (to == from) {
+			PP_CLRMIGRATE(pp);
+			goto next;
+		}
+
+		/*
+		 * Need to get exclusive lock's to migrate
+		 */
+		for (i = 0; i < page_cnt; i++) {
+			ASSERT(PAGE_LOCKED(ppa[i]));
+			if (page_pptonum(ppa[i]) != pfn + i ||
+			    ppa[i]->p_szc != pszc) {
+				break;
+			}
+			if (!page_tryupgrade(ppa[i])) {
+				lgrp_stat_add(from->lgrp_id,
+				    LGRP_PM_FAIL_LOCK_PGS,
+				    page_cnt);
+				break;
+			}
+		}
+		if (i != page_cnt) {
+			while (--i != -1) {
+				page_downgrade(ppa[i]);
+			}
+			goto next;
+		}
+
+		(void) page_create_wait(page_cnt, PG_WAIT);
+		newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
+		if (newpp == NULL) {
+			page_create_putback(page_cnt);
+			for (i = 0; i < page_cnt; i++) {
+				page_downgrade(ppa[i]);
+			}
+			lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
+			    page_cnt);
+			goto next;
+		}
+		ASSERT(newpp->p_szc == pszc);
+		/*
+		 * Clear migrate bit and relocate page
+		 */
+		PP_CLRMIGRATE(pp);
+		if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
+			panic("page_migrate: page_relocate failed");
+		}
+		ASSERT(page_cnt * PAGESIZE == pgsz);
+
+		/*
+		 * Keep stats for number of pages migrated from and to
+		 * each lgroup
+		 */
+		lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
+		lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
+		/*
+		 * update the page_t array we were passed in and
+		 * unlink constituent pages of a large page.
+		 */
+		for (i = 0; i < page_cnt; ++i, ++pp) {
+			ASSERT(PAGE_EXCL(newpp));
+			ASSERT(newpp->p_szc == pszc);
+			ppa[i] = newpp;
+			pp = newpp;
+			page_sub(&newpp, pp);
+			page_downgrade(pp);
+		}
+		ASSERT(newpp == NULL);
+next:
+		addr += pgsz;
+		ppa += page_cnt;
+		npages -= page_cnt;
+	}
+}
+
+/*
+ * initialize the vnode for retired pages
+ */
+static void
+page_retired_init(void)
+{
+	vn_setops(&retired_ppages, &retired_vnodeops);
+}
+
+/* ARGSUSED */
+static void
+retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr)
+{
+	panic("retired_dispose invoked");
+}
+
+/* ARGSUSED */
+static void
+retired_inactive(vnode_t *vp, cred_t *cr)
+{}
+
+void
+page_unretire_pages(void)
+{
+	page_t		*pp;
+	kmutex_t	*vphm;
+	vnode_t		*vp;
+	page_t		*rpages[UNRETIRE_PAGES];
+	pgcnt_t		i, npages, rmem;
+	uint64_t	pa;
+
+	rmem = 0;
+
+	for (;;) {
+		/*
+		 * We do this in 2 steps:
+		 *
+		 * 1. We walk the retired pages list and collect a list of
+		 *    pages that have the toxic field cleared.
+		 *
+		 * 2. We iterate through the page list and unretire each one.
+		 *
+		 * We have to do it in two steps on account of the mutexes that
+		 * we need to acquire.
+		 */
+
+		vp = &retired_ppages;
+		vphm = page_vnode_mutex(vp);
+		mutex_enter(vphm);
+
+		if ((pp = vp->v_pages) == NULL) {
+			mutex_exit(vphm);
+			break;
+		}
+
+		i = 0;
+		do {
+			ASSERT(pp != NULL);
+			ASSERT(pp->p_vnode == vp);
+
+			/*
+			 * DR operations change the association between a page_t
+			 * and the physical page it represents. Check if the
+			 * page is still bad. If not, unretire it.
+			 */
+			if (!page_isfaulty(pp))
+				rpages[i++] = pp;
+
+			pp = pp->p_vpnext;
+		} while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES));
+
+		mutex_exit(vphm);
+
+		npages = i;
+		for (i = 0; i < npages; i++) {
+			pp = rpages[i];
+			pa = ptob((uint64_t)page_pptonum(pp));
+
+			/*
+			 * Need to upgrade the shared lock to an exclusive
+			 * lock in order to hash out the page.
+			 *
+			 * The page could have been retired but the page lock
+			 * may not have been downgraded yet. If so, skip this
+			 * page. page_free() will call this function after the
+			 * lock is downgraded.
+			 */
+
+			if (!PAGE_SHARED(pp) || !page_tryupgrade(pp))
+				continue;
+
+			/*
+			 * Both page_free() and DR call this function. They
+			 * can potentially call this function at the same
+			 * time and race with each other.
+			 */
+			if (!page_isretired(pp) || page_isfaulty(pp)) {
+				page_downgrade(pp);
+				continue;
+			}
+
+			cmn_err(CE_NOTE,
+				"unretiring retired page 0x%08x.%08x",
+				(uint32_t)(pa >> 32), (uint32_t)pa);
+
+			/*
+			 * When a page is removed from the retired pages vnode,
+			 * its toxic field is also cleared. So, we do not have
+			 * to do that seperately here.
+			 */
+			page_hashout(pp, (kmutex_t *)NULL);
+
+			/*
+			 * This is a good page. So, free it.
+			 */
+			pp->p_vnode = NULL;
+			page_free(pp, 1);
+			rmem++;
+		}
+
+		/*
+		 * If the rpages array was filled up, then there could be more
+		 * retired pages that are not faulty. We need to iterate
+		 * again and unretire them. Otherwise, we are done.
+		 */
+		if (npages < UNRETIRE_PAGES)
+			break;
+	}
+
+	mutex_enter(&freemem_lock);
+	availrmem += rmem;
+	mutex_exit(&freemem_lock);
+}
+
+ulong_t mem_waiters 	= 0;
+ulong_t	max_count 	= 20;
+#define	MAX_DELAY	0x1ff
+
+/*
+ * Check if enough memory is available to proceed.
+ * Depending on system configuration and how much memory is
+ * reserved for swap we need to check against two variables.
+ * e.g. on systems with little physical swap availrmem can be
+ * more reliable indicator of how much memory is available.
+ * On systems with large phys swap freemem can be better indicator.
+ * If freemem drops below threshold level don't return an error
+ * immediately but wake up pageout to free memory and block.
+ * This is done number of times. If pageout is not able to free
+ * memory within certain time return an error.
+ * The same applies for availrmem but kmem_reap is used to
+ * free memory.
+ */
+int
+page_mem_avail(pgcnt_t npages)
+{
+	ulong_t count;
+
+#if defined(__i386)
+	if (freemem > desfree + npages &&
+	    availrmem > swapfs_reserve + npages &&
+	    btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem +
+	    npages)
+		return (1);
+#else
+	if (freemem > desfree + npages &&
+	    availrmem > swapfs_reserve + npages)
+		return (1);
+#endif
+
+	count = max_count;
+	atomic_add_long(&mem_waiters, 1);
+
+	while (freemem < desfree + npages && --count) {
+		cv_signal(&proc_pageout->p_cv);
+		if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
+			atomic_add_long(&mem_waiters, -1);
+			return (0);
+		}
+	}
+	if (count == 0) {
+		atomic_add_long(&mem_waiters, -1);
+		return (0);
+	}
+
+	count = max_count;
+	while (availrmem < swapfs_reserve + npages && --count) {
+		kmem_reap();
+		if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
+			atomic_add_long(&mem_waiters, -1);
+			return (0);
+		}
+	}
+	atomic_add_long(&mem_waiters, -1);
+	if (count == 0)
+		return (0);
+
+#if defined(__i386)
+	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
+	    tune.t_minarmem + npages)
+		return (0);
+#endif
+	return (1);
+}
+
+
+/*
+ * Search the memory segments to locate the desired page.  Within a
+ * segment, pages increase linearly with one page structure per
+ * physical page frame (size PAGESIZE).  The search begins
+ * with the segment that was accessed last, to take advantage of locality.
+ * If the hint misses, we start from the beginning of the sorted memseg list
+ */
+
+
+/*
+ * Some data structures for pfn to pp lookup.
+ */
+ulong_t mhash_per_slot;
+struct memseg *memseg_hash[N_MEM_SLOTS];
+
+page_t *
+page_numtopp_nolock(pfn_t pfnum)
+{
+	static struct memseg *last_memseg_by_pfnum = NULL;
+	struct memseg *seg;
+	page_t *pp;
+
+	/*
+	 *	XXX - Since page_numtopp_nolock is called in many places where
+	 *	the search fails more than it succeeds. It maybe worthwhile
+	 *	to put a check for pf_is_memory or a pfnum <= max_pfn (set at
+	 *	boot time).
+	 *
+	 *	if (!pf_is_memory(pfnum) || (pfnum > max_pfn))
+	 *		return (NULL);
+	 */
+
+	MEMSEG_STAT_INCR(nsearch);
+
+	/* Try last winner first */
+	if (((seg = last_memseg_by_pfnum) != NULL) &&
+		(pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
+		MEMSEG_STAT_INCR(nlastwon);
+		pp = seg->pages + (pfnum - seg->pages_base);
+		if (pp->p_pagenum == pfnum)
+			return ((page_t *)pp);
+	}
+
+	/* Else Try hash */
+	if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
+		(pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
+		MEMSEG_STAT_INCR(nhashwon);
+		last_memseg_by_pfnum = seg;
+		pp = seg->pages + (pfnum - seg->pages_base);
+		if (pp->p_pagenum == pfnum)
+			return ((page_t *)pp);
+	}
+
+	/* Else Brute force */
+	for (seg = memsegs; seg != NULL; seg = seg->next) {
+		if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
+			last_memseg_by_pfnum = seg;
+			pp = seg->pages + (pfnum - seg->pages_base);
+			return ((page_t *)pp);
+		}
+	}
+	last_memseg_by_pfnum = NULL;
+	MEMSEG_STAT_INCR(nnotfound);
+	return ((page_t *)NULL);
+
+}
+
+struct memseg *
+page_numtomemseg_nolock(pfn_t pfnum)
+{
+	struct memseg *seg;
+	page_t *pp;
+
+	/* Try hash */
+	if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
+		(pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
+		pp = seg->pages + (pfnum - seg->pages_base);
+		if (pp->p_pagenum == pfnum)
+			return (seg);
+	}
+
+	/* Else Brute force */
+	for (seg = memsegs; seg != NULL; seg = seg->next) {
+		if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
+			return (seg);
+		}
+	}
+	return ((struct memseg *)NULL);
+}
+
+/*
+ * Given a page and a count return the page struct that is
+ * n structs away from the current one in the global page
+ * list.
+ *
+ * This function wraps to the first page upon
+ * reaching the end of the memseg list.
+ */
+page_t *
+page_nextn(page_t *pp, ulong_t n)
+{
+	static struct memseg *last_page_next_memseg = NULL;
+	struct memseg *seg;
+	page_t *ppn;
+
+	if (((seg = last_page_next_memseg) == NULL) ||
+	    (seg->pages_base == seg->pages_end) ||
+	    !(pp >= seg->pages && pp < seg->epages)) {
+
+		for (seg = memsegs; seg; seg = seg->next) {
+			if (pp >= seg->pages && pp < seg->epages)
+				break;
+		}
+
+		if (seg == NULL) {
+			/* Memory delete got in, return something valid. */
+			/* TODO: fix me. */
+			seg = memsegs;
+			pp = seg->pages;
+		}
+	}
+
+	/* check for wraparound - possible if n is large */
+	while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
+		n -= seg->epages - pp;
+		seg = seg->next;
+		if (seg == NULL)
+			seg = memsegs;
+		pp = seg->pages;
+	}
+	last_page_next_memseg = seg;
+	return (ppn);
+}
+
+/*
+ * Initialize for a loop using page_next_scan_large().
+ */
+page_t *
+page_next_scan_init(void **cookie)
+{
+	ASSERT(cookie != NULL);
+	*cookie = (void *)memsegs;
+	return ((page_t *)memsegs->pages);
+}
+
+/*
+ * Return the next page in a scan of page_t's, assuming we want
+ * to skip over sub-pages within larger page sizes.
+ *
+ * The cookie is used to keep track of the current memseg.
+ */
+page_t *
+page_next_scan_large(
+	page_t		*pp,
+	ulong_t		*n,
+	void		**cookie)
+{
+	struct memseg	*seg = (struct memseg *)*cookie;
+	page_t		*new_pp;
+	ulong_t		cnt;
+	pfn_t		pfn;
+
+
+	/*
+	 * get the count of page_t's to skip based on the page size
+	 */
+	ASSERT(pp != NULL);
+	if (pp->p_szc == 0) {
+		cnt = 1;
+	} else {
+		pfn = page_pptonum(pp);
+		cnt = page_get_pagecnt(pp->p_szc);
+		cnt -= pfn & (cnt - 1);
+	}
+	*n += cnt;
+	new_pp = pp + cnt;
+
+	/*
+	 * Catch if we went past the end of the current memory segment. If so,
+	 * just move to the next segment with pages.
+	 */
+	if (new_pp >= seg->epages) {
+		do {
+			seg = seg->next;
+			if (seg == NULL)
+				seg = memsegs;
+		} while (seg->pages == seg->epages);
+		new_pp = seg->pages;
+		*cookie = (void *)seg;
+	}
+
+	return (new_pp);
+}
+
+
+/*
+ * Returns next page in list. Note: this function wraps
+ * to the first page in the list upon reaching the end
+ * of the list. Callers should be aware of this fact.
+ */
+
+/* We should change this be a #define */
+
+page_t *
+page_next(page_t *pp)
+{
+	return (page_nextn(pp, 1));
+}
+
+/*
+ * Special for routines processing an array of page_t.
+ */
+page_t *
+page_nextn_raw(page_t *pp, ulong_t n)
+{
+	return (pp+n);
+}
+
+page_t *
+page_first()
+{
+	return ((page_t *)memsegs->pages);
+}
+
+
+/*
+ * This routine is called at boot with the initial memory configuration
+ * and when memory is added or removed.
+ */
+void
+build_pfn_hash()
+{
+	pfn_t cur;
+	pgcnt_t index;
+	struct memseg *pseg;
+	int	i;
+
+	/*
+	 * Clear memseg_hash array.
+	 * Since memory add/delete is designed to operate concurrently
+	 * with normal operation, the hash rebuild must be able to run
+	 * concurrently with page_numtopp_nolock(). To support this
+	 * functionality, assignments to memseg_hash array members must
+	 * be done atomically.
+	 *
+	 * NOTE: bzero() does not currently guarantee this for kernel
+	 * threads, and cannot be used here.
+	 */
+	for (i = 0; i < N_MEM_SLOTS; i++)
+		memseg_hash[i] = NULL;
+
+	hat_kpm_mseghash_clear(N_MEM_SLOTS);
+
+	/*
+	 * Physmax is the last valid pfn.
+	 */
+	mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
+	for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
+		index = MEMSEG_PFN_HASH(pseg->pages_base);
+		cur = pseg->pages_base;
+		do {
+			if (index >= N_MEM_SLOTS)
+				index = MEMSEG_PFN_HASH(cur);
+
+			if (memseg_hash[index] == NULL ||
+			    memseg_hash[index]->pages_base > pseg->pages_base) {
+				memseg_hash[index] = pseg;
+				hat_kpm_mseghash_update(index, pseg);
+			}
+			cur += mhash_per_slot;
+			index++;
+		} while (cur < pseg->pages_end);
+	}
+}
+
+/*
+ * Return the pagenum for the pp
+ */
+pfn_t
+page_pptonum(page_t *pp)
+{
+	return (pp->p_pagenum);
+}
+
+/*
+ * interface to the referenced and modified etc bits
+ * in the PSM part of the page struct
+ * when no locking is desired.
+ */
+void
+page_set_props(page_t *pp, uint_t flags)
+{
+	ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
+	pp->p_nrm |= (uchar_t)flags;
+}
+
+void
+page_clr_all_props(page_t *pp)
+{
+	pp->p_nrm = 0;
+}
+
+/*
+ * The following functions is called from free_vp_pages()
+ * for an inexact estimate of a newly free'd page...
+ */
+ulong_t
+page_share_cnt(page_t *pp)
+{
+	return (hat_page_getshare(pp));
+}
+
+/*
+ * The following functions are used in handling memory
+ * errors.
+ */
+
+int
+page_istoxic(page_t *pp)
+{
+	return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC);
+}
+
+int
+page_isfailing(page_t *pp)
+{
+	return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING);
+}
+
+int
+page_isretired(page_t *pp)
+{
+	return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED);
+}
+
+int
+page_deteriorating(page_t *pp)
+{
+	return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0);
+}
+
+void
+page_settoxic(page_t *pp, uchar_t flag)
+{
+	uchar_t new_flag = 0;
+	while ((new_flag & flag) != flag) {
+		uchar_t old_flag = pp->p_toxic;
+		new_flag = old_flag | flag;
+		(void) cas8(&pp->p_toxic, old_flag, new_flag);
+		new_flag = ((volatile page_t *)pp)->p_toxic;
+	}
+}
+
+void
+page_clrtoxic(page_t *pp)
+{
+	/*
+	 * We don't need to worry about atomicity on the
+	 * p_toxic flag here as this is only called from
+	 * page_free() while holding an exclusive lock on
+	 * the page
+	 */
+	pp->p_toxic = PAGE_IS_OK;
+}
+
+void
+page_clrtoxic_flag(page_t *pp, uchar_t flag)
+{
+	uchar_t new_flag = ((volatile page_t *)pp)->p_toxic;
+	while ((new_flag & flag) == flag) {
+		uchar_t old_flag = new_flag;
+		new_flag = old_flag & ~flag;
+		(void) cas8(&pp->p_toxic, old_flag, new_flag);
+		new_flag = ((volatile page_t *)pp)->p_toxic;
+	}
+}
+
+int
+page_isfaulty(page_t *pp)
+{
+	return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY);
+}
+
+/*
+ * The following four functions are called from /proc code
+ * for the /proc/<pid>/xmap interface.
+ */
+int
+page_isshared(page_t *pp)
+{
+	return (hat_page_getshare(pp) > 1);
+}
+
+int
+page_isfree(page_t *pp)
+{
+	return (PP_ISFREE(pp));
+}
+
+int
+page_isref(page_t *pp)
+{
+	return (hat_page_getattr(pp, P_REF));
+}
+
+int
+page_ismod(page_t *pp)
+{
+	return (hat_page_getattr(pp, P_MOD));
+}
diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c
new file mode 100644
index 0000000000..3d1d773321
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_pagelist.c
@@ -0,0 +1,3726 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/*	All Rights Reserved   */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * This file contains common functions to access and manage the page lists.
+ * Many of these routines originated from platform dependent modules
+ * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
+ * a platform independent manner.
+ *
+ * vm/vm_dep.h provides for platform specific support.
+ */
+
+#include <sys/types.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/systm.h>
+#include <sys/atomic.h>
+#include <sys/sysmacros.h>
+#include <vm/as.h>
+#include <vm/page.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_vn.h>
+#include <sys/memnode.h>
+#include <vm/vm_dep.h>
+#include <sys/lgrp.h>
+#include <sys/mem_config.h>
+#include <sys/callb.h>
+#include <sys/mem_cage.h>
+#include <sys/sdt.h>
+
+extern uint_t	vac_colors;
+
+/*
+ * number of page colors equivalent to reqested color in page_get routines.
+ * If set, keeps large pages intact longer and keeps MPO allocation
+ * from the local mnode in favor of acquiring the 'correct' page color from
+ * a demoted large page or from a remote mnode.
+ */
+int	colorequiv;
+
+/*
+ * if set, specifies the percentage of large pages that are free from within
+ * a large page region before attempting to lock those pages for
+ * page_get_contig_pages processing.
+ *
+ * Should be turned on when kpr is available when page_trylock_contig_pages
+ * can be more selective.
+ */
+
+int	ptcpthreshold;
+
+/*
+ * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
+ * use slot 0 (base page size unused) to enable or disable limiting search.
+ * Enabled by default.
+ */
+int	pgcpfailcnt[MMU_PAGE_SIZES];
+int	pgcplimitsearch = 1;
+
+#ifdef VM_STATS
+struct vmm_vmstats_str  vmm_vmstats;
+
+#endif /* VM_STATS */
+
+#if defined(__sparc)
+#define	LPGCREATE	0
+#else
+/* enable page_get_contig_pages */
+#define	LPGCREATE	1
+#endif
+
+int pg_contig_disable;
+int pg_lpgcreate_nocage = LPGCREATE;
+
+/*
+ * page_freelist_fill pfn flag to signify no hi pfn requirement.
+ */
+#define	PFNNULL		0
+
+/* Flags involved in promotion and demotion routines */
+#define	PC_FREE		0x1	/* put page on freelist */
+#define	PC_ALLOC	0x2	/* return page for allocation */
+
+/*
+ * Flag for page_demote to be used with PC_FREE to denote that we don't care
+ * what the color is as the color parameter to the function is ignored.
+ */
+#define	PC_NO_COLOR	(-1)
+
+/*
+ * page counters candidates info
+ * See page_ctrs_cands comment below for more details.
+ * fields are as follows:
+ *	pcc_pages_free:		# pages which freelist coalesce can create
+ *	pcc_color_free_len:	number of elements in pcc_color_free array
+ *	pcc_color_free:		pointer to page free counts per color
+ */
+typedef struct pcc_info {
+	pgcnt_t	pcc_pages_free;
+	int	pcc_color_free_len;
+	pgcnt_t	*pcc_color_free;
+} pcc_info_t;
+
+/*
+ * On big machines it can take a long time to check page_counters
+ * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
+ * updated sum of all elements of the corresponding page_counters arrays.
+ * page_freelist_coalesce() searches page_counters only if an appropriate
+ * element of page_ctrs_cands array is greater than 0.
+ *
+ * An extra dimension is used for page_ctrs_cands to spread the elements
+ * over a few e$ cache lines to avoid serialization during the array
+ * updates.
+ */
+#pragma	align 64(page_ctrs_cands)
+
+static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
+
+/*
+ * Return in val the total number of free pages which can be created
+ * for the given mnode (m) and region size (r)
+ */
+#define	PGCTRS_CANDS_GETVALUE(m, r, val) {				\
+	int i;								\
+	val = 0;							\
+	for (i = 0; i < NPC_MUTEX; i++) {				\
+	    val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free;		\
+	}								\
+}
+
+/*
+ * Return in val the total number of free pages which can be created
+ * for the given mnode (m), region size (r), and color (c)
+ */
+#define	PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) {			\
+	int i;								\
+	val = 0;							\
+	ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len);	\
+	for (i = 0; i < NPC_MUTEX; i++) {				\
+	    val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)];	\
+	}								\
+}
+
+/*
+ * We can only allow a single thread to update a counter within the physical
+ * range of the largest supported page size. That is the finest granularity
+ * possible since the counter values are dependent on each other
+ * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
+ * ctr_mutex lock index for a particular physical range.
+ */
+static kmutex_t	*ctr_mutex[NPC_MUTEX];
+
+#define	PP_CTR_LOCK_INDX(pp)						\
+	(((pp)->p_pagenum >>					\
+	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
+
+/*
+ * Local functions prototypes.
+ */
+
+void page_ctr_add(page_t *, int);
+void page_ctr_add_internal(int, page_t *, int);
+void page_ctr_sub(page_t *, int);
+uint_t  page_convert_color(uchar_t, uchar_t, uint_t);
+void page_freelist_lock(int);
+void page_freelist_unlock(int);
+page_t *page_promote(int, pfn_t, uchar_t, int);
+page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
+page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
+page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
+static int page_trylock_cons(page_t *pp, se_t se);
+
+#define	PNUM_SIZE(szc)							\
+	(hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift)
+#define	PNUM_SHIFT(szc)							\
+	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
+
+/*
+ * The page_counters array below is used to keep track of free contiguous
+ * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
+ * This contains an array of counters, the size of the array, a shift value
+ * used to convert a pagenum into a counter array index or vice versa, as
+ * well as a cache of the last successful index to be promoted to a larger
+ * page size.  As an optimization, we keep track of the last successful index
+ * to be promoted per page color for the given size region, and this is
+ * allocated dynamically based upon the number of colors for a given
+ * region size.
+ *
+ * Conceptually, the page counters are represented as:
+ *
+ *	page_counters[region_size][mnode]
+ *
+ *	region_size:	size code of a candidate larger page made up
+ *			of contiguous free smaller pages.
+ *
+ *	page_counters[region_size][mnode].hpm_counters[index]:
+ *		represents how many (region_size - 1) pages either
+ *		exist or can be created within the given index range.
+ *
+ * Let's look at a sparc example:
+ *	If we want to create a free 512k page, we look at region_size 2
+ *	for the mnode we want.  We calculate the index and look at a specific
+ *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
+ *	this location, it means that 8 64k pages either exist or can be created
+ *	from 8K pages in order to make a single free 512k page at the given
+ *	index.  Note that when a region is full, it will contribute to the
+ *	counts in the region above it.  Thus we will not know what page
+ *	size the free pages will be which can be promoted to this new free
+ *	page unless we look at all regions below the current region.
+ */
+
+/*
+ * Note: hpmctr_t is defined in platform vm_dep.h
+ * hw_page_map_t contains all the information needed for the page_counters
+ * logic. The fields are as follows:
+ *
+ *	hpm_counters:	dynamically allocated array to hold counter data
+ *	hpm_entries:	entries in hpm_counters
+ *	hpm_shift:	shift for pnum/array index conv
+ *	hpm_base:	PFN mapped to counter index 0
+ *	hpm_color_current_len:	# of elements in hpm_color_current "array" below
+ *	hpm_color_current:	last index in counter array for this color at
+ *				which we successfully created a large page
+ */
+typedef struct hw_page_map {
+	hpmctr_t	*hpm_counters;
+	size_t		hpm_entries;
+	int		hpm_shift;
+	pfn_t		hpm_base;
+	size_t		hpm_color_current_len;
+	size_t 		*hpm_color_current;
+} hw_page_map_t;
+
+/*
+ * Element zero is not used, but is allocated for convenience.
+ */
+static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
+
+/*
+ * The following macros are convenient ways to get access to the individual
+ * elements of the page_counters arrays.  They can be used on both
+ * the left side and right side of equations.
+ */
+#define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
+	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
+
+#define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
+	(page_counters[(rg_szc)][(mnode)].hpm_counters)
+
+#define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
+	(page_counters[(rg_szc)][(mnode)].hpm_shift)
+
+#define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
+	(page_counters[(rg_szc)][(mnode)].hpm_entries)
+
+#define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
+	(page_counters[(rg_szc)][(mnode)].hpm_base)
+
+#define	PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc)		\
+	(page_counters[(rg_szc)][(mnode)].hpm_color_current_len)
+
+#define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc)	\
+	(page_counters[(rg_szc)][(mnode)].hpm_color_current)
+
+#define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color)	\
+	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)])
+
+#define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
+	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
+		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
+
+#define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
+	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
+		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
+
+/*
+ * Protects the hpm_counters and hpm_color_current memory from changing while
+ * looking at page counters information.
+ * Grab the write lock to modify what these fields point at.
+ * Grab the read lock to prevent any pointers from changing.
+ * The write lock can not be held during memory allocation due to a possible
+ * recursion deadlock with trying to grab the read lock while the
+ * write lock is already held.
+ */
+krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
+
+/*
+ * page size to page size code
+ */
+int
+page_szc(size_t pagesize)
+{
+	int	i = 0;
+
+	while (hw_page_array[i].hp_size) {
+		if (pagesize == hw_page_array[i].hp_size)
+			return (i);
+		i++;
+	}
+	return (-1);
+}
+
+/*
+ * page size to page size code for user supported page sizes
+ */
+int
+page_user_szc(size_t pagesize)
+{
+	int szc = page_szc(pagesize);
+	if (szc != -1)
+		return (SZC_2_USERSZC(szc));
+	return (-1);
+}
+
+/*
+ * Return how many page sizes are available for the user to use.  This is
+ * what the hardware supports and not based upon how the OS implements the
+ * support of different page sizes.
+ */
+uint_t
+page_num_user_pagesizes(void)
+{
+	return (mmu_exported_page_sizes);
+}
+
+uint_t
+page_num_pagesizes(void)
+{
+	return (mmu_page_sizes);
+}
+
+/*
+ * returns the count of the number of base pagesize pages associated with szc
+ */
+pgcnt_t
+page_get_pagecnt(uint_t szc)
+{
+	if (szc >= mmu_page_sizes)
+		panic("page_get_pagecnt: out of range %d", szc);
+	return (hw_page_array[szc].hp_pgcnt);
+}
+
+size_t
+page_get_pagesize(uint_t szc)
+{
+	if (szc >= mmu_page_sizes)
+		panic("page_get_pagesize: out of range %d", szc);
+	return (hw_page_array[szc].hp_size);
+}
+
+/*
+ * Return the size of a page based upon the index passed in.  An index of
+ * zero refers to the smallest page size in the system, and as index increases
+ * it refers to the next larger supported page size in the system.
+ * Note that szc and userszc may not be the same due to unsupported szc's on
+ * some systems.
+ */
+size_t
+page_get_user_pagesize(uint_t userszc)
+{
+	uint_t szc = USERSZC_2_SZC(userszc);
+
+	if (szc >= mmu_page_sizes)
+		panic("page_get_user_pagesize: out of range %d", szc);
+	return (hw_page_array[szc].hp_size);
+}
+
+uint_t
+page_get_shift(uint_t szc)
+{
+	if (szc >= mmu_page_sizes)
+		panic("page_get_shift: out of range %d", szc);
+	return (hw_page_array[szc].hp_shift);
+}
+
+uint_t
+page_get_pagecolors(uint_t szc)
+{
+	ASSERT(page_colors != 0);
+	return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1));
+}
+
+/*
+ * Called by startup().
+ * Size up the per page size free list counters based on physmax
+ * of each node and max_mem_nodes.
+ */
+size_t
+page_ctrs_sz(void)
+{
+	int	r;		/* region size */
+	int	mnode;
+	uint_t	ctrs_sz = 0;
+	int 	i;
+	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
+
+	/*
+	 * We need to determine how many page colors there are for each
+	 * page size in order to allocate memory for any color specific
+	 * arrays.
+	 */
+	colors_per_szc[0] = page_colors;
+	for (i = 1; i < mmu_page_sizes; i++) {
+		colors_per_szc[i] =
+		    page_convert_color(0, i, page_colors - 1) + 1;
+	}
+
+	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+
+		pgcnt_t r_pgcnt;
+		pfn_t   r_base;
+		pgcnt_t r_align;
+
+		if (mem_node_config[mnode].exists == 0)
+			continue;
+
+		/*
+		 * determine size needed for page counter arrays with
+		 * base aligned to large page size.
+		 */
+		for (r = 1; r < mmu_page_sizes; r++) {
+			/* add in space for hpm_counters */
+			r_align = page_get_pagecnt(r);
+			r_base = mem_node_config[mnode].physbase;
+			r_base &= ~(r_align - 1);
+			r_pgcnt = howmany(mem_node_config[mnode].physmax -
+			r_base, r_align);
+			/*
+			 * Round up to always allocate on pointer sized
+			 * boundaries.
+			 */
+			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
+			    sizeof (hpmctr_t *));
+
+			/* add in space for hpm_color_current */
+			ctrs_sz += (colors_per_szc[r] *
+			    sizeof (size_t));
+		}
+	}
+
+	for (r = 1; r < mmu_page_sizes; r++) {
+		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
+
+		/* add in space for page_ctrs_cands */
+		ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t));
+		ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] *
+		    sizeof (pgcnt_t);
+	}
+
+	/* ctr_mutex */
+	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
+
+	/* size for page list counts */
+	PLCNT_SZ(ctrs_sz);
+
+	/*
+	 * add some slop for roundups. page_ctrs_alloc will roundup the start
+	 * address of the counters to ecache_alignsize boundary for every
+	 * memory node.
+	 */
+	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
+}
+
+caddr_t
+page_ctrs_alloc(caddr_t alloc_base)
+{
+	int	mnode;
+	int	r;		/* region size */
+	int	i;
+	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
+
+	/*
+	 * We need to determine how many page colors there are for each
+	 * page size in order to allocate memory for any color specific
+	 * arrays.
+	 */
+	colors_per_szc[0] = page_colors;
+	for (i = 1; i < mmu_page_sizes; i++) {
+		colors_per_szc[i] =
+		    page_convert_color(0, i, page_colors - 1) + 1;
+	}
+
+	for (r = 1; r < mmu_page_sizes; r++) {
+		page_counters[r] = (hw_page_map_t *)alloc_base;
+		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
+	}
+
+	/* page_ctrs_cands */
+	for (r = 1; r < mmu_page_sizes; r++) {
+		for (i = 0; i < NPC_MUTEX; i++) {
+			page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base;
+			alloc_base += max_mem_nodes * (sizeof (pcc_info_t));
+
+		}
+	}
+
+	/* page_ctrs_cands pcc_color_free array */
+	for (r = 1; r < mmu_page_sizes; r++) {
+		for (i = 0; i < NPC_MUTEX; i++) {
+			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+				page_ctrs_cands[i][r][mnode].pcc_color_free_len
+				    = colors_per_szc[r];
+				page_ctrs_cands[i][r][mnode].pcc_color_free =
+				    (pgcnt_t *)alloc_base;
+				alloc_base += colors_per_szc[r] *
+				    sizeof (pgcnt_t);
+			}
+		}
+	}
+
+	/* ctr_mutex */
+	for (i = 0; i < NPC_MUTEX; i++) {
+		ctr_mutex[i] = (kmutex_t *)alloc_base;
+		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
+	}
+
+	/* initialize page list counts */
+	PLCNT_INIT(alloc_base);
+
+	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+
+		pgcnt_t r_pgcnt;
+		pfn_t	r_base;
+		pgcnt_t r_align;
+		int	r_shift;
+
+		if (mem_node_config[mnode].exists == 0)
+			continue;
+
+		for (r = 1; r < mmu_page_sizes; r++) {
+			/*
+			 * the page_counters base has to be aligned to the
+			 * page count of page size code r otherwise the counts
+			 * will cross large page boundaries.
+			 */
+			r_align = page_get_pagecnt(r);
+			r_base = mem_node_config[mnode].physbase;
+			/* base needs to be aligned - lower to aligned value */
+			r_base &= ~(r_align - 1);
+			r_pgcnt = howmany(mem_node_config[mnode].physmax -
+			r_base, r_align);
+			r_shift = PAGE_BSZS_SHIFT(r);
+
+			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
+			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
+			PAGE_COUNTERS_BASE(mnode, r) = r_base;
+			PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) =
+			    colors_per_szc[r];
+			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) =
+			    (size_t *)alloc_base;
+			alloc_base += (sizeof (size_t) * colors_per_szc[r]);
+			for (i = 0; i < colors_per_szc[r]; i++) {
+				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
+			}
+			PAGE_COUNTERS_COUNTERS(mnode, r) =
+			    (hpmctr_t *)alloc_base;
+			/*
+			 * Round up to make alloc_base always be aligned on
+			 * a pointer boundary.
+			 */
+			alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
+			    sizeof (hpmctr_t *));
+
+			/*
+			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
+			 * satisfy the identity requirement.
+			 * We should be able to go from one to the other
+			 * and get consistent values.
+			 */
+			ASSERT(PNUM_TO_IDX(mnode, r,
+			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
+			ASSERT(IDX_TO_PNUM(mnode, r,
+			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
+		}
+		/*
+		 * Roundup the start address of the page_counters to
+		 * cache aligned boundary for every memory node.
+		 * page_ctrs_sz() has added some slop for these roundups.
+		 */
+		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
+			L2CACHE_ALIGN);
+	}
+
+	/* Initialize other page counter specific data structures. */
+	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
+		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
+	}
+
+	return (alloc_base);
+}
+
+/*
+ * Functions to adjust region counters for each size free list.
+ * Caller is responsible to acquire the ctr_mutex lock if necessary and
+ * thus can be called during startup without locks.
+ */
+/* ARGSUSED */
+void
+page_ctr_add_internal(int mnode, page_t *pp, int flags)
+{
+	ssize_t		r;	/* region size */
+	ssize_t		idx;
+	pfn_t		pfnum;
+	int		lckidx;
+
+	ASSERT(pp->p_szc < mmu_page_sizes);
+
+	PLCNT_INCR(pp, mnode, pp->p_szc, flags);
+
+	/* no counter update needed for largest page size */
+	if (pp->p_szc >= mmu_page_sizes - 1) {
+		return;
+	}
+
+	r = pp->p_szc + 1;
+	pfnum = pp->p_pagenum;
+	lckidx = PP_CTR_LOCK_INDX(pp);
+
+	/*
+	 * Increment the count of free pages for the current
+	 * region. Continue looping up in region size incrementing
+	 * count if the preceeding region is full.
+	 */
+	while (r < mmu_page_sizes) {
+		idx = PNUM_TO_IDX(mnode, r, pfnum);
+
+		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
+		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
+
+		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r))
+			break;
+
+		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++;
+		page_ctrs_cands[lckidx][r][mnode].
+		    pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
+		r++;
+	}
+}
+
+void
+page_ctr_add(page_t *pp, int flags)
+{
+	int		lckidx = PP_CTR_LOCK_INDX(pp);
+	int		mnode = PP_2_MEM_NODE(pp);
+	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
+
+	mutex_enter(lock);
+	page_ctr_add_internal(mnode, pp, flags);
+	mutex_exit(lock);
+}
+
+void
+page_ctr_sub(page_t *pp, int flags)
+{
+	int		lckidx;
+	int		mnode = PP_2_MEM_NODE(pp);
+	kmutex_t	*lock;
+	ssize_t		r;	/* region size */
+	ssize_t		idx;
+	pfn_t		pfnum;
+
+	ASSERT(pp->p_szc < mmu_page_sizes);
+
+	PLCNT_DECR(pp, mnode, pp->p_szc, flags);
+
+	/* no counter update needed for largest page size */
+	if (pp->p_szc >= mmu_page_sizes - 1) {
+		return;
+	}
+
+	r = pp->p_szc + 1;
+	pfnum = pp->p_pagenum;
+	lckidx = PP_CTR_LOCK_INDX(pp);
+	lock = &ctr_mutex[lckidx][mnode];
+
+	/*
+	 * Decrement the count of free pages for the current
+	 * region. Continue looping up in region size decrementing
+	 * count if the preceeding region was full.
+	 */
+	mutex_enter(lock);
+	while (r < mmu_page_sizes) {
+		idx = PNUM_TO_IDX(mnode, r, pfnum);
+
+		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
+		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
+
+		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
+			break;
+		}
+		ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0);
+		ASSERT(page_ctrs_cands[lckidx][r][mnode].
+		    pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
+
+		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--;
+		page_ctrs_cands[lckidx][r][mnode].
+		    pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
+		r++;
+	}
+	mutex_exit(lock);
+}
+
+/*
+ * Adjust page counters following a memory attach, since typically the
+ * size of the array needs to change, and the PFN to counter index
+ * mapping needs to change.
+ */
+uint_t
+page_ctrs_adjust(int mnode)
+{
+	pgcnt_t npgs;
+	int	r;		/* region size */
+	int	i;
+	size_t	pcsz, old_csz;
+	hpmctr_t *new_ctr, *old_ctr;
+	pfn_t	oldbase, newbase;
+	size_t	old_npgs;
+	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
+	size_t	size_cache[MMU_PAGE_SIZES];
+	size_t	*color_cache[MMU_PAGE_SIZES];
+	size_t	*old_color_array;
+	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
+
+	newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
+	npgs = roundup(mem_node_config[mnode].physmax,
+	    PC_BASE_ALIGN) - newbase;
+
+	/*
+	 * We need to determine how many page colors there are for each
+	 * page size in order to allocate memory for any color specific
+	 * arrays.
+	 */
+	colors_per_szc[0] = page_colors;
+	for (r = 1; r < mmu_page_sizes; r++) {
+		colors_per_szc[r] =
+		    page_convert_color(0, r, page_colors - 1) + 1;
+	}
+
+	/*
+	 * Preallocate all of the new hpm_counters arrays as we can't
+	 * hold the page_ctrs_rwlock as a writer and allocate memory.
+	 * If we can't allocate all of the arrays, undo our work so far
+	 * and return failure.
+	 */
+	for (r = 1; r < mmu_page_sizes; r++) {
+		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
+
+		ctr_cache[r] = kmem_zalloc(pcsz *
+		    sizeof (hpmctr_t), KM_NOSLEEP);
+		if (ctr_cache[r] == NULL) {
+			while (--r >= 1) {
+				kmem_free(ctr_cache[r],
+				    size_cache[r] * sizeof (hpmctr_t));
+			}
+			return (ENOMEM);
+		}
+		size_cache[r] = pcsz;
+	}
+	/*
+	 * Preallocate all of the new color current arrays as we can't
+	 * hold the page_ctrs_rwlock as a writer and allocate memory.
+	 * If we can't allocate all of the arrays, undo our work so far
+	 * and return failure.
+	 */
+	for (r = 1; r < mmu_page_sizes; r++) {
+		color_cache[r] = kmem_zalloc(sizeof (size_t) *
+		    colors_per_szc[r], KM_NOSLEEP);
+		if (color_cache[r] == NULL) {
+			while (--r >= 1) {
+				kmem_free(color_cache[r],
+				    colors_per_szc[r] * sizeof (size_t));
+			}
+			for (r = 1; r < mmu_page_sizes; r++) {
+				kmem_free(ctr_cache[r],
+				    size_cache[r] * sizeof (hpmctr_t));
+			}
+			return (ENOMEM);
+		}
+	}
+
+	/*
+	 * Grab the write lock to prevent others from walking these arrays
+	 * while we are modifying them.
+	 */
+	rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
+	page_freelist_lock(mnode);
+	for (r = 1; r < mmu_page_sizes; r++) {
+		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
+		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
+		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
+		oldbase = PAGE_COUNTERS_BASE(mnode, r);
+		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
+		old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r);
+
+		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
+		new_ctr = ctr_cache[r];
+		ctr_cache[r] = NULL;
+		if (old_ctr != NULL &&
+		    (oldbase + old_npgs > newbase) &&
+		    (newbase + npgs > oldbase)) {
+			/*
+			 * Map the intersection of the old and new
+			 * counters into the new array.
+			 */
+			size_t offset;
+			if (newbase > oldbase) {
+				offset = (newbase - oldbase) >>
+				    PAGE_COUNTERS_SHIFT(mnode, r);
+				bcopy(old_ctr + offset, new_ctr,
+				    MIN(pcsz, (old_csz - offset)) *
+				    sizeof (hpmctr_t));
+			} else {
+				offset = (oldbase - newbase) >>
+				    PAGE_COUNTERS_SHIFT(mnode, r);
+				bcopy(old_ctr, new_ctr + offset,
+				    MIN(pcsz - offset, old_csz) *
+				    sizeof (hpmctr_t));
+			}
+		}
+
+		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
+		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
+		PAGE_COUNTERS_BASE(mnode, r) = newbase;
+		PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r];
+		PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r];
+		color_cache[r] = NULL;
+		/*
+		 * for now, just reset on these events as it's probably
+		 * not worthwhile to try and optimize this.
+		 */
+		for (i = 0; i < colors_per_szc[r]; i++) {
+			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
+		}
+
+		/* cache info for freeing out of the critical path */
+		if ((caddr_t)old_ctr >= kernelheap &&
+		    (caddr_t)old_ctr < ekernelheap) {
+			ctr_cache[r] = old_ctr;
+			size_cache[r] = old_csz;
+		}
+		if ((caddr_t)old_color_array >= kernelheap &&
+		    (caddr_t)old_color_array < ekernelheap) {
+			color_cache[r] = old_color_array;
+		}
+		/*
+		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
+		 * satisfy the identity requirement.
+		 * We should be able to go from one to the other
+		 * and get consistent values.
+		 */
+		ASSERT(PNUM_TO_IDX(mnode, r,
+		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
+		ASSERT(IDX_TO_PNUM(mnode, r,
+		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
+	}
+	page_freelist_unlock(mnode);
+	rw_exit(&page_ctrs_rwlock[mnode]);
+
+	/*
+	 * Now that we have dropped the write lock, it is safe to free all
+	 * of the memory we have cached above.
+	 */
+	for (r = 1; r < mmu_page_sizes; r++) {
+		if (ctr_cache[r] != NULL) {
+			kmem_free(ctr_cache[r],
+			    size_cache[r] * sizeof (hpmctr_t));
+		}
+		if (color_cache[r] != NULL) {
+			kmem_free(color_cache[r],
+			    colors_per_szc[r] * sizeof (size_t));
+		}
+	}
+	return (0);
+}
+
+/*
+ * color contains a valid color index or bin for cur_szc
+ */
+uint_t
+page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color)
+{
+	uint_t shift;
+
+	if (cur_szc > new_szc) {
+		shift = page_get_shift(cur_szc) - page_get_shift(new_szc);
+		return (color << shift);
+	} else if (cur_szc < new_szc) {
+		shift = page_get_shift(new_szc) - page_get_shift(cur_szc);
+		return (color >> shift);
+	}
+	return (color);
+}
+
+#ifdef DEBUG
+
+/*
+ * confirm pp is a large page corresponding to szc
+ */
+void
+chk_lpg(page_t *pp, uchar_t szc)
+{
+	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
+	uint_t noreloc;
+
+	if (npgs == 1) {
+		ASSERT(pp->p_szc == 0);
+		ASSERT(pp->p_next == pp);
+		ASSERT(pp->p_prev == pp);
+		return;
+	}
+
+	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
+	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
+
+	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
+	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
+	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
+	ASSERT(pp->p_prev == (pp + (npgs - 1)));
+
+	/*
+	 * Check list of pages.
+	 */
+	noreloc = PP_ISNORELOC(pp);
+	while (npgs--) {
+		if (npgs != 0) {
+			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
+			ASSERT(pp->p_next == (pp + 1));
+		}
+		ASSERT(pp->p_szc == szc);
+		ASSERT(PP_ISFREE(pp));
+		ASSERT(PP_ISAGED(pp));
+		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
+		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
+		ASSERT(pp->p_vnode  == NULL);
+		ASSERT(PP_ISNORELOC(pp) == noreloc);
+
+		pp = pp->p_next;
+	}
+}
+#endif /* DEBUG */
+
+void
+page_freelist_lock(int mnode)
+{
+	int i;
+	for (i = 0; i < NPC_MUTEX; i++) {
+		mutex_enter(FPC_MUTEX(mnode, i));
+		mutex_enter(CPC_MUTEX(mnode, i));
+	}
+}
+
+void
+page_freelist_unlock(int mnode)
+{
+	int i;
+	for (i = 0; i < NPC_MUTEX; i++) {
+		mutex_exit(FPC_MUTEX(mnode, i));
+		mutex_exit(CPC_MUTEX(mnode, i));
+	}
+}
+
+/*
+ * add pp to the specified page list. Defaults to head of the page list
+ * unless PG_LIST_TAIL is specified.
+ */
+void
+page_list_add(page_t *pp, int flags)
+{
+	page_t		**ppp;
+	kmutex_t	*pcm;
+	uint_t		bin, mtype;
+	int		mnode;
+
+	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(!hat_page_is_mapped(pp));
+	ASSERT(hat_page_getshare(pp) == 0);
+
+	/*
+	 * Large pages should be freed via page_list_add_pages().
+	 */
+	ASSERT(pp->p_szc == 0);
+
+	/*
+	 * Don't need to lock the freelist first here
+	 * because the page isn't on the freelist yet.
+	 * This means p_szc can't change on us.
+	 */
+
+	bin = PP_2_BIN(pp);
+	mnode = PP_2_MEM_NODE(pp);
+	mtype = PP_2_MTYPE(pp);
+
+	if (flags & PG_LIST_ISINIT) {
+		/*
+		 * PG_LIST_ISINIT is set during system startup (ie. single
+		 * threaded), add a page to the free list and add to the
+		 * the free region counters w/o any locking
+		 */
+		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
+
+		/* inline version of page_add() */
+		if (*ppp != NULL) {
+			pp->p_next = *ppp;
+			pp->p_prev = (*ppp)->p_prev;
+			(*ppp)->p_prev = pp;
+			pp->p_prev->p_next = pp;
+		} else
+			*ppp = pp;
+
+		page_ctr_add_internal(mnode, pp, flags);
+	} else {
+		pcm = PC_BIN_MUTEX(mnode, bin, flags);
+
+		if (flags & PG_FREE_LIST) {
+			ASSERT(PP_ISAGED(pp));
+			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
+
+		} else {
+			ASSERT(pp->p_vnode);
+			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
+			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
+		}
+		mutex_enter(pcm);
+		page_add(ppp, pp);
+
+		if (flags & PG_LIST_TAIL)
+			*ppp = (*ppp)->p_next;
+		/*
+		 * Add counters before releasing pcm mutex to avoid a race with
+		 * page_freelist_coalesce and page_freelist_fill.
+		 */
+		page_ctr_add(pp, flags);
+		mutex_exit(pcm);
+	}
+
+
+#if defined(__sparc)
+	if (PP_ISNORELOC(pp)) {
+		kcage_freemem_add(1);
+	}
+#endif
+	/*
+	 * It is up to the caller to unlock the page!
+	 */
+	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
+}
+
+
+#ifdef __sparc
+/*
+ * This routine is only used by kcage_init during system startup.
+ * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
+ * without the overhead of taking locks and updating counters.
+ */
+void
+page_list_noreloc_startup(page_t *pp)
+{
+	page_t		**ppp;
+	uint_t		bin;
+	int		mnode;
+	int		mtype;
+	int		flags = PG_LIST_ISCAGE;
+
+	/*
+	 * If this is a large page on the freelist then
+	 * break it up into smaller pages.
+	 */
+	if (pp->p_szc != 0)
+		page_boot_demote(pp);
+
+	/*
+	 * Get list page is currently on.
+	 */
+	bin = PP_2_BIN(pp);
+	mnode = PP_2_MEM_NODE(pp);
+	mtype = PP_2_MTYPE(pp);
+	ASSERT(mtype == MTYPE_RELOC);
+	ASSERT(pp->p_szc == 0);
+
+	if (PP_ISAGED(pp)) {
+		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
+		flags |= PG_FREE_LIST;
+	} else {
+		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
+		flags |= PG_CACHE_LIST;
+	}
+
+	ASSERT(*ppp != NULL);
+
+	/*
+	 * Delete page from current list.
+	 */
+	if (*ppp == pp)
+		*ppp = pp->p_next;		/* go to next page */
+	if (*ppp == pp) {
+		*ppp = NULL;			/* page list is gone */
+	} else {
+		pp->p_prev->p_next = pp->p_next;
+		pp->p_next->p_prev = pp->p_prev;
+	}
+
+	/* LINTED */
+	PLCNT_DECR(pp, mnode, 0, flags);
+
+	/*
+	 * Set no reloc for cage initted pages.
+	 */
+	PP_SETNORELOC(pp);
+
+	mtype = PP_2_MTYPE(pp);
+	ASSERT(mtype == MTYPE_NORELOC);
+
+	/*
+	 * Get new list for page.
+	 */
+	if (PP_ISAGED(pp)) {
+		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
+	} else {
+		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
+	}
+
+	/*
+	 * Insert page on new list.
+	 */
+	if (*ppp == NULL) {
+		*ppp = pp;
+		pp->p_next = pp->p_prev = pp;
+	} else {
+		pp->p_next = *ppp;
+		pp->p_prev = (*ppp)->p_prev;
+		(*ppp)->p_prev = pp;
+		pp->p_prev->p_next = pp;
+	}
+
+	/* LINTED */
+	PLCNT_INCR(pp, mnode, 0, flags);
+
+	/*
+	 * Update cage freemem counter
+	 */
+	atomic_add_long(&kcage_freemem, 1);
+}
+#else	/* __sparc */
+
+/* ARGSUSED */
+void
+page_list_noreloc_startup(page_t *pp)
+{
+	panic("page_list_noreloc_startup: should be here only for sparc");
+}
+#endif
+
+void
+page_list_add_pages(page_t *pp, int flags)
+{
+	kmutex_t *pcm;
+	pgcnt_t	pgcnt;
+	uint_t	bin, mtype, i;
+	int	mnode;
+
+	/* default to freelist/head */
+	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
+
+	CHK_LPG(pp, pp->p_szc);
+	VM_STAT_ADD(vmm_vmstats.pc_list_add_pages[pp->p_szc]);
+
+	bin = PP_2_BIN(pp);
+	mnode = PP_2_MEM_NODE(pp);
+	mtype = PP_2_MTYPE(pp);
+
+	if (flags & PG_LIST_ISINIT) {
+		ASSERT(pp->p_szc == mmu_page_sizes - 1);
+		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
+		ASSERT(!PP_ISNORELOC(pp));
+		PLCNT_INCR(pp, mnode, pp->p_szc, flags);
+	} else {
+
+		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
+
+		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
+
+		mutex_enter(pcm);
+		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
+		page_ctr_add(pp, PG_FREE_LIST);
+		mutex_exit(pcm);
+
+		pgcnt = page_get_pagecnt(pp->p_szc);
+#if defined(__sparc)
+		if (PP_ISNORELOC(pp))
+			kcage_freemem_add(pgcnt);
+#endif
+		for (i = 0; i < pgcnt; i++, pp++)
+			page_unlock(pp);
+	}
+}
+
+/*
+ * During boot, need to demote a large page to base
+ * pagesize pages for seg_kmem for use in boot_alloc()
+ */
+void
+page_boot_demote(page_t *pp)
+{
+	ASSERT(pp->p_szc != 0);
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(PP_ISAGED(pp));
+
+	(void) page_demote(PP_2_MEM_NODE(pp),
+	    PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
+	    PC_FREE);
+
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(PP_ISAGED(pp));
+	ASSERT(pp->p_szc == 0);
+}
+
+/*
+ * Take a particular page off of whatever freelist the page
+ * is claimed to be on.
+ *
+ * NOTE: Only used for PAGESIZE pages.
+ */
+void
+page_list_sub(page_t *pp, int flags)
+{
+	int		bin;
+	uint_t		mtype;
+	int		mnode;
+	kmutex_t	*pcm;
+	page_t		**ppp;
+
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(PP_ISFREE(pp));
+
+	/*
+	 * The p_szc field can only be changed by page_promote()
+	 * and page_demote(). Only free pages can be promoted and
+	 * demoted and the free list MUST be locked during these
+	 * operations. So to prevent a race in page_list_sub()
+	 * between computing which bin of the freelist lock to
+	 * grab and actually grabing the lock we check again that
+	 * the bin we locked is still the correct one. Notice that
+	 * the p_szc field could have actually changed on us but
+	 * if the bin happens to still be the same we are safe.
+	 */
+try_again:
+	bin = PP_2_BIN(pp);
+	mnode = PP_2_MEM_NODE(pp);
+	pcm = PC_BIN_MUTEX(mnode, bin, flags);
+	mutex_enter(pcm);
+	if (PP_2_BIN(pp) != bin) {
+		mutex_exit(pcm);
+		goto try_again;
+	}
+	mtype = PP_2_MTYPE(pp);
+
+	if (flags & PG_FREE_LIST) {
+		ASSERT(PP_ISAGED(pp));
+		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
+	} else {
+		ASSERT(!PP_ISAGED(pp));
+		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
+	}
+
+	/*
+	 * Common PAGESIZE case.
+	 *
+	 * Note that we locked the freelist. This prevents
+	 * any page promotion/demotion operations. Therefore
+	 * the p_szc will not change until we drop pcm mutex.
+	 */
+	if (pp->p_szc == 0) {
+		page_sub(ppp, pp);
+		/*
+		 * Subtract counters before releasing pcm mutex
+		 * to avoid race with page_freelist_coalesce.
+		 */
+		page_ctr_sub(pp, flags);
+		mutex_exit(pcm);
+
+#if defined(__sparc)
+		if (PP_ISNORELOC(pp)) {
+			kcage_freemem_sub(1);
+		}
+#endif
+		return;
+	}
+
+	/*
+	 * Large pages on the cache list are not supported.
+	 */
+	if (flags & PG_CACHE_LIST)
+		panic("page_list_sub: large page on cachelist");
+
+	/*
+	 * Slow but rare.
+	 *
+	 * Somebody wants this particular page which is part
+	 * of a large page. In this case we just demote the page
+	 * if it's on the freelist.
+	 *
+	 * We have to drop pcm before locking the entire freelist.
+	 * Once we have re-locked the freelist check to make sure
+	 * the page hasn't already been demoted or completely
+	 * freed.
+	 */
+	mutex_exit(pcm);
+	page_freelist_lock(mnode);
+	if (pp->p_szc != 0) {
+		/*
+		 * Large page is on freelist.
+		 */
+		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
+		    pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
+	}
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(PP_ISAGED(pp));
+	ASSERT(pp->p_szc == 0);
+
+	/*
+	 * Subtract counters before releasing pcm mutex
+	 * to avoid race with page_freelist_coalesce.
+	 */
+	bin = PP_2_BIN(pp);
+	mtype = PP_2_MTYPE(pp);
+	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
+
+	page_sub(ppp, pp);
+	page_ctr_sub(pp, flags);
+	page_freelist_unlock(mnode);
+
+#if defined(__sparc)
+	if (PP_ISNORELOC(pp)) {
+		kcage_freemem_sub(1);
+	}
+#endif
+}
+
+void
+page_list_sub_pages(page_t *pp, uint_t szc)
+{
+	kmutex_t *pcm;
+	uint_t	bin, mtype;
+	int	mnode;
+
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(PP_ISAGED(pp));
+
+	/*
+	 * See comment in page_list_sub().
+	 */
+try_again:
+	bin = PP_2_BIN(pp);
+	mnode = PP_2_MEM_NODE(pp);
+	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
+	mutex_enter(pcm);
+	if (PP_2_BIN(pp) != bin) {
+		mutex_exit(pcm);
+		goto	try_again;
+	}
+
+	VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages1[pp->p_szc]);
+
+	/*
+	 * If we're called with a page larger than szc or it got
+	 * promoted above szc before we locked the freelist then
+	 * drop pcm and re-lock entire freelist. If page still larger
+	 * than szc then demote it.
+	 */
+	if (pp->p_szc > szc) {
+		VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages2[pp->p_szc]);
+		mutex_exit(pcm);
+		pcm = NULL;
+		page_freelist_lock(mnode);
+		if (pp->p_szc > szc) {
+			VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages3[pp->p_szc]);
+			(void) page_demote(mnode,
+			    PFN_BASE(pp->p_pagenum, pp->p_szc),
+			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
+		}
+		bin = PP_2_BIN(pp);
+	}
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(PP_ISAGED(pp));
+	ASSERT(pp->p_szc <= szc);
+	ASSERT(pp == PP_PAGEROOT(pp));
+
+	mtype = PP_2_MTYPE(pp);
+	if (pp->p_szc != 0) {
+		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
+		CHK_LPG(pp, pp->p_szc);
+	} else {
+		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
+	}
+	page_ctr_sub(pp, PG_FREE_LIST);
+
+	if (pcm != NULL) {
+		mutex_exit(pcm);
+	} else {
+		page_freelist_unlock(mnode);
+	}
+
+#if defined(__sparc)
+	if (PP_ISNORELOC(pp)) {
+		pgcnt_t	pgcnt;
+
+		pgcnt = page_get_pagecnt(pp->p_szc);
+		kcage_freemem_sub(pgcnt);
+	}
+#endif
+}
+
+/*
+ * Add the page to the front of a linked list of pages
+ * using the p_next & p_prev pointers for the list.
+ * The caller is responsible for protecting the list pointers.
+ */
+void
+mach_page_add(page_t **ppp, page_t *pp)
+{
+	if (*ppp == NULL) {
+		pp->p_next = pp->p_prev = pp;
+	} else {
+		pp->p_next = *ppp;
+		pp->p_prev = (*ppp)->p_prev;
+		(*ppp)->p_prev = pp;
+		pp->p_prev->p_next = pp;
+	}
+	*ppp = pp;
+}
+
+/*
+ * Remove this page from a linked list of pages
+ * using the p_next & p_prev pointers for the list.
+ *
+ * The caller is responsible for protecting the list pointers.
+ */
+void
+mach_page_sub(page_t **ppp, page_t *pp)
+{
+	ASSERT(PP_ISFREE(pp));
+
+	if (*ppp == NULL || pp == NULL)
+		panic("mach_page_sub");
+
+	if (*ppp == pp)
+		*ppp = pp->p_next;		/* go to next page */
+
+	if (*ppp == pp)
+		*ppp = NULL;			/* page list is gone */
+	else {
+		pp->p_prev->p_next = pp->p_next;
+		pp->p_next->p_prev = pp->p_prev;
+	}
+	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
+}
+
+/*
+ * Routine fsflush uses to gradually coalesce the free list into larger pages.
+ */
+void
+page_promote_size(page_t *pp, uint_t cur_szc)
+{
+	pfn_t pfn;
+	int mnode;
+	int idx;
+	int new_szc = cur_szc + 1;
+	int full = FULL_REGION_CNT(new_szc);
+
+	pfn = page_pptonum(pp);
+	mnode = PFN_2_MEM_NODE(pfn);
+
+	page_freelist_lock(mnode);
+
+	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
+	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
+		(void) page_promote(mnode, pfn, new_szc, PC_FREE);
+
+	page_freelist_unlock(mnode);
+}
+
+static uint_t page_promote_err;
+static uint_t page_promote_noreloc_err;
+
+/*
+ * Create a single larger page (of szc new_szc) from smaller contiguous pages
+ * for the given mnode starting at pfnum. Pages involved are on the freelist
+ * before the call and may be returned to the caller if requested, otherwise
+ * they will be placed back on the freelist.
+ * If flags is PC_ALLOC, then the large page will be returned to the user in
+ * a state which is consistent with a page being taken off the freelist.  If
+ * we failed to lock the new large page, then we will return NULL to the
+ * caller and put the large page on the freelist instead.
+ * If flags is PC_FREE, then the large page will be placed on the freelist,
+ * and NULL will be returned.
+ * The caller is responsible for locking the freelist as well as any other
+ * accounting which needs to be done for a returned page.
+ *
+ * RFE: For performance pass in pp instead of pfnum so
+ * 	we can avoid excessive calls to page_numtopp_nolock().
+ *	This would depend on an assumption that all contiguous
+ *	pages are in the same memseg so we can just add/dec
+ *	our pp.
+ *
+ * Lock ordering:
+ *
+ *	There is a potential but rare deadlock situation
+ *	for page promotion and demotion operations. The problem
+ *	is there are two paths into the freelist manager and
+ *	they have different lock orders:
+ *
+ *	page_create()
+ *		lock freelist
+ *		page_lock(EXCL)
+ *		unlock freelist
+ *		return
+ *		caller drops page_lock
+ *
+ *	page_free() and page_reclaim()
+ *		caller grabs page_lock(EXCL)
+ *
+ *		lock freelist
+ *		unlock freelist
+ *		drop page_lock
+ *
+ *	What prevents a thread in page_create() from deadlocking
+ *	with a thread freeing or reclaiming the same page is the
+ *	page_trylock() in page_get_freelist(). If the trylock fails
+ *	it skips the page.
+ *
+ *	The lock ordering for promotion and demotion is the same as
+ *	for page_create(). Since the same deadlock could occur during
+ *	page promotion and freeing or reclaiming of a page on the
+ *	cache list we might have to fail the operation and undo what
+ *	have done so far. Again this is rare.
+ */
+page_t *
+page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
+{
+	page_t		*pp, *pplist, *tpp, *start_pp;
+	pgcnt_t		new_npgs, npgs;
+	uint_t		bin;
+	pgcnt_t		tmpnpgs, pages_left;
+	uint_t		mtype;
+	uint_t		noreloc;
+	uint_t 		i;
+	int 		which_list;
+	ulong_t		index;
+	kmutex_t	*phm;
+
+	/*
+	 * General algorithm:
+	 * Find the starting page
+	 * Walk each page struct removing it from the freelist,
+	 * and linking it to all the other pages removed.
+	 * Once all pages are off the freelist,
+	 * walk the list, modifying p_szc to new_szc and what
+	 * ever other info needs to be done to create a large free page.
+	 * According to the flags, either return the page or put it
+	 * on the freelist.
+	 */
+
+	start_pp = page_numtopp_nolock(pfnum);
+	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
+	new_npgs = page_get_pagecnt(new_szc);
+	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
+
+	/*
+	 * Loop through smaller pages to confirm that all pages
+	 * give the same result for PP_ISNORELOC().
+	 * We can check this reliably here as the protocol for setting
+	 * P_NORELOC requires pages to be taken off the free list first.
+	 */
+	for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) {
+		if (pp == start_pp) {
+			/* First page, set requirement. */
+			noreloc = PP_ISNORELOC(pp);
+		} else if (noreloc != PP_ISNORELOC(pp)) {
+			page_promote_noreloc_err++;
+			page_promote_err++;
+			return (NULL);
+		}
+	}
+
+	pages_left = new_npgs;
+	pplist = NULL;
+	pp = start_pp;
+
+	/* Loop around coalescing the smaller pages into a big page. */
+	while (pages_left) {
+		/*
+		 * Remove from the freelist.
+		 */
+		ASSERT(PP_ISFREE(pp));
+		bin = PP_2_BIN(pp);
+		ASSERT(mnode == PP_2_MEM_NODE(pp));
+		mtype = PP_2_MTYPE(pp);
+		if (PP_ISAGED(pp)) {
+
+			/*
+			 * PG_FREE_LIST
+			 */
+			if (pp->p_szc) {
+				page_vpsub(&PAGE_FREELISTS(mnode,
+				    pp->p_szc, bin, mtype), pp);
+			} else {
+				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
+				    bin, mtype), pp);
+			}
+			which_list = PG_FREE_LIST;
+		} else {
+			ASSERT(pp->p_szc == 0);
+
+			/*
+			 * PG_CACHE_LIST
+			 *
+			 * Since this page comes from the
+			 * cachelist, we must destroy the
+			 * vnode association.
+			 */
+			if (!page_trylock(pp, SE_EXCL)) {
+				goto fail_promote;
+			}
+
+			/*
+			 * We need to be careful not to deadlock
+			 * with another thread in page_lookup().
+			 * The page_lookup() thread could be holding
+			 * the same phm that we need if the two
+			 * pages happen to hash to the same phm lock.
+			 * At this point we have locked the entire
+			 * freelist and page_lookup() could be trying
+			 * to grab a freelist lock.
+			 */
+			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
+			phm = PAGE_HASH_MUTEX(index);
+			if (!mutex_tryenter(phm)) {
+				page_unlock(pp);
+				goto fail_promote;
+			}
+
+			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
+			page_hashout(pp, phm);
+			mutex_exit(phm);
+			PP_SETAGED(pp);
+			page_unlock(pp);
+			which_list = PG_CACHE_LIST;
+		}
+		page_ctr_sub(pp, which_list);
+
+		/*
+		 * Concatenate the smaller page(s) onto
+		 * the large page list.
+		 */
+		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
+		pages_left -= npgs;
+		tpp = pp;
+		while (npgs--) {
+			tpp->p_szc = new_szc;
+			tpp = tpp->p_next;
+		}
+		page_list_concat(&pplist, &pp);
+		pp += tmpnpgs;
+	}
+	CHK_LPG(pplist, new_szc);
+
+	/*
+	 * return the page to the user if requested
+	 * in the properly locked state.
+	 */
+	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
+		return (pplist);
+	}
+
+	/*
+	 * Otherwise place the new large page on the freelist
+	 */
+	bin = PP_2_BIN(pplist);
+	mnode = PP_2_MEM_NODE(pplist);
+	mtype = PP_2_MTYPE(pplist);
+	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
+
+	page_ctr_add(pplist, PG_FREE_LIST);
+	return (NULL);
+
+fail_promote:
+	/*
+	 * A thread must have still been freeing or
+	 * reclaiming the page on the cachelist.
+	 * To prevent a deadlock undo what we have
+	 * done sofar and return failure. This
+	 * situation can only happen while promoting
+	 * PAGESIZE pages.
+	 */
+	page_promote_err++;
+	while (pplist) {
+		pp = pplist;
+		mach_page_sub(&pplist, pp);
+		pp->p_szc = 0;
+		bin = PP_2_BIN(pp);
+		mtype = PP_2_MTYPE(pp);
+		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
+		page_ctr_add(pp, PG_FREE_LIST);
+	}
+	return (NULL);
+
+}
+
+/*
+ * Break up a large page into smaller size pages.
+ * Pages involved are on the freelist before the call and may
+ * be returned to the caller if requested, otherwise they will
+ * be placed back on the freelist.
+ * The caller is responsible for locking the freelist as well as any other
+ * accounting which needs to be done for a returned page.
+ * If flags is not PC_ALLOC, the color argument is ignored, and thus
+ * technically, any value may be passed in but PC_NO_COLOR is the standard
+ * which should be followed for clarity's sake.
+ */
+page_t *
+page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
+    int color, int flags)
+{
+	page_t	*pp, *pplist, *npplist;
+	pgcnt_t	npgs, n;
+	uint_t	bin;
+	uint_t	mtype;
+	page_t	*ret_pp = NULL;
+
+	ASSERT(cur_szc != 0);
+	ASSERT(new_szc < cur_szc);
+
+	pplist = page_numtopp_nolock(pfnum);
+	ASSERT(pplist != NULL);
+
+	ASSERT(pplist->p_szc == cur_szc);
+
+	bin = PP_2_BIN(pplist);
+	ASSERT(mnode == PP_2_MEM_NODE(pplist));
+	mtype = PP_2_MTYPE(pplist);
+	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
+
+	CHK_LPG(pplist, cur_szc);
+	page_ctr_sub(pplist, PG_FREE_LIST);
+
+	/*
+	 * Number of PAGESIZE pages for smaller new_szc
+	 * page.
+	 */
+	npgs = page_get_pagecnt(new_szc);
+
+	while (pplist) {
+		pp = pplist;
+
+		ASSERT(pp->p_szc == cur_szc);
+
+		/*
+		 * We either break it up into PAGESIZE pages or larger.
+		 */
+		if (npgs == 1) {	/* PAGESIZE case */
+			mach_page_sub(&pplist, pp);
+			ASSERT(pp->p_szc == cur_szc);
+			ASSERT(new_szc == 0);
+			ASSERT(mnode == PP_2_MEM_NODE(pp));
+			pp->p_szc = new_szc;
+			bin = PP_2_BIN(pp);
+			if ((bin == color) && (flags == PC_ALLOC) &&
+			    (ret_pp == NULL) &&
+			    page_trylock_cons(pp, SE_EXCL)) {
+				ret_pp = pp;
+			} else {
+				mtype = PP_2_MTYPE(pp);
+				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
+				    mtype), pp);
+				page_ctr_add(pp, PG_FREE_LIST);
+			}
+		} else {
+
+			/*
+			 * Break down into smaller lists of pages.
+			 */
+			page_list_break(&pplist, &npplist, npgs);
+
+			pp = pplist;
+			n = npgs;
+			while (n--) {
+				ASSERT(pp->p_szc == cur_szc);
+				pp->p_szc = new_szc;
+				pp = pp->p_next;
+			}
+
+			CHK_LPG(pplist, new_szc);
+
+			bin = PP_2_BIN(pplist);
+			ASSERT(mnode == PP_2_MEM_NODE(pp));
+			if ((bin == color) && (flags == PC_ALLOC) &&
+			    (ret_pp == NULL) &&
+			    page_trylock_cons(pp, SE_EXCL)) {
+				ret_pp = pp;
+			} else {
+				mtype = PP_2_MTYPE(pp);
+				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
+				    bin, mtype), pplist);
+
+				page_ctr_add(pplist, PG_FREE_LIST);
+			}
+			pplist = npplist;
+		}
+	}
+	return (ret_pp);
+}
+
+int mpss_coalesce_disable = 0;
+
+/*
+ * Coalesce free pages into a page of the given szc and color if possible.
+ * Return the pointer to the page created, otherwise, return NULL.
+ */
+static page_t *
+page_freelist_coalesce(int mnode, uchar_t szc, int color)
+{
+	int 	r;		/* region size */
+	int 	idx, full, i;
+	pfn_t	pfnum;
+	size_t	len;
+	size_t	buckets_to_check;
+	pgcnt_t	cands;
+	page_t	*ret_pp;
+	int	color_stride;
+
+	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce);
+
+	if (mpss_coalesce_disable) {
+		return (NULL);
+	}
+
+	r = szc;
+	PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands);
+	if (cands == 0) {
+		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip);
+		return (NULL);
+	}
+	full = FULL_REGION_CNT(r);
+	color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
+	    page_colors;
+
+	/* Prevent page_counters dynamic memory from being freed */
+	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
+	len  = PAGE_COUNTERS_ENTRIES(mnode, r);
+	buckets_to_check = len / color_stride;
+	idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color);
+	ASSERT((idx % color_stride) == color);
+	idx += color_stride;
+	if (idx >= len)
+		idx = color;
+	for (i = 0; i < buckets_to_check; i++) {
+		if (PAGE_COUNTERS(mnode, r, idx) == full) {
+			pfnum = IDX_TO_PNUM(mnode, r, idx);
+			ASSERT(pfnum >= mem_node_config[mnode].physbase &&
+			    pfnum < mem_node_config[mnode].physmax);
+			/*
+			 * RFE: For performance maybe we can do something less
+			 *	brutal than locking the entire freelist. So far
+			 * 	this doesn't seem to be a performance problem?
+			 */
+			page_freelist_lock(mnode);
+			if (PAGE_COUNTERS(mnode, r, idx) != full) {
+				VM_STAT_ADD(vmm_vmstats.page_ctrs_changed);
+				goto skip_this_one;
+			}
+			ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC);
+			if (ret_pp != NULL) {
+				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
+				    idx;
+				page_freelist_unlock(mnode);
+				rw_exit(&page_ctrs_rwlock[mnode]);
+#if defined(__sparc)
+				if (PP_ISNORELOC(ret_pp)) {
+					pgcnt_t npgs;
+
+					npgs = page_get_pagecnt(ret_pp->p_szc);
+					kcage_freemem_sub(npgs);
+				}
+#endif
+				return (ret_pp);
+			}
+skip_this_one:
+			page_freelist_unlock(mnode);
+			/*
+			 * No point looking for another page if we've
+			 * already tried all of the ones that
+			 * page_ctr_cands indicated.  Stash off where we left
+			 * off.
+			 * Note: this is not exact since we don't hold the
+			 * page_freelist_locks before we initially get the
+			 * value of cands for performance reasons, but should
+			 * be a decent approximation.
+			 */
+			if (--cands == 0) {
+				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
+				    idx;
+				break;
+			}
+		}
+		idx += color_stride;
+		if (idx >= len)
+			idx = color;
+	}
+	rw_exit(&page_ctrs_rwlock[mnode]);
+	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed);
+	return (NULL);
+}
+
+/*
+ * For the given mnode, promote as many small pages to large pages as possible.
+ */
+void
+page_freelist_coalesce_all(int mnode)
+{
+	int 	r;		/* region size */
+	int 	idx, full;
+	pfn_t	pfnum;
+	size_t	len;
+
+	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
+
+	if (mpss_coalesce_disable) {
+		return;
+	}
+
+	/*
+	 * Lock the entire freelist and coalesce what we can.
+	 *
+	 * Always promote to the largest page possible
+	 * first to reduce the number of page promotions.
+	 */
+	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
+	page_freelist_lock(mnode);
+	for (r = mmu_page_sizes - 1; r > 0; r--) {
+		pgcnt_t cands;
+
+		PGCTRS_CANDS_GETVALUE(mnode, r, cands);
+		if (cands == 0) {
+			VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
+			continue;
+		}
+
+		full = FULL_REGION_CNT(r);
+		len  = PAGE_COUNTERS_ENTRIES(mnode, r);
+
+		for (idx = 0; idx < len; idx++) {
+			if (PAGE_COUNTERS(mnode, r, idx) == full) {
+				pfnum = IDX_TO_PNUM(mnode, r, idx);
+				ASSERT(pfnum >=
+				    mem_node_config[mnode].physbase &&
+				    pfnum <
+				    mem_node_config[mnode].physmax);
+				(void) page_promote(mnode, pfnum, r, PC_FREE);
+			}
+		}
+	}
+	page_freelist_unlock(mnode);
+	rw_exit(&page_ctrs_rwlock[mnode]);
+}
+
+/*
+ * This is where all polices for moving pages around
+ * to different page size free lists is implemented.
+ * Returns 1 on success, 0 on failure.
+ *
+ * So far these are the priorities for this algorithm in descending
+ * order:
+ *
+ *	1) When servicing a request try to do so with a free page
+ *	   from next size up. Helps defer fragmentation as long
+ *	   as possible.
+ *
+ *	2) Page coalesce on demand. Only when a freelist
+ *	   larger than PAGESIZE is empty and step 1
+ *	   will not work since all larger size lists are
+ *	   also empty.
+ *
+ * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
+ */
+page_t *
+page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi)
+{
+	uchar_t nszc = szc + 1;
+	int 	bin;
+	page_t	*pp, *firstpp;
+	page_t	*ret_pp = NULL;
+
+	ASSERT(szc < mmu_page_sizes);
+
+	/*
+	 * First try to break up a larger page to fill
+	 * current size freelist.
+	 */
+	while (nszc < mmu_page_sizes) {
+		/*
+		 * If page found then demote it.
+		 */
+		bin = page_convert_color(szc, nszc, color);
+		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
+			page_freelist_lock(mnode);
+			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
+
+			/*
+			 * If pfnhi is not PFNNULL, look for large page below
+			 * pfnhi. PFNNULL signifies no pfn requirement.
+			 */
+			if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
+				do {
+					pp = pp->p_vpnext;
+					if (pp == firstpp) {
+						pp = NULL;
+						break;
+					}
+				} while (pp->p_pagenum >= pfnhi);
+			}
+			if (pp) {
+				ASSERT(pp->p_szc == nszc);
+				ret_pp = page_demote(mnode, pp->p_pagenum,
+				    pp->p_szc, szc, color, PC_ALLOC);
+				if (ret_pp) {
+					page_freelist_unlock(mnode);
+#if defined(__sparc)
+					if (PP_ISNORELOC(ret_pp)) {
+						pgcnt_t npgs;
+
+						npgs = page_get_pagecnt(
+						    ret_pp->p_szc);
+						kcage_freemem_sub(npgs);
+					}
+#endif
+					return (ret_pp);
+				}
+			}
+			page_freelist_unlock(mnode);
+		}
+		nszc++;
+	}
+
+	/*
+	 * Ok that didn't work. Time to coalesce.
+	 */
+	if (szc != 0) {
+		ret_pp = page_freelist_coalesce(mnode, szc, color);
+	}
+
+	return (ret_pp);
+}
+
+/*
+ * Helper routine used only by the freelist code to lock
+ * a page. If the page is a large page then it succeeds in
+ * locking all the constituent pages or none at all.
+ * Returns 1 on sucess, 0 on failure.
+ */
+static int
+page_trylock_cons(page_t *pp, se_t se)
+{
+	page_t	*tpp, *first_pp = pp;
+
+	/*
+	 * Fail if can't lock first or only page.
+	 */
+	if (!page_trylock(pp, se)) {
+		return (0);
+	}
+
+	/*
+	 * PAGESIZE: common case.
+	 */
+	if (pp->p_szc == 0) {
+		return (1);
+	}
+
+	/*
+	 * Large page case.
+	 */
+	tpp = pp->p_next;
+	while (tpp != pp) {
+		if (!page_trylock(tpp, se)) {
+			/*
+			 * On failure unlock what we
+			 * have locked so far.
+			 */
+			while (first_pp != tpp) {
+				page_unlock(first_pp);
+				first_pp = first_pp->p_next;
+			}
+			return (0);
+		}
+		tpp = tpp->p_next;
+	}
+	return (1);
+}
+
+page_t *
+page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
+    uint_t flags)
+{
+	kmutex_t	*pcm;
+	int		i, fill_tried, fill_marker;
+	page_t		*pp, *first_pp;
+	uint_t		bin_marker;
+	int		colors, cpucolors;
+	uchar_t		nszc;
+	uint_t		nszc_color_shift;
+	int		nwaybins = 0, nwaycnt;
+
+	ASSERT(szc < mmu_page_sizes);
+
+	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
+
+	/* LINTED */
+	MTYPE_START(mnode, mtype, flags);
+	if (mtype < 0) {	/* mnode foes not have memory in mtype range */
+		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
+		return (NULL);
+	}
+
+	/*
+	 * Set how many physical colors for this page size.
+	 */
+	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
+	    page_colors;
+
+	nszc = MIN(szc + 1, mmu_page_sizes - 1);
+	nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc);
+
+	/* cpu_page_colors is non-zero if a page color may be in > 1 bin */
+	cpucolors = cpu_page_colors;
+
+	/*
+	 * adjust cpucolors to possibly check additional 'equivalent' bins
+	 * to try to minimize fragmentation of large pages by delaying calls
+	 * to page_freelist_fill.
+	 */
+	if (colorequiv > 1) {
+		int equivcolors = colors / colorequiv;
+
+		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
+			cpucolors = equivcolors;
+	}
+
+	ASSERT(colors <= page_colors);
+	ASSERT(colors);
+	ASSERT((colors & (colors - 1)) == 0);
+
+	ASSERT(bin < colors);
+
+	/*
+	 * Only hold one freelist lock at a time, that way we
+	 * can start anywhere and not have to worry about lock
+	 * ordering.
+	 */
+big_try_again:
+	fill_tried = 0;
+	nwaycnt = 0;
+	for (i = 0; i <= colors; i++) {
+try_again:
+		ASSERT(bin < colors);
+		if (PAGE_FREELISTS(mnode, szc, bin, mtype)) {
+			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
+			mutex_enter(pcm);
+			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
+			if (pp != NULL) {
+				/*
+				 * These were set before the page
+				 * was put on the free list,
+				 * they must still be set.
+				 */
+				ASSERT(PP_ISFREE(pp));
+				ASSERT(PP_ISAGED(pp));
+				ASSERT(pp->p_vnode == NULL);
+				ASSERT(pp->p_hash == NULL);
+				ASSERT(pp->p_offset == (u_offset_t)-1);
+				ASSERT(pp->p_szc == szc);
+				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
+
+				/*
+				 * Walk down the hash chain.
+				 * 8k pages are linked on p_next
+				 * and p_prev fields. Large pages
+				 * are a contiguous group of
+				 * constituent pages linked together
+				 * on their p_next and p_prev fields.
+				 * The large pages are linked together
+				 * on the hash chain using p_vpnext
+				 * p_vpprev of the base constituent
+				 * page of each large page.
+				 */
+				first_pp = pp;
+				while (!page_trylock_cons(pp, SE_EXCL)) {
+					if (szc == 0) {
+						pp = pp->p_next;
+					} else {
+						pp = pp->p_vpnext;
+					}
+
+					ASSERT(PP_ISFREE(pp));
+					ASSERT(PP_ISAGED(pp));
+					ASSERT(pp->p_vnode == NULL);
+					ASSERT(pp->p_hash == NULL);
+					ASSERT(pp->p_offset == (u_offset_t)-1);
+					ASSERT(pp->p_szc == szc);
+					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
+							mnode);
+
+					if (pp == first_pp) {
+						pp = NULL;
+						break;
+					}
+				}
+
+				if (pp) {
+					ASSERT(mtype == PP_2_MTYPE(pp));
+					ASSERT(pp->p_szc == szc);
+					if (szc == 0) {
+						page_sub(&PAGE_FREELISTS(mnode,
+						    szc, bin, mtype), pp);
+					} else {
+						page_vpsub(&PAGE_FREELISTS(
+						    mnode, szc, bin, mtype),
+						    pp);
+						CHK_LPG(pp, szc);
+					}
+					page_ctr_sub(pp, PG_FREE_LIST);
+
+					if ((PP_ISFREE(pp) == 0) ||
+					    (PP_ISAGED(pp) == 0))
+						panic("free page is not. pp %p",
+						    (void *)pp);
+					mutex_exit(pcm);
+
+#if defined(__sparc)
+					ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
+					    (flags & PG_NORELOC) == 0);
+
+					if (PP_ISNORELOC(pp)) {
+						pgcnt_t	npgs;
+
+						npgs = page_get_pagecnt(szc);
+						kcage_freemem_sub(npgs);
+					}
+#endif
+					VM_STAT_ADD(vmm_vmstats.
+					    pgmf_allocok[szc]);
+					return (pp);
+				}
+			}
+			mutex_exit(pcm);
+		}
+
+		/*
+		 * Wow! The initial bin is empty.
+		 * If specific color is needed, check if page color may be
+		 * in other bins. cpucolors is:
+		 *   0	if the colors for this cpu is equal to page_colors.
+		 *	This means that pages with a particular color are in a
+		 *	single bin.
+		 *  -1	if colors of cpus (cheetah+) are heterogenous. Need to
+		 *	first determine the colors for the current cpu.
+		 *  >0	colors of all cpus are homogenous and < page_colors
+		 */
+
+		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
+			if (!nwaybins) {
+				/*
+				 * cpucolors is negative if ecache setsizes
+				 * are heterogenous. determine colors for this
+				 * particular cpu.
+				 */
+				if (cpucolors < 0) {
+					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
+					ASSERT(cpucolors > 0);
+					nwaybins = colors / cpucolors;
+				} else {
+					nwaybins = colors / cpucolors;
+					ASSERT(szc > 0 || nwaybins > 1);
+				}
+				if (nwaybins < 2)
+					cpucolors = 0;
+			}
+
+			if (cpucolors && (nwaycnt + 1 <= nwaybins)) {
+				nwaycnt++;
+				bin = (bin + (colors / nwaybins)) &
+				    (colors - 1);
+				if (nwaycnt < nwaybins) {
+					goto try_again;
+				}
+			}
+			/* back to initial color if fall-thru */
+		}
+
+		/*
+		 * color bins are all empty if color match. Try and satisfy
+		 * the request by breaking up or coalescing pages from
+		 * a different size freelist of the correct color that
+		 * satisfies the ORIGINAL color requested. If that
+		 * fails then try pages of the same size but different
+		 * colors assuming we are not called with
+		 * PG_MATCH_COLOR.
+		 */
+		if (!fill_tried) {
+			fill_tried = 1;
+			fill_marker = bin >> nszc_color_shift;
+			pp = page_freelist_fill(szc, bin, mnode, mtype,
+			    PFNNULL);
+			if (pp != NULL) {
+				return (pp);
+			}
+		}
+
+		if (flags & PG_MATCH_COLOR)
+			break;
+
+		/*
+		 * Select next color bin to try.
+		 */
+		if (szc == 0) {
+			/*
+			 * PAGESIZE page case.
+			 */
+			if (i == 0) {
+				bin = (bin + BIN_STEP) & page_colors_mask;
+				bin_marker = bin;
+			} else {
+				bin = (bin + vac_colors) & page_colors_mask;
+				if (bin == bin_marker) {
+					bin = (bin + 1) & page_colors_mask;
+					bin_marker = bin;
+				}
+			}
+		} else {
+			/*
+			 * Large page case.
+			 */
+			bin = (bin + 1) & (colors - 1);
+		}
+		/*
+		 * If bin advanced to the next color bin of the
+		 * next larger pagesize, there is a chance the fill
+		 * could succeed.
+		 */
+		if (fill_marker != (bin >> nszc_color_shift))
+			fill_tried = 0;
+	}
+
+#if defined(__sparc)
+	if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&
+		(kcage_freemem >= kcage_lotsfree)) {
+		/*
+		 * The Cage is ON and with plenty of free mem, and
+		 * we're willing to check for a NORELOC page if we
+		 * couldn't find a RELOC page, so spin again.
+		 */
+		flags |= PG_NORELOC;
+		mtype = MTYPE_NORELOC;
+		goto big_try_again;
+	}
+#else
+	if (flags & PGI_MT_RANGE) {
+		/* cycle through range of mtypes */
+		MTYPE_NEXT(mnode, mtype, flags);
+		if (mtype >= 0)
+			goto big_try_again;
+	}
+#endif
+	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
+
+	return (NULL);
+}
+
+
+/*
+ * Returns the count of free pages for 'pp' with size code 'szc'.
+ * Note: This function does not return an exact value as the page freelist
+ * locks are not held and thus the values in the page_counters may be
+ * changing as we walk through the data.
+ */
+static int
+page_freecnt(int mnode, page_t *pp, uchar_t szc)
+{
+	pgcnt_t	pgfree;
+	pgcnt_t cnt;
+	ssize_t	r = szc;	/* region size */
+	ssize_t	idx;
+	int	i;
+	int	full, range;
+
+	/* Make sure pagenum passed in is aligned properly */
+	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
+	ASSERT(szc > 0);
+
+	/* Prevent page_counters dynamic memory from being freed */
+	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
+	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
+	cnt = PAGE_COUNTERS(mnode, r, idx);
+	pgfree = cnt << PNUM_SHIFT(r - 1);
+	range = FULL_REGION_CNT(szc);
+
+	/* Check for completely full region */
+	if (cnt == range) {
+		rw_exit(&page_ctrs_rwlock[mnode]);
+		return (pgfree);
+	}
+
+	while (--r > 0) {
+		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
+		full = FULL_REGION_CNT(r);
+		for (i = 0; i < range; i++, idx++) {
+			cnt = PAGE_COUNTERS(mnode, r, idx);
+			/*
+			 * If cnt here is full, that means we have already
+			 * accounted for these pages earlier.
+			 */
+			if (cnt != full) {
+				pgfree += (cnt << PNUM_SHIFT(r - 1));
+			}
+		}
+		range *= full;
+	}
+	rw_exit(&page_ctrs_rwlock[mnode]);
+	return (pgfree);
+}
+
+/*
+ * Called from page_geti_contig_pages to exclusively lock constituent pages
+ * starting from 'spp' for page size code 'szc'.
+ *
+ * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
+ * region needs to be greater than or equal to the threshold.
+ */
+static int
+page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
+{
+	pgcnt_t	pgcnt = PNUM_SIZE(szc);
+	pgcnt_t pgfree, i;
+	page_t *pp;
+
+	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
+
+
+	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
+		goto skipptcpcheck;
+	/*
+	 * check if there are sufficient free pages available before attempting
+	 * to trylock. Count is approximate as page counters can change.
+	 */
+	pgfree = page_freecnt(mnode, spp, szc);
+
+	/* attempt to trylock if there are sufficient already free pages */
+	if (pgfree < pgcnt/ptcpthreshold) {
+		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
+		return (0);
+	}
+
+skipptcpcheck:
+
+	for (i = 0; i < pgcnt; i++) {
+		pp = &spp[i];
+		if (!page_trylock(pp, SE_EXCL)) {
+			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
+			while (--i != (pgcnt_t)-1) {
+				pp = &spp[i];
+				ASSERT(PAGE_EXCL(pp));
+				page_unlock(pp);
+			}
+			return (0);
+		}
+		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
+		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
+		    !PP_ISFREE(pp)) {
+			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
+			ASSERT(i == 0);
+			page_unlock(pp);
+			return (0);
+		}
+		if (PP_ISNORELOC(pp)) {
+			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
+			while (i != (pgcnt_t)-1) {
+				pp = &spp[i];
+				ASSERT(PAGE_EXCL(pp));
+				page_unlock(pp);
+				i--;
+			}
+			return (0);
+		}
+	}
+	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
+	return (1);
+}
+
+/*
+ * Claim large page pointed to by 'pp'. 'pp' is the starting set
+ * of 'szc' constituent pages that had been locked exclusively previously.
+ * Will attempt to relocate constituent pages in use.
+ */
+static page_t *
+page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
+{
+	spgcnt_t pgcnt, npgs, i;
+	page_t *targpp, *rpp, *hpp;
+	page_t *replpp = NULL;
+	page_t *pplist = NULL;
+
+	ASSERT(pp != NULL);
+
+	pgcnt = page_get_pagecnt(szc);
+	while (pgcnt) {
+		ASSERT(PAGE_EXCL(pp));
+		ASSERT(!PP_ISNORELOC(pp));
+		if (PP_ISFREE(pp)) {
+			/*
+			 * If this is a PG_FREE_LIST page then its
+			 * size code can change underneath us due to
+			 * page promotion or demotion. As an optimzation
+			 * use page_list_sub_pages() instead of
+			 * page_list_sub().
+			 */
+			if (PP_ISAGED(pp)) {
+				page_list_sub_pages(pp, szc);
+				if (pp->p_szc == szc) {
+					return (pp);
+				}
+				ASSERT(pp->p_szc < szc);
+				npgs = page_get_pagecnt(pp->p_szc);
+				hpp = pp;
+				for (i = 0; i < npgs; i++, pp++) {
+					pp->p_szc = szc;
+				}
+				page_list_concat(&pplist, &hpp);
+				pgcnt -= npgs;
+				continue;
+			}
+			ASSERT(!PP_ISAGED(pp));
+			ASSERT(pp->p_szc == 0);
+			page_list_sub(pp, PG_CACHE_LIST);
+			page_hashout(pp, NULL);
+			PP_SETAGED(pp);
+			pp->p_szc = szc;
+			page_list_concat(&pplist, &pp);
+			pp++;
+			pgcnt--;
+			continue;
+		}
+		npgs = page_get_pagecnt(pp->p_szc);
+
+		/*
+		 * page_create_wait freemem accounting done by caller of
+		 * page_get_freelist and not necessary to call it prior to
+		 * calling page_get_replacement_page.
+		 *
+		 * page_get_replacement_page can call page_get_contig_pages
+		 * to acquire a large page (szc > 0); the replacement must be
+		 * smaller than the contig page size to avoid looping or
+		 * szc == 0 and PGI_PGCPSZC0 is set.
+		 */
+		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
+			replpp = page_get_replacement_page(pp, NULL, 0);
+			if (replpp) {
+				npgs = page_get_pagecnt(pp->p_szc);
+				ASSERT(npgs <= pgcnt);
+				targpp = pp;
+			}
+		}
+
+		/*
+		 * If replacement is NULL or do_page_relocate fails, fail
+		 * coalescing of pages.
+		 */
+		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
+		    &npgs, NULL) != 0)) {
+			/*
+			 * Unlock un-processed target list
+			 */
+			while (pgcnt--) {
+				ASSERT(PAGE_EXCL(pp));
+				page_unlock(pp);
+				pp++;
+			}
+			/*
+			 * Free the processed target list.
+			 */
+			while (pplist) {
+				pp = pplist;
+				page_sub(&pplist, pp);
+				ASSERT(PAGE_EXCL(pp));
+				ASSERT(pp->p_szc == szc);
+				ASSERT(PP_ISFREE(pp));
+				ASSERT(PP_ISAGED(pp));
+				pp->p_szc = 0;
+				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
+				page_unlock(pp);
+			}
+
+			if (replpp != NULL)
+				page_free_replacement_page(replpp);
+
+			return (NULL);
+		}
+		ASSERT(pp == targpp);
+
+		/* LINTED */
+		ASSERT(hpp = pp); /* That's right, it's an assignment */
+
+		pp += npgs;
+		pgcnt -= npgs;
+
+		while (npgs--) {
+			ASSERT(PAGE_EXCL(targpp));
+			ASSERT(!PP_ISFREE(targpp));
+			ASSERT(!PP_ISNORELOC(targpp));
+			PP_SETFREE(targpp);
+			ASSERT(PP_ISAGED(targpp));
+			ASSERT(targpp->p_szc < szc || (szc == 0 &&
+			    (flags & PGI_PGCPSZC0)));
+			targpp->p_szc = szc;
+			targpp = targpp->p_next;
+
+			rpp = replpp;
+			ASSERT(rpp != NULL);
+			page_sub(&replpp, rpp);
+			ASSERT(PAGE_EXCL(rpp));
+			ASSERT(!PP_ISFREE(rpp));
+			page_unlock(rpp);
+		}
+		ASSERT(targpp == hpp);
+		ASSERT(replpp == NULL);
+		page_list_concat(&pplist, &targpp);
+	}
+	CHK_LPG(pplist, szc);
+	return (pplist);
+}
+
+/*
+ * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
+ * of 0 means nothing left after trim.
+ */
+
+int
+trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
+{
+	pfn_t	kcagepfn;
+	int	decr;
+	int	rc = 0;
+
+	if (PP_ISNORELOC(mseg->pages)) {
+		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
+
+			/* lower part of this mseg inside kernel cage */
+			decr = kcage_current_pfn(&kcagepfn);
+
+			/* kernel cage may have transitioned past mseg */
+			if (kcagepfn >= mseg->pages_base &&
+			    kcagepfn < mseg->pages_end) {
+				ASSERT(decr == 0);
+				*lo = kcagepfn;
+				*hi = MIN(pfnhi,
+				    (mseg->pages_end - 1));
+				rc = 1;
+			}
+		}
+		/* else entire mseg in the cage */
+	} else {
+		if (PP_ISNORELOC(mseg->epages - 1)) {
+
+			/* upper part of this mseg inside kernel cage */
+			decr = kcage_current_pfn(&kcagepfn);
+
+			/* kernel cage may have transitioned past mseg */
+			if (kcagepfn >= mseg->pages_base &&
+			    kcagepfn < mseg->pages_end) {
+				ASSERT(decr);
+				*hi = kcagepfn;
+				*lo = MAX(pfnlo, mseg->pages_base);
+				rc = 1;
+			}
+		} else {
+			/* entire mseg outside of kernel cage */
+			*lo = MAX(pfnlo, mseg->pages_base);
+			*hi = MIN(pfnhi, (mseg->pages_end - 1));
+			rc = 1;
+		}
+	}
+	return (rc);
+}
+
+/*
+ * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a
+ * page with size code 'szc'. Claiming such a page requires acquiring
+ * exclusive locks on all constituent pages (page_trylock_contig_pages),
+ * relocating pages in use and concatenating these constituent pages into a
+ * large page.
+ *
+ * The page lists do not have such a large page and page_freelist_fill has
+ * already failed to demote larger pages and/or coalesce smaller free pages.
+ *
+ * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
+ * pages with the same color as 'bin'.
+ *
+ * 'pfnflag' specifies the subset of the pfn range to search.
+ */
+
+
+static page_t *
+page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
+    pfn_t pfnlo, pfn_t pfnhi, int pfnflag)
+{
+	struct memseg *mseg;
+	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
+	pgcnt_t szcpgmask = szcpgcnt - 1;
+	pfn_t	randpfn;
+	page_t *pp, *randpp, *endpp;
+	uint_t colors;
+	pfn_t hi, lo;
+	uint_t skip;
+
+	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
+
+	if ((pfnhi - pfnlo) + 1 < szcpgcnt)
+		return (NULL);
+
+	ASSERT(szc < mmu_page_sizes);
+
+	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
+	    page_colors;
+
+	ASSERT(bin < colors);
+
+	/*
+	 * trim the pfn range to search based on pfnflag. pfnflag is set
+	 * when there have been previous page_get_contig_page failures to
+	 * limit the search.
+	 *
+	 * The high bit in pfnflag specifies the number of 'slots' in the
+	 * pfn range and the remainder of pfnflag specifies which slot.
+	 * For example, a value of 1010b would mean the second slot of
+	 * the pfn range that has been divided into 8 slots.
+	 */
+	if (pfnflag > 1) {
+		int	slots = 1 << (highbit(pfnflag) - 1);
+		int	slotid = pfnflag & (slots - 1);
+		pgcnt_t	szcpages;
+		int	slotlen;
+
+		pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
+		pfnhi = pfnhi & ~(szcpgcnt - 1);
+
+		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
+		slotlen = howmany(szcpages, slots);
+		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
+		ASSERT(pfnlo < pfnhi);
+		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
+			pfnhi = pfnlo + (slotlen * szcpgcnt);
+	}
+
+	memsegs_lock(0);
+
+	/*
+	 * loop through memsegs to look for contig page candidates
+	 */
+
+	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
+		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
+			/* no overlap */
+			continue;
+		}
+
+		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
+			/* mseg too small */
+			continue;
+
+		/* trim off kernel cage pages from pfn range */
+		if (kcage_on) {
+			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
+				continue;
+		} else {
+			lo = MAX(pfnlo, mseg->pages_base);
+			hi = MIN(pfnhi, (mseg->pages_end - 1));
+		}
+
+		/* round to szcpgcnt boundaries */
+		lo = P2ROUNDUP(lo, szcpgcnt);
+		hi = hi & ~(szcpgcnt - 1);
+
+		if (hi <= lo)
+			continue;
+
+		/*
+		 * set lo to point to the pfn for the desired bin. Large
+		 * page sizes may only have a single page color
+		 */
+		if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
+			uint_t	lobin;
+
+			/*
+			 * factor in colorequiv to check additional
+			 * 'equivalent' bins.
+			 */
+			if (colorequiv > 1 && colors > colorequiv)
+				colors = colors / colorequiv;
+
+			/* determine bin that lo currently points to */
+			lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt;
+
+			/*
+			 * set lo to point at appropriate color and set skip
+			 * to arrive at the next szc page of the same color.
+			 */
+			lo += ((bin - lobin) & (colors - 1)) * szcpgcnt;
+
+			skip = colors * szcpgcnt;
+		} else {
+			/* check all pages starting from lo */
+			skip = szcpgcnt;
+		}
+		if (hi <= lo)
+			/* mseg cannot satisfy color request */
+			continue;
+
+		/* randomly choose a point between lo and hi to begin search */
+
+		randpfn = (pfn_t)GETTICK();
+		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
+		randpp = mseg->pages + (randpfn - mseg->pages_base);
+
+		ASSERT(randpp->p_pagenum == randpfn);
+
+		pp = randpp;
+		endpp =  mseg->pages + (hi - mseg->pages_base);
+
+		ASSERT(randpp + szcpgcnt <= endpp);
+
+		do {
+			ASSERT(!(pp->p_pagenum & szcpgmask));
+			ASSERT((flags & PG_MATCH_COLOR) == 0 ||
+			    colorequiv > 1 ||
+			    PP_2_BIN(pp) == bin);
+			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
+				/* pages unlocked by page_claim on failure */
+				if (page_claim_contig_pages(pp, szc, flags)) {
+					memsegs_unlock(0);
+					return (pp);
+				}
+			}
+
+			pp += skip;
+			if (pp >= endpp) {
+				/* start from the beginning */
+				pp = mseg->pages + (lo - mseg->pages_base);
+				ASSERT(pp->p_pagenum == lo);
+				ASSERT(pp + szcpgcnt <= endpp);
+			}
+		} while (pp != randpp);
+	}
+	memsegs_unlock(0);
+	return (NULL);
+}
+
+
+/*
+ * controlling routine that searches through physical memory in an attempt to
+ * claim a large page based on the input parameters.
+ * on the page free lists.
+ *
+ * calls page_geti_contig_pages with an initial pfn range from the mnode
+ * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
+ * that overlaps with the kernel cage or does not match the requested page
+ * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
+ * page_geti_contig_pages may further limit the search range based on
+ * previous failure counts (pgcpfailcnt[]).
+ *
+ * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
+ * pagesize page that satisfies mtype.
+ */
+page_t *
+page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
+    uint_t flags)
+{
+	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
+	page_t		*pp;
+	int		pfnflag = 0;	/* no limit on search if 0 */
+
+	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
+
+	/* LINTED */
+	MTYPE_START(mnode, mtype, flags);
+	if (mtype < 0) {	/* mnode does not have memory in mtype range */
+		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
+		return (NULL);
+	}
+
+	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
+
+	/* do not limit search and ignore color if hi pri */
+
+	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
+		pfnflag = pgcpfailcnt[szc];
+
+	/* remove color match to improve chances */
+
+	if (flags & PGI_PGCPHIPRI || pfnflag)
+		flags &= ~PG_MATCH_COLOR;
+
+	do {
+		/* get pfn range based on mnode and mtype */
+		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
+
+		ASSERT(pfnhi >= pfnlo);
+
+		pp = page_geti_contig_pages(mnode, bin, szc, flags,
+		    pfnlo, pfnhi, pfnflag);
+
+		if (pp != NULL) {
+			pfnflag = pgcpfailcnt[szc];
+			if (pfnflag) {
+				/* double the search size */
+				pgcpfailcnt[szc] = pfnflag >> 1;
+			}
+			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
+			return (pp);
+		}
+	/* LINTED */
+	} while ((flags & PGI_MT_RANGE) &&
+	    (MTYPE_NEXT(mnode, mtype, flags) >= 0));
+
+	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
+	return (NULL);
+}
+
+
+/*
+ * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
+ *
+ * Does its own locking and accounting.
+ * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
+ * pages of the proper color even if there are pages of a different color.
+ *
+ * Finds a page, removes it, THEN locks it.
+ */
+
+/*ARGSUSED*/
+page_t *
+page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
+	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
+{
+	struct as	*as = seg->s_as;
+	page_t		*pp = NULL;
+	ulong_t		bin;
+	uchar_t		szc;
+	int		mnode;
+	int		mtype;
+	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
+	lgrp_mnode_cookie_t	lgrp_cookie;
+
+	page_get_func = page_get_mnode_freelist;
+
+	/*
+	 * If we aren't passed a specific lgroup, or passed a freed lgrp
+	 * assume we wish to allocate near to the current thread's home.
+	 */
+	if (!LGRP_EXISTS(lgrp))
+		lgrp = lgrp_home_lgrp();
+
+	if (kcage_on) {
+		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
+		    kcage_freemem < kcage_throttlefree + btop(size) &&
+		    curthread != kcage_cageout_thread) {
+			/*
+			 * Set a "reserve" of kcage_throttlefree pages for
+			 * PG_PANIC and cageout thread allocations.
+			 *
+			 * Everybody else has to serialize in
+			 * page_create_get_something() to get a cage page, so
+			 * that we don't deadlock cageout!
+			 */
+			return (NULL);
+		}
+	} else {
+		flags &= ~PG_NORELOC;
+		flags |= PGI_NOCAGE;
+	}
+
+	/* LINTED */
+	MTYPE_INIT(mtype, vp, vaddr, flags);
+
+	/*
+	 * Convert size to page size code.
+	 */
+	if ((szc = page_szc(size)) == (uchar_t)-1)
+		panic("page_get_freelist: illegal page size request");
+	ASSERT(szc < mmu_page_sizes);
+
+	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
+
+	/* LINTED */
+	AS_2_BIN(as, seg, vp, vaddr, bin);
+
+	/* bin is for base pagesize color - convert if larger pagesize. */
+	if (szc)
+		bin = page_convert_color(0, szc, bin);
+
+	/*
+	 * Try to get a local page first, but try remote if we can't
+	 * get a page of the right color.
+	 */
+pgretry:
+	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
+	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
+		pp = page_get_func(mnode, bin, mtype, szc, flags);
+		if (pp != NULL) {
+			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
+			DTRACE_PROBE4(page__get,
+			    lgrp_t *, lgrp,
+			    int, mnode,
+			    ulong_t, bin,
+			    uint_t, flags);
+			return (pp);
+		}
+	}
+	ASSERT(pp == NULL);
+
+	/*
+	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
+	 * remote free lists.  Caller expected to call page_get_cachelist which
+	 * will check local cache lists and remote free lists.
+	 */
+	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
+		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
+		return (NULL);
+	}
+
+	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
+
+	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
+
+	/*
+	 * Try to get a non-local freelist page.
+	 */
+	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
+	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
+		pp = page_get_func(mnode, bin, mtype, szc, flags);
+		if (pp != NULL) {
+			DTRACE_PROBE4(page__get,
+			    lgrp_t *, lgrp,
+			    int, mnode,
+			    ulong_t, bin,
+			    uint_t, flags);
+			VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
+			return (pp);
+		}
+	}
+
+	ASSERT(pp == NULL);
+
+	/*
+	 * when the cage is off chances are page_get_contig_pages() will fail
+	 * to lock a large page chunk therefore when the cage is off it's not
+	 * called by default.  this can be changed via /etc/system.
+	 *
+	 * page_get_contig_pages() also called to acquire a base pagesize page
+	 * for page_create_get_something().
+	 */
+	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
+	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
+	    (page_get_func != page_get_contig_pages)) {
+
+		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
+		page_get_func = page_get_contig_pages;
+		goto pgretry;
+	}
+
+	if (pgcplimitsearch && page_get_func == page_get_contig_pages)
+		pgcpfailcnt[szc]++;
+
+	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
+	return (NULL);
+}
+
+/*
+ * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
+ *
+ * Does its own locking.
+ * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
+ * pages of the proper color even if there are pages of a different color.
+ * Otherwise, scan the bins for ones with pages.  For each bin with pages,
+ * try to lock one of them.  If no page can be locked, try the
+ * next bin.  Return NULL if a page can not be found and locked.
+ *
+ * Finds a pages, trys to lock it, then removes it.
+ */
+
+/*ARGSUSED*/
+page_t *
+page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
+    caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
+{
+	page_t		*pp;
+	struct as	*as = seg->s_as;
+	ulong_t		bin;
+	/*LINTED*/
+	int		mnode;
+	int		mtype;
+	lgrp_mnode_cookie_t	lgrp_cookie;
+
+	/*
+	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
+	 * assume we wish to allocate near to the current thread's home.
+	 */
+	if (!LGRP_EXISTS(lgrp))
+		lgrp = lgrp_home_lgrp();
+
+	if (!kcage_on) {
+		flags &= ~PG_NORELOC;
+		flags |= PGI_NOCAGE;
+	}
+
+	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
+	    kcage_freemem <= kcage_throttlefree) {
+		/*
+		 * Reserve kcage_throttlefree pages for critical kernel
+		 * threads.
+		 *
+		 * Everybody else has to go to page_create_get_something()
+		 * to get a cage page, so we don't deadlock cageout.
+		 */
+		return (NULL);
+	}
+
+	/* LINTED */
+	AS_2_BIN(as, seg, vp, vaddr, bin);
+
+	ASSERT(bin <= page_colors_mask);
+
+	/* LINTED */
+	MTYPE_INIT(mtype, vp, vaddr, flags);
+
+	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
+
+	/*
+	 * Try local cachelists first
+	 */
+	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
+	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
+		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
+		if (pp != NULL) {
+			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
+			DTRACE_PROBE4(page__get,
+			    lgrp_t *, lgrp,
+			    int, mnode,
+			    ulong_t, bin,
+			    uint_t, flags);
+			return (pp);
+		}
+	}
+
+	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
+
+	/*
+	 * Try freelists/cachelists that are farther away
+	 * This is our only chance to allocate remote pages for PAGESIZE
+	 * requests.
+	 */
+	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
+	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
+		pp = page_get_mnode_freelist(mnode, bin, mtype,
+		    0, flags);
+		if (pp != NULL) {
+			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
+			DTRACE_PROBE4(page__get,
+			    lgrp_t *, lgrp,
+			    int, mnode,
+			    ulong_t, bin,
+			    uint_t, flags);
+			return (pp);
+		}
+		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
+		if (pp != NULL) {
+			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
+			DTRACE_PROBE4(page__get,
+			    lgrp_t *, lgrp,
+			    int, mnode,
+			    ulong_t, bin,
+			    uint_t, flags);
+			return (pp);
+		}
+	}
+
+	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
+	return (NULL);
+}
+
+page_t *
+page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
+{
+	kmutex_t	*pcm;
+	int		i;
+	page_t		*pp;
+	page_t		*first_pp;
+	uint_t		bin_marker;
+	int		nwaybins, nwaycnt;
+	int		cpucolors;
+
+	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
+
+	/* LINTED */
+	MTYPE_START(mnode, mtype, flags);
+	if (mtype < 0) {	/* mnode does not have memory in mtype range */
+		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
+		return (NULL);
+	}
+
+	nwaybins = 0;
+	cpucolors = cpu_page_colors;
+	/*
+	 * adjust cpucolors to possibly check additional 'equivalent' bins
+	 * to try to minimize fragmentation of large pages by delaying calls
+	 * to page_freelist_fill.
+	 */
+	if (colorequiv > 1) {
+		int equivcolors = page_colors / colorequiv;
+
+		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
+			cpucolors = equivcolors;
+	}
+
+	/*
+	 * Only hold one cachelist lock at a time, that way we
+	 * can start anywhere and not have to worry about lock
+	 * ordering.
+	 */
+
+big_try_again:
+	nwaycnt = 0;
+	for (i = 0; i <= page_colors; i++) {
+		if (PAGE_CACHELISTS(mnode, bin, mtype)) {
+			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
+			mutex_enter(pcm);
+			pp = PAGE_CACHELISTS(mnode, bin, mtype);
+			if (pp != NULL) {
+				first_pp = pp;
+				ASSERT(pp->p_vnode);
+				ASSERT(PP_ISAGED(pp) == 0);
+				ASSERT(pp->p_szc == 0);
+				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
+				while (!page_trylock(pp, SE_EXCL)) {
+					pp = pp->p_next;
+					ASSERT(pp->p_szc == 0);
+					if (pp == first_pp) {
+						/*
+						 * We have searched the
+						 * complete list!
+						 * And all of them (might
+						 * only be one) are locked.
+						 * This can happen since
+						 * these pages can also be
+						 * found via the hash list.
+						 * When found via the hash
+						 * list, they are locked
+						 * first, then removed.
+						 * We give up to let the
+						 * other thread run.
+						 */
+						pp = NULL;
+						break;
+					}
+					ASSERT(pp->p_vnode);
+					ASSERT(PP_ISFREE(pp));
+					ASSERT(PP_ISAGED(pp) == 0);
+					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
+							mnode);
+				}
+
+				if (pp) {
+					page_t	**ppp;
+					/*
+					 * Found and locked a page.
+					 * Pull it off the list.
+					 */
+					ASSERT(mtype == PP_2_MTYPE(pp));
+					ppp = &PAGE_CACHELISTS(mnode, bin,
+					    mtype);
+					page_sub(ppp, pp);
+					/*
+					 * Subtract counters before releasing
+					 * pcm mutex to avoid a race with
+					 * page_freelist_coalesce and
+					 * page_freelist_fill.
+					 */
+					page_ctr_sub(pp, PG_CACHE_LIST);
+					mutex_exit(pcm);
+					ASSERT(pp->p_vnode);
+					ASSERT(PP_ISAGED(pp) == 0);
+#if defined(__sparc)
+					ASSERT(!kcage_on ||
+					    (flags & PG_NORELOC) == 0 ||
+					    PP_ISNORELOC(pp));
+					if (PP_ISNORELOC(pp)) {
+						kcage_freemem_sub(1);
+					}
+#endif
+					VM_STAT_ADD(vmm_vmstats.
+					    pgmc_allocok);
+					return (pp);
+				}
+			}
+			mutex_exit(pcm);
+		}
+
+		/*
+		 * Wow! The initial bin is empty or no page in the bin could
+		 * be locked.
+		 *
+		 * If specific color is needed, check if page color may be in
+		 * other bins.
+		 */
+		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
+			if (!nwaybins) {
+				if (cpucolors < 0) {
+					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
+					ASSERT(cpucolors > 0);
+					nwaybins = page_colors / cpucolors;
+					if (nwaybins < 2)
+						cpucolors = 0;
+				} else {
+					nwaybins = page_colors / cpucolors;
+					ASSERT(nwaybins > 1);
+				}
+			}
+
+			if (++nwaycnt >= nwaybins) {
+				break;
+			}
+			bin = (bin + (page_colors / nwaybins)) &
+			    page_colors_mask;
+			continue;
+		}
+
+		if (i == 0) {
+			bin = (bin + BIN_STEP) & page_colors_mask;
+			bin_marker = bin;
+		} else {
+			bin = (bin + vac_colors) & page_colors_mask;
+			if (bin == bin_marker) {
+				bin = (bin + 1) & page_colors_mask;
+				bin_marker = bin;
+			}
+		}
+	}
+
+#if defined(__sparc)
+	if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&
+		(kcage_freemem >= kcage_lotsfree)) {
+		/*
+		 * The Cage is ON and with plenty of free mem, and
+		 * we're willing to check for a NORELOC page if we
+		 * couldn't find a RELOC page, so spin again.
+		 */
+		flags |= PG_NORELOC;
+		mtype = MTYPE_NORELOC;
+		goto big_try_again;
+	}
+#else
+	if (flags & PGI_MT_RANGE) {
+		MTYPE_NEXT(mnode, mtype, flags);
+		if (mtype >= 0)
+			goto big_try_again;
+	}
+#endif
+	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
+	return (NULL);
+}
+
+#ifdef DEBUG
+#define	REPL_PAGE_STATS
+#endif /* DEBUG */
+
+#ifdef REPL_PAGE_STATS
+struct repl_page_stats {
+	uint_t	ngets;
+	uint_t	ngets_noreloc;
+	uint_t	npgr_noreloc;
+	uint_t	nnopage_first;
+	uint_t	nnopage;
+	uint_t	nhashout;
+	uint_t	nnofree;
+	uint_t	nnext_pp;
+} repl_page_stats;
+#define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
+#else /* REPL_PAGE_STATS */
+#define	REPL_STAT_INCR(v)
+#endif /* REPL_PAGE_STATS */
+
+int	pgrppgcp;
+
+/*
+ * The freemem accounting must be done by the caller.
+ * First we try to get a replacement page of the same size as like_pp,
+ * if that is not possible, then we just get a set of discontiguous
+ * PAGESIZE pages.
+ */
+page_t *
+page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp,
+    uint_t pgrflags)
+{
+	page_t		*like_pp;
+	page_t		*pp, *pplist;
+	page_t		*pl = NULL;
+	ulong_t		bin;
+	int		mnode, page_mnode;
+	int		szc;
+	spgcnt_t	npgs, pg_cnt;
+	pfn_t		pfnum;
+	int		mtype;
+	int		flags = 0;
+	lgrp_mnode_cookie_t	lgrp_cookie;
+
+
+	REPL_STAT_INCR(ngets);
+	like_pp = orig_like_pp;
+	ASSERT(PAGE_EXCL(like_pp));
+
+	szc = like_pp->p_szc;
+	npgs = page_get_pagecnt(szc);
+	/*
+	 * Now we reset like_pp to the base page_t.
+	 * That way, we won't walk past the end of this 'szc' page.
+	 */
+	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
+	like_pp = page_numtopp_nolock(pfnum);
+	ASSERT(like_pp->p_szc == szc);
+
+	if (PP_ISNORELOC(like_pp)) {
+		ASSERT(kcage_on);
+		REPL_STAT_INCR(ngets_noreloc);
+		flags = PGI_RELOCONLY;
+	} else if (pgrflags & PGR_NORELOC) {
+		ASSERT(kcage_on);
+		REPL_STAT_INCR(npgr_noreloc);
+		flags = PG_NORELOC;
+	}
+
+	/*
+	 * Kernel pages must always be replaced with the same size
+	 * pages, since we cannot properly handle demotion of kernel
+	 * pages.
+	 */
+	if (like_pp->p_vnode == &kvp)
+		pgrflags |= PGR_SAMESZC;
+
+	/* LINTED */
+	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode);
+
+	while (npgs) {
+		pplist = NULL;
+		for (;;) {
+			pg_cnt = page_get_pagecnt(szc);
+			bin = PP_2_BIN(like_pp);
+			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
+			ASSERT(pg_cnt <= npgs);
+
+			/*
+			 * If an lgroup was specified, try to get the
+			 * page from that lgroup.
+			 */
+			if (LGRP_EXISTS(lgrp)) {
+				/* Try the lgroup's freelists first */
+				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+				    LGRP_SRCH_LOCAL);
+				while ((pplist == NULL) &&
+				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
+				    != -1) {
+					pplist = page_get_mnode_freelist(
+						mnode, bin, mtype, szc,
+						    flags);
+				}
+
+				/*
+				 * Now try it's cachelists if this is a
+				 * small page. Don't need to do it for
+				 * larger ones since page_freelist_coalesce()
+				 * already failed.
+				 */
+				if (pplist != NULL || szc != 0)
+					break;
+
+				/* Now try it's cachelists */
+				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+				    LGRP_SRCH_LOCAL);
+
+				while ((pplist == NULL) &&
+				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
+				    != -1) {
+					pplist = page_get_mnode_cachelist(
+						bin, flags, mnode, mtype);
+				}
+				if (pplist != NULL) {
+					page_hashout(pplist, NULL);
+					PP_SETAGED(pplist);
+					REPL_STAT_INCR(nhashout);
+					break;
+				}
+				/* Done looking in this lgroup. Bail out. */
+				break;
+			}
+
+			ASSERT(!LGRP_EXISTS(lgrp));
+			/*
+			 * No lgroup was specified, so just try to get the
+			 * page as close to like_pp's mnode as possible.
+			 * First try the local freelist...
+			 */
+			mnode = PP_2_MEM_NODE(like_pp);
+			pplist = page_get_mnode_freelist(mnode, bin,
+			    mtype, szc, flags);
+			if (pplist != NULL)
+				break;
+
+			REPL_STAT_INCR(nnofree);
+
+			/*
+			 * ...then the local cachelist. Don't need to do it for
+			 * larger pages cause page_freelist_coalesce() already
+			 * failed there anyway.
+			 */
+			if (szc == 0) {
+				pplist = page_get_mnode_cachelist(bin, flags,
+				    mnode, mtype);
+				if (pplist != NULL) {
+					page_hashout(pplist, NULL);
+					PP_SETAGED(pplist);
+					REPL_STAT_INCR(nhashout);
+					break;
+				}
+			}
+
+			/* Now try remote freelists */
+			page_mnode = mnode;
+			lgrp =
+			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
+			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+			    LGRP_SRCH_HIER);
+			while (pplist == NULL &&
+			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
+			    != -1) {
+				/*
+				 * Skip local mnode.
+				 */
+				if ((mnode == page_mnode) ||
+				    (mem_node_config[mnode].exists == 0))
+					continue;
+
+				pplist = page_get_mnode_freelist(mnode,
+				    bin, mtype, szc, flags);
+			}
+
+			if (pplist != NULL)
+				break;
+
+
+			/* Now try remote cachelists */
+			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+			    LGRP_SRCH_HIER);
+			while (pplist == NULL && szc == 0) {
+				mnode = lgrp_memnode_choose(&lgrp_cookie);
+				if (mnode == -1)
+					break;
+				/*
+				 * Skip local mnode.
+				 */
+				if ((mnode == page_mnode) ||
+				    (mem_node_config[mnode].exists == 0))
+					continue;
+
+				pplist = page_get_mnode_cachelist(bin,
+				    flags, mnode, mtype);
+
+				if (pplist != NULL) {
+					page_hashout(pplist, NULL);
+					PP_SETAGED(pplist);
+					REPL_STAT_INCR(nhashout);
+					break;
+				}
+			}
+
+			/*
+			 * Break out of while loop under the following cases:
+			 * - If we successfully got a page.
+			 * - If pgrflags specified only returning a specific
+			 *   page size and we could not find that page size.
+			 * - If we could not satisfy the request with PAGESIZE
+			 *   or larger pages.
+			 */
+			if (pplist != NULL || szc == 0)
+				break;
+
+			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
+				/* try to find contig page */
+
+				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+				    LGRP_SRCH_HIER);
+
+				while ((pplist == NULL) &&
+				    (mnode =
+					lgrp_memnode_choose(&lgrp_cookie))
+				    != -1) {
+					pplist = page_get_contig_pages(
+						mnode, bin, mtype, szc,
+						    flags | PGI_PGCPHIPRI);
+				}
+				break;
+			}
+
+			/*
+			 * The correct thing to do here is try the next
+			 * page size down using szc--. Due to a bug
+			 * with the processing of HAT_RELOAD_SHARE
+			 * where the sfmmu_ttecnt arrays of all
+			 * hats sharing an ISM segment don't get updated,
+			 * using intermediate size pages for relocation
+			 * can lead to continuous page faults.
+			 */
+			szc = 0;
+		}
+
+		if (pplist != NULL) {
+			DTRACE_PROBE4(page__get,
+			    lgrp_t *, lgrp,
+			    int, mnode,
+			    ulong_t, bin,
+			    uint_t, flags);
+
+			while (pplist != NULL && pg_cnt--) {
+				ASSERT(pplist != NULL);
+				pp = pplist;
+				page_sub(&pplist, pp);
+				PP_CLRFREE(pp);
+				PP_CLRAGED(pp);
+				page_list_concat(&pl, &pp);
+				npgs--;
+				like_pp = like_pp + 1;
+				REPL_STAT_INCR(nnext_pp);
+			}
+			ASSERT(pg_cnt == 0);
+		} else {
+			break;
+		}
+	}
+
+	if (npgs) {
+		/*
+		 * We were unable to allocate the necessary number
+		 * of pages.
+		 * We need to free up any pl.
+		 */
+		REPL_STAT_INCR(nnopage);
+		page_free_replacement_page(pl);
+		return (NULL);
+	} else {
+		return (pl);
+	}
+}
+
+/*
+ * demote a free large page to it's constituent pages
+ */
+void
+page_demote_free_pages(page_t *pp)
+{
+
+	int mnode;
+
+	ASSERT(pp != NULL);
+	ASSERT(PAGE_LOCKED(pp));
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
+
+	mnode = PP_2_MEM_NODE(pp);
+	page_freelist_lock(mnode);
+	if (pp->p_szc != 0) {
+		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
+		    pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
+	}
+	page_freelist_unlock(mnode);
+	ASSERT(pp->p_szc == 0);
+}
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
new file mode 100644
index 0000000000..fcafb5f803
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -0,0 +1,1147 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VM - paged vnode.
+ *
+ * This file supplies vm support for the vnode operations that deal with pages.
+ */
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/vmmeter.h>
+#include <sys/vmsystm.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/user.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/cpuvar.h>
+#include <sys/vtrace.h>
+#include <sys/tnf_probe.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/rm.h>
+#include <vm/pvn.h>
+#include <vm/page.h>
+#include <vm/seg_map.h>
+#include <vm/seg_kmem.h>
+#include <sys/fs/swapnode.h>
+
+int pvn_nofodklust = 0;
+int pvn_write_noklust = 0;
+
+uint_t pvn_vmodsort_supported = 0;	/* set if HAT supports VMODSORT */
+uint_t pvn_vmodsort_disable = 0;	/* set in /etc/system to disable HAT */
+					/* support for vmodsort for testing */
+
+static struct kmem_cache *marker_cache = NULL;
+
+/*
+ * Find the largest contiguous block which contains `addr' for file offset
+ * `offset' in it while living within the file system block sizes (`vp_off'
+ * and `vp_len') and the address space limits for which no pages currently
+ * exist and which map to consecutive file offsets.
+ */
+page_t *
+pvn_read_kluster(
+	struct vnode *vp,
+	u_offset_t off,
+	struct seg *seg,
+	caddr_t addr,
+	u_offset_t *offp,			/* return values */
+	size_t *lenp,				/* return values */
+	u_offset_t vp_off,
+	size_t vp_len,
+	int isra)
+{
+	ssize_t deltaf, deltab;
+	page_t *pp;
+	page_t *plist = NULL;
+	spgcnt_t pagesavail;
+	u_offset_t vp_end;
+
+	ASSERT(off >= vp_off && off < vp_off + vp_len);
+
+	/*
+	 * We only want to do klustering/read ahead if there
+	 * is more than minfree pages currently available.
+	 */
+	pagesavail = freemem - minfree;
+
+	if (pagesavail <= 0)
+		if (isra)
+			return ((page_t *)NULL);    /* ra case - give up */
+		else
+			pagesavail = 1;		    /* must return a page */
+
+	/* We calculate in pages instead of bytes due to 32-bit overflows */
+	if (pagesavail < (spgcnt_t)btopr(vp_len)) {
+		/*
+		 * Don't have enough free memory for the
+		 * max request, try sizing down vp request.
+		 */
+		deltab = (ssize_t)(off - vp_off);
+		vp_len -= deltab;
+		vp_off += deltab;
+		if (pagesavail < btopr(vp_len)) {
+			/*
+			 * Still not enough memory, just settle for
+			 * pagesavail which is at least 1.
+			 */
+			vp_len = ptob(pagesavail);
+		}
+	}
+
+	vp_end = vp_off + vp_len;
+	ASSERT(off >= vp_off && off < vp_end);
+
+	if (isra && SEGOP_KLUSTER(seg, addr, 0))
+		return ((page_t *)NULL);	/* segment driver says no */
+
+	if ((plist = page_create_va(vp, off,
+	    PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
+		return ((page_t *)NULL);
+
+	if (vp_len <= PAGESIZE || pvn_nofodklust) {
+		*offp = off;
+		*lenp = MIN(vp_len, PAGESIZE);
+	} else {
+		/*
+		 * Scan back from front by incrementing "deltab" and
+		 * comparing "off" with "vp_off + deltab" to avoid
+		 * "signed" versus "unsigned" conversion problems.
+		 */
+		for (deltab = PAGESIZE; off >= vp_off + deltab;
+		    deltab += PAGESIZE) {
+			/*
+			 * Call back to the segment driver to verify that
+			 * the klustering/read ahead operation makes sense.
+			 */
+			if (SEGOP_KLUSTER(seg, addr, -deltab))
+				break;		/* page not eligible */
+			if ((pp = page_create_va(vp, off - deltab,
+			    PAGESIZE, PG_EXCL, seg, addr - deltab))
+			    == NULL)
+				break;		/* already have the page */
+			/*
+			 * Add page to front of page list.
+			 */
+			page_add(&plist, pp);
+		}
+		deltab -= PAGESIZE;
+
+		/* scan forward from front */
+		for (deltaf = PAGESIZE; off + deltaf < vp_end;
+		    deltaf += PAGESIZE) {
+			/*
+			 * Call back to the segment driver to verify that
+			 * the klustering/read ahead operation makes sense.
+			 */
+			if (SEGOP_KLUSTER(seg, addr, deltaf))
+				break;		/* page not file extension */
+			if ((pp = page_create_va(vp, off + deltaf,
+			    PAGESIZE, PG_EXCL, seg, addr + deltaf))
+			    == NULL)
+				break;		/* already have page */
+
+			/*
+			 * Add page to end of page list.
+			 */
+			page_add(&plist, pp);
+			plist = plist->p_next;
+		}
+		*offp = off = off - deltab;
+		*lenp = deltab + deltaf;
+		ASSERT(off >= vp_off);
+
+		/*
+		 * If we ended up getting more than was actually
+		 * requested, retract the returned length to only
+		 * reflect what was requested.  This might happen
+		 * if we were allowed to kluster pages across a
+		 * span of (say) 5 frags, and frag size is less
+		 * than PAGESIZE.  We need a whole number of
+		 * pages to contain those frags, but the returned
+		 * size should only allow the returned range to
+		 * extend as far as the end of the frags.
+		 */
+		if ((vp_off + vp_len) < (off + *lenp)) {
+			ASSERT(vp_end > off);
+			*lenp = vp_end - off;
+		}
+	}
+	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
+		"pvn_read_kluster:seg %p addr %x isra %x",
+		seg, addr, isra);
+	return (plist);
+}
+
+/*
+ * Handle pages for this vnode on either side of the page "pp"
+ * which has been locked by the caller.  This routine will also
+ * do klustering in the range [vp_off, vp_off + vp_len] up
+ * until a page which is not found.  The offset and length
+ * of pages included is returned in "*offp" and "*lenp".
+ *
+ * Returns a list of dirty locked pages all ready to be
+ * written back.
+ */
+page_t *
+pvn_write_kluster(
+	struct vnode *vp,
+	page_t *pp,
+	u_offset_t *offp,		/* return values */
+	size_t *lenp,			/* return values */
+	u_offset_t vp_off,
+	size_t vp_len,
+	int flags)
+{
+	u_offset_t off;
+	page_t *dirty;
+	size_t deltab, deltaf;
+	se_t se;
+	u_offset_t vp_end;
+
+	off = pp->p_offset;
+
+	/*
+	 * Kustering should not be done if we are invalidating
+	 * pages since we could destroy pages that belong to
+	 * some other process if this is a swap vnode.
+	 */
+	if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
+		*offp = off;
+		*lenp = PAGESIZE;
+		return (pp);
+	}
+
+	if (flags & (B_FREE | B_INVAL))
+		se = SE_EXCL;
+	else
+		se = SE_SHARED;
+
+	dirty = pp;
+	/*
+	 * Scan backwards looking for pages to kluster by incrementing
+	 * "deltab" and comparing "off" with "vp_off + deltab" to
+	 * avoid "signed" versus "unsigned" conversion problems.
+	 */
+	for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
+		pp = page_lookup_nowait(vp, off - deltab, se);
+		if (pp == NULL)
+			break;		/* page not found */
+		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
+			break;
+		page_add(&dirty, pp);
+	}
+	deltab -= PAGESIZE;
+
+	vp_end = vp_off + vp_len;
+	/* now scan forwards looking for pages to kluster */
+	for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
+		pp = page_lookup_nowait(vp, off + deltaf, se);
+		if (pp == NULL)
+			break;		/* page not found */
+		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
+			break;
+		page_add(&dirty, pp);
+		dirty = dirty->p_next;
+	}
+
+	*offp = off - deltab;
+	*lenp = deltab + deltaf;
+	return (dirty);
+}
+
+/*
+ * Generic entry point used to release the "shared/exclusive" lock
+ * and the "p_iolock" on pages after i/o is complete.
+ */
+void
+pvn_io_done(page_t *plist)
+{
+	page_t *pp;
+
+	while (plist != NULL) {
+		pp = plist;
+		page_sub(&plist, pp);
+		page_io_unlock(pp);
+		page_unlock(pp);
+	}
+}
+
+/*
+ * Entry point to be used by file system getpage subr's and
+ * other such routines which either want to unlock pages (B_ASYNC
+ * request) or destroy a list of pages if an error occurred.
+ */
+void
+pvn_read_done(page_t *plist, int flags)
+{
+	page_t *pp;
+
+	while (plist != NULL) {
+		pp = plist;
+		page_sub(&plist, pp);
+		page_io_unlock(pp);
+		if (flags & B_ERROR) {
+			/*LINTED: constant in conditional context*/
+			VN_DISPOSE(pp, B_INVAL, 0, kcred);
+		} else {
+			(void) page_release(pp, 0);
+		}
+	}
+}
+
+/*
+ * Automagic pageout.
+ * When memory gets tight, start freeing pages popping out of the
+ * write queue.
+ */
+int	write_free = 1;
+pgcnt_t	pages_before_pager = 200;	/* LMXXX */
+
+/*
+ * Routine to be called when page-out's complete.
+ * The caller, typically VOP_PUTPAGE, has to explicity call this routine
+ * after waiting for i/o to complete (biowait) to free the list of
+ * pages associated with the buffer.  These pages must be locked
+ * before i/o is initiated.
+ *
+ * If a write error occurs, the pages are marked as modified
+ * so the write will be re-tried later.
+ */
+
+void
+pvn_write_done(page_t *plist, int flags)
+{
+	int dfree = 0;
+	int pgrec = 0;
+	int pgout = 0;
+	int pgpgout = 0;
+	int anonpgout = 0;
+	int anonfree = 0;
+	int fspgout = 0;
+	int fsfree = 0;
+	int execpgout = 0;
+	int execfree = 0;
+	page_t *pp;
+	struct cpu *cpup;
+	struct vnode *vp = NULL;	/* for probe */
+	uint_t ppattr;
+
+	ASSERT((flags & B_READ) == 0);
+
+	/*
+	 * If we are about to start paging anyway, start freeing pages.
+	 */
+	if (write_free && freemem < lotsfree + pages_before_pager &&
+	    (flags & B_ERROR) == 0) {
+		flags |= B_FREE;
+	}
+
+	/*
+	 * Handle each page involved in the i/o operation.
+	 */
+	while (plist != NULL) {
+		pp = plist;
+		ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
+		page_sub(&plist, pp);
+
+		/* Kernel probe support */
+		if (vp == NULL)
+			vp = pp->p_vnode;
+
+		if (flags & B_ERROR) {
+			/*
+			 * Write operation failed.  We don't want
+			 * to destroy (or free) the page unless B_FORCE
+			 * is set. We set the mod bit again and release
+			 * all locks on the page so that it will get written
+			 * back again later when things are hopefully
+			 * better again.
+			 * If B_INVAL and B_FORCE is set we really have
+			 * to destroy the page.
+			 */
+			if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
+				page_io_unlock(pp);
+				/*LINTED: constant in conditional context*/
+				VN_DISPOSE(pp, B_INVAL, 0, kcred);
+			} else {
+				hat_setmod(pp);
+				page_io_unlock(pp);
+				page_unlock(pp);
+			}
+		} else if (flags & B_INVAL) {
+			/*
+			 * XXX - Failed writes with B_INVAL set are
+			 * not handled appropriately.
+			 */
+			page_io_unlock(pp);
+			/*LINTED: constant in conditional context*/
+			VN_DISPOSE(pp, B_INVAL, 0, kcred);
+		} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
+			/*
+			 * Update statistics for pages being paged out
+			 */
+			if (pp->p_vnode) {
+				if (IS_SWAPFSVP(pp->p_vnode)) {
+					anonpgout++;
+				} else {
+					if (pp->p_vnode->v_flag & VVMEXEC) {
+						execpgout++;
+					} else {
+						fspgout++;
+					}
+				}
+			}
+			page_io_unlock(pp);
+			pgout = 1;
+			pgpgout++;
+			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
+				"page_ws_out:pp %p", pp);
+
+			/*
+			 * The page_struct_lock need not be acquired to
+			 * examine "p_lckcnt" and "p_cowcnt" since we'll
+			 * have an "exclusive" lock if the upgrade succeeds.
+			 */
+			if (page_tryupgrade(pp) &&
+			    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
+				/*
+				 * Check if someone has reclaimed the
+				 * page.  If ref and mod are not set, no
+				 * one is using it so we can free it.
+				 * The rest of the system is careful
+				 * to use the NOSYNC flag to unload
+				 * translations set up for i/o w/o
+				 * affecting ref and mod bits.
+				 *
+				 * Obtain a copy of the real hardware
+				 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
+				 * to avoid having to flush the cache.
+				 */
+				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
+					HAT_SYNC_STOPON_MOD);
+			ck_refmod:
+				if (!(ppattr & (P_REF | P_MOD))) {
+					if (hat_page_is_mapped(pp)) {
+						/*
+						 * Doesn't look like the page
+						 * was modified so now we
+						 * really have to unload the
+						 * translations.  Meanwhile
+						 * another CPU could've
+						 * modified it so we have to
+						 * check again.  We don't loop
+						 * forever here because now
+						 * the translations are gone
+						 * and no one can get a new one
+						 * since we have the "exclusive"
+						 * lock on the page.
+						 */
+						(void) hat_pageunload(pp,
+							HAT_FORCE_PGUNLOAD);
+						ppattr = hat_page_getattr(pp,
+							P_REF | P_MOD);
+						goto ck_refmod;
+					}
+					/*
+					 * Update statistics for pages being
+					 * freed
+					 */
+					if (pp->p_vnode) {
+						if (IS_SWAPFSVP(pp->p_vnode)) {
+							anonfree++;
+						} else {
+							if (pp->p_vnode->v_flag
+							    & VVMEXEC) {
+								execfree++;
+							} else {
+								fsfree++;
+							}
+						}
+					}
+					/*LINTED: constant in conditional ctx*/
+					VN_DISPOSE(pp, B_FREE,
+						(flags & B_DONTNEED), kcred);
+					dfree++;
+				} else {
+					page_unlock(pp);
+					pgrec++;
+					TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
+					    "page_ws_free:pp %p", pp);
+				}
+			} else {
+				/*
+				 * Page is either `locked' in memory
+				 * or was reclaimed and now has a
+				 * "shared" lock, so release it.
+				 */
+				page_unlock(pp);
+			}
+		} else {
+			/*
+			 * Neither B_FREE nor B_INVAL nor B_ERROR.
+			 * Just release locks.
+			 */
+			page_io_unlock(pp);
+			page_unlock(pp);
+		}
+	}
+
+	CPU_STATS_ENTER_K();
+	cpup = CPU;		/* get cpup now that CPU cannot change */
+	CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
+	CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
+	CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
+	CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
+	CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
+	CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
+	CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
+	CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
+	CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
+	CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
+	CPU_STATS_EXIT_K();
+
+	/* Kernel probe */
+	TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
+		tnf_opaque,	vnode,			vp,
+		tnf_ulong,	pages_pageout,		pgpgout,
+		tnf_ulong,	pages_freed,		dfree,
+		tnf_ulong,	pages_reclaimed,	pgrec);
+}
+
+/*
+ * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
+ * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
+ * operation and is only to be considered if it doesn't involve any
+ * waiting here.  B_TRUNC indicates that the file is being truncated
+ * and so no i/o needs to be done. B_FORCE indicates that the page
+ * must be destroyed so don't try wrting it out.
+ *
+ * The caller must ensure that the page is locked.  Returns 1, if
+ * the page should be written back (the "iolock" is held in this
+ * case), or 0 if the page has been dealt with or has been
+ * unlocked.
+ */
+int
+pvn_getdirty(page_t *pp, int flags)
+{
+	ASSERT((flags & (B_INVAL | B_FREE)) ?
+	    PAGE_EXCL(pp) : PAGE_SHARED(pp));
+	ASSERT(PP_ISFREE(pp) == 0);
+
+	/*
+	 * If trying to invalidate or free a logically `locked' page,
+	 * forget it.  Don't need page_struct_lock to check p_lckcnt and
+	 * p_cowcnt as the page is exclusively locked.
+	 */
+	if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
+	    (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
+		page_unlock(pp);
+		return (0);
+	}
+
+	/*
+	 * Now acquire the i/o lock so we can add it to the dirty
+	 * list (if necessary).  We avoid blocking on the i/o lock
+	 * in the following cases:
+	 *
+	 *	If B_DELWRI is set, which implies that this request is
+	 *	due to a klustering operartion.
+	 *
+	 *	If this is an async (B_ASYNC) operation and we are not doing
+	 *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
+	 *	that the the page is written out].
+	 */
+	if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
+		if (!page_io_trylock(pp)) {
+			page_unlock(pp);
+			return (0);
+		}
+	} else {
+		page_io_lock(pp);
+	}
+
+	/*
+	 * If we want to free or invalidate the page then
+	 * we need to unload it so that anyone who wants
+	 * it will have to take a minor fault to get it.
+	 * Otherwise, we're just writing the page back so we
+	 * need to sync up the hardwre and software mod bit to
+	 * detect any future modifications.  We clear the
+	 * software mod bit when we put the page on the dirty
+	 * list.
+	 */
+	if (flags & (B_INVAL | B_FREE)) {
+		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+	} else {
+		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
+	}
+
+	if (!hat_ismod(pp) || (flags & B_TRUNC)) {
+		/*
+		 * Don't need to add it to the
+		 * list after all.
+		 */
+		page_io_unlock(pp);
+		if (flags & B_INVAL) {
+			/*LINTED: constant in conditional context*/
+			VN_DISPOSE(pp, B_INVAL, 0, kcred);
+		} else if (flags & B_FREE) {
+			/*LINTED: constant in conditional context*/
+			VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
+		} else {
+			/*
+			 * This is advisory path for the callers
+			 * of VOP_PUTPAGE() who prefer freeing the
+			 * page _only_ if no one else is accessing it.
+			 * E.g. segmap_release()
+			 *
+			 * The above hat_ismod() check is useless because:
+			 * (1) we may not be holding SE_EXCL lock;
+			 * (2) we've not unloaded _all_ translations
+			 *
+			 * Let page_release() do the heavy-lifting.
+			 */
+			(void) page_release(pp, 1);
+		}
+		return (0);
+	}
+
+	/*
+	 * Page is dirty, get it ready for the write back
+	 * and add page to the dirty list.
+	 */
+	hat_clrrefmod(pp);
+
+	/*
+	 * If we're going to free the page when we're done
+	 * then we can let others try to use it starting now.
+	 * We'll detect the fact that they used it when the
+	 * i/o is done and avoid freeing the page.
+	 */
+	if (flags & B_FREE)
+		page_downgrade(pp);
+
+
+	TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
+
+	return (1);
+}
+
+
+/*ARGSUSED*/
+static int
+marker_constructor(void *buf, void *cdrarg, int kmflags)
+{
+	page_t *mark = buf;
+	bzero(mark, sizeof (page_t));
+	return (0);
+}
+
+void
+pvn_init()
+{
+	if (pvn_vmodsort_disable == 0)
+		pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
+	marker_cache = kmem_cache_create("marker_cache",
+	    sizeof (page_t), 0, marker_constructor,
+	    NULL, NULL, NULL, NULL, 0);
+}
+
+
+/*
+ * Process a vnode's page list for all pages whose offset is >= off.
+ * Pages are to either be free'd, invalidated, or written back to disk.
+ *
+ * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
+ * is specified, otherwise they are "shared" locked.
+ *
+ * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
+ *
+ * Special marker page_t's are inserted in the list in order
+ * to keep track of where we are in the list when locks are dropped.
+ *
+ * Note the list is circular and insertions can happen only at the
+ * head and tail of the list. The algorithm ensures visiting all pages
+ * on the list in the following way:
+ *
+ *    Drop two marker pages at the end of the list.
+ *
+ *    Move one marker page backwards towards the start of the list until
+ *    it is at the list head, processing the pages passed along the way.
+ *
+ *    Due to race conditions when the vphm mutex is dropped, additional pages
+ *    can be added to either end of the list, so we'll continue to move
+ *    the marker and process pages until it is up against the end marker.
+ *
+ * There is one special exit condition. If we are processing a VMODSORT
+ * vnode and only writing back modified pages, we can stop as soon as
+ * we run into an unmodified page.  This makes fsync(3) operations fast.
+ */
+int
+pvn_vplist_dirty(
+	vnode_t		*vp,
+	u_offset_t	off,
+	int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
+			size_t *, int, cred_t *),
+	int		flags,
+	cred_t		*cred)
+{
+	page_t		*pp;
+	page_t		*mark;		/* marker page that moves toward head */
+	page_t		*end;		/* marker page at end of list */
+	int		err = 0;
+	int		error;
+	kmutex_t	*vphm;
+	se_t		se;
+	page_t		**where_to_move;
+
+	ASSERT(vp->v_type != VCHR);
+
+	if (vp->v_pages == NULL)
+		return (0);
+
+
+	/*
+	 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
+	 *
+	 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
+	 * from getting blocked while flushing pages to a dead NFS server.
+	 */
+	mutex_enter(&vp->v_lock);
+	if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
+		mutex_exit(&vp->v_lock);
+		return (EAGAIN);
+	}
+
+	while (vp->v_flag & VVMLOCK)
+		cv_wait(&vp->v_cv, &vp->v_lock);
+
+	if (vp->v_pages == NULL) {
+		mutex_exit(&vp->v_lock);
+		return (0);
+	}
+
+	vp->v_flag |= VVMLOCK;
+	mutex_exit(&vp->v_lock);
+
+
+	/*
+	 * Set up the marker pages used to walk the list
+	 */
+	end = kmem_cache_alloc(marker_cache, KM_SLEEP);
+	end->p_vnode = vp;
+	end->p_offset = (u_offset_t)-2;
+	mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
+	mark->p_vnode = vp;
+	mark->p_offset = (u_offset_t)-1;
+
+	/*
+	 * Grab the lock protecting the vnode's page list
+	 * note that this lock is dropped at times in the loop.
+	 */
+	vphm = page_vnode_mutex(vp);
+	mutex_enter(vphm);
+	if (vp->v_pages == NULL)
+		goto leave;
+
+	/*
+	 * insert the markers and loop through the list of pages
+	 */
+	page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
+	page_vpadd(&mark->p_vpnext, end);
+	for (;;) {
+
+		/*
+		 * If only doing an async write back, then we can
+		 * stop as soon as we get to start of the list.
+		 */
+		if (flags == B_ASYNC && vp->v_pages == mark)
+			break;
+
+		/*
+		 * otherwise stop when we've gone through all the pages
+		 */
+		if (mark->p_vpprev == end)
+			break;
+
+		pp = mark->p_vpprev;
+		if (vp->v_pages == pp)
+			where_to_move = &vp->v_pages;
+		else
+			where_to_move = &pp->p_vpprev->p_vpnext;
+
+		ASSERT(pp->p_vnode == vp);
+
+		/*
+		 * Skip this page if the offset is out of the desired range.
+		 * Just move the marker and continue.
+		 */
+		if (pp->p_offset < off) {
+			page_vpsub(&vp->v_pages, mark);
+			page_vpadd(where_to_move, mark);
+			continue;
+		}
+
+		/*
+		 * If just flushing dirty pages to disk and this vnode
+		 * is using a sorted list of pages, we can stop processing
+		 * as soon as we find an unmodified page. Since all the
+		 * modified pages are visited first.
+		 */
+		if (IS_VMODSORT(vp) &&
+		    !(flags & (B_INVAL | B_FREE | B_TRUNC)) &&
+		    !hat_ismod(pp)) {
+#ifdef  DEBUG
+			/*
+			 * For debug kernels examine what should be all the
+			 * remaining clean pages, asserting that they are
+			 * not modified.
+			 */
+			page_t	*chk = pp;
+			int	attr;
+
+			page_vpsub(&vp->v_pages, mark);
+			page_vpadd(where_to_move, mark);
+			do {
+				chk = chk->p_vpprev;
+				ASSERT(chk != end);
+				if (chk == mark)
+					continue;
+				attr = hat_page_getattr(chk, P_MOD | P_REF);
+				if ((attr & P_MOD) == 0)
+					continue;
+				panic("v_pages list not all clean: "
+				    "page_t*=%p vnode=%p off=%lx "
+				    "attr=0x%x last clean page_t*=%p\n",
+				    (void *)chk, (void *)chk->p_vnode,
+				    (long)chk->p_offset, attr, (void *)pp);
+			} while (chk != vp->v_pages);
+#endif
+			break;
+		}
+
+		/*
+		 * If we are supposed to invalidate or free this
+		 * page, then we need an exclusive lock.
+		 */
+		se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
+
+		/*
+		 * We must acquire the page lock for all synchronous
+		 * operations (invalidate, free and write).
+		 */
+		if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
+			/*
+			 * If the page_lock() drops the mutex
+			 * we must retry the loop.
+			 */
+			if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
+				continue;
+
+			/*
+			 * It's ok to move the marker page now.
+			 */
+			page_vpsub(&vp->v_pages, mark);
+			page_vpadd(where_to_move, mark);
+		} else {
+
+			/*
+			 * update the marker page for all remaining cases
+			 */
+			page_vpsub(&vp->v_pages, mark);
+			page_vpadd(where_to_move, mark);
+
+			/*
+			 * For write backs, If we can't lock the page, it's
+			 * invalid or in the process of being destroyed.  Skip
+			 * it, assuming someone else is writing it.
+			 */
+			if (!page_trylock(pp, se))
+				continue;
+		}
+
+		ASSERT(pp->p_vnode == vp);
+
+		/*
+		 * Successfully locked the page, now figure out what to
+		 * do with it. Free pages are easily dealt with, invalidate
+		 * if desired or just go on to the next page.
+		 */
+		if (PP_ISFREE(pp)) {
+			if ((flags & B_INVAL) == 0) {
+				page_unlock(pp);
+				continue;
+			}
+
+			/*
+			 * Invalidate (destroy) the page.
+			 */
+			mutex_exit(vphm);
+			page_destroy_free(pp);
+			mutex_enter(vphm);
+			continue;
+		}
+
+		/*
+		 * pvn_getdirty() figures out what do do with a dirty page.
+		 * If the page is dirty, the putapage() routine will write it
+		 * and will kluster any other adjacent dirty pages it can.
+		 *
+		 * pvn_getdirty() and `(*putapage)' unlock the page.
+		 */
+		mutex_exit(vphm);
+		if (pvn_getdirty(pp, flags)) {
+			error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
+			if (!err)
+				err = error;
+		}
+		mutex_enter(vphm);
+	}
+	page_vpsub(&vp->v_pages, mark);
+	page_vpsub(&vp->v_pages, end);
+
+leave:
+	/*
+	 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
+	 */
+	mutex_exit(vphm);
+	kmem_cache_free(marker_cache, mark);
+	kmem_cache_free(marker_cache, end);
+	mutex_enter(&vp->v_lock);
+	vp->v_flag &= ~VVMLOCK;
+	cv_broadcast(&vp->v_cv);
+	mutex_exit(&vp->v_lock);
+	return (err);
+}
+
+/*
+ * Zero out zbytes worth of data. Caller should be aware that this
+ * routine may enter back into the fs layer (xxx_getpage). Locks
+ * that the xxx_getpage routine may need should not be held while
+ * calling this.
+ */
+void
+pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
+{
+	caddr_t addr;
+
+	ASSERT(vp->v_type != VCHR);
+
+	if (vp->v_pages == NULL)
+		return;
+
+	/*
+	 * zbytes may be zero but there still may be some portion of
+	 * a page which needs clearing (since zbytes is a function
+	 * of filesystem block size, not pagesize.)
+	 */
+	if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
+		return;
+
+	/*
+	 * We get the last page and handle the partial
+	 * zeroing via kernel mappings.  This will make the page
+	 * dirty so that we know that when this page is written
+	 * back, the zeroed information will go out with it.  If
+	 * the page is not currently in memory, then the kzero
+	 * operation will cause it to be brought it.  We use kzero
+	 * instead of bzero so that if the page cannot be read in
+	 * for any reason, the system will not panic.  We need
+	 * to zero out a minimum of the fs given zbytes, but we
+	 * might also have to do more to get the entire last page.
+	 */
+
+	if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
+		panic("pvn_vptrunc zbytes");
+	addr = segmap_getmapflt(segkmap, vp, vplen,
+	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
+	(void) kzero(addr + (vplen & MAXBOFFSET),
+	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
+	(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
+}
+
+/*
+ * Handles common work of the VOP_GETPAGE routines when more than
+ * one page must be returned by calling a file system specific operation
+ * to do most of the work.  Must be called with the vp already locked
+ * by the VOP_GETPAGE routine.
+ */
+int
+pvn_getpages(
+	int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
+		size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
+	struct vnode *vp,
+	u_offset_t off,
+	size_t len,
+	uint_t *protp,
+	page_t *pl[],
+	size_t plsz,
+	struct seg *seg,
+	caddr_t addr,
+	enum seg_rw rw,
+	struct cred *cred)
+{
+	page_t **ppp;
+	u_offset_t o, eoff;
+	size_t sz, xlen;
+	int err;
+
+	ASSERT(plsz >= len);		/* insure that we have enough space */
+
+	/*
+	 * Loop one page at a time and let getapage function fill
+	 * in the next page in array.  We only allow one page to be
+	 * returned at a time (except for the last page) so that we
+	 * don't have any problems with duplicates and other such
+	 * painful problems.  This is a very simple minded algorithm,
+	 * but it does the job correctly.  We hope that the cost of a
+	 * getapage call for a resident page that we might have been
+	 * able to get from an earlier call doesn't cost too much.
+	 */
+	ppp = pl;
+	sz = PAGESIZE;
+	eoff = off + len;
+	xlen = len;
+	for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
+	    xlen -= PAGESIZE) {
+		if (o + PAGESIZE >= eoff) {
+			/*
+			 * Last time through - allow the all of
+			 * what's left of the pl[] array to be used.
+			 */
+			sz = plsz - (o - off);
+		}
+		err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
+		    rw, cred);
+		if (err) {
+			/*
+			 * Release any pages we already got.
+			 */
+			if (o > off && pl != NULL) {
+				for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
+					(void) page_release(*ppp, 1);
+			}
+			break;
+		}
+		if (pl != NULL)
+			ppp++;
+	}
+	return (err);
+}
+
+/*
+ * Initialize the page list array.
+ */
+void
+pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
+    u_offset_t off, size_t io_len, enum seg_rw rw)
+{
+	ssize_t sz;
+	page_t *ppcur, **ppp;
+
+	if (plsz >= io_len) {
+		/*
+		 * Everything fits, set up to load
+		 * all the pages.
+		 */
+		sz = io_len;
+	} else {
+		/*
+		 * Set up to load plsz worth
+		 * starting at the needed page.
+		 */
+		while (pp->p_offset != off) {
+			/* XXX - Do we need this assert? */
+			ASSERT(pp->p_next->p_offset !=
+			    pp->p_offset);
+			/*
+			 * Remove page from the i/o list,
+			 * release the i/o and the page lock.
+			 */
+			ppcur = pp;
+			page_sub(&pp, ppcur);
+			page_io_unlock(ppcur);
+			(void) page_release(ppcur, 1);
+		}
+		sz = plsz;
+	}
+
+	/*
+	 * Initialize the page list array.
+	 */
+	ppp = pl;
+	do {
+		ppcur = pp;
+		*ppp++ = ppcur;
+		page_sub(&pp, ppcur);
+		page_io_unlock(ppcur);
+		if (rw != S_CREATE)
+			page_downgrade(ppcur);
+		sz -= PAGESIZE;
+	} while (sz > 0 && pp != NULL);
+	*ppp = NULL;		/* terminate list */
+
+	/*
+	 * Now free the remaining pages that weren't
+	 * loaded in the page list.
+	 */
+	while (pp != NULL) {
+		ppcur = pp;
+		page_sub(&pp, ppcur);
+		page_io_unlock(ppcur);
+		(void) page_release(ppcur, 1);
+	}
+}
diff --git a/usr/src/uts/common/vm/vm_rm.c b/usr/src/uts/common/vm/vm_rm.c
new file mode 100644
index 0000000000..36cd5f0375
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_rm.c
@@ -0,0 +1,189 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/sysmacros.h>
+#include <sys/errno.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/proc.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg_vn.h>
+#include <vm/rm.h>
+#include <vm/seg.h>
+#include <vm/page.h>
+
+/*
+ * Yield the size of an address space.
+ *
+ * The size can only be used as a hint since we cannot guarantee it
+ * will stay the same size unless the as->a_lock is held by the caller.
+ */
+size_t
+rm_assize(struct as *as)
+{
+	size_t size = 0;
+	struct seg *seg;
+	struct segvn_data *svd;
+	extern struct seg_ops segdev_ops;	/* needs a header file */
+
+	ASSERT(as != NULL && AS_READ_HELD(as, &as->a_lock));
+
+	if (as == &kas)
+		return (0);
+
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+		if (seg->s_ops == &segdev_ops &&
+		    ((SEGOP_GETTYPE(seg, seg->s_base) &
+		    (MAP_SHARED | MAP_PRIVATE)) == 0)) {
+			/*
+			 * Don't include mappings of /dev/null.  These just
+			 * reserve address space ranges and have no memory.
+			 * We cheat by knowing that these segments come
+			 * from segdev and have no mapping type.
+			 */
+			/* EMPTY */;
+		} else if (seg->s_ops == &segvn_ops &&
+		    (svd = (struct segvn_data *)seg->s_data) != NULL &&
+		    (svd->vp == NULL || svd->vp->v_type != VREG) &&
+		    (svd->flags & MAP_NORESERVE)) {
+			/*
+			 * Don't include MAP_NORESERVE pages in the
+			 * address range unless their mappings have
+			 * actually materialized.  We cheat by knowing
+			 * that segvn is the only segment driver that
+			 * supports MAP_NORESERVE and that the actual
+			 * number of bytes reserved is in the segment's
+			 * private data structure.
+			 */
+			size += svd->swresv;
+		} else {
+			caddr_t addr = seg->s_base;
+			size_t segsize = seg->s_size;
+			vnode_t *vp;
+			vattr_t vattr;
+
+			/*
+			 * If the segment is mapped beyond the end of the
+			 * underlying mapped file, if any, then limit the
+			 * segment's size contribution to the file size.
+			 */
+			vattr.va_mask = AT_SIZE;
+			if (seg->s_ops == &segvn_ops &&
+			    SEGOP_GETVP(seg, addr, &vp) == 0 &&
+			    vp != NULL && vp->v_type == VREG &&
+			    VOP_GETATTR(vp, &vattr, ATTR_HINT, CRED()) == 0) {
+				u_offset_t filesize = vattr.va_size;
+				u_offset_t offset = SEGOP_GETOFFSET(seg, addr);
+
+				if (filesize < offset)
+					filesize = 0;
+				else
+					filesize -= offset;
+				filesize = P2ROUNDUP_TYPED(filesize, PAGESIZE,
+				    u_offset_t);
+				if ((u_offset_t)segsize > filesize)
+					segsize = filesize;
+			}
+			size += segsize;
+		}
+	}
+
+	return (size);
+}
+
+/*
+ * Yield the memory claim requirement for an address space.
+ *
+ * This is currently implemented as the number of active hardware
+ * translations that have page structures.  Therefore, it can
+ * underestimate the traditional resident set size, eg, if the
+ * physical page is present and the hardware translation is missing;
+ * and it can overestimate the rss, eg, if there are active
+ * translations to a frame buffer with page structs.
+ * Also, it does not take sharing and XHATs into account.
+ */
+size_t
+rm_asrss(as)
+	register struct as *as;
+{
+	if (as != (struct as *)NULL && as != &kas)
+		return ((size_t)btop(hat_get_mapped_size(as->a_hat)));
+	else
+		return (0);
+}
+
+/*
+ * Return a 16-bit binary fraction representing the percent of total memory
+ * used by this address space.  Binary point is to right of high-order bit.
+ * Defined as the ratio of a_rss for the process to total physical memory.
+ * This assumes 2s-complement arithmetic and that shorts and longs are
+ * 16 bits and 32 bits, respectively.
+ */
+ushort_t
+rm_pctmemory(struct as *as)
+{
+	/* This can't overflow */
+	ulong_t num = (ulong_t)rm_asrss(as) << (PAGESHIFT-1);
+	int shift = 16 - PAGESHIFT;
+	ulong_t total = total_pages;
+
+	if (shift < 0) {
+		num >>= (-shift);
+		shift = 0;
+	}
+	while (shift > 0 && (num & 0x80000000) == 0) {
+		shift--;
+		num <<= 1;
+	}
+	if (shift > 0)
+		total >>= shift;
+
+	return (num / total);
+}
diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c
new file mode 100644
index 0000000000..50cc21cdf7
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_seg.c
@@ -0,0 +1,952 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VM - segment management.
+ */
+
+#include <sys/types.h>
+#include <sys/inttypes.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kmem.h>
+#include <sys/vmsystm.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/callb.h>
+#include <sys/mem_config.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kmem.h>
+
+/*
+ * kstats for segment advise
+ */
+segadvstat_t segadvstat = {
+	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
+	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
+};
+
+kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
+uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
+
+/* #define	PDEBUG */
+#if defined(PDEBUG) || defined(lint) || defined(__lint)
+int pdebug = 0;
+#else
+#define	pdebug		0
+#endif	/* PDEBUG */
+
+#define	PPRINTF				if (pdebug) printf
+#define	PPRINT(x)			PPRINTF(x)
+#define	PPRINT1(x, a)			PPRINTF(x, a)
+#define	PPRINT2(x, a, b)		PPRINTF(x, a, b)
+#define	PPRINT3(x, a, b, c)		PPRINTF(x, a, b, c)
+#define	PPRINT4(x, a, b, c, d)		PPRINTF(x, a, b, c, d)
+#define	PPRINT5(x, a, b, c, d, e)	PPRINTF(x, a, b, c, d, e)
+
+#define	P_HASHMASK		(p_hashsize - 1)
+#define	P_BASESHIFT		6
+
+/*
+ * entry in the segment page cache
+ */
+struct seg_pcache {
+	struct seg_pcache *p_hnext;	/* list for hashed blocks */
+	struct seg_pcache *p_hprev;
+	int		p_active;	/* active count */
+	int		p_ref;		/* ref bit */
+	size_t		p_len;		/* segment length */
+	caddr_t		p_addr;		/* base address */
+	struct seg 	*p_seg;		/* segment */
+	struct page	**p_pp;		/* pp shadow list */
+	enum seg_rw	p_rw;		/* rw */
+	uint_t		p_flags;	/* bit flags */
+	int		(*p_callback)(struct seg *, caddr_t, size_t,
+			    struct page **, enum seg_rw);
+};
+
+struct seg_phash {
+	struct seg_pcache *p_hnext;	/* list for hashed blocks */
+	struct seg_pcache *p_hprev;
+	int p_qlen;			/* Q length */
+	kmutex_t p_hmutex;		/* protects hash bucket */
+};
+
+static int seg_preap_time = 20;	/* reclaim every 20 secs */
+static int seg_pmaxqlen = 5;	/* max Q length in hash list */
+static int seg_ppcount = 5;	/* max # of purges per reclaim interval */
+static int seg_plazy = 1;	/* if 1, pages are cached after pageunlock */
+static pgcnt_t seg_pwindow;	/* max # of pages that can be cached */
+static pgcnt_t seg_plocked;	/* # of pages which are cached by pagelock */
+static pgcnt_t seg_plocked_window; /* # pages from window */
+int seg_preapahead;
+
+static uint_t seg_pdisable = 0;	/* if not 0, caching temporarily disabled */
+
+static int seg_pupdate_active = 1;	/* background reclaim thread */
+static clock_t seg_preap_interval;	/* reap interval in ticks */
+
+static kmutex_t seg_pcache;	/* protects the whole pagelock cache */
+static kmutex_t seg_pmem;	/* protects window counter */
+static ksema_t seg_psaync_sem;	/* sema for reclaim thread */
+static struct seg_phash *p_hashtab;
+static int p_hashsize = 0;
+
+#define	p_hash(seg) \
+	(P_HASHMASK & \
+	((uintptr_t)(seg) >> P_BASESHIFT))
+
+#define	p_match(pcp, seg, addr, len, rw) \
+	(((pcp)->p_seg == (seg) && \
+	(pcp)->p_addr == (addr) && \
+	(pcp)->p_rw == (rw) && \
+	(pcp)->p_len == (len)) ? 1 : 0)
+
+#define	p_match_pp(pcp, seg, addr, len, pp, rw) \
+	(((pcp)->p_seg == (seg) && \
+	(pcp)->p_addr == (addr) && \
+	(pcp)->p_pp == (pp) && \
+	(pcp)->p_rw == (rw) && \
+	(pcp)->p_len == (len)) ? 1 : 0)
+
+
+/*
+ * lookup an address range in pagelock cache. Return shadow list
+ * and bump up active count.
+ */
+struct page **
+seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
+{
+	struct seg_pcache *pcp;
+	struct seg_phash *hp;
+
+	/*
+	 * Skip pagelock cache, while DR is in progress or
+	 * seg_pcache is off.
+	 */
+	if (seg_pdisable || seg_plazy == 0) {
+		return (NULL);
+	}
+
+	hp = &p_hashtab[p_hash(seg)];
+	mutex_enter(&hp->p_hmutex);
+	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
+	    pcp = pcp->p_hnext) {
+		if (p_match(pcp, seg, addr, len, rw)) {
+			pcp->p_active++;
+			mutex_exit(&hp->p_hmutex);
+
+			PPRINT5("seg_plookup hit: seg %p, addr %p, "
+			    "len %lx, count %d, pplist %p \n",
+			    (void *)seg, (void *)addr, len, pcp->p_active,
+			    (void *)pcp->p_pp);
+
+			return (pcp->p_pp);
+		}
+	}
+	mutex_exit(&hp->p_hmutex);
+
+	PPRINT("seg_plookup miss:\n");
+
+	return (NULL);
+}
+
+/*
+ * mark address range inactive. If the cache is off or the address
+ * range is not in the cache we call the segment driver to reclaim
+ * the pages. Otherwise just decrement active count and set ref bit.
+ */
+void
+seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
+    enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t,
+    struct page **, enum seg_rw))
+{
+	struct seg_pcache *pcp;
+	struct seg_phash *hp;
+
+	if (seg_plazy == 0) {
+		(void) (*callback)(seg, addr, len, pp, rw);
+		return;
+	}
+	hp = &p_hashtab[p_hash(seg)];
+	mutex_enter(&hp->p_hmutex);
+	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
+	    pcp = pcp->p_hnext) {
+		if (p_match_pp(pcp, seg, addr, len, pp, rw)) {
+			pcp->p_active--;
+			ASSERT(pcp->p_active >= 0);
+			if (pcp->p_active == 0 && seg_pdisable) {
+				int npages;
+
+				ASSERT(callback == pcp->p_callback);
+				/* free the entry */
+				hp->p_qlen--;
+				pcp->p_hprev->p_hnext = pcp->p_hnext;
+				pcp->p_hnext->p_hprev = pcp->p_hprev;
+				mutex_exit(&hp->p_hmutex);
+				npages = pcp->p_len >> PAGESHIFT;
+				mutex_enter(&seg_pmem);
+				seg_plocked -= npages;
+				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
+					seg_plocked_window -= npages;
+				}
+				mutex_exit(&seg_pmem);
+				kmem_free(pcp, sizeof (struct seg_pcache));
+				goto out;
+			}
+			pcp->p_ref = 1;
+			mutex_exit(&hp->p_hmutex);
+			return;
+		}
+	}
+	mutex_exit(&hp->p_hmutex);
+out:
+	(void) (*callback)(seg, addr, len, pp, rw);
+}
+
+/*
+ * The seg_pinsert_check() is used by segment drivers to predict whether
+ * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
+ */
+
+int
+seg_pinsert_check(struct seg *seg, size_t len, uint_t flags)
+{
+	struct seg_phash *hp;
+
+	if (seg_plazy == 0) {
+		return (SEGP_FAIL);
+	}
+	if (seg_pdisable != 0) {
+		return (SEGP_FAIL);
+	}
+	ASSERT((len & PAGEOFFSET) == 0);
+	hp = &p_hashtab[p_hash(seg)];
+	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
+		return (SEGP_FAIL);
+	}
+	/*
+	 * If the SEGP_FORCE_WIRED flag is set,
+	 * we skip the check for seg_pwindow.
+	 */
+	if ((flags & SEGP_FORCE_WIRED) == 0) {
+		pgcnt_t npages;
+
+		npages = len >> PAGESHIFT;
+		if ((seg_plocked_window + npages) > seg_pwindow) {
+			return (SEGP_FAIL);
+		}
+	}
+	return (SEGP_SUCCESS);
+}
+
+
+/*
+ * insert address range with shadow list into pagelock cache. If
+ * the cache is off or caching is temporarily disabled or the allowed
+ * 'window' is exceeded - return SEGP_FAIL. Otherwise return
+ * SEGP_SUCCESS.
+ */
+int
+seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
+    enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t,
+    size_t, struct page **, enum seg_rw))
+{
+	struct seg_pcache *pcp;
+	struct seg_phash *hp;
+	pgcnt_t npages;
+
+	if (seg_plazy == 0) {
+		return (SEGP_FAIL);
+	}
+	if (seg_pdisable != 0) {
+		return (SEGP_FAIL);
+	}
+	ASSERT((len & PAGEOFFSET) == 0);
+	hp = &p_hashtab[p_hash(seg)];
+	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
+		return (SEGP_FAIL);
+	}
+	npages = len >> PAGESHIFT;
+	mutex_enter(&seg_pmem);
+	/*
+	 * If the SEGP_FORCE_WIRED flag is set,
+	 * we skip the check for seg_pwindow.
+	 */
+	if ((flags & SEGP_FORCE_WIRED) == 0) {
+		seg_plocked_window += npages;
+		if (seg_plocked_window > seg_pwindow) {
+			seg_plocked_window -= npages;
+			mutex_exit(&seg_pmem);
+			return (SEGP_FAIL);
+		}
+	}
+	seg_plocked += npages;
+	mutex_exit(&seg_pmem);
+
+	pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP);
+	pcp->p_seg = seg;
+	pcp->p_addr = addr;
+	pcp->p_len = len;
+	pcp->p_pp = pp;
+	pcp->p_rw = rw;
+	pcp->p_callback = callback;
+	pcp->p_active = 1;
+	pcp->p_flags = flags;
+
+	PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n",
+	    (void *)seg, (void *)addr, len, (void *)pp);
+
+	hp = &p_hashtab[p_hash(seg)];
+	mutex_enter(&hp->p_hmutex);
+	hp->p_qlen++;
+	pcp->p_hnext = hp->p_hnext;
+	pcp->p_hprev = (struct seg_pcache *)hp;
+	hp->p_hnext->p_hprev = pcp;
+	hp->p_hnext = pcp;
+	mutex_exit(&hp->p_hmutex);
+	return (SEGP_SUCCESS);
+}
+
+/*
+ * purge all entries from the pagelock cache if not active
+ * and not recently used. Drop all locks and call through
+ * the address space into the segment driver to reclaim
+ * the pages. This makes sure we get the address space
+ * and segment driver locking right.
+ */
+static void
+seg_ppurge_all(int force)
+{
+	struct seg_pcache *delcallb_list = NULL;
+	struct seg_pcache *pcp;
+	struct seg_phash *hp;
+	int purge_count = 0;
+	pgcnt_t npages = 0;
+	pgcnt_t npages_window = 0;
+
+	/*
+	 * if the cache if off or empty, return
+	 */
+	if (seg_plazy == 0 || seg_plocked == 0) {
+		return;
+	}
+	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
+		mutex_enter(&hp->p_hmutex);
+		pcp = hp->p_hnext;
+
+		/*
+		 * While 'force' is set, seg_pasync_thread is not
+		 * throttled.  This is to speedup flushing of seg_pcache
+		 * in preparation for DR.
+		 *
+		 * In normal case, when 'force' is not set, we throttle
+		 * seg_pasync_thread so that we don't spend all the time
+		 * time in purging the cache.
+		 */
+		while ((pcp != (struct seg_pcache *)hp) &&
+				(force || (purge_count <= seg_ppcount))) {
+
+			/*
+			 * purge entries which are not active and
+			 * have not been used recently and
+			 * have the SEGP_ASYNC_FLUSH flag.
+			 *
+			 * In the 'force' case, we ignore the
+			 * SEGP_ASYNC_FLUSH flag.
+			 */
+			if (!(pcp->p_flags & SEGP_ASYNC_FLUSH))
+				pcp->p_ref = 1;
+			if (force)
+				pcp->p_ref = 0;
+			if (!pcp->p_ref && !pcp->p_active) {
+				struct as *as = pcp->p_seg->s_as;
+
+				/*
+				 * try to get the readers lock on the address
+				 * space before taking out the cache element.
+				 * This ensures as_pagereclaim() can actually
+				 * call through the address space and free
+				 * the pages. If we don't get the lock, just
+				 * skip this entry. The pages will be reclaimed
+				 * by the segment driver at unmap time.
+				 */
+				if (AS_LOCK_TRYENTER(as, &as->a_lock,
+				    RW_READER)) {
+					hp->p_qlen--;
+					pcp->p_hprev->p_hnext = pcp->p_hnext;
+					pcp->p_hnext->p_hprev = pcp->p_hprev;
+					pcp->p_hprev = delcallb_list;
+					delcallb_list = pcp;
+					purge_count++;
+				}
+			} else {
+				pcp->p_ref = 0;
+			}
+			pcp = pcp->p_hnext;
+		}
+		mutex_exit(&hp->p_hmutex);
+		if (!force && purge_count > seg_ppcount)
+			break;
+	}
+
+	/*
+	 * run the delayed callback list. We don't want to hold the
+	 * cache lock during a call through the address space.
+	 */
+	while (delcallb_list != NULL) {
+		struct as *as;
+
+		pcp = delcallb_list;
+		delcallb_list = pcp->p_hprev;
+		as = pcp->p_seg->s_as;
+
+		PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, "
+		    "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr,
+		    pcp->p_len, (void *)pcp->p_pp);
+
+		as_pagereclaim(as, pcp->p_pp, pcp->p_addr,
+		    pcp->p_len, pcp->p_rw);
+		AS_LOCK_EXIT(as, &as->a_lock);
+		npages += pcp->p_len >> PAGESHIFT;
+		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
+			npages_window += pcp->p_len >> PAGESHIFT;
+		}
+		kmem_free(pcp, sizeof (struct seg_pcache));
+	}
+	mutex_enter(&seg_pmem);
+	seg_plocked -= npages;
+	seg_plocked_window -= npages_window;
+	mutex_exit(&seg_pmem);
+}
+
+/*
+ * Remove cached pages for segment(s) entries from hashtable.
+ * The segments are identified by a given clients callback
+ * function.
+ * This is useful for multiple seg's cached on behalf of
+ * dummy segment (ISM/DISM) with common callback function.
+ * The clients callback function may return status indicating
+ * that the last seg's entry has been purged. In such a case
+ * the seg_ppurge_seg() stops searching hashtable and exits.
+ * Otherwise all hashtable entries are scanned.
+ */
+void
+seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t,
+    struct page **, enum seg_rw))
+{
+	struct seg_pcache *pcp, *npcp;
+	struct seg_phash *hp;
+	pgcnt_t npages = 0;
+	pgcnt_t npages_window = 0;
+	int	done = 0;
+
+	/*
+	 * if the cache if off or empty, return
+	 */
+	if (seg_plazy == 0 || seg_plocked == 0) {
+		return;
+	}
+	mutex_enter(&seg_pcache);
+	seg_pdisable++;
+	mutex_exit(&seg_pcache);
+
+	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
+
+		mutex_enter(&hp->p_hmutex);
+		pcp = hp->p_hnext;
+		while (pcp != (struct seg_pcache *)hp) {
+
+			/*
+			 * purge entries which are not active
+			 */
+			npcp = pcp->p_hnext;
+			if (!pcp->p_active && pcp->p_callback == callback) {
+				hp->p_qlen--;
+				pcp->p_hprev->p_hnext = pcp->p_hnext;
+				pcp->p_hnext->p_hprev = pcp->p_hprev;
+
+				if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr,
+				    pcp->p_len, pcp->p_pp, pcp->p_rw)) {
+					done = 1;
+				}
+
+				npages += pcp->p_len >> PAGESHIFT;
+				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
+					npages_window +=
+					    pcp->p_len >> PAGESHIFT;
+				}
+				kmem_free(pcp, sizeof (struct seg_pcache));
+			}
+			pcp = npcp;
+			if (done)
+				break;
+		}
+		mutex_exit(&hp->p_hmutex);
+		if (done)
+			break;
+	}
+
+	mutex_enter(&seg_pcache);
+	seg_pdisable--;
+	mutex_exit(&seg_pcache);
+
+	mutex_enter(&seg_pmem);
+	seg_plocked -= npages;
+	seg_plocked_window -= npages_window;
+	mutex_exit(&seg_pmem);
+}
+
+/*
+ * purge all entries for a given segment. Since we
+ * callback into the segment driver directly for page
+ * reclaim the caller needs to hold the right locks.
+ */
+void
+seg_ppurge(struct seg *seg)
+{
+	struct seg_pcache *delcallb_list = NULL;
+	struct seg_pcache *pcp;
+	struct seg_phash *hp;
+	pgcnt_t npages = 0;
+	pgcnt_t npages_window = 0;
+
+	if (seg_plazy == 0) {
+		return;
+	}
+	hp = &p_hashtab[p_hash(seg)];
+	mutex_enter(&hp->p_hmutex);
+	pcp = hp->p_hnext;
+	while (pcp != (struct seg_pcache *)hp) {
+		if (pcp->p_seg == seg) {
+			if (pcp->p_active) {
+				break;
+			}
+			hp->p_qlen--;
+			pcp->p_hprev->p_hnext = pcp->p_hnext;
+			pcp->p_hnext->p_hprev = pcp->p_hprev;
+			pcp->p_hprev = delcallb_list;
+			delcallb_list = pcp;
+		}
+		pcp = pcp->p_hnext;
+	}
+	mutex_exit(&hp->p_hmutex);
+	while (delcallb_list != NULL) {
+		pcp = delcallb_list;
+		delcallb_list = pcp->p_hprev;
+
+		PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, "
+		    "pplist %p\n", (void *)seg, (void *)pcp->p_addr,
+		    pcp->p_len, (void *)pcp->p_pp);
+
+		ASSERT(seg == pcp->p_seg);
+		(void) (*pcp->p_callback)(seg, pcp->p_addr,
+		    pcp->p_len, pcp->p_pp, pcp->p_rw);
+		npages += pcp->p_len >> PAGESHIFT;
+		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
+			npages_window += pcp->p_len >> PAGESHIFT;
+		}
+		kmem_free(pcp, sizeof (struct seg_pcache));
+	}
+	mutex_enter(&seg_pmem);
+	seg_plocked -= npages;
+	seg_plocked_window -= npages_window;
+	mutex_exit(&seg_pmem);
+}
+
+static void seg_pinit_mem_config(void);
+
+/*
+ * setup the pagelock cache
+ */
+static void
+seg_pinit(void)
+{
+	struct seg_phash *hp;
+	int i;
+	uint_t physmegs;
+
+	sema_init(&seg_psaync_sem, 0, NULL, SEMA_DEFAULT, NULL);
+
+	mutex_enter(&seg_pcache);
+	if (p_hashtab == NULL) {
+		physmegs = physmem >> (20 - PAGESHIFT);
+
+		/* If p_hashsize was not set in /etc/system ... */
+		if (p_hashsize == 0) {
+			/*
+			 * Choose p_hashsize based on physmem.
+			 */
+			if (physmegs < 64) {
+				p_hashsize = 64;
+			} else if (physmegs < 1024) {
+				p_hashsize = 1024;
+			} else if (physmegs < 10 * 1024) {
+				p_hashsize = 8192;
+			} else if (physmegs < 20 * 1024) {
+				p_hashsize = 2 * 8192;
+				seg_pmaxqlen = 16;
+			} else {
+				p_hashsize = 128 * 1024;
+				seg_pmaxqlen = 128;
+			}
+		}
+
+		p_hashtab = kmem_zalloc(
+			p_hashsize * sizeof (struct seg_phash), KM_SLEEP);
+		for (i = 0; i < p_hashsize; i++) {
+			hp = (struct seg_phash *)&p_hashtab[i];
+			hp->p_hnext = (struct seg_pcache *)hp;
+			hp->p_hprev = (struct seg_pcache *)hp;
+			mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
+		}
+		if (seg_pwindow == 0) {
+			if (physmegs < 24) {
+				/* don't use cache */
+				seg_plazy = 0;
+			} else if (physmegs < 64) {
+				seg_pwindow = physmem >> 5; /* 3% of memory */
+			} else if (physmegs < 10 * 1024) {
+				seg_pwindow = physmem >> 3; /* 12% of memory */
+			} else {
+				seg_pwindow = physmem >> 1;
+			}
+		}
+	}
+	mutex_exit(&seg_pcache);
+
+	seg_pinit_mem_config();
+}
+
+/*
+ * called by pageout if memory is low
+ */
+void
+seg_preap(void)
+{
+	/*
+	 * if the cache if off or empty, return
+	 */
+	if (seg_plocked == 0 || seg_plazy == 0) {
+		return;
+	}
+	sema_v(&seg_psaync_sem);
+}
+
+static void seg_pupdate(void *);
+
+/*
+ * run as a backgroud thread and reclaim pagelock
+ * pages which have not been used recently
+ */
+void
+seg_pasync_thread(void)
+{
+	callb_cpr_t cpr_info;
+	kmutex_t pasync_lock;	/* just for CPR stuff */
+
+	mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	CALLB_CPR_INIT(&cpr_info, &pasync_lock,
+		callb_generic_cpr, "seg_pasync");
+
+	if (seg_preap_interval == 0) {
+		seg_preap_interval = seg_preap_time * hz;
+	} else {
+		seg_preap_interval *= hz;
+	}
+	if (seg_plazy && seg_pupdate_active) {
+		(void) timeout(seg_pupdate, NULL, seg_preap_interval);
+	}
+
+	for (;;) {
+		mutex_enter(&pasync_lock);
+		CALLB_CPR_SAFE_BEGIN(&cpr_info);
+		mutex_exit(&pasync_lock);
+		sema_p(&seg_psaync_sem);
+		mutex_enter(&pasync_lock);
+		CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock);
+		mutex_exit(&pasync_lock);
+
+		seg_ppurge_all(0);
+	}
+}
+
+static void
+seg_pupdate(void *dummy)
+{
+	sema_v(&seg_psaync_sem);
+
+	if (seg_plazy && seg_pupdate_active) {
+		(void) timeout(seg_pupdate, dummy, seg_preap_interval);
+	}
+}
+
+static struct kmem_cache *seg_cache;
+
+/*
+ * Initialize segment management data structures.
+ */
+void
+seg_init(void)
+{
+	kstat_t *ksp;
+
+	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
+		0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
+		segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
+	if (ksp) {
+		ksp->ks_data = (void *)segadvstat_ptr;
+		kstat_install(ksp);
+	}
+
+	seg_pinit();
+}
+
+/*
+ * Allocate a segment to cover [base, base+size]
+ * and attach it to the specified address space.
+ */
+struct seg *
+seg_alloc(struct as *as, caddr_t base, size_t size)
+{
+	struct seg *new;
+	caddr_t segbase;
+	size_t segsize;
+
+	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
+	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
+	    (uintptr_t)segbase;
+
+	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
+		return ((struct seg *)NULL);	/* bad virtual addr range */
+
+	if (as != &kas &&
+	    valid_usr_range(segbase, segsize, 0, as,
+	    as->a_userlimit) != RANGE_OKAY)
+		return ((struct seg *)NULL);	/* bad virtual addr range */
+
+	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
+	new->s_ops = NULL;
+	new->s_data = NULL;
+	new->s_szc = 0;
+	new->s_flags = 0;
+	if (seg_attach(as, segbase, segsize, new) < 0) {
+		kmem_cache_free(seg_cache, new);
+		return ((struct seg *)NULL);
+	}
+	/* caller must fill in ops, data */
+	return (new);
+}
+
+/*
+ * Attach a segment to the address space.  Used by seg_alloc()
+ * and for kernel startup to attach to static segments.
+ */
+int
+seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
+{
+	seg->s_as = as;
+	seg->s_base = base;
+	seg->s_size = size;
+
+	/*
+	 * as_addseg() will add the segment at the appropraite point
+	 * in the list. It will return -1 if there is overlap with
+	 * an already existing segment.
+	 */
+	return (as_addseg(as, seg));
+}
+
+/*
+ * Unmap a segment and free it from its associated address space.
+ * This should be called by anybody who's finished with a whole segment's
+ * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
+ * responsibility of the segment driver to unlink the the segment
+ * from the address space, and to free public and private data structures
+ * associated with the segment.  (This is typically done by a call to
+ * seg_free()).
+ */
+void
+seg_unmap(struct seg *seg)
+{
+#ifdef DEBUG
+	int ret;
+#endif /* DEBUG */
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+	/* Shouldn't have called seg_unmap if mapping isn't yet established */
+	ASSERT(seg->s_data != NULL);
+
+	/* Unmap the whole mapping */
+#ifdef DEBUG
+	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
+	ASSERT(ret == 0);
+#else
+	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
+#endif /* DEBUG */
+}
+
+/*
+ * Free the segment from its associated as. This should only be called
+ * if a mapping to the segment has not yet been established (e.g., if
+ * an error occurs in the middle of doing an as_map when the segment
+ * has already been partially set up) or if it has already been deleted
+ * (e.g., from a segment driver unmap routine if the unmap applies to the
+ * entire segment). If the mapping is currently set up then seg_unmap() should
+ * be called instead.
+ */
+void
+seg_free(struct seg *seg)
+{
+	register struct as *as = seg->s_as;
+	struct seg *tseg = as_removeseg(as, seg);
+
+	ASSERT(tseg == seg);
+
+	/*
+	 * If the segment private data field is NULL,
+	 * then segment driver is not attached yet.
+	 */
+	if (seg->s_data != NULL)
+		SEGOP_FREE(seg);
+
+	kmem_cache_free(seg_cache, seg);
+}
+
+/*ARGSUSED*/
+static void
+seg_p_mem_config_post_add(
+	void *arg,
+	pgcnt_t delta_pages)
+{
+	/* Nothing to do. */
+}
+
+/*
+ * Attempt to purge seg_pcache.  May need to return before this has
+ * completed to allow other pre_del callbacks to unlock pages. This is
+ * ok because:
+ *	1) The seg_pdisable flag has been set so at least we won't
+ *	cache anymore locks and the locks we couldn't purge
+ *	will not be held if they do get released by a subsequent
+ *	pre-delete callback.
+ *
+ *	2) The rest of the memory delete thread processing does not
+ *	depend on the changes made in this pre-delete callback. No
+ *	panics will result, the worst that will happen is that the
+ *	DR code will timeout and cancel the delete.
+ */
+/*ARGSUSED*/
+static int
+seg_p_mem_config_pre_del(
+	void *arg,
+	pgcnt_t delta_pages)
+{
+	pgcnt_t	old_plocked;
+	int stall_count = 0;
+
+	mutex_enter(&seg_pcache);
+	seg_pdisable++;
+	ASSERT(seg_pdisable != 0);
+	mutex_exit(&seg_pcache);
+
+	/*
+	 * Attempt to empty the cache. Terminate if seg_plocked does not
+	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
+	 */
+	while (seg_plocked != 0) {
+		old_plocked = seg_plocked;
+		seg_ppurge_all(1);
+		if (seg_plocked == old_plocked) {
+			if (stall_count++ > SEGP_STALL_THRESHOLD) {
+				cmn_err(CE_NOTE, "!Pre-delete couldn't purge"
+					" pagelock cache - continuing");
+				break;
+			}
+		} else
+			stall_count = 0;
+		if (seg_plocked != 0)
+			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
+	}
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+seg_p_mem_config_post_del(
+	void *arg,
+	pgcnt_t delta_pages,
+	int cancelled)
+{
+	mutex_enter(&seg_pcache);
+	ASSERT(seg_pdisable != 0);
+	seg_pdisable--;
+	mutex_exit(&seg_pcache);
+}
+
+static kphysm_setup_vector_t seg_p_mem_config_vec = {
+	KPHYSM_SETUP_VECTOR_VERSION,
+	seg_p_mem_config_post_add,
+	seg_p_mem_config_pre_del,
+	seg_p_mem_config_post_del,
+};
+
+static void
+seg_pinit_mem_config(void)
+{
+	int ret;
+
+	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
+	/*
+	 * Want to catch this in the debug kernel. At run time, if the
+	 * callbacks don't get run all will be OK as the disable just makes
+	 * it more likely that the pages can be collected.
+	 */
+	ASSERT(ret == 0);
+}
diff --git a/usr/src/uts/common/vm/vm_swap.c b/usr/src/uts/common/vm/vm_swap.c
new file mode 100644
index 0000000000..d7028b6f29
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_swap.c
@@ -0,0 +1,1590 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Each physical swap area has an associated bitmap representing
+ * its physical storage. The bitmap records which swap slots are
+ * currently allocated or freed.  Allocation is done by searching
+ * through the bitmap for the first free slot. Thus, there's
+ * no linear relation between offset within the swap device and the
+ * address (within its segment(s)) of the page that the slot backs;
+ * instead, it's an arbitrary one-to-one mapping.
+ *
+ * Associated with each swap area is a swapinfo structure.  These
+ * structures are linked into a linear list that determines the
+ * ordering of swap areas in the logical swap device.  Each contains a
+ * pointer to the corresponding bitmap, the area's size, and its
+ * associated vnode.
+ */
+
+#include <sys/types.h>
+#include <sys/inttypes.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/vtrace.h>
+#include <sys/swap.h>
+#include <sys/dumphdr.h>
+#include <sys/debug.h>
+#include <sys/fs/snode.h>
+#include <sys/fs/swapnode.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/page.h>
+#include <vm/seg_vn.h>
+#include <vm/hat.h>
+#include <vm/anon.h>
+#include <vm/seg_map.h>
+
+/*
+ * To balance the load among multiple swap areas, we don't allow
+ * more than swap_maxcontig allocations to be satisfied from a
+ * single swap area before moving on to the next swap area.  This
+ * effectively "interleaves" allocations among the many swap areas.
+ */
+int swap_maxcontig;	/* set by anon_init() to 1 Mb */
+
+#define	MINIROOTSIZE	12000	/* ~6 Meg XXX */
+
+/*
+ * XXX - this lock is a kludge. It serializes some aspects of swapadd() and
+ * swapdel() (namely VOP_OPEN, VOP_CLOSE, VN_RELE).  It protects against
+ * somebody swapadd'ing and getting swap slots from a vnode, while someone
+ * else is in the process of closing or rele'ing it.
+ */
+static kmutex_t swap_lock;
+
+kmutex_t swapinfo_lock;
+
+/*
+ * protected by the swapinfo_lock
+ */
+struct swapinfo	*swapinfo;
+
+static	struct	swapinfo *silast;
+static	int	nswapfiles;
+
+static u_offset_t	swap_getoff(struct swapinfo *);
+static int	swapadd(struct vnode *, ulong_t, ulong_t, char *);
+static int	swapdel(struct vnode *, ulong_t);
+static int	swapslot_free(struct vnode *, u_offset_t, struct swapinfo *);
+
+/*
+ * swap device bitmap allocation macros
+ */
+#define	MAPSHIFT	5
+#define	NBBW		(NBPW * NBBY)	/* number of bits per word */
+#define	TESTBIT(map, i)		(((map)[(i) >> MAPSHIFT] & (1 << (i) % NBBW)))
+#define	SETBIT(map, i)		(((map)[(i) >> MAPSHIFT] |= (1 << (i) % NBBW)))
+#define	CLEARBIT(map, i)	(((map)[(i) >> MAPSHIFT] &= ~(1 << (i) % NBBW)))
+
+int swap_debug = 0;	/* set for debug printf's */
+int swap_verify = 0;	/* set to verify slots when freeing and allocating */
+
+uint_t swapalloc_maxcontig;
+
+/*
+ * Allocate a range of up to *lenp contiguous slots (page) from a physical
+ * swap device. Flags are one of:
+ *	SA_NOT  Must have a slot from a physical swap device other than the
+ * 		the one containing input (*vpp, *offp).
+ * Less slots than requested may be returned. *lenp allocated slots are
+ * returned starting at *offp on *vpp.
+ * Returns 1 for a successful allocation, 0 for couldn't allocate any slots.
+ */
+int
+swap_phys_alloc(
+	struct vnode **vpp,
+	u_offset_t *offp,
+	size_t *lenp,
+	uint_t flags)
+{
+	struct swapinfo *sip;
+	offset_t soff, noff;
+	size_t len;
+
+	mutex_enter(&swapinfo_lock);
+	sip = silast;
+
+	/* Find a desirable physical device and allocate from it. */
+	do {
+		if (sip == NULL)
+			break;
+		if (!(sip->si_flags & ST_INDEL) &&
+		    (spgcnt_t)sip->si_nfpgs > 0) {
+			/* Caller wants other than specified swap device */
+			if (flags & SA_NOT) {
+				if (*vpp != sip->si_vp ||
+				    *offp < sip->si_soff ||
+				    *offp >= sip->si_eoff)
+					goto found;
+			/* Caller is loose, will take anything */
+			} else
+				goto found;
+		} else if (sip->si_nfpgs == 0)
+			sip->si_allocs = 0;
+		if ((sip = sip->si_next) == NULL)
+			sip = swapinfo;
+	} while (sip != silast);
+	mutex_exit(&swapinfo_lock);
+	return (0);
+found:
+	soff = swap_getoff(sip);
+	sip->si_nfpgs--;
+	if (soff == -1)
+		panic("swap_alloc: swap_getoff failed!");
+
+	for (len = PAGESIZE; len < *lenp; len += PAGESIZE) {
+		if (sip->si_nfpgs == 0)
+			break;
+		if (swapalloc_maxcontig && len >= swapalloc_maxcontig)
+			break;
+		noff = swap_getoff(sip);
+		if (noff == -1) {
+			break;
+		} else if (noff != soff + len) {
+			CLEARBIT(sip->si_swapslots, btop(noff - sip->si_soff));
+			break;
+		}
+		sip->si_nfpgs--;
+	}
+	*vpp = sip->si_vp;
+	*offp = soff;
+	*lenp = len;
+	ASSERT((spgcnt_t)sip->si_nfpgs >= 0);
+	sip->si_allocs += btop(len);
+	if (sip->si_allocs >= swap_maxcontig) {
+		sip->si_allocs = 0;
+		if ((silast = sip->si_next) == NULL)
+			silast = swapinfo;
+	}
+	TRACE_2(TR_FAC_VM, TR_SWAP_ALLOC,
+		"swap_alloc:sip %p offset %lx", sip, soff);
+	mutex_exit(&swapinfo_lock);
+	return (1);
+}
+
+int swap_backsearch = 0;
+
+/*
+ * Get a free offset on swap device sip.
+ * Return >=0 offset if succeeded, -1 for failure.
+ */
+static u_offset_t
+swap_getoff(struct swapinfo *sip)
+{
+	uint_t *sp, *ep;
+	size_t aoff, boff, poff, slotnumber;
+
+	ASSERT(MUTEX_HELD(&swapinfo_lock));
+
+	sip->si_alloccnt++;
+	for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
+	    ep = &sip->si_swapslots[sip->si_mapsize / NBPW]; sp < ep; sp++) {
+		if (*sp != (uint_t)0xffffffff)
+			goto foundentry;
+		else
+			sip->si_checkcnt++;
+	}
+	SWAP_PRINT(SW_ALLOC,
+	    "swap_getoff: couldn't find slot from hint %ld to end\n",
+	    sip->si_hint, 0, 0, 0, 0);
+	/*
+	 * Go backwards? Check for faster method XXX
+	 */
+	if (swap_backsearch) {
+		for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
+		    ep = sip->si_swapslots; sp > ep; sp--) {
+			if (*sp != (uint_t)0xffffffff)
+				goto foundentry;
+			else
+				sip->si_checkcnt++;
+		}
+	} else {
+		for (sp = sip->si_swapslots,
+		    ep = &sip->si_swapslots[sip->si_hint >> MAPSHIFT];
+		    sp < ep; sp++) {
+			if (*sp != (uint_t)0xffffffff)
+				goto foundentry;
+			else
+				sip->si_checkcnt++;
+		}
+	}
+	if (*sp == 0xffffffff) {
+		cmn_err(CE_WARN, "No free swap slots!");
+		return ((u_offset_t)-1);
+	}
+
+foundentry:
+	/*
+	 * aoff is the page number offset (in bytes) of the si_swapslots
+	 * array element containing a free page
+	 *
+	 * boff is the page number offset of the free page
+	 * (i.e. cleared bit) in si_swapslots[aoff].
+	 */
+	aoff = ((char *)sp - (char *)sip->si_swapslots) * NBBY;
+
+	for (boff = (sip->si_hint % NBBW); boff < NBBW; boff++) {
+		if (!TESTBIT(sip->si_swapslots, aoff + boff))
+			goto foundslot;
+		else
+			sip->si_checkcnt++;
+	}
+	for (boff = 0; boff < (sip->si_hint % NBBW); boff++) {
+		if (!TESTBIT(sip->si_swapslots, aoff + boff))
+			goto foundslot;
+		else
+			sip->si_checkcnt++;
+	}
+	panic("swap_getoff: didn't find slot in word hint %ld", sip->si_hint);
+
+foundslot:
+	/*
+	 * Return the offset of the free page in swap device.
+	 * Convert page number of byte offset and add starting
+	 * offset of swap device.
+	 */
+	slotnumber = aoff + boff;
+	SWAP_PRINT(SW_ALLOC, "swap_getoff: allocating slot %ld\n",
+	    slotnumber, 0, 0, 0, 0);
+	poff = ptob(slotnumber);
+	if (poff + sip->si_soff >= sip->si_eoff)
+		printf("ptob(aoff(%ld) + boff(%ld))(%ld) >= eoff(%ld)\n",
+		    aoff, boff, ptob(slotnumber), (long)sip->si_eoff);
+	ASSERT(poff < sip->si_eoff);
+	/*
+	 * We could verify here that the slot isn't already allocated
+	 * by looking through all the anon slots.
+	 */
+	SETBIT(sip->si_swapslots, slotnumber);
+	sip->si_hint = slotnumber + 1;	/* hint = next slot */
+	return (poff + sip->si_soff);
+}
+
+/*
+ * Free a swap page.
+ */
+void
+swap_phys_free(struct vnode *vp, u_offset_t off, size_t len)
+{
+	struct swapinfo *sip;
+	ssize_t pagenumber, npage;
+
+	mutex_enter(&swapinfo_lock);
+	sip = swapinfo;
+
+	do {
+		if (sip->si_vp == vp &&
+		    sip->si_soff <= off && off < sip->si_eoff) {
+			for (pagenumber = btop(off - sip->si_soff),
+			    npage = btop(len) + pagenumber;
+			    pagenumber < npage; pagenumber++) {
+				SWAP_PRINT(SW_ALLOC,
+				    "swap_phys_free: freeing slot %ld on "
+				    "sip %p\n",
+				    pagenumber, sip, 0, 0, 0);
+				if (!TESTBIT(sip->si_swapslots, pagenumber)) {
+					panic(
+					    "swap_phys_free: freeing free slot "
+					    "%p,%lx\n", (void *)vp,
+					    ptob(pagenumber) + sip->si_soff);
+				}
+				CLEARBIT(sip->si_swapslots, pagenumber);
+				sip->si_nfpgs++;
+			}
+			ASSERT(sip->si_nfpgs <= sip->si_npgs);
+			mutex_exit(&swapinfo_lock);
+			return;
+		}
+	} while ((sip = sip->si_next) != NULL);
+	panic("swap_phys_free");
+	/*NOTREACHED*/
+}
+
+/*
+ * Return the anon struct corresponding for the given
+ * <vnode, off> if it is part of the virtual swap device.
+ * Return the anon struct if found, otherwise NULL.
+ */
+struct anon *
+swap_anon(struct vnode *vp, u_offset_t off)
+{
+	struct anon *ap;
+
+	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(vp, off)]));
+
+	for (ap = anon_hash[ANON_HASH(vp, off)]; ap != NULL; ap = ap->an_hash) {
+		if (ap->an_vp == vp && ap->an_off == off)
+			return (ap);
+	}
+	return (NULL);
+}
+
+
+/*
+ * Determine if the vp offset range overlap a swap device.
+ */
+int
+swap_in_range(struct vnode *vp, u_offset_t offset, size_t len)
+{
+	struct swapinfo *sip;
+	u_offset_t eoff;
+
+	eoff = offset + len;
+	ASSERT(eoff > offset);
+
+	mutex_enter(&swapinfo_lock);
+	sip = swapinfo;
+	if (vp && sip) {
+		do {
+			if (vp != sip->si_vp || eoff <= sip->si_soff ||
+			    offset >= sip->si_eoff)
+				continue;
+			mutex_exit(&swapinfo_lock);
+			return (1);
+		} while ((sip = sip->si_next) != NULL);
+	}
+	mutex_exit(&swapinfo_lock);
+	return (0);
+}
+
+/*
+ * See if name is one of our swap files
+ * even though lookupname failed.
+ * This can be used by swapdel to delete
+ * swap resources on remote machines
+ * where the link has gone down.
+ */
+static struct vnode *
+swapdel_byname(
+	char 	*name,			/* pathname to delete */
+	ulong_t lowblk) 	/* Low block number of area to delete */
+{
+	struct swapinfo **sipp, *osip;
+	u_offset_t soff;
+
+	/*
+	 * Find the swap file entry for the file to
+	 * be deleted. Skip any entries that are in
+	 * transition.
+	 */
+
+	soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
+
+	mutex_enter(&swapinfo_lock);
+	for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
+		if ((strcmp(osip->si_pname, name) == 0) &&
+		    (osip->si_soff == soff) && (osip->si_flags == 0)) {
+			struct vnode *vp = osip->si_vp;
+
+			VN_HOLD(vp);
+			mutex_exit(&swapinfo_lock);
+			return (vp);
+		}
+	}
+	mutex_exit(&swapinfo_lock);
+	return (NULL);
+}
+
+
+/*
+ * New system call to manipulate swap files.
+ */
+int
+swapctl(int sc_cmd, void *sc_arg, int *rv)
+{
+	struct swapinfo *sip, *csip, *tsip;
+	int error = 0;
+	struct swapent st, *ust;
+	struct swapres sr;
+	struct vnode *vp;
+	int cnt = 0;
+	int tmp_nswapfiles;
+	int nswap;
+	int length, nlen;
+	int gplen = 0, plen;
+	char *swapname;
+	char *pname;
+	char *tpname;
+	struct anoninfo ai;
+	spgcnt_t avail;
+	int global = INGLOBALZONE(curproc);
+
+	/*
+	 * When running in a zone we want to hide the details of the swap
+	 * devices: we report there only being one swap device named "swap"
+	 * having a size equal to the sum of the sizes of all real swap devices
+	 * on the system.
+	 */
+	switch (sc_cmd) {
+	case SC_GETNSWP:
+		if (global)
+			*rv = nswapfiles;
+		else
+			*rv = 1;
+		return (0);
+
+	case SC_AINFO:
+		/*
+		 * Return anoninfo information with these changes:
+		 * ani_max = maximum amount of swap space
+		 *	(including potentially available physical memory)
+		 * ani_free = amount of unallocated anonymous memory
+		 *	(some of which might be reserved and including
+		 *	 potentially available physical memory)
+		 * ani_resv = amount of claimed (reserved) anonymous memory
+		 */
+		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
+		ai.ani_max = (k_anoninfo.ani_max +
+			k_anoninfo.ani_mem_resv) +avail;
+
+		ai.ani_free = k_anoninfo.ani_free + avail;
+
+		ai.ani_resv = k_anoninfo.ani_phys_resv +
+		    k_anoninfo.ani_mem_resv;
+
+		if (copyout(&ai, sc_arg, sizeof (struct anoninfo)) != 0)
+			return (EFAULT);
+		return (0);
+
+	case SC_LIST:
+		if (copyin(sc_arg, &length, sizeof (int)) != 0)
+			return (EFAULT);
+		if (!global) {
+			struct swapent st;
+			char *swappath = "swap";
+
+			if (length < 1)
+				return (ENOMEM);
+			ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
+			if (copyin(ust, &st, sizeof (swapent_t)) != 0)
+				return (EFAULT);
+			st.ste_start = PAGESIZE >> SCTRSHFT;
+			st.ste_length = (off_t)0;
+			st.ste_pages = 0;
+			st.ste_free = 0;
+			st.ste_flags = 0;
+			mutex_enter(&swapinfo_lock);
+			for (sip = swapinfo, nswap = 0;
+			    sip != NULL && nswap < nswapfiles;
+			    sip = sip->si_next, nswap++) {
+				st.ste_length +=
+				    (sip->si_eoff - sip->si_soff) >> SCTRSHFT;
+				st.ste_pages += sip->si_npgs;
+				st.ste_free += sip->si_nfpgs;
+			}
+			mutex_exit(&swapinfo_lock);
+			if (copyout(&st, ust, sizeof (swapent_t)) != 0 ||
+			    copyout(swappath, st.ste_path,
+				    strlen(swappath) + 1) != 0) {
+				return (EFAULT);
+			}
+			*rv = 1;
+			return (0);
+		}
+beginning:
+		tmp_nswapfiles = nswapfiles;
+		/* Return an error if not enough space for the whole table. */
+		if (length < tmp_nswapfiles)
+			return (ENOMEM);
+		/*
+		 * Get memory to hold the swap entries and their names. We'll
+		 * copy the real entries into these and then copy these out.
+		 * Allocating the pathname memory is only a guess so we may
+		 * find that we need more and have to do it again.
+		 * All this is because we have to hold the anon lock while
+		 * traversing the swapinfo list, and we can't be doing copyouts
+		 * and/or kmem_alloc()s during this.
+		 */
+		csip = kmem_zalloc(tmp_nswapfiles * sizeof (struct swapinfo),
+		    KM_SLEEP);
+retry:
+		nlen = tmp_nswapfiles * (gplen += 100);
+		pname = kmem_zalloc(nlen, KM_SLEEP);
+
+		mutex_enter(&swapinfo_lock);
+
+		if (tmp_nswapfiles != nswapfiles) {
+			mutex_exit(&swapinfo_lock);
+			kmem_free(pname, nlen);
+			kmem_free(csip,
+			    tmp_nswapfiles * sizeof (struct swapinfo));
+			gplen = 0;
+			goto beginning;
+		}
+		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
+		    sip && nswap < tmp_nswapfiles;
+		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
+			plen = sip->si_pnamelen;
+			if (tpname + plen - pname > nlen) {
+				mutex_exit(&swapinfo_lock);
+				kmem_free(pname, nlen);
+				goto retry;
+			}
+			*tsip = *sip;
+			tsip->si_pname = tpname;
+			(void) strcpy(tsip->si_pname, sip->si_pname);
+		}
+		mutex_exit(&swapinfo_lock);
+
+		if (sip) {
+			error = ENOMEM;
+			goto lout;
+		}
+		ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
+		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
+			if (copyin(ust, &st, sizeof (swapent_t)) != 0) {
+				error = EFAULT;
+				goto lout;
+			}
+			st.ste_flags = tsip->si_flags;
+			st.ste_length =
+			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
+			st.ste_start = tsip->si_soff >> SCTRSHFT;
+			st.ste_pages = tsip->si_npgs;
+			st.ste_free = tsip->si_nfpgs;
+			if (copyout(&st, ust, sizeof (swapent_t)) != 0) {
+				error = EFAULT;
+				goto lout;
+			}
+			if (!tsip->si_pnamelen)
+				continue;
+			if (copyout(tsip->si_pname, st.ste_path,
+				tsip->si_pnamelen) != 0) {
+				error = EFAULT;
+				goto lout;
+			}
+		}
+		*rv = nswap;
+lout:
+		kmem_free(csip, tmp_nswapfiles * sizeof (struct swapinfo));
+		kmem_free(pname, nlen);
+		return (error);
+
+	case SC_ADD:
+	case SC_REMOVE:
+		break;
+	default:
+		return (EINVAL);
+	}
+	if ((error = secpolicy_swapctl(CRED())) != 0)
+		return (error);
+
+	if (copyin(sc_arg, &sr, sizeof (swapres_t)))
+		return (EFAULT);
+
+	/* Allocate the space to read in pathname */
+	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
+		return (ENOMEM);
+
+	error = copyinstr(sr.sr_name, swapname, MAXPATHLEN, 0);
+	if (error)
+		goto out;
+
+	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+	if (error) {
+		if (sc_cmd == SC_ADD)
+			goto out;
+		/* see if we match by name */
+		vp = swapdel_byname(swapname, (size_t)sr.sr_start);
+		if (vp == NULL)
+			goto out;
+	}
+
+	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
+		VN_RELE(vp);
+		error = ENOSYS;
+		goto out;
+	}
+	switch (vp->v_type) {
+	case VBLK:
+		break;
+
+	case VREG:
+		if (vp->v_vfsp && vn_is_readonly(vp))
+			error = EROFS;
+		else
+			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED());
+		break;
+
+	case VDIR:
+		error = EISDIR;
+		break;
+	default:
+		error = ENOSYS;
+		break;
+	}
+	if (error == 0) {
+		if (sc_cmd == SC_REMOVE)
+			error = swapdel(vp, sr.sr_start);
+		else
+			error = swapadd(vp, sr.sr_start,
+					sr.sr_length, swapname);
+	}
+	VN_RELE(vp);
+out:
+	kmem_free(swapname, MAXPATHLEN);
+	return (error);
+}
+
+#if defined(_LP64) && defined(_SYSCALL32)
+
+int
+swapctl32(int sc_cmd, void *sc_arg, int *rv)
+{
+	struct swapinfo *sip, *csip, *tsip;
+	int error = 0;
+	struct swapent32 st, *ust;
+	struct swapres32 sr;
+	struct vnode *vp;
+	int cnt = 0;
+	int tmp_nswapfiles;
+	int nswap;
+	int length, nlen;
+	int gplen = 0, plen;
+	char *swapname;
+	char *pname;
+	char *tpname;
+	struct anoninfo32 ai;
+	size_t s;
+	spgcnt_t avail;
+
+	switch (sc_cmd) {
+	case SC_GETNSWP:
+		*rv = nswapfiles;
+		return (0);
+
+	case SC_AINFO:
+		/*
+		 * Return anoninfo information with these changes:
+		 * ani_max = maximum amount of swap space
+		 *	(including potentially available physical memory)
+		 * ani_free = amount of unallocated anonymous memory
+		 *	(some of which might be reserved and including
+		 *	 potentially available physical memory)
+		 * ani_resv = amount of claimed (reserved) anonymous memory
+		 */
+		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
+		s = (k_anoninfo.ani_max + k_anoninfo.ani_mem_resv) + avail;
+		if (s > UINT32_MAX)
+			return (EOVERFLOW);
+		ai.ani_max = s;
+
+		s = k_anoninfo.ani_free + avail;
+		if (s > UINT32_MAX)
+			return (EOVERFLOW);
+		ai.ani_free = s;
+
+		s = k_anoninfo.ani_phys_resv + k_anoninfo.ani_mem_resv;
+		if (s > UINT32_MAX)
+			return (EOVERFLOW);
+		ai.ani_resv = s;
+
+		if (copyout(&ai, sc_arg, sizeof (ai)) != 0)
+			return (EFAULT);
+		return (0);
+
+	case SC_LIST:
+		if (copyin(sc_arg, &length, sizeof (int32_t)) != 0)
+			return (EFAULT);
+beginning:
+		tmp_nswapfiles = nswapfiles;
+		/* Return an error if not enough space for the whole table. */
+		if (length < tmp_nswapfiles)
+			return (ENOMEM);
+		/*
+		 * Get memory to hold the swap entries and their names. We'll
+		 * copy the real entries into these and then copy these out.
+		 * Allocating the pathname memory is only a guess so we may
+		 * find that we need more and have to do it again.
+		 * All this is because we have to hold the anon lock while
+		 * traversing the swapinfo list, and we can't be doing copyouts
+		 * and/or kmem_alloc()s during this.
+		 */
+		csip = kmem_zalloc(tmp_nswapfiles * sizeof (*csip), KM_SLEEP);
+retry:
+		nlen = tmp_nswapfiles * (gplen += 100);
+		pname = kmem_zalloc(nlen, KM_SLEEP);
+
+		mutex_enter(&swapinfo_lock);
+
+		if (tmp_nswapfiles != nswapfiles) {
+			mutex_exit(&swapinfo_lock);
+			kmem_free(pname, nlen);
+			kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
+			gplen = 0;
+			goto beginning;
+		}
+		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
+		    (sip != NULL) && (nswap < tmp_nswapfiles);
+		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
+			plen = sip->si_pnamelen;
+			if (tpname + plen - pname > nlen) {
+				mutex_exit(&swapinfo_lock);
+				kmem_free(pname, nlen);
+				goto retry;
+			}
+			*tsip = *sip;
+			tsip->si_pname = tpname;
+			(void) strcpy(tsip->si_pname, sip->si_pname);
+		}
+		mutex_exit(&swapinfo_lock);
+
+		if (sip != NULL) {
+			error = ENOMEM;
+			goto lout;
+		}
+		ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent;
+		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
+			if (copyin(ust, &st, sizeof (*ust)) != 0) {
+				error = EFAULT;
+				goto lout;
+			}
+			st.ste_flags = tsip->si_flags;
+			st.ste_length =
+			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
+			st.ste_start = tsip->si_soff >> SCTRSHFT;
+			st.ste_pages = tsip->si_npgs;
+			st.ste_free = tsip->si_nfpgs;
+			if (copyout(&st, ust, sizeof (st)) != 0) {
+				error = EFAULT;
+				goto lout;
+			}
+			if (!tsip->si_pnamelen)
+				continue;
+			if (copyout(tsip->si_pname,
+			    (caddr_t)(uintptr_t)st.ste_path,
+			    tsip->si_pnamelen) != 0) {
+				error = EFAULT;
+				goto lout;
+			}
+		}
+		*rv = nswap;
+lout:
+		kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
+		kmem_free(pname, nlen);
+		return (error);
+
+	case SC_ADD:
+	case SC_REMOVE:
+		break;
+	default:
+		return (EINVAL);
+	}
+	if ((error = secpolicy_swapctl(CRED())) != 0)
+		return (error);
+
+	if (copyin(sc_arg, &sr, sizeof (sr)))
+		return (EFAULT);
+
+	/* Allocate the space to read in pathname */
+	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
+		return (ENOMEM);
+
+	error = copyinstr((caddr_t)(uintptr_t)sr.sr_name,
+	    swapname, MAXPATHLEN, NULL);
+	if (error)
+		goto out;
+
+	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+	if (error) {
+		if (sc_cmd == SC_ADD)
+			goto out;
+		/* see if we match by name */
+		vp = swapdel_byname(swapname, (uint_t)sr.sr_start);
+		if (vp == NULL)
+			goto out;
+	}
+
+	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
+		VN_RELE(vp);
+		error = ENOSYS;
+		goto out;
+	}
+	switch (vp->v_type) {
+	case VBLK:
+		break;
+
+	case VREG:
+		if (vp->v_vfsp && vn_is_readonly(vp))
+			error = EROFS;
+		else
+			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED());
+		break;
+
+	case VDIR:
+		error = EISDIR;
+		break;
+	default:
+		error = ENOSYS;
+		break;
+	}
+	if (error == 0) {
+		if (sc_cmd == SC_REMOVE)
+			error = swapdel(vp, sr.sr_start);
+		else
+			error = swapadd(vp, sr.sr_start, sr.sr_length,
+			    swapname);
+	}
+	VN_RELE(vp);
+out:
+	kmem_free(swapname, MAXPATHLEN);
+	return (error);
+}
+
+#endif /* _LP64 && _SYSCALL32 */
+
+/*
+ * Add a new swap file.
+ */
+int
+swapadd(struct vnode *vp, ulong_t lowblk, ulong_t nblks, char *swapname)
+{
+	struct swapinfo **sipp, *nsip = NULL, *esip = NULL;
+	struct vnode *cvp;
+	struct vattr vattr;
+	pgcnt_t pages;
+	u_offset_t soff, eoff;
+	int error;
+	ssize_t i, start, end;
+	ushort_t wasswap;
+	ulong_t startblk;
+	size_t	returned_mem;
+
+	SWAP_PRINT(SW_CTL, "swapadd: vp %p lowblk %ld nblks %ld swapname %s\n",
+	    vp, lowblk, nblks, swapname, 0);
+	/*
+	 * Get the real vnode. (If vp is not a specnode it just returns vp, so
+	 * it does the right thing, but having this code know about specnodes
+	 * violates the spirit of having it be indepedent of vnode type.)
+	 */
+	cvp = common_specvp(vp);
+
+	/*
+	 * Or in VISSWAP so file system has chance to deny swap-ons during open.
+	 */
+	mutex_enter(&cvp->v_lock);
+	wasswap = cvp->v_flag & VISSWAP;
+	cvp->v_flag |= VISSWAP;
+	mutex_exit(&cvp->v_lock);
+
+	mutex_enter(&swap_lock);
+	if (error = VOP_OPEN(&cvp, FREAD|FWRITE, CRED())) {
+		mutex_exit(&swap_lock);
+		/* restore state of v_flag */
+		if (!wasswap) {
+			mutex_enter(&cvp->v_lock);
+			cvp->v_flag &= ~VISSWAP;
+			mutex_exit(&cvp->v_lock);
+		}
+		return (error);
+	}
+	mutex_exit(&swap_lock);
+
+	/*
+	 * Get partition size. Return error if empty partition,
+	 * or if request does not fit within the partition.
+	 * If this is the first swap device, we can reduce
+	 * the size of the swap area to match what is
+	 * available.  This can happen if the system was built
+	 * on a machine with a different size swap partition.
+	 */
+	vattr.va_mask = AT_SIZE;
+	if (error = VOP_GETATTR(cvp, &vattr, ATTR_COMM, CRED()))
+		goto out;
+
+	/*
+	 * Specfs returns a va_size of MAXOFFSET_T (UNKNOWN_SIZE) when the
+	 * size of the device can't be determined.
+	 */
+	if ((vattr.va_size == 0) || (vattr.va_size == MAXOFFSET_T)) {
+		error = EINVAL;
+		goto out;
+	}
+
+#ifdef	_ILP32
+	/*
+	 * No support for large swap in 32-bit OS, if the size of the swap is
+	 * bigger than MAXOFF32_T then the size used by swapfs must be limited.
+	 * This limitation is imposed by the swap subsystem itself, a D_64BIT
+	 * driver as the target of swap operation should be able to field
+	 * the IO.
+	 */
+	if (vattr.va_size > MAXOFF32_T) {
+		cmn_err(CE_NOTE,
+			"!swap device %s truncated from 0x%llx to 0x%x bytes",
+			swapname, vattr.va_size, MAXOFF32_T);
+		vattr.va_size = MAXOFF32_T;
+	}
+#endif	/* _ILP32 */
+
+	/* Fail if file not writeable (try to set size to current size) */
+	vattr.va_mask = AT_SIZE;
+	if (error = VOP_SETATTR(cvp, &vattr, 0, CRED(), NULL))
+		goto out;
+
+	/* Fail if fs does not support VOP_PAGEIO */
+	error = VOP_PAGEIO(cvp, (page_t *)NULL, (u_offset_t)0, 0, 0, CRED());
+
+	if (error == ENOSYS)
+		goto out;
+	else
+		error = 0;
+	/*
+	 * If swapping on the root filesystem don't put swap blocks that
+	 * correspond to the miniroot filesystem on the swap free list.
+	 */
+	if (cvp == rootdir)
+		startblk = roundup(MINIROOTSIZE<<SCTRSHFT, klustsize)>>SCTRSHFT;
+	else				/* Skip 1st page (disk label) */
+		startblk = (ulong_t)(lowblk ? lowblk : 1);
+
+	soff = startblk << SCTRSHFT;
+	if (soff >= vattr.va_size) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * If user specified 0 blks, use the size of the device
+	 */
+	eoff = nblks ?  soff + (nblks - (startblk - lowblk) << SCTRSHFT) :
+			vattr.va_size;
+
+	SWAP_PRINT(SW_CTL, "swapadd: va_size %ld soff %ld eoff %ld\n",
+	    vattr.va_size, soff, eoff, 0, 0);
+
+	if (eoff > vattr.va_size) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * The starting and ending offsets must be page aligned.
+	 * Round soff up to next page boundary, round eoff
+	 * down to previous page boundary.
+	 */
+	soff = ptob(btopr(soff));
+	eoff = ptob(btop(eoff));
+	if (soff >= eoff) {
+		SWAP_PRINT(SW_CTL, "swapadd: soff %ld >= eoff %ld\n",
+		    soff, eoff, 0, 0, 0);
+		error = EINVAL;
+		goto out;
+	}
+
+	pages = btop(eoff - soff);
+
+	/* Allocate and partially set up the new swapinfo */
+	nsip = kmem_zalloc(sizeof (struct swapinfo), KM_SLEEP);
+	nsip->si_vp = cvp;
+
+	nsip->si_soff = soff;
+	nsip->si_eoff = eoff;
+	nsip->si_hint = 0;
+	nsip->si_checkcnt = nsip->si_alloccnt = 0;
+
+	nsip->si_pnamelen = (int)strlen(swapname) + 1;
+	nsip->si_pname = (char *)kmem_zalloc(nsip->si_pnamelen, KM_SLEEP);
+	bcopy(swapname, nsip->si_pname, nsip->si_pnamelen - 1);
+	SWAP_PRINT(SW_CTL, "swapadd: allocating swapinfo for %s, %ld pages\n",
+	    swapname, pages, 0, 0, 0);
+	/*
+	 * Size of swapslots map in bytes
+	 */
+	nsip->si_mapsize = P2ROUNDUP(pages, NBBW) / NBBY;
+	nsip->si_swapslots = kmem_zalloc(nsip->si_mapsize, KM_SLEEP);
+
+	/*
+	 * Permanently set the bits that can't ever be allocated,
+	 * i.e. those from the ending offset to the round up slot for the
+	 * swapslots bit map.
+	 */
+	start = pages;
+	end = P2ROUNDUP(pages, NBBW);
+	for (i = start; i < end; i++) {
+		SWAP_PRINT(SW_CTL, "swapadd: set bit for page %ld\n", i,
+		    0, 0, 0, 0);
+		SETBIT(nsip->si_swapslots, i);
+	}
+	nsip->si_npgs = nsip->si_nfpgs = pages;
+	/*
+	 * Now check to see if we can add it. We wait til now to check because
+	 * we need the swapinfo_lock and we don't want sleep with it (e.g.,
+	 * during kmem_alloc()) while we're setting up the swapinfo.
+	 */
+	mutex_enter(&swapinfo_lock);
+	for (sipp = &swapinfo; (esip = *sipp) != NULL; sipp = &esip->si_next) {
+		if (esip->si_vp == cvp) {
+			if (esip->si_soff == soff && esip->si_npgs == pages &&
+			    (esip->si_flags & ST_DOINGDEL)) {
+				/*
+				 * We are adding a device that we are in the
+				 * middle of deleting. Just clear the
+				 * ST_DOINGDEL flag to signal this and
+				 * the deletion routine will eventually notice
+				 * it and add it back.
+				 */
+				esip->si_flags &= ~ST_DOINGDEL;
+				mutex_exit(&swapinfo_lock);
+				goto out;
+			}
+			/* disallow overlapping swap files */
+			if ((soff < esip->si_eoff) && (eoff > esip->si_soff)) {
+				error = EEXIST;
+				mutex_exit(&swapinfo_lock);
+				goto out;
+			}
+		}
+	}
+
+	nswapfiles++;
+
+	/*
+	 * add new swap device to list and shift allocations to it
+	 * before updating the anoninfo counters
+	 */
+	*sipp = nsip;
+	silast = nsip;
+
+	/*
+	 * Update the total amount of reservable swap space
+	 * accounting properly for swap space from physical memory
+	 */
+	/* New swap device soaks up currently reserved memory swap */
+	mutex_enter(&anoninfo_lock);
+
+	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+	k_anoninfo.ani_max += pages;
+	ANI_ADD(pages);
+	if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
+		returned_mem = MIN(k_anoninfo.ani_mem_resv -
+		    k_anoninfo.ani_locked_swap,
+		    k_anoninfo.ani_max - k_anoninfo.ani_phys_resv);
+
+		ANI_ADD(-returned_mem);
+		k_anoninfo.ani_free -= returned_mem;
+		k_anoninfo.ani_mem_resv -= returned_mem;
+		k_anoninfo.ani_phys_resv += returned_mem;
+
+		mutex_enter(&freemem_lock);
+		availrmem += returned_mem;
+		mutex_exit(&freemem_lock);
+	}
+	/*
+	 * At boot time, to permit booting small memory machines using
+	 * only physical memory as swap space, we allowed a dangerously
+	 * large amount of memory to be used as swap space; now that
+	 * more physical backing store is available bump down the amount
+	 * we can get from memory to a safer size.
+	 */
+	if (swapfs_minfree < swapfs_desfree) {
+		mutex_enter(&freemem_lock);
+		if (availrmem > swapfs_desfree || !k_anoninfo.ani_mem_resv)
+			swapfs_minfree = swapfs_desfree;
+		mutex_exit(&freemem_lock);
+	}
+
+	SWAP_PRINT(SW_CTL, "swapadd: ani_max %ld ani_free %ld\n",
+	    k_anoninfo.ani_free, k_anoninfo.ani_free, 0, 0, 0);
+
+	mutex_exit(&anoninfo_lock);
+
+	mutex_exit(&swapinfo_lock);
+
+	/* Initialize the dump device */
+	mutex_enter(&dump_lock);
+	if (dumpvp == NULL)
+		(void) dumpinit(vp, swapname, 0);
+	mutex_exit(&dump_lock);
+
+	VN_HOLD(cvp);
+out:
+	if (error || esip) {
+		SWAP_PRINT(SW_CTL, "swapadd: error (%d)\n", error, 0, 0, 0, 0);
+
+		if (!wasswap) {
+			mutex_enter(&cvp->v_lock);
+			cvp->v_flag &= ~VISSWAP;
+			mutex_exit(&cvp->v_lock);
+		}
+		if (nsip) {
+			kmem_free(nsip->si_swapslots, (size_t)nsip->si_mapsize);
+			kmem_free(nsip->si_pname, nsip->si_pnamelen);
+			kmem_free(nsip, sizeof (*nsip));
+		}
+		mutex_enter(&swap_lock);
+		(void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED());
+		mutex_exit(&swap_lock);
+	}
+	return (error);
+}
+
+/*
+ * Delete a swap file.
+ */
+static int
+swapdel(
+	struct vnode *vp,
+	ulong_t lowblk) /* Low block number of area to delete. */
+{
+	struct swapinfo **sipp, *osip = NULL;
+	struct vnode *cvp;
+	u_offset_t soff;
+	int error = 0;
+	u_offset_t toff = 0;
+	struct vnode *tvp = NULL;
+	spgcnt_t pages;
+	struct anon **app, *ap;
+	kmutex_t *ahm;
+	pgcnt_t adjust_swap = 0;
+
+	/* Find the swap file entry for the file to be deleted */
+	cvp = common_specvp(vp);
+
+
+	lowblk = lowblk ? lowblk : 1; 	/* Skip first page (disk label) */
+	soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
+
+	mutex_enter(&swapinfo_lock);
+	for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
+		if ((osip->si_vp == cvp) &&
+		    (osip->si_soff == soff) && (osip->si_flags == 0))
+			break;
+	}
+
+	/* If the file was not found, error.  */
+	if (osip == NULL) {
+		error = EINVAL;
+		mutex_exit(&swapinfo_lock);
+		goto out;
+	}
+
+	pages = osip->si_npgs;
+
+	/*
+	 * Do not delete if we will be low on swap pages.
+	 */
+	mutex_enter(&anoninfo_lock);
+
+	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+	mutex_enter(&freemem_lock);
+	if (((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +
+	    MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) < pages) {
+		mutex_exit(&freemem_lock);
+		mutex_exit(&anoninfo_lock);
+		error = ENOMEM;
+		cmn_err(CE_WARN, "swapdel - too few free pages");
+		mutex_exit(&swapinfo_lock);
+		goto out;
+	}
+	mutex_exit(&freemem_lock);
+
+	k_anoninfo.ani_max -= pages;
+
+	/* If needed, reserve memory swap to replace old device */
+	if (k_anoninfo.ani_phys_resv > k_anoninfo.ani_max) {
+		adjust_swap = k_anoninfo.ani_phys_resv - k_anoninfo.ani_max;
+		k_anoninfo.ani_phys_resv -= adjust_swap;
+		k_anoninfo.ani_mem_resv += adjust_swap;
+		mutex_enter(&freemem_lock);
+		availrmem -= adjust_swap;
+		mutex_exit(&freemem_lock);
+		ANI_ADD(adjust_swap);
+	}
+	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+	mutex_exit(&anoninfo_lock);
+
+	ANI_ADD(-pages);
+
+	/*
+	 * Set the delete flag.  This prevents anyone from allocating more
+	 * pages from this file. Also set ST_DOINGDEL. Someone who wants to
+	 * add the file back while we're deleting it will signify by clearing
+	 * this flag.
+	 */
+	osip->si_flags |= ST_INDEL|ST_DOINGDEL;
+	mutex_exit(&swapinfo_lock);
+
+	/*
+	 * Free all the allocated physical slots for this file. We do this
+	 * by walking through the entire anon hash array, because we need
+	 * to update all the anon slots that have physical swap slots on
+	 * this file, and this is the only way to find them all. We go back
+	 * to the beginning of a bucket after each slot is freed because the
+	 * anonhash_lock is not held during the free and thus the hash table
+	 * may change under us.
+	 */
+	for (app = anon_hash; app < &anon_hash[ANON_HASH_SIZE]; app++) {
+		ahm = &anonhash_lock[(app-anon_hash) & (AH_LOCK_SIZE - 1)];
+		mutex_enter(ahm);
+top:
+		for (ap = *app; ap != NULL; ap = ap->an_hash) {
+			if (ap->an_pvp == cvp &&
+			    ap->an_poff >= osip->si_soff &&
+			    ap->an_poff < osip->si_eoff) {
+				ASSERT(TESTBIT(osip->si_swapslots,
+				    btop((size_t)(ap->an_poff -
+				    osip->si_soff))));
+				tvp = ap->an_vp;
+				toff = ap->an_off;
+				VN_HOLD(tvp);
+				mutex_exit(ahm);
+
+				error = swapslot_free(tvp, toff, osip);
+
+				VN_RELE(tvp);
+				mutex_enter(ahm);
+				if (!error && (osip->si_flags & ST_DOINGDEL)) {
+					goto top;
+				} else {
+					if (error) {
+						cmn_err(CE_WARN,
+						    "swapslot_free failed %d",
+						    error);
+					}
+
+					/*
+					 * Add device back before making it
+					 * visible.
+					 */
+					mutex_enter(&swapinfo_lock);
+					osip->si_flags &=
+					    ~(ST_INDEL | ST_DOINGDEL);
+					mutex_exit(&swapinfo_lock);
+
+					/*
+					 * Update the anon space available
+					 */
+					mutex_enter(&anoninfo_lock);
+
+					k_anoninfo.ani_phys_resv += adjust_swap;
+					k_anoninfo.ani_mem_resv -= adjust_swap;
+					k_anoninfo.ani_max += pages;
+
+					mutex_enter(&freemem_lock);
+					availrmem += adjust_swap;
+					mutex_exit(&freemem_lock);
+
+					mutex_exit(&anoninfo_lock);
+
+					ANI_ADD(pages);
+
+					mutex_exit(ahm);
+					goto out;
+				}
+			}
+		}
+		mutex_exit(ahm);
+	}
+
+	/* All done, they'd better all be free! */
+	mutex_enter(&swapinfo_lock);
+	ASSERT(osip->si_nfpgs == osip->si_npgs);
+
+	/* Now remove it from the swapinfo list */
+	for (sipp = &swapinfo; *sipp != NULL; sipp = &(*sipp)->si_next) {
+		if (*sipp == osip)
+			break;
+	}
+	ASSERT(*sipp);
+	*sipp = osip->si_next;
+	if (silast == osip)
+		if ((silast = osip->si_next) == NULL)
+			silast = swapinfo;
+	nswapfiles--;
+	mutex_exit(&swapinfo_lock);
+
+	kmem_free(osip->si_swapslots, osip->si_mapsize);
+	kmem_free(osip->si_pname, osip->si_pnamelen);
+	kmem_free(osip, sizeof (*osip));
+
+	mutex_enter(&dump_lock);
+	if (cvp == dumpvp)
+		dumpfini();
+	mutex_exit(&dump_lock);
+
+	/* Release the vnode */
+
+	mutex_enter(&swap_lock);
+	(void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED());
+	mutex_enter(&cvp->v_lock);
+	cvp->v_flag &= ~VISSWAP;
+	mutex_exit(&cvp->v_lock);
+	VN_RELE(cvp);
+	mutex_exit(&swap_lock);
+out:
+	return (error);
+}
+
+/*
+ * Free up a physical swap slot on swapinfo sip, currently in use by the
+ * anonymous page whose name is (vp, off).
+ */
+static int
+swapslot_free(
+	struct vnode *vp,
+	u_offset_t off,
+	struct swapinfo *sip)
+{
+	struct page *pl[2], *pp;
+	struct anon *ap = NULL;
+	int error = 0;
+	kmutex_t *ahm;
+
+	/*
+	 * Get the page for the old swap slot and i/o lock it.
+	 * Users of the physical slot will synchronize on the i/o lock.
+	 */
+	if (error = VOP_GETPAGE(vp, (offset_t)off, ptob(1), NULL,
+	    pl, ptob(1), segkmap, NULL, S_READ, CRED())) {
+		/*
+		 * Anon slot went away (EIDRM) or vp was truncated (EFAULT)
+		 * while we got the page. Thus the physical slot must be
+		 * free, so we have succeeded.
+		 */
+		if (error == EIDRM || error == EFAULT)
+			error = 0;
+		return (error);
+	}
+	pp = pl[0];
+	page_io_lock(pp);
+
+	ahm = &anonhash_lock[AH_LOCK(vp, off)];
+	mutex_enter(ahm);
+	/*
+	 * Get the anon slot; anon struct cannot vanish while we hold
+	 * SE_SHARED lock on the physical page since anon_decref() blocks
+	 * in page_lookup() before it can proceed further to remove
+	 * anon struct from anon_hash table.
+	 */
+	if ((ap = swap_anon(vp, off)) == NULL) {
+		panic("swapslot_free(%p, %llx, %p), page: %p, null anon",
+			vp, off, sip, pp);
+	}
+	/*
+	 * Free the physical slot. It may have been freed up and replaced with
+	 * another one while we were getting the page so we have to re-verify
+	 * that this is really one we want. If we do free the slot we have
+	 * to mark the page modified, as its backing store is now gone.
+	 */
+	if (ap->an_pvp == sip->si_vp && ap->an_poff >= sip->si_soff &&
+	    ap->an_poff < sip->si_eoff) {
+		swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
+		ap->an_pvp = NULL;
+		ap->an_poff = NULL;
+		mutex_exit(ahm);
+		hat_setmod(pp);
+	} else {
+		mutex_exit(ahm);
+	}
+out:
+	/* Release the page locks */
+	page_unlock(pp);
+	page_io_unlock(pp);
+	return (error);
+}
+
+/*
+ * Get contig physical backing store for vp, in the range
+ * [*offp, *offp + *lenp), May back a subrange of this, but must
+ * always include the requested offset or fail. Returns the offsets
+ * backed as [*offp, *offp + *lenp) and the physical offsets used to
+ * back them from *pvpp in the range [*pstartp, *pstartp + *lenp).
+ * Returns 	0 for success
+ * 		SE_NOANON -- no anon slot for requested paged
+ *		SE_NOSWAP -- no physical swap space available
+ */
+int
+swap_newphysname(
+	struct vnode *vp,
+	u_offset_t offset,
+	u_offset_t *offp,
+	size_t *lenp,
+	struct vnode **pvpp,
+	u_offset_t *poffp)
+{
+	struct anon *ap = NULL;		/* anon slot for vp, off */
+	int error = 0;
+	struct vnode *pvp;
+	u_offset_t poff, pstart, prem;
+	size_t plen;
+	u_offset_t off, start;
+	kmutex_t *ahm;
+
+	ASSERT(*offp <= offset && offset < *offp + *lenp);
+
+	/* Get new physical swap slots. */
+	plen = *lenp;
+	if (!swap_phys_alloc(&pvp, &pstart, &plen, 0)) {
+		/*
+		 * No swap available so return error unless requested
+		 * offset is already backed in which case return that.
+		 */
+		ahm = &anonhash_lock[AH_LOCK(vp, offset)];
+		mutex_enter(ahm);
+		if ((ap = swap_anon(vp, offset)) == NULL) {
+			error = SE_NOANON;
+			mutex_exit(ahm);
+			return (error);
+		}
+		error = (ap->an_pvp ? 0 : SE_NOSWAP);
+		*offp = offset;
+		*lenp = PAGESIZE;
+		*pvpp = ap->an_pvp;
+		*poffp = ap->an_poff;
+		mutex_exit(ahm);
+		return (error);
+	}
+
+	/*
+	 * We got plen (<= *lenp) contig slots. Use these to back a
+	 * subrange of [*offp, *offp + *lenp) which includes offset.
+	 * For now we just put offset at the end of the kluster.
+	 * Clearly there are other possible choices - which is best?
+	 */
+	start = MAX(*offp,
+	    (offset + PAGESIZE > plen) ? (offset + PAGESIZE - plen) : 0);
+	ASSERT(start + plen <= *offp + *lenp);
+
+	for (off = start, poff = pstart; poff < pstart + plen;
+	    off += PAGESIZE, poff += PAGESIZE) {
+		ahm = &anonhash_lock[AH_LOCK(vp, off)];
+		mutex_enter(ahm);
+		if ((ap = swap_anon(vp, off)) != NULL) {
+			/* Free old slot if any, and assign new one */
+			if (ap->an_pvp)
+				swap_phys_free(ap->an_pvp, ap->an_poff,
+				    PAGESIZE);
+			ap->an_pvp = pvp;
+			ap->an_poff = poff;
+		} else {	/* No anon slot for a klustered page, quit. */
+			prem = (pstart + plen) - poff;
+			/* Already did requested page, do partial kluster */
+			if (off > offset) {
+				plen = poff - pstart;
+				error = 0;
+			/* Fail on requested page, error */
+			} else if (off == offset)  {
+				error = SE_NOANON;
+			/* Fail on prior page, fail on requested page, error */
+			} else if ((ap = swap_anon(vp, offset)) == NULL) {
+				error = SE_NOANON;
+			/* Fail on prior page, got requested page, do only it */
+			} else {
+				/* Free old slot if any, and assign new one */
+				if (ap->an_pvp)
+					swap_phys_free(ap->an_pvp, ap->an_poff,
+					    PAGESIZE);
+				ap->an_pvp = pvp;
+				ap->an_poff = poff;
+				/* One page kluster */
+				start = offset;
+				plen = PAGESIZE;
+				pstart = poff;
+				poff += PAGESIZE;
+				prem -= PAGESIZE;
+			}
+			/* Free unassigned slots */
+			swap_phys_free(pvp, poff, prem);
+			mutex_exit(ahm);
+			break;
+		}
+		mutex_exit(ahm);
+	}
+	ASSERT(*offp <= start && start + plen <= *offp + *lenp);
+	ASSERT(start <= offset && offset < start + plen);
+	*offp = start;
+	*lenp = plen;
+	*pvpp = pvp;
+	*poffp = pstart;
+	return (error);
+}
+
+
+/*
+ * Get the physical swap backing store location for a given anonymous page
+ * named (vp, off). The backing store name is returned in (*pvpp, *poffp).
+ * Returns	0 		success
+ *		EIDRM --	no anon slot (page is not allocated)
+ */
+int
+swap_getphysname(
+	struct vnode *vp,
+	u_offset_t off,
+	struct vnode **pvpp,
+	u_offset_t *poffp)
+{
+	struct anon *ap;
+	int error = 0;
+	kmutex_t *ahm;
+
+	ahm = &anonhash_lock[AH_LOCK(vp, off)];
+	mutex_enter(ahm);
+
+	/* Get anon slot for vp, off */
+	ap = swap_anon(vp, off);
+	if (ap == NULL) {
+		error = EIDRM;
+		goto out;
+	}
+	*pvpp = ap->an_pvp;
+	*poffp = ap->an_poff;
+out:
+	mutex_exit(ahm);
+	return (error);
+}
diff --git a/usr/src/uts/common/vm/vpage.h b/usr/src/uts/common/vm/vpage.h
new file mode 100644
index 0000000000..68dfb1adb0
--- /dev/null
+++ b/usr/src/uts/common/vm/vpage.h
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1998 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef	_VM_VPAGE_H
+#define	_VM_VPAGE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Information per virtual page.
+ */
+struct vpage {
+	uchar_t nvp_prot;	/* see <sys/mman.h> prot flags */
+	uchar_t nvp_advice;	/* pplock & <sys/mman.h> madvise flags */
+};
+
+/*
+ * This was changed from a bitfield to flags/macros in order
+ * to conserve space (uchar_t bitfields are not ANSI).  This could
+ * have been condensed to a uchar_t, but at the expense of complexity.
+ * We've stolen a bit from the top of nvp_advice to store pplock in.
+ *
+ * WARNING: VPP_SETADVICE(vpp, x) evaluates vpp twice, and VPP_PLOCK(vpp)
+ * returns a positive integer when the lock is held, not necessarily (1).
+ */
+#define	VP_ADVICE_MASK	(0x07)
+#define	VP_PPLOCK_MASK	(0x80)	/* physical page locked by me */
+#define	VP_PPLOCK_SHIFT	(0x07)	/* offset of lock hiding inside nvp_advice */
+
+#define	VPP_PROT(vpp)	((vpp)->nvp_prot)
+#define	VPP_ADVICE(vpp)	((vpp)->nvp_advice & VP_ADVICE_MASK)
+#define	VPP_ISPPLOCK(vpp) \
+	((uchar_t)((vpp)->nvp_advice & VP_PPLOCK_MASK))
+
+#define	VPP_SETPROT(vpp, x)	((vpp)->nvp_prot = (x))
+#define	VPP_SETADVICE(vpp, x) \
+	((vpp)->nvp_advice = ((vpp)->nvp_advice & ~VP_ADVICE_MASK) | \
+		((x) & VP_ADVICE_MASK))
+#define	VPP_SETPPLOCK(vpp)	((vpp)->nvp_advice |= VP_PPLOCK_MASK)
+#define	VPP_CLRPPLOCK(vpp)	((vpp)->nvp_advice &= ~VP_PPLOCK_MASK)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_VPAGE_H */
diff --git a/usr/src/uts/common/vm/xhat.c b/usr/src/uts/common/vm/xhat.c
new file mode 100644
index 0000000000..255ca1eb67
--- /dev/null
+++ b/usr/src/uts/common/vm/xhat.c
@@ -0,0 +1,555 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/mman.h>
+#include <sys/systm.h>
+#include <vm/xhat.h>
+#include <vm/page.h>
+#include <vm/as.h>
+
+int xhat_debug = 0;
+
+krwlock_t xhat_provider_rwlock;
+xhat_provider_t *xhat_provider = NULL;
+
+void
+xhat_init()
+{
+	rw_init(&xhat_provider_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+
+
+int
+xhat_provider_register(xhat_provider_t *provider)
+{
+	/* strlen("_cache") = 7 */
+	char	cache_name[XHAT_CACHE_NAMELEN + 7];
+
+
+	if (provider->xhat_provider_version != XHAT_PROVIDER_VERSION) {
+		cmn_err(CE_WARN, "XHAT provider version mismatch");
+		return (-1);
+	}
+
+	if ((XHAT_POPS(provider)->xhat_alloc == NULL) ||
+	    (XHAT_POPS(provider)->xhat_free == NULL)) {
+		cmn_err(CE_WARN, "Malformed XHAT provider");
+		return (-1);
+	}
+
+	/* Allocate kmem_cache which will manage xhat blocks */
+	provider->xblkcache->free_blks = NULL;
+	(void) strncpy(cache_name, provider->xhat_provider_name,
+	    XHAT_CACHE_NAMELEN);
+	(void) strcat(cache_name, "_cache");
+	provider->xblkcache->cache = kmem_cache_create(cache_name,
+	    provider->xhat_provider_blk_size, 0, NULL, NULL,
+	    provider->xblkcache->reclaim,
+	    (void *)provider, NULL, 0);
+	if (provider->xblkcache->cache == NULL) {
+		cmn_err(CE_WARN, "Failed to allocate cache for %s",
+		    provider->xhat_provider_name);
+		return (-1);
+	}
+
+	mutex_init(&provider->xblkcache->lock, NULL, MUTEX_DEFAULT, NULL);
+
+
+	/* Insert provider in the global list */
+	rw_enter(&xhat_provider_rwlock, RW_WRITER);
+	provider->next = xhat_provider;
+	provider->prev = NULL;
+	if (xhat_provider)
+		xhat_provider->prev = provider;
+	xhat_provider = provider;
+	xhat_provider->xhat_provider_refcnt = 0;
+	rw_exit(&xhat_provider_rwlock);
+	return (0);
+}
+
+
+
+int
+xhat_provider_unregister(xhat_provider_t *provider)
+{
+	if (provider->xhat_provider_version != XHAT_PROVIDER_VERSION)
+		return (-1);
+
+	rw_enter(&xhat_provider_rwlock, RW_WRITER);
+
+	if (provider->xhat_provider_refcnt) {
+		rw_exit(&xhat_provider_rwlock);
+		return (-1);
+	}
+
+	if (provider->next)
+		provider->next->prev = provider->prev;
+	if (provider->prev)
+		provider->prev->next = provider->next;
+	else
+		xhat_provider = provider->next;
+	provider->prev = NULL;
+	provider->next = NULL;
+	rw_exit(&xhat_provider_rwlock);
+
+	/* Free all xblks that are sitting on free_blks list */
+	provider->xblkcache->reclaim(provider);
+
+	kmem_cache_destroy(provider->xblkcache->cache);
+
+	return (0);
+}
+
+
+
+/* Attaches an XHAT to the address space */
+int
+xhat_attach_xhat(xhat_provider_t *provider, struct as *as,
+    struct xhat **xhatp, void *arg)
+{
+	struct xhat *xh;
+
+
+
+	xh = XHAT_POPS(provider)->xhat_alloc(arg);
+	if (xh == NULL) {
+		*xhatp = NULL;
+		return (XH_PRVDR);
+	}
+
+	mutex_init(&xh->xhat_lock, NULL, MUTEX_DEFAULT, NULL);
+	xh->xhat_provider = provider;
+
+	rw_enter(&xhat_provider_rwlock, RW_WRITER);
+	provider->xhat_provider_refcnt++;
+	rw_exit(&xhat_provider_rwlock);
+
+	mutex_enter(&as->a_contents);
+
+	/* Is address space busy (being freed, dup'd or swapped)? */
+	if (AS_ISBUSY(as)) {
+		mutex_exit(&as->a_contents);
+		XHAT_POPS(provider)->xhat_free(xh);
+
+		rw_enter(&xhat_provider_rwlock, RW_WRITER);
+		provider->xhat_provider_refcnt--;
+		rw_exit(&xhat_provider_rwlock);
+
+		*xhatp = NULL;
+		return (XH_ASBUSY);
+	}
+
+	xh->xhat_as = as;
+	xh->xhat_refcnt = 0;
+	xh->holder = NULL;
+	xh->arg = arg;
+	xh->next = (struct xhat *)as->a_xhat;
+	if (xh->next)
+		xh->next->prev = xh;
+	as->a_xhat = xh;
+	mutex_exit(&as->a_contents);
+	*xhatp = xh;
+	return (0);
+}
+
+
+int
+xhat_detach_xhat(xhat_provider_t *provider, struct as *as)
+{
+	struct xhat *xh;
+
+
+	mutex_enter(&as->a_contents);
+
+	for (xh = (struct xhat *)as->a_xhat; xh != NULL; xh = xh->next)
+		if (xh->xhat_provider == provider) {
+
+
+			if (xh->holder != NULL) {
+				/*
+				 * The address space is being freed,
+				 * dup'd or swapped out.
+				 * If we are the thread which doing one
+				 * of those operations, we can go ahead
+				 * and free up the XHAT.
+				 * Otherwise, return.
+				 */
+				if (xh->holder != curthread) {
+					mutex_exit(&as->a_contents);
+					return (XH_ASBUSY);
+				} else
+					xhat_hat_rele(xh);
+			}
+
+			if (xh->xhat_refcnt > 0) {
+				/*
+				 * There are still "users" of the XHAT.
+				 * This may be either because the caller
+				 * forgot to free something up (which is a bug)
+				 * or because xhat_op_all() is in progress.
+				 * Since we are not allowing any of
+				 * xhat_op_all's ops to call xhat_detach_xhat(),
+				 * This can only be some other thread. It
+				 * may want to wait a bit and retry.
+				 */
+
+
+				/* Restore the hold on the XHAT */
+				if (xh->holder == curthread)
+					xhat_hat_hold(xh);
+
+				mutex_exit(&as->a_contents);
+				return (XH_XHHELD);
+			}
+
+			rw_enter(&xhat_provider_rwlock, RW_WRITER);
+			provider->xhat_provider_refcnt--;
+			rw_exit(&xhat_provider_rwlock);
+
+			if (xh->next)
+				xh->next->prev = xh->prev;
+			if (xh->prev)
+				xh->prev->next = xh->next;
+			else
+				as->a_xhat = (void *) xh->next;
+			mutex_exit(&as->a_contents);
+
+			XHAT_POPS(provider)->xhat_free(xh);
+
+			return (0);
+		}
+	mutex_exit(&as->a_contents);
+	return (XH_NOTATTCHD);
+}
+
+void
+xhat_hat_hold(struct xhat *xhat)
+{
+	mutex_enter(&xhat->xhat_lock);
+	xhat->xhat_refcnt++;
+	mutex_exit(&xhat->xhat_lock);
+}
+
+void
+xhat_hat_rele(struct xhat *xhat)
+{
+	mutex_enter(&xhat->xhat_lock);
+	xhat->xhat_refcnt--;
+	ASSERT(xhat->xhat_refcnt >= 0);
+	mutex_exit(&xhat->xhat_lock);
+}
+
+
+int
+xhat_hat_holders(struct xhat *xhat)
+{
+	return (xhat->xhat_refcnt);
+}
+
+
+/*
+ * Assumes that address space is already locked
+ * and that AS_FREE is set for as->a_flags.
+ */
+void
+xhat_free_start_all(struct as *as)
+{
+	struct xhat *xh, *xh_nxt;
+
+
+	ASSERT(AS_ISBUSY(as));
+
+	mutex_enter(&as->a_contents);
+	xh = (struct xhat *)as->a_xhat;
+
+	/*
+	 * Simply calling xhat_hat_hold() won't work because we will
+	 * not be able to succeed in xhat_detach_xhat(), which may
+	 * get called from here. We need to know _who_ the holder is.
+	 */
+	if (xh != NULL) {
+		xhat_hat_hold(xh);
+		ASSERT(xh->holder == NULL);
+		xh->holder = curthread;
+	}
+
+	while (xh != NULL) {
+
+		xh_nxt = xh->next;
+		if (xh_nxt != NULL) {
+			ASSERT(xh_nxt->holder == NULL);
+			xhat_hat_hold(xh_nxt);
+			xh_nxt->holder = curthread;
+		}
+
+		mutex_exit(&as->a_contents);
+
+		XHAT_FREE_START(xh);
+
+		mutex_enter(&as->a_contents);
+
+		xh = xh_nxt;
+	}
+
+	mutex_exit(&as->a_contents);
+}
+
+
+
+/*
+ * Assumes that address space is already locked.
+ * Since xhat_free_start_all() must have been called
+ * earlier, for all XHATs holder is set to curthread.
+ * Also, since AS_BUSY is set for as->a_flags, no new
+ * XHATs could have been added.
+ */
+void
+xhat_free_end_all(struct as *as)
+{
+
+	struct xhat *xh, *xh_nxt;
+
+	ASSERT(AS_ISBUSY(as));
+
+	mutex_enter(&as->a_contents);
+	xh = (struct xhat *)as->a_xhat;
+
+
+	while (xh != NULL) {
+
+		ASSERT(xh->holder == curthread);
+
+		xh_nxt = xh->next;
+
+		mutex_exit(&as->a_contents);
+
+		XHAT_FREE_END(xh);
+
+		mutex_enter(&as->a_contents);
+
+		xh = xh_nxt;
+	}
+
+	mutex_exit(&as->a_contents);
+}
+
+
+/* Assumes that address space is already locked */
+
+/* ARGSUSED */
+int
+xhat_dup_all(struct as *as, struct as *newas, caddr_t addr, size_t len,
+    uint_t flag)
+{
+	/* This is not supported. Should we return some sort of error? */
+
+	ASSERT(AS_ISBUSY(as));
+
+	return (0);
+}
+
+
+/* Assumes that address space is already locked */
+void
+xhat_swapout_all(struct as *as)
+{
+	struct xhat *xh, *xh_nxt;
+
+
+	ASSERT(AS_ISBUSY(as));
+
+	mutex_enter(&as->a_contents);
+	xh = (struct xhat *)as->a_xhat;
+
+	if (xh != NULL) {
+		xhat_hat_hold(xh);
+		ASSERT(xh->holder == NULL);
+		xh->holder = curthread;
+	}
+
+
+	while (xh != NULL) {
+
+		xh_nxt = xh->next;
+		if (xh_nxt != NULL) {
+			ASSERT(xh_nxt->holder == NULL);
+			xhat_hat_hold(xh_nxt);
+			xh_nxt->holder = curthread;
+		}
+
+		mutex_exit(&as->a_contents);
+
+		XHAT_SWAPOUT(xh);
+
+		mutex_enter(&as->a_contents);
+
+		/*
+		 * If the xh is still there (i.e. swapout did not
+		 * destroy it), clear the holder field.
+		 * xh_nxt->prev couldn't have been changed in xhat_attach_xhat()
+		 * because AS_BUSY is set. xhat_detach_xhat() also couldn't
+		 * have modified it because (holder != NULL).
+		 * If there is only one XHAT, just see if a_xhat still
+		 * points to us.
+		 */
+		if (((xh_nxt != NULL) && (xh_nxt->prev == xh)) ||
+		    ((as->a_xhat != NULL) && (as->a_xhat == xh))) {
+			xhat_hat_rele(xh);
+			xh->holder = NULL;
+		}
+
+		xh = xh_nxt;
+	}
+
+	mutex_exit(&as->a_contents);
+}
+
+
+
+
+/*
+ * In the following routines, the appropriate xhat_op
+ * should never attempt to call xhat_detach_xhat(): it will
+ * never succeed since the XHAT is held.
+ */
+
+
+#define	XHAT_UNLOAD_CALLBACK_OP	(0)
+#define	XHAT_SETATTR_OP		(1)
+#define	XHAT_CLRATTR_OP		(2)
+#define	XHAT_CHGATTR_OP		(3)
+#define	XHAT_CHGPROT_OP		(4)
+#define	XHAT_UNSHARE_OP		(5)
+
+
+static void
+xhat_op_all(int op, struct as *as, caddr_t addr,
+    size_t len, uint_t flags, void *ptr)
+{
+	struct xhat *xh, *xh_nxt;
+
+	mutex_enter(&as->a_contents);
+	xh = (struct xhat *)as->a_xhat;
+
+	while (xh != NULL) {
+
+		xhat_hat_hold(xh);
+
+		xh_nxt = xh->next;
+		if (xh_nxt != NULL)
+			xhat_hat_hold(xh_nxt);
+
+		mutex_exit(&as->a_contents);
+
+		switch (op) {
+		case XHAT_UNLOAD_CALLBACK_OP:
+			XHAT_UNLOAD_CALLBACK(xh, addr,
+			    len, flags, (hat_callback_t *)ptr);
+			break;
+		case XHAT_SETATTR_OP:
+			XHAT_SETATTR(xh, addr, len, flags);
+			break;
+		case XHAT_CLRATTR_OP:
+			XHAT_CLRATTR(xh, addr, len, flags);
+			break;
+		case XHAT_CHGATTR_OP:
+			XHAT_CHGATTR(xh, addr, len, flags);
+			break;
+		case XHAT_CHGPROT_OP:
+			XHAT_CHGPROT(xh, addr, len, flags);
+			break;
+		case XHAT_UNSHARE_OP:
+			XHAT_UNSHARE(xh, addr, len);
+			break;
+		default:
+			panic("Unknown op %d in xhat_op_all", op);
+		}
+
+		mutex_enter(&as->a_contents);
+
+		/*
+		 * Both pointers are still valid because both
+		 * XHATs are held.
+		 */
+		xhat_hat_rele(xh);
+		if (xh_nxt != NULL)
+			xhat_hat_rele(xh_nxt);
+		xh = xh_nxt;
+	}
+
+	mutex_exit(&as->a_contents);
+}
+
+
+
+void
+xhat_unload_callback_all(struct as *as, caddr_t addr, size_t len, uint_t flags,
+    hat_callback_t *callback)
+{
+	xhat_op_all(XHAT_UNLOAD_CALLBACK_OP, as, addr, len, flags, callback);
+}
+
+
+void
+xhat_setattr_all(struct as *as, caddr_t addr, size_t len, uint_t attr)
+{
+	xhat_op_all(XHAT_SETATTR_OP, as, addr, len, attr, NULL);
+}
+
+
+
+void
+xhat_clrattr_all(struct as *as, caddr_t addr, size_t len, uint_t attr)
+{
+	xhat_op_all(XHAT_CLRATTR_OP, as, addr, len, attr, NULL);
+}
+
+
+void
+xhat_chgattr_all(struct as *as, caddr_t addr, size_t len, uint_t attr)
+{
+	xhat_op_all(XHAT_CHGATTR_OP, as, addr, len, attr, NULL);
+}
+
+
+void
+xhat_chgprot_all(struct as *as, caddr_t addr, size_t len, uint_t prot)
+{
+	xhat_op_all(XHAT_CHGPROT_OP, as, addr, len, prot, NULL);
+}
+
+
+void
+xhat_unshare_all(struct as *as, caddr_t addr, size_t len)
+{
+	xhat_op_all(XHAT_UNSHARE_OP, as, addr, len, 0, NULL);
+}
diff --git a/usr/src/uts/common/vm/xhat.h b/usr/src/uts/common/vm/xhat.h
new file mode 100644
index 0000000000..808262f2c9
--- /dev/null
+++ b/usr/src/uts/common/vm/xhat.h
@@ -0,0 +1,208 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VM_XHAT_H
+#define	_VM_XHAT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifndef _ASM
+
+#include <sys/types.h>
+#include <vm/page.h>
+#include <sys/kmem.h>
+
+struct xhat;
+struct xhat_hme_blk;
+
+struct xhat_ops {
+	struct xhat	*(*xhat_alloc)(void *);
+	void		(*xhat_free)(struct xhat *);
+	void		(*xhat_free_start)(struct xhat *);
+	void		(*xhat_free_end)(struct xhat *);
+	int		(*xhat_dup)(struct xhat *, struct xhat *, caddr_t,
+	    size_t, uint_t);
+	void		(*xhat_swapin)(struct xhat *);
+	void		(*xhat_swapout)(struct xhat *);
+	void		(*xhat_memload)(struct xhat *, caddr_t, struct page *,
+			    uint_t, uint_t);
+	void		(*xhat_memload_array)(struct xhat *, caddr_t, size_t,
+			    struct page **, uint_t, uint_t);
+	void		(*xhat_devload)(struct xhat *, caddr_t, size_t, pfn_t,
+	    uint_t, int);
+	void		(*xhat_unload)(struct xhat *, caddr_t, size_t, uint_t);
+	void		(*xhat_unload_callback)(struct xhat *, caddr_t, size_t,
+	    uint_t, hat_callback_t *);
+	void		(*xhat_setattr)(struct xhat *, caddr_t, size_t, uint_t);
+	void		(*xhat_clrattr)(struct xhat *, caddr_t, size_t, uint_t);
+	void		(*xhat_chgattr)(struct xhat *, caddr_t, size_t, uint_t);
+	void		(*xhat_unshare)(struct xhat *, caddr_t, size_t);
+	void		(*xhat_chgprot)(struct xhat *, caddr_t, size_t, uint_t);
+	int		(*xhat_pageunload)(struct xhat *, struct page *, uint_t,
+			    void *);
+};
+
+
+#define	XHAT_POPS(_p)	(_p)->xhat_provider_ops
+#define	XHAT_PROPS(_h)	XHAT_POPS(((struct xhat *)(_h))->xhat_provider)
+#define	XHAT_HOPS(hat, func, args) \
+	{ \
+		if (XHAT_PROPS(hat)-> /* */ func) \
+			XHAT_PROPS(hat)-> /* */ func /* */ args; \
+	}
+
+#define	XHAT_FREE_START(a) \
+	XHAT_HOPS(a, xhat_free_start, ((struct xhat *)(a)))
+#define	XHAT_FREE_END(a) \
+	XHAT_HOPS(a, xhat_free_end, ((struct xhat *)(a)))
+#define	XHAT_DUP(a, b, c, d, e) \
+	((XHAT_PROPS(a)->xhat_dup == NULL) ? (0) : \
+	XHAT_PROPS(a)->xhat_dup((struct xhat *)(a), \
+				(struct xhat *)(b), c, d, e))
+#define	XHAT_SWAPIN(a) \
+	XHAT_HOPS(a, xhat_swapin, ((struct xhat *)(a)))
+#define	XHAT_SWAPOUT(a) \
+	XHAT_HOPS(a, xhat_swapout, ((struct xhat *)(a)))
+#define	XHAT_MEMLOAD(a, b, c, d, e) \
+	XHAT_HOPS(a, xhat_memload, ((struct xhat *)(a), b, c, d, e))
+#define	XHAT_MEMLOAD_ARRAY(a, b, c, d, e, f) \
+	XHAT_HOPS(a, xhat_memload_array, ((struct xhat *)(a), b, c, d, e, f))
+#define	XHAT_DEVLOAD(a, b, c, d, e, f) \
+	XHAT_HOPS(a, xhat_devload, ((struct xhat *)(a), b, c, d, e, f))
+#define	XHAT_UNLOAD(a, b, c, d) \
+	XHAT_HOPS(a, xhat_unload, ((struct xhat *)(a), b, c, d))
+#define	XHAT_UNLOAD_CALLBACK(a, b, c, d, e) \
+	XHAT_HOPS(a, xhat_unload_callback, ((struct xhat *)(a), b, c, d, e))
+#define	XHAT_SETATTR(a, b, c, d) \
+	XHAT_HOPS(a, xhat_setattr, ((struct xhat *)(a), b, c, d))
+#define	XHAT_CLRATTR(a, b, c, d) \
+	XHAT_HOPS(a, xhat_clrattr, ((struct xhat *)(a), b, c, d))
+#define	XHAT_CHGATTR(a, b, c, d) \
+	XHAT_HOPS(a, xhat_chgattr, ((struct xhat *)(a), b, c, d))
+#define	XHAT_UNSHARE(a, b, c) \
+	XHAT_HOPS(a, xhat_unshare, ((struct xhat *)(a), b, c))
+#define	XHAT_CHGPROT(a, b, c, d) \
+	XHAT_HOPS(a, xhat_chgprot, ((struct xhat *)(a), b, c, d))
+#define	XHAT_PAGEUNLOAD(a, b, c, d) \
+	((XHAT_PROPS(a)->xhat_pageunload == NULL) ? (0) : \
+	XHAT_PROPS(a)->xhat_pageunload((struct xhat *)(a), b, c, d))
+
+
+
+#define	XHAT_PROVIDER_VERSION	1
+
+/*
+ * Provider name will be appended with "_cache"
+ * when initializing kmem cache.
+ * The resulting sring must be less than
+ * KMEM_CACHE_NAMELEN
+ */
+#define	XHAT_CACHE_NAMELEN	24
+
+typedef struct xblk_cache {
+	kmutex_t	lock;
+	kmem_cache_t	*cache;
+	void		*free_blks;
+	void		(*reclaim)(void *);
+} xblk_cache_t;
+
+typedef struct xhat_provider {
+	int		xhat_provider_version;
+	int		xhat_provider_refcnt;
+	struct xhat_provider *next;
+	struct xhat_provider *prev;
+	char		xhat_provider_name[XHAT_CACHE_NAMELEN];
+	xblk_cache_t	*xblkcache;
+	struct xhat_ops *xhat_provider_ops;
+	int		xhat_provider_blk_size;
+} xhat_provider_t;
+
+/*
+ * The xhat structure is protected by xhat_lock.
+ * A particular xhat implementation is a extension of the
+ * xhat structure and may contain its own lock(s) to
+ * protect those additional fields.
+ * The xhat structure is never allocated directly.
+ * Instead its allocation is provided by the hat implementation.
+ * The xhat provider ops xhat_alloc/xhat_free are used to
+ * alloc/free a implementation dependant xhat structure.
+ */
+struct xhat {
+	xhat_provider_t		*xhat_provider;
+	struct as		*xhat_as;
+	void			*arg;
+	struct xhat		*prev;
+	struct xhat		*next;
+	kmutex_t		xhat_lock;
+	int			xhat_refcnt;
+	kthread_t		*holder;
+};
+
+
+/* Error codes */
+#define	XH_PRVDR	(1)	/* Provider-specific error */
+#define	XH_ASBUSY	(2)	/* Address space is busy */
+#define	XH_XHHELD	(3)	/* XHAT is being held */
+#define	XH_NOTATTCHD	(4)	/* Provider is not attached to as */
+
+
+int	xhat_provider_register(xhat_provider_t *);
+int	xhat_provider_unregister(xhat_provider_t *);
+void	xhat_init(void);
+int	xhat_attach_xhat(xhat_provider_t *, struct as *, struct xhat **,
+    void *);
+int	xhat_detach_xhat(xhat_provider_t *, struct as *);
+pfn_t	xhat_insert_xhatblk(page_t *, struct xhat *, void **);
+int	xhat_delete_xhatblk(void *, int);
+void	xhat_hat_hold(struct xhat *);
+void	xhat_hat_rele(struct xhat *);
+int	xhat_hat_holders(struct xhat *);
+
+void	xhat_free_start_all(struct as *);
+void	xhat_free_end_all(struct as *);
+int	xhat_dup_all(struct as *, struct as *, caddr_t, size_t, uint_t);
+void	xhat_swapout_all(struct as *);
+void	xhat_unload_callback_all(struct as *, caddr_t, size_t, uint_t,
+    hat_callback_t *);
+void	xhat_setattr_all(struct as *, caddr_t, size_t, uint_t);
+void	xhat_clrattr_all(struct as *, caddr_t, size_t, uint_t);
+void	xhat_chgattr_all(struct as *, caddr_t, size_t, uint_t);
+void	xhat_chgprot_all(struct as *, caddr_t, size_t, uint_t);
+void	xhat_unshare_all(struct as *, caddr_t, size_t);
+
+
+#endif /* _ASM */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_XHAT_H */
author	stevel@tonic-gate <none@none>	2005-06-14 00:00:00 -0700
committer	stevel@tonic-gate <none@none>	2005-06-14 00:00:00 -0700
commit	7c478bd95313f5f23a4c958a745db2134aa03244 (patch)
tree	c871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/uts/common/vm
download	illumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz