summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/vm
diff options
context:
space:
mode:
authorstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
committerstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
commit7c478bd95313f5f23a4c958a745db2134aa03244 (patch)
treec871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/uts/common/vm
downloadillumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz
OpenSolaris Launch
Diffstat (limited to 'usr/src/uts/common/vm')
-rw-r--r--usr/src/uts/common/vm/Makefile55
-rw-r--r--usr/src/uts/common/vm/anon.h461
-rw-r--r--usr/src/uts/common/vm/as.h290
-rw-r--r--usr/src/uts/common/vm/faultcode.h76
-rw-r--r--usr/src/uts/common/vm/hat.c149
-rw-r--r--usr/src/uts/common/vm/hat.h598
-rw-r--r--usr/src/uts/common/vm/hat_refmod.c544
-rw-r--r--usr/src/uts/common/vm/kpm.h57
-rw-r--r--usr/src/uts/common/vm/page.h1006
-rw-r--r--usr/src/uts/common/vm/page_lock.c861
-rw-r--r--usr/src/uts/common/vm/pvn.h117
-rw-r--r--usr/src/uts/common/vm/rm.h61
-rw-r--r--usr/src/uts/common/vm/seg.h252
-rw-r--r--usr/src/uts/common/vm/seg_dev.c4073
-rw-r--r--usr/src/uts/common/vm/seg_dev.h131
-rw-r--r--usr/src/uts/common/vm/seg_enum.h85
-rw-r--r--usr/src/uts/common/vm/seg_kmem.c1516
-rw-r--r--usr/src/uts/common/vm/seg_kmem.h129
-rw-r--r--usr/src/uts/common/vm/seg_kp.c1444
-rw-r--r--usr/src/uts/common/vm/seg_kp.h165
-rw-r--r--usr/src/uts/common/vm/seg_kpm.c323
-rw-r--r--usr/src/uts/common/vm/seg_kpm.h118
-rw-r--r--usr/src/uts/common/vm/seg_map.c2345
-rw-r--r--usr/src/uts/common/vm/seg_map.h294
-rw-r--r--usr/src/uts/common/vm/seg_spt.c2701
-rw-r--r--usr/src/uts/common/vm/seg_spt.h155
-rw-r--r--usr/src/uts/common/vm/seg_vn.c7745
-rw-r--r--usr/src/uts/common/vm/seg_vn.h168
-rw-r--r--usr/src/uts/common/vm/vm_anon.c3197
-rw-r--r--usr/src/uts/common/vm/vm_as.c2898
-rw-r--r--usr/src/uts/common/vm/vm_page.c6708
-rw-r--r--usr/src/uts/common/vm/vm_pagelist.c3726
-rw-r--r--usr/src/uts/common/vm/vm_pvn.c1147
-rw-r--r--usr/src/uts/common/vm/vm_rm.c189
-rw-r--r--usr/src/uts/common/vm/vm_seg.c952
-rw-r--r--usr/src/uts/common/vm/vm_swap.c1590
-rw-r--r--usr/src/uts/common/vm/vpage.h86
-rw-r--r--usr/src/uts/common/vm/xhat.c555
-rw-r--r--usr/src/uts/common/vm/xhat.h208
39 files changed, 47175 insertions, 0 deletions
diff --git a/usr/src/uts/common/vm/Makefile b/usr/src/uts/common/vm/Makefile
new file mode 100644
index 0000000000..fcd6582985
--- /dev/null
+++ b/usr/src/uts/common/vm/Makefile
@@ -0,0 +1,55 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+
+# include global definitions
+include ../../../Makefile.master
+
+HDRS= anon.h as.h faultcode.h hat.h kpm.h page.h pvn.h rm.h seg.h vpage.h \
+ seg_dev.h seg_enum.h seg_kmem.h seg_kp.h seg_kpm.h seg_map.h \
+ seg_vn.h seg_spt.h
+
+ROOTDIRS= $(ROOT)/usr/include/vm
+
+ROOTHDRS= $(HDRS:%=$(ROOTDIRS)/%)
+
+CHECKHDRS= $(HDRS:%.h=%.check)
+
+# install rule
+$(ROOTDIRS)/%: %
+ $(INS.file)
+
+.KEEP_STATE:
+
+.PARALLEL: $(CHECKHDRS)
+
+install_h: $(ROOTDIRS) $(ROOTHDRS)
+
+$(ROOTDIRS):
+ $(INS.dir)
+
+check: $(CHECKHDRS)
diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h
new file mode 100644
index 0000000000..466b939a75
--- /dev/null
+++ b/usr/src/uts/common/vm/anon.h
@@ -0,0 +1,461 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_ANON_H
+#define _VM_ANON_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/cred.h>
+#include <vm/seg.h>
+#include <vm/vpage.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Anonymous pages.
+ */
+
+typedef unsigned long anoff_t; /* anon offsets */
+
+/*
+ * Each anonymous page, either in memory or in swap, has an anon structure.
+ * The structure (slot) provides a level of indirection between anonymous pages
+ * and their backing store.
+ *
+ * (an_vp, an_off) names the vnode of the anonymous page for this slot.
+ *
+ * (an_pvp, an_poff) names the location of the physical backing store
+ * for the page this slot represents. If the name is null there is no
+ * associated physical store. The physical backing store location can
+ * change while the slot is in use.
+ *
+ * an_hash is a hash list of anon slots. The list is hashed by
+ * (an_vp, an_off) of the associated anonymous page and provides a
+ * method of going from the name of an anonymous page to its
+ * associated anon slot.
+ *
+ * an_refcnt holds a reference count which is the number of separate
+ * copies that will need to be created in case of copy-on-write.
+ * A refcnt > 0 protects the existence of the slot. The refcnt is
+ * initialized to 1 when the anon slot is created in anon_alloc().
+ * If a client obtains an anon slot and allows multiple threads to
+ * share it, then it is the client's responsibility to insure that
+ * it does not allow one thread to try to reference the slot at the
+ * same time as another is trying to decrement the last count and
+ * destroy the anon slot. E.g., the seg_vn segment type protects
+ * against this with higher level locks.
+ */
+
+struct anon {
+ struct vnode *an_vp; /* vnode of anon page */
+ struct vnode *an_pvp; /* vnode of physical backing store */
+ anoff_t an_off; /* offset of anon page */
+ anoff_t an_poff; /* offset in vnode */
+ struct anon *an_hash; /* hash table of anon slots */
+ int an_refcnt; /* # of people sharing slot */
+};
+
+#ifdef _KERNEL
+/*
+ * The swapinfo_lock protects:
+ * swapinfo list
+ * individual swapinfo structures
+ *
+ * The anoninfo_lock protects:
+ * anoninfo counters
+ *
+ * The anonhash_lock protects:
+ * anon hash lists
+ * anon slot fields
+ *
+ * Fields in the anon slot which are read-only for the life of the slot
+ * (an_vp, an_off) do not require the anonhash_lock be held to access them.
+ * If you access a field without the anonhash_lock held you must be holding
+ * the slot with an_refcnt to make sure it isn't destroyed.
+ * To write (an_pvp, an_poff) in a given slot you must also hold the
+ * p_iolock of the anonymous page for slot.
+ */
+extern kmutex_t anoninfo_lock;
+extern kmutex_t swapinfo_lock;
+extern kmutex_t anonhash_lock[];
+extern pad_mutex_t anon_array_lock[];
+extern kcondvar_t anon_array_cv[];
+
+/*
+ * Global hash table to provide a function from (vp, off) -> ap
+ */
+extern size_t anon_hash_size;
+extern struct anon **anon_hash;
+#define ANON_HASH_SIZE anon_hash_size
+#define ANON_HASHAVELEN 4
+#define ANON_HASH(VP, OFF) \
+((((uintptr_t)(VP) >> 7) ^ ((OFF) >> PAGESHIFT)) & (ANON_HASH_SIZE - 1))
+
+#define AH_LOCK_SIZE 64
+#define AH_LOCK(vp, off) (ANON_HASH((vp), (off)) & (AH_LOCK_SIZE -1))
+
+#endif /* _KERNEL */
+
+/*
+ * Declaration for the Global counters to accurately
+ * track the kernel foot print in memory.
+ */
+extern pgcnt_t segvn_pages_locked;
+extern pgcnt_t pages_locked;
+extern pgcnt_t pages_claimed;
+extern pgcnt_t pages_useclaim;
+extern pgcnt_t obp_pages;
+
+/*
+ * Anonymous backing store accounting structure for swapctl.
+ *
+ * ani_max = maximum amount of swap space
+ * (including potentially available physical memory)
+ * ani_free = amount of unallocated anonymous memory
+ * (some of which might be reserved and including
+ * potentially available physical memory)
+ * ani_resv = amount of claimed (reserved) anonymous memory
+ *
+ * The swap data can be aquired more efficiently through the
+ * kstats interface.
+ * Total slots currently available for reservation =
+ * MAX(ani_max - ani_resv, 0) + (availrmem - swapfs_minfree)
+ */
+struct anoninfo {
+ pgcnt_t ani_max;
+ pgcnt_t ani_free;
+ pgcnt_t ani_resv;
+};
+
+#ifdef _SYSCALL32
+struct anoninfo32 {
+ size32_t ani_max;
+ size32_t ani_free;
+ size32_t ani_resv;
+};
+#endif /* _SYSCALL32 */
+
+/*
+ * Define the NCPU pool of the ani_free counters. Update the counter
+ * of the cpu on which the thread is running and in every clock intr
+ * sync anoninfo.ani_free with the current total off all the NCPU entries.
+ */
+
+typedef struct ani_free {
+ kmutex_t ani_lock;
+ pgcnt_t ani_count;
+ uchar_t pad[64 - sizeof (kmutex_t) - sizeof (pgcnt_t)];
+ /* XXX 64 = cacheline size */
+} ani_free_t;
+
+#define ANI_MAX_POOL 128
+extern ani_free_t ani_free_pool[];
+
+#define ANI_ADD(inc) { \
+ ani_free_t *anifp; \
+ int index; \
+ index = (CPU->cpu_id & (ANI_MAX_POOL - 1)); \
+ anifp = &ani_free_pool[index]; \
+ mutex_enter(&anifp->ani_lock); \
+ anifp->ani_count += inc; \
+ mutex_exit(&anifp->ani_lock); \
+}
+
+/*
+ * Anon array pointers are allocated in chunks. Each chunk
+ * has PAGESIZE/sizeof(u_long *) of anon pointers.
+ * There are two levels of arrays for anon array pointers larger
+ * than a chunk. The first level points to anon array chunks.
+ * The second level consists of chunks of anon pointers.
+ *
+ * If anon array is smaller than a chunk then the whole anon array
+ * is created (memory is allocated for whole anon array).
+ * If anon array is larger than a chunk only first level array is
+ * allocated. Then other arrays (chunks) are allocated only when
+ * they are initialized with anon pointers.
+ */
+struct anon_hdr {
+ kmutex_t serial_lock; /* serialize array chunk allocation */
+ pgcnt_t size; /* number of pointers to (anon) pages */
+ void **array_chunk; /* pointers to anon pointers or chunks of */
+ /* anon pointers */
+ int flags; /* ANON_ALLOC_FORCE force preallocation of */
+ /* whole anon array */
+};
+
+#ifdef _LP64
+#define ANON_PTRSHIFT 3
+#define ANON_PTRMASK ~7
+#else
+#define ANON_PTRSHIFT 2
+#define ANON_PTRMASK ~3
+#endif
+
+#define ANON_CHUNK_SIZE (PAGESIZE >> ANON_PTRSHIFT)
+#define ANON_CHUNK_SHIFT (PAGESHIFT - ANON_PTRSHIFT)
+#define ANON_CHUNK_OFF (ANON_CHUNK_SIZE - 1)
+
+/*
+ * Anon flags.
+ */
+#define ANON_SLEEP 0x0 /* ok to block */
+#define ANON_NOSLEEP 0x1 /* non-blocking call */
+#define ANON_ALLOC_FORCE 0x2 /* force single level anon array */
+#define ANON_GROWDOWN 0x4 /* anon array should grow downward */
+
+/*
+ * The anon_map structure is used by various clients of the anon layer to
+ * manage anonymous memory. When anonymous memory is shared,
+ * then the different clients sharing it will point to the
+ * same anon_map structure. Also, if a segment is unmapped
+ * in the middle where an anon_map structure exists, the
+ * newly created segment will also share the anon_map structure,
+ * although the two segments will use different ranges of the
+ * anon array. When mappings are private (or shared with
+ * a reference count of 1), an unmap operation will free up
+ * a range of anon slots in the array given by the anon_map
+ * structure. Because of fragmentation due to this unmapping,
+ * we have to store the size of the anon array in the anon_map
+ * structure so that we can free everything when the referernce
+ * count goes to zero.
+ *
+ * A new rangelock scheme is introduced to make the anon layer scale.
+ * A reader/writer lock per anon_amp and an array of system-wide hash
+ * locks, anon_array_lock[] are introduced to replace serial_lock and
+ * anonmap lock. The writer lock is held when we want to singlethreaD
+ * the reference to the anon array pointers or when references to
+ * anon_map's members, whereas reader lock and anon_array_lock are
+ * held to allows multiple threads to reference different part of
+ * anon array. A global set of condition variables, anon_array_cv,
+ * are used with anon_array_lock[] to make the hold time of the locks
+ * short.
+ *
+ * szc is used to calculate the index of hash locks and cv's. We
+ * could've just used seg->s_szc if not for the possible sharing of
+ * anon_amp between SYSV shared memory and ISM, so now we introduce
+ * szc in the anon_map structure. For MAP_SHARED, the amp->szc is either
+ * 0 (base page size) or page_num_pagesizes() - 1, while MAP_PRIVATE
+ * the amp->szc could be anything in [0, page_num_pagesizes() - 1].
+ */
+struct anon_map {
+ krwlock_t a_rwlock; /* protect anon_map and anon array */
+ size_t size; /* size in bytes mapped by the anon array */
+ struct anon_hdr *ahp; /* anon array header pointer, containing */
+ /* anon pointer array(s) */
+ size_t swresv; /* swap space reserved for this anon_map */
+ uint_t refcnt; /* reference count on this structure */
+ ushort_t a_szc; /* max szc among shared processes */
+ void *locality; /* lgroup locality info */
+};
+
+#ifdef _KERNEL
+
+#define ANON_BUSY 0x1
+#define ANON_ISBUSY(slot) (*(slot) & ANON_BUSY)
+#define ANON_SETBUSY(slot) (*(slot) |= ANON_BUSY)
+#define ANON_CLRBUSY(slot) (*(slot) &= ~ANON_BUSY)
+
+#define ANON_MAP_SHIFT 6 /* log2(sizeof (struct anon_map)) */
+#define ANON_ARRAY_SHIFT 7 /* log2(ANON_LOCKSIZE) */
+#define ANON_LOCKSIZE 128
+
+#define ANON_LOCK_ENTER(lock, type) rw_enter((lock), (type))
+#define ANON_LOCK_EXIT(lock) rw_exit((lock))
+
+#define ANON_ARRAY_HASH(amp, idx)\
+ ((((idx) + ((idx) >> ANON_ARRAY_SHIFT) +\
+ ((idx) >> (ANON_ARRAY_SHIFT << 1)) +\
+ ((idx) >> (ANON_ARRAY_SHIFT + (ANON_ARRAY_SHIFT << 1)))) ^\
+ ((uintptr_t)(amp) >> ANON_MAP_SHIFT)) & (ANON_LOCKSIZE - 1))
+
+typedef struct anon_sync_obj {
+ kmutex_t *sync_mutex;
+ kcondvar_t *sync_cv;
+ ulong_t *sync_data;
+} anon_sync_obj_t;
+
+/*
+ * Anonymous backing store accounting structure for kernel.
+ * ani_max = total reservable slots on physical (disk-backed) swap
+ * ani_phys_resv = total phys slots reserved for use by clients
+ * ani_mem_resv = total mem slots reserved for use by clients
+ * ani_free = # unallocated physical slots + # of reserved unallocated
+ * memory slots
+ */
+
+/*
+ * Initial total swap slots available for reservation
+ */
+#define TOTAL_AVAILABLE_SWAP \
+ (k_anoninfo.ani_max + MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
+
+/*
+ * Swap slots currently available for reservation
+ */
+#define CURRENT_TOTAL_AVAILABLE_SWAP \
+ ((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) + \
+ MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
+
+struct k_anoninfo {
+ pgcnt_t ani_max; /* total reservable slots on phys */
+ /* (disk) swap */
+ pgcnt_t ani_free; /* # of unallocated phys and mem slots */
+ pgcnt_t ani_phys_resv; /* # of reserved phys (disk) slots */
+ pgcnt_t ani_mem_resv; /* # of reserved mem slots */
+ pgcnt_t ani_locked_swap; /* # of swap slots locked in reserved */
+ /* mem swap */
+};
+
+extern struct k_anoninfo k_anoninfo;
+
+extern void anon_init(void);
+extern struct anon *anon_alloc(struct vnode *, anoff_t);
+extern void anon_dup(struct anon_hdr *, ulong_t,
+ struct anon_hdr *, ulong_t, size_t);
+extern void anon_dup_fill_holes(struct anon_hdr *, ulong_t,
+ struct anon_hdr *, ulong_t, size_t, uint_t, int);
+extern int anon_fill_cow_holes(struct seg *, caddr_t, struct anon_hdr *,
+ ulong_t, struct vnode *, u_offset_t, size_t, uint_t,
+ uint_t, struct vpage [], struct cred *);
+extern void anon_free(struct anon_hdr *, ulong_t, size_t);
+extern void anon_free_pages(struct anon_hdr *, ulong_t, size_t, uint_t);
+extern void anon_disclaim(struct anon_map *, ulong_t, size_t, int);
+extern int anon_getpage(struct anon **, uint_t *, struct page **,
+ size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
+extern int swap_getconpage(struct vnode *, u_offset_t, size_t,
+ uint_t *, page_t *[], size_t, page_t *,
+ spgcnt_t *, struct seg *, caddr_t,
+ enum seg_rw, struct cred *);
+extern int anon_map_getpages(struct anon_map *, ulong_t,
+ uint_t, struct seg *, caddr_t, uint_t,
+ uint_t *, page_t *[], uint_t *,
+ struct vpage [], enum seg_rw, int, int, struct cred *);
+extern int anon_map_privatepages(struct anon_map *, ulong_t,
+ uint_t, struct seg *, caddr_t, uint_t,
+ page_t *[], struct vpage [], int, struct cred *);
+extern struct page *anon_private(struct anon **, struct seg *,
+ caddr_t, uint_t, struct page *,
+ int, struct cred *);
+extern struct page *anon_zero(struct seg *, caddr_t,
+ struct anon **, struct cred *);
+extern int anon_map_createpages(struct anon_map *, ulong_t,
+ size_t, struct page **,
+ struct seg *, caddr_t,
+ enum seg_rw, struct cred *);
+extern int anon_map_demotepages(struct anon_map *, ulong_t,
+ struct seg *, caddr_t, uint_t,
+ struct vpage [], struct cred *);
+extern int anon_resvmem(size_t, uint_t);
+extern void anon_unresv(size_t);
+extern struct anon_map *anonmap_alloc(size_t, size_t);
+extern void anonmap_free(struct anon_map *);
+extern void anon_decref(struct anon *);
+extern int non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *);
+extern pgcnt_t anon_pages(struct anon_hdr *, ulong_t, pgcnt_t);
+extern int anon_swap_adjust(pgcnt_t);
+extern void anon_swap_restore(pgcnt_t);
+extern struct anon_hdr *anon_create(pgcnt_t, int);
+extern void anon_release(struct anon_hdr *, pgcnt_t);
+extern struct anon *anon_get_ptr(struct anon_hdr *, ulong_t);
+extern ulong_t *anon_get_slot(struct anon_hdr *, ulong_t);
+extern struct anon *anon_get_next_ptr(struct anon_hdr *, ulong_t *);
+extern int anon_set_ptr(struct anon_hdr *, ulong_t, struct anon *, int);
+extern int anon_copy_ptr(struct anon_hdr *, ulong_t,
+ struct anon_hdr *, ulong_t, pgcnt_t, int);
+extern pgcnt_t anon_grow(struct anon_hdr *, ulong_t *, pgcnt_t, pgcnt_t, int);
+extern void anon_array_enter(struct anon_map *, ulong_t,
+ anon_sync_obj_t *);
+extern void anon_array_exit(anon_sync_obj_t *);
+
+/*
+ * anon_resv checks to see if there is enough swap space to fulfill a
+ * request and if so, reserves the appropriate anonymous memory resources.
+ * anon_checkspace just checks to see if there is space to fulfill the request,
+ * without taking any resources. Both return 1 if successful and 0 if not.
+ */
+#define anon_resv(size) anon_resvmem((size), 1)
+#define anon_checkspace(size) anon_resvmem((size), 0)
+
+/*
+ * Flags to anon_private
+ */
+#define STEAL_PAGE 0x1 /* page can be stolen */
+#define LOCK_PAGE 0x2 /* page must be ``logically'' locked */
+
+/*
+ * Flags to anon_disclaim
+ */
+#define ANON_PGLOOKUP_BLK 0x1 /* block on locked pages */
+
+/*
+ * SEGKP ANON pages that are locked are assumed to be LWP stack pages
+ * and thus count towards the user pages locked count.
+ * This value is protected by the same lock as availrmem.
+ */
+extern pgcnt_t anon_segkp_pages_locked;
+
+extern int anon_debug;
+
+#ifdef ANON_DEBUG
+
+#define A_ANON 0x01
+#define A_RESV 0x02
+#define A_MRESV 0x04
+
+/* vararg-like debugging macro. */
+#define ANON_PRINT(f, printf_args) \
+ if (anon_debug & f) \
+ printf printf_args
+
+#else /* ANON_DEBUG */
+
+#define ANON_PRINT(f, printf_args)
+
+#endif /* ANON_DEBUG */
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_ANON_H */
diff --git a/usr/src/uts/common/vm/as.h b/usr/src/uts/common/vm/as.h
new file mode 100644
index 0000000000..c7afefc23c
--- /dev/null
+++ b/usr/src/uts/common/vm/as.h
@@ -0,0 +1,290 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_AS_H
+#define _VM_AS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/watchpoint.h>
+#include <vm/seg.h>
+#include <vm/faultcode.h>
+#include <vm/hat.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Address spaces.
+ */
+
+/*
+ * Each address space consists of a sorted list of segments
+ * and machine dependent address translation information.
+ *
+ * All the hard work is in the segment drivers and the
+ * hardware address translation code.
+ *
+ * The segment list is represented as an AVL tree.
+ *
+ * The address space lock (a_lock) is a long term lock which serializes
+ * access to certain operations (as_map, as_unmap) and protects the
+ * underlying generic segment data (seg.h) along with some fields in the
+ * address space structure as shown below:
+ *
+ * address space structure segment structure
+ *
+ * a_segtree s_base
+ * a_size s_size
+ * a_lastgap s_link
+ * a_seglast s_ops
+ * s_as
+ * s_data
+ *
+ * The address space contents lock (a_contents) is a short term
+ * lock that protects most of the data in the address space structure.
+ * This lock is always acquired after the "a_lock" in all situations
+ * except while dealing with AS_CLAIMGAP to avoid deadlocks.
+ *
+ * The following fields are protected by this lock:
+ *
+ * a_flags (AS_PAGLCK, AS_CLAIMGAP, etc.)
+ * a_unmapwait
+ * a_seglast
+ *
+ * The address space lock (a_lock) is always held prior to any segment
+ * operation. Some segment drivers use the address space lock to protect
+ * some or all of their segment private data, provided the version of
+ * "a_lock" (read vs. write) is consistent with the use of the data.
+ *
+ * The following fields are protected by the hat layer lock:
+ *
+ * a_vbits
+ * a_hat
+ * a_hrm
+ */
+
+struct as {
+ kmutex_t a_contents; /* protect certain fields in the structure */
+ uchar_t a_flags; /* as attributes */
+ uchar_t a_vbits; /* used for collecting statistics */
+ kcondvar_t a_cv; /* used by as_rangelock */
+ struct hat *a_hat; /* hat structure */
+ struct hrmstat *a_hrm; /* ref and mod bits */
+ caddr_t a_userlimit; /* highest allowable address in this as */
+ struct seg *a_seglast; /* last segment hit on the addr space */
+ krwlock_t a_lock; /* protects segment related fields */
+ size_t a_size; /* size of address space */
+ struct seg *a_lastgap; /* last seg found by as_gap() w/ AS_HI (mmap) */
+ struct seg *a_lastgaphl; /* last seg saved in as_gap() either for */
+ /* AS_HI or AS_LO used in as_addseg() */
+ avl_tree_t a_segtree; /* segments in this address space. (AVL tree) */
+ avl_tree_t a_wpage; /* watched pages (procfs) */
+ uchar_t a_updatedir; /* mappings changed, rebuild a_objectdir */
+ timespec_t a_updatetime; /* time when mappings last changed */
+ vnode_t **a_objectdir; /* object directory (procfs) */
+ size_t a_sizedir; /* size of object directory */
+ struct as_callback *a_callbacks; /* callback list */
+ void *a_xhat; /* list of xhat providers */
+};
+
+#define AS_PAGLCK 0x80
+#define AS_CLAIMGAP 0x40
+#define AS_UNMAPWAIT 0x20
+#define AS_NEEDSPURGE 0x10 /* mostly for seg_nf, see as_purge() */
+#define AS_BUSY 0x01 /* needed by XHAT framework */
+
+#define AS_ISPGLCK(as) ((as)->a_flags & AS_PAGLCK)
+#define AS_ISCLAIMGAP(as) ((as)->a_flags & AS_CLAIMGAP)
+#define AS_ISUNMAPWAIT(as) ((as)->a_flags & AS_UNMAPWAIT)
+#define AS_ISBUSY(as) ((as)->a_flags & AS_BUSY)
+
+
+#define AS_SETPGLCK(as) ((as)->a_flags |= AS_PAGLCK)
+#define AS_SETCLAIMGAP(as) ((as)->a_flags |= AS_CLAIMGAP)
+#define AS_SETUNMAPWAIT(as) ((as)->a_flags |= AS_UNMAPWAIT)
+#define AS_SETBUSY(as) ((as)->a_flags |= AS_BUSY)
+
+#define AS_CLRPGLCK(as) ((as)->a_flags &= ~AS_PAGLCK)
+#define AS_CLRCLAIMGAP(as) ((as)->a_flags &= ~AS_CLAIMGAP)
+#define AS_CLRUNMAPWAIT(as) ((as)->a_flags &= ~AS_UNMAPWAIT)
+#define AS_CLRBUSY(as) ((as)->a_flags &= ~AS_BUSY)
+
+#define AS_TYPE_64BIT(as) \
+ (((as)->a_userlimit > (caddr_t)UINT32_MAX) ? 1 : 0)
+
+/*
+ * The as_callback is the basic structure which supports the ability to
+ * inform clients of specific events pertaining to address space management.
+ * A user calls as_add_callback to register an address space callback
+ * for a range of pages, specifying the events that need to occur.
+ * When as_do_callbacks is called and finds a 'matching' entry, the
+ * callback is called once, and the callback function MUST call
+ * as_delete_callback when all callback activities are complete.
+ * The thread calling as_do_callbacks blocks until the as_delete_callback
+ * is called. This allows for asynchorous events to subside before the
+ * as_do_callbacks thread continues.
+ *
+ * An example of the need for this is a driver which has done long-term
+ * locking of memory. Address space management operations (events) such
+ * as as_free, as_umap, and as_setprot will block indefinitely until the
+ * pertinent memory is unlocked. The callback mechanism provides the
+ * way to inform the driver of the event so that the driver may do the
+ * necessary unlocking.
+ *
+ * The contents of this structure is protected by a_contents lock
+ */
+typedef void (*callback_func_t)(struct as *, void *, uint_t);
+struct as_callback {
+ struct as_callback *ascb_next; /* list link */
+ uint_t ascb_events; /* event types */
+ callback_func_t ascb_func; /* callback function */
+ void *ascb_arg; /* callback argument */
+ caddr_t ascb_saddr; /* start address */
+ size_t ascb_len; /* address range */
+};
+/*
+ * Callback events
+ */
+#define AS_FREE_EVENT 0x1
+#define AS_SETPROT_EVENT 0x2
+#define AS_UNMAP_EVENT 0x4
+#define AS_CALLBACK_CALLED ((uint_t)(1U << (8 * sizeof (uint_t) - 1U)))
+#define AS_UNMAPWAIT_EVENT \
+ (AS_FREE_EVENT | AS_SETPROT_EVENT | AS_UNMAP_EVENT)
+#define AS_ALL_EVENT \
+ (AS_FREE_EVENT | AS_SETPROT_EVENT | AS_UNMAP_EVENT)
+
+
+/* Return code values for as_callback_delete */
+enum as_cbdelete_rc {
+ AS_CALLBACK_DELETED,
+ AS_CALLBACK_NOTFOUND,
+ AS_CALLBACK_DELETE_DEFERRED
+};
+
+#ifdef _KERNEL
+
+/*
+ * Flags for as_gap.
+ */
+#define AH_DIR 0x1 /* direction flag mask */
+#define AH_LO 0x0 /* find lowest hole */
+#define AH_HI 0x1 /* find highest hole */
+#define AH_CONTAIN 0x2 /* hole must contain `addr' */
+
+extern struct as kas; /* kernel's address space */
+
+/*
+ * Macros for address space locking.
+ */
+#define AS_LOCK_ENTER(as, lock, type) rw_enter((lock), (type))
+#define AS_LOCK_EXIT(as, lock) rw_exit((lock))
+#define AS_LOCK_DESTROY(as, lock) rw_destroy((lock))
+#define AS_LOCK_TRYENTER(as, lock, type) rw_tryenter((lock), (type))
+
+/*
+ * Macros to test lock states.
+ */
+#define AS_LOCK_HELD(as, lock) RW_LOCK_HELD((lock))
+#define AS_READ_HELD(as, lock) RW_READ_HELD((lock))
+#define AS_WRITE_HELD(as, lock) RW_WRITE_HELD((lock))
+
+/*
+ * macros to walk thru segment lists
+ */
+#define AS_SEGFIRST(as) avl_first(&(as)->a_segtree)
+#define AS_SEGNEXT(as, seg) AVL_NEXT(&(as)->a_segtree, (seg))
+#define AS_SEGPREV(as, seg) AVL_PREV(&(as)->a_segtree, (seg))
+
+void as_init(void);
+void as_avlinit(struct as *);
+struct seg *as_segat(struct as *as, caddr_t addr);
+void as_rangelock(struct as *as);
+void as_rangeunlock(struct as *as);
+struct as *as_alloc(void);
+void as_free(struct as *as);
+int as_dup(struct as *as, struct as **outas);
+struct seg *as_findseg(struct as *as, caddr_t addr, int tail);
+int as_addseg(struct as *as, struct seg *newseg);
+struct seg *as_removeseg(struct as *as, struct seg *seg);
+faultcode_t as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
+ enum fault_type type, enum seg_rw rw);
+faultcode_t as_faulta(struct as *as, caddr_t addr, size_t size);
+int as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot);
+int as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot);
+int as_unmap(struct as *as, caddr_t addr, size_t size);
+int as_map(struct as *as, caddr_t addr, size_t size, int ((*crfp)()),
+ void *argsp);
+void as_purge(struct as *as);
+int as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
+ uint_t flags, caddr_t addr);
+int as_memory(struct as *as, caddr_t *basep, size_t *lenp);
+size_t as_swapout(struct as *as);
+int as_incore(struct as *as, caddr_t addr, size_t size, char *vec,
+ size_t *sizep);
+int as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
+ uintptr_t arg, ulong_t *lock_map, size_t pos);
+int as_exec(struct as *oas, caddr_t ostka, size_t stksz,
+ struct as *nas, caddr_t nstka, uint_t hatflag);
+int as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
+ size_t size, enum seg_rw rw);
+void as_pageunlock(struct as *as, struct page **pp, caddr_t addr,
+ size_t size, enum seg_rw rw);
+void as_pagereclaim(struct as *as, struct page **pp, caddr_t addr,
+ size_t size, enum seg_rw rw);
+int as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
+ boolean_t wait);
+void as_setwatch(struct as *as);
+void as_clearwatch(struct as *as);
+int as_getmemid(struct as *, caddr_t, memid_t *);
+
+int as_add_callback(struct as *, void (*)(), void *, uint_t,
+ caddr_t, size_t, int);
+uint_t as_delete_callback(struct as *, void *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_AS_H */
diff --git a/usr/src/uts/common/vm/faultcode.h b/usr/src/uts/common/vm/faultcode.h
new file mode 100644
index 0000000000..82f886e00f
--- /dev/null
+++ b/usr/src/uts/common/vm/faultcode.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1992 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_FAULTCODE_H
+#define _VM_FAULTCODE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This file describes the data type returned by vm routines
+ * which handle faults.
+ *
+ * If FC_CODE(fc) == FC_OBJERR, then FC_ERRNO(fc) contains the errno value
+ * returned by the underlying object mapped at the fault address.
+ */
+#define FC_HWERR 0x1 /* misc hardware error (e.g. bus timeout) */
+#define FC_ALIGN 0x2 /* hardware alignment error */
+#define FC_OBJERR 0x3 /* underlying object returned errno value */
+#define FC_PROT 0x4 /* access exceeded current protections */
+#define FC_NOMAP 0x5 /* no mapping at the fault address */
+#define FC_NOSUPPORT 0x6 /* operation not supported by driver */
+
+#define FC_MAKE_ERR(e) (((e) << 8) | FC_OBJERR)
+
+#define FC_CODE(fc) ((fc) & 0xff)
+#define FC_ERRNO(fc) ((unsigned)(fc) >> 8)
+
+#ifndef _ASM
+typedef int faultcode_t; /* type returned by vm fault routines */
+#endif /* _ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_FAULTCODE_H */
diff --git a/usr/src/uts/common/vm/hat.c b/usr/src/uts/common/vm/hat.c
new file mode 100644
index 0000000000..24d6e50b1a
--- /dev/null
+++ b/usr/src/uts/common/vm/hat.c
@@ -0,0 +1,149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/systm.h>
+#include <sys/modctl.h>
+#include <sys/kobj.h>
+#include <vm/hat.h>
+
+/*
+ * PSARC 2004/405 made hat_getkpfnum(9F) obsolete. As part of the
+ * obsolecense, the original documented behavior will begin to be
+ * enforced in the future; namely, hat_getkpfnum(9F) may _only_
+ * be called with device-mapped memory virtual addresses. Since
+ * changing hat_getkpfnum(9F) to return PFN_INVALID on kernel memory
+ * would break a lot of modules without any warning, we've implemented
+ * the following mechanism as a stop-gap. In a future release, this
+ * can all be ripped out and hat_getkpfnum(9F) changed to return
+ * PFN_INVALID if it isn't called with a device-mapped memory address.
+ *
+ * We keep track of each module that has used hat_getkpfnum(9F)
+ * incorrectly. This allows us to avoid flooding the console/logs
+ * with too many warnings about a bad module that has already been
+ * flagged.
+ *
+ * On amd64 hat_getkpfnum() is never supported.
+ */
+
+#if !defined(__amd64)
+
+#define HAT_STACK_MAXDEPTH 15
+
+struct badcall_node {
+ char *bc_modname;
+ int bc_stackdepth;
+ pc_t bc_callstack[HAT_STACK_MAXDEPTH];
+ struct badcall_node *bc_linkage;
+};
+
+static struct badcall_node *bad_getkpfnum_callers;
+
+/*
+ * Common VM HAT routines.
+ */
+
+static void
+printwarn(struct badcall_node *bc)
+{
+ int sf;
+ char *ksym;
+ ulong_t off;
+
+ cmn_err(CE_WARN, "Module %s is using the obsolete hat_getkpfnum(9F)",
+ bc->bc_modname);
+ cmn_err(CE_CONT, "interface in a way that will not be supported in\n");
+ cmn_err(CE_CONT, "a future release of Solaris. Please contact the\n");
+ cmn_err(CE_CONT, "vendor that supplied the module for assistance,\n");
+ cmn_err(CE_CONT, "or consult the Writing Device Drivers guide,\n");
+ cmn_err(CE_CONT, "available from http://www.sun.com for migration\n");
+ cmn_err(CE_CONT, "advice.\n");
+ cmn_err(CE_CONT, "---\n");
+ cmn_err(CE_CONT, "Callstack of bad caller:\n");
+
+ for (sf = 0; sf < bc->bc_stackdepth; sf++) {
+ ksym = kobj_getsymname(bc->bc_callstack[sf], &off);
+ cmn_err(CE_CONT, "\t%s+%lx\n", ksym? ksym : "?", off);
+ }
+}
+
+
+void
+hat_getkpfnum_badcall(void *caller)
+{
+ struct badcall_node bcs;
+ char *modname = mod_containing_pc((caddr_t)caller);
+ struct badcall_node *bc;
+
+#ifdef __sparc
+ /*
+ * This is a hack until the ifb and jfb framebuffer drivers
+ * are fixed. Right now they use hat_getkpfnum() in a way that
+ * is really safe but will be incorrectly flagged as being
+ * buggy.
+ */
+ if (strcmp(modname, "ifb") == 0 || strcmp(modname, "jfb") == 0)
+ return;
+#elif defined(__i386)
+ /*
+ * This is a hack until these ethernet drivers can be fixed
+ * or EOL'd. hat_getkpfnum() will continue to work correctly
+ * until this list can be removed.
+ */
+ if (strcmp(modname, "dnet") == 0 || strcmp(modname, "pcn") == 0 ||
+ strcmp(modname, "adp") == 0 || strcmp(modname, "chs") == 0)
+ return;
+#endif /* __sparc / __i386 */
+
+ for (bc = bad_getkpfnum_callers; bc != NULL; bc = bc->bc_linkage)
+ if (strcmp(bc->bc_modname, modname) == 0)
+ return;
+
+ /*
+ * We haven't seen this caller before, so create a log of
+ * the callstack and module name, and emit a warning to the
+ * user.
+ */
+ bc = kmem_zalloc(sizeof (struct badcall_node), KM_NOSLEEP);
+ if (bc != NULL) {
+ bc->bc_linkage = bad_getkpfnum_callers;
+ bc->bc_modname = modname;
+ bad_getkpfnum_callers = bc;
+ } else {
+ bc = &bcs;
+ bc->bc_modname = modname;
+ }
+
+ bc->bc_stackdepth = getpcstack(bc->bc_callstack, HAT_STACK_MAXDEPTH);
+
+ printwarn(bc);
+}
+#endif /* __amd64 */
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
new file mode 100644
index 0000000000..b873f4e06e
--- /dev/null
+++ b/usr/src/uts/common/vm/hat.h
@@ -0,0 +1,598 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_HAT_H
+#define _VM_HAT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <vm/faultcode.h>
+#include <sys/kstat.h>
+#include <sys/siginfo.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Hardware Address Translation management.
+ *
+ * This file describes the machine independent interfaces to
+ * the hardware address translation management routines. Other
+ * machine specific interfaces and structures are defined
+ * in <vm/hat_xxx.h>. The hat layer manages the address
+ * translation hardware as a cache driven by calls from the
+ * higher levels of the VM system.
+ */
+
+struct hat;
+struct kpme;
+struct memseg;
+
+#include <vm/page.h>
+
+/*
+ * a callback used with hat_unload_callback()
+ * start and end mark are set to a range of unloaded addresses
+ * and the function is invoked with a pointer to this data structure
+ */
+typedef struct hat_callback {
+ caddr_t hcb_start_addr;
+ caddr_t hcb_end_addr;
+ void (*hcb_function)(struct hat_callback *);
+ void *hcb_data;
+} hat_callback_t;
+
+#ifdef _KERNEL
+
+/*
+ * One time hat initialization
+ */
+void hat_init(void);
+
+/*
+ * Notify hat of a system dump
+ */
+void hat_dump(void);
+
+/*
+ * Operations on an address space:
+ *
+ * struct hat *hat_alloc(as)
+ * allocated a hat structure for as.
+ *
+ * void hat_free_start(hat)
+ * informs hat layer process has finished executing but as has not
+ * been cleaned up yet.
+ *
+ * void hat_free_end(hat)
+ * informs hat layer as is being destroyed. hat layer cannot use as
+ * pointer after this call.
+ *
+ * void hat_swapin(hat)
+ * allocate any hat resources required for process being swapped in.
+ *
+ * void hat_swapout(hat)
+ * deallocate hat resources for process being swapped out.
+ *
+ * size_t hat_get_mapped_size(hat)
+ * returns number of bytes that have valid mappings in hat.
+ *
+ * void hat_stats_enable(hat)
+ * void hat_stats_disable(hat)
+ * enables/disables collection of stats for hat.
+ *
+ * int hat_dup(parenthat, childhat, addr, len, flags)
+ * Duplicate address translations of the parent to the child. Supports
+ * the entire address range or a range depending on flag,
+ * zero returned on success, non-zero on error
+ *
+ * void hat_thread_exit(thread)
+ * Notifies the HAT that a thread is exiting, called after it has been
+ * reassigned to the kernel AS.
+ */
+
+struct hat *hat_alloc(struct as *);
+void hat_free_start(struct hat *);
+void hat_free_end(struct hat *);
+int hat_dup(struct hat *, struct hat *, caddr_t, size_t, uint_t);
+void hat_swapin(struct hat *);
+void hat_swapout(struct hat *);
+size_t hat_get_mapped_size(struct hat *);
+int hat_stats_enable(struct hat *);
+void hat_stats_disable(struct hat *);
+void hat_thread_exit(kthread_t *);
+
+/*
+ * Operations on a named address within a segment:
+ *
+ * void hat_memload(hat, addr, pp, attr, flags)
+ * load/lock the given page struct
+ *
+ * void hat_memload_array(hat, addr, len, ppa, attr, flags)
+ * load/lock the given array of page structs
+ *
+ * void hat_devload(hat, addr, len, pf, attr, flags)
+ * load/lock the given page frame number
+ *
+ * void hat_unlock(hat, addr, len)
+ * unlock a given range of addresses
+ *
+ * void hat_unload(hat, addr, len, flags)
+ * void hat_unload_callback(hat, addr, len, flags, callback)
+ * unload a given range of addresses (has optional callback)
+ *
+ * void hat_sync(hat, addr, len, flags)
+ * synchronize mapping with software data structures
+ *
+ * void hat_map(hat, addr, len, flags)
+ *
+ * void hat_setattr(hat, addr, len, attr)
+ * void hat_clrattr(hat, addr, len, attr)
+ * void hat_chgattr(hat, addr, len, attr)
+ * modify attributes for a range of addresses. skips any invalid mappings
+ *
+ * uint_t hat_getattr(hat, addr, *attr)
+ * returns attr for <hat,addr> in *attr. returns 0 if there was a
+ * mapping and *attr is valid, nonzero if there was no mapping and
+ * *attr is not valid.
+ *
+ * size_t hat_getpagesize(hat, addr)
+ * returns pagesize in bytes for <hat, addr>. returns -1 if there is
+ * no mapping. This is an advisory call.
+ *
+ * pfn_t hat_getpfnum(hat, addr)
+ * returns pfn for <hat, addr> or PFN_INVALID if mapping is invalid.
+ *
+ * pfn_t hat_getkpfnum(addr)
+ * returns pfn for non-memory mapped addr in kernel address space
+ * or PFN_INVALID if mapping is invalid or is kernel memory.
+ *
+ * int hat_probe(hat, addr)
+ * return 0 if no valid mapping is present. Faster version
+ * of hat_getattr in certain architectures.
+ *
+ * int hat_share(dhat, daddr, shat, saddr, len, szc)
+ *
+ * void hat_unshare(hat, addr, len, szc)
+ *
+ * void hat_chgprot(hat, addr, len, vprot)
+ * This is a deprecated call. New segment drivers should store
+ * all attributes and use hat_*attr calls.
+ * Change the protections in the virtual address range
+ * given to the specified virtual protection. If vprot is ~PROT_WRITE,
+ * then remove write permission, leaving the other permissions
+ * unchanged. If vprot is ~PROT_USER, remove user permissions.
+ */
+
+void hat_memload(struct hat *, caddr_t, struct page *, uint_t, uint_t);
+void hat_memload_array(struct hat *, caddr_t, size_t, struct page **,
+ uint_t, uint_t);
+
+void hat_devload(struct hat *, caddr_t, size_t, pfn_t, uint_t, int);
+void hat_unlock(struct hat *, caddr_t, size_t);
+void hat_unload(struct hat *, caddr_t, size_t, uint_t);
+void hat_unload_callback(struct hat *, caddr_t, size_t, uint_t,
+ hat_callback_t *);
+void hat_sync(struct hat *, caddr_t, size_t, uint_t);
+void hat_map(struct hat *, caddr_t, size_t, uint_t);
+void hat_setattr(struct hat *, caddr_t, size_t, uint_t);
+void hat_clrattr(struct hat *, caddr_t, size_t, uint_t);
+void hat_chgattr(struct hat *, caddr_t, size_t, uint_t);
+uint_t hat_getattr(struct hat *, caddr_t, uint_t *);
+ssize_t hat_getpagesize(struct hat *, caddr_t);
+pfn_t hat_getpfnum(struct hat *, caddr_t);
+int hat_probe(struct hat *, caddr_t);
+int hat_share(struct hat *, caddr_t, struct hat *, caddr_t, size_t, uint_t);
+void hat_unshare(struct hat *, caddr_t, size_t, uint_t);
+void hat_chgprot(struct hat *, caddr_t, size_t, uint_t);
+void hat_reserve(struct as *, caddr_t, size_t);
+pfn_t va_to_pfn(void *);
+uint64_t va_to_pa(void *);
+
+/*
+ * hat_getkpfnum() is never supported on amd64 and will be
+ * removed from other platforms in future release
+ */
+#if !defined(__amd64)
+pfn_t hat_getkpfnum(caddr_t);
+#endif
+
+
+/*
+ * Kernel Physical Mapping (segkpm) hat interface routines.
+ */
+caddr_t hat_kpm_mapin(struct page *, struct kpme *);
+void hat_kpm_mapout(struct page *, struct kpme *, caddr_t);
+caddr_t hat_kpm_page2va(struct page *, int);
+struct page *hat_kpm_vaddr2page(caddr_t);
+int hat_kpm_fault(struct hat *, caddr_t);
+void hat_kpm_mseghash_clear(int);
+void hat_kpm_mseghash_update(pgcnt_t, struct memseg *);
+void hat_kpm_addmem_mseg_update(struct memseg *, pgcnt_t, offset_t);
+void hat_kpm_addmem_mseg_insert(struct memseg *);
+void hat_kpm_addmem_memsegs_update(struct memseg *);
+caddr_t hat_kpm_mseg_reuse(struct memseg *);
+void hat_kpm_delmem_mseg_update(struct memseg *, struct memseg **);
+void hat_kpm_split_mseg_update(struct memseg *, struct memseg **,
+ struct memseg *, struct memseg *, struct memseg *);
+void hat_kpm_walk(void (*)(void *, void *, size_t), void *);
+
+/*
+ * Operations on all translations for a given page(s)
+ *
+ * void hat_page_setattr(pp, flag)
+ * void hat_page_clrattr(pp, flag)
+ * used to set/clr red/mod bits.
+ *
+ * uint hat_page_getattr(pp, flag)
+ * If flag is specified, returns 0 if attribute is disabled
+ * and non zero if enabled. If flag specifes multiple attributs
+ * then returns 0 if ALL atriibutes are disabled. This is an advisory
+ * call.
+ *
+ * int hat_pageunload(pp, forceflag)
+ * unload all translations attached to pp.
+ *
+ * uint_t hat_pagesync(pp, flags)
+ * get hw stats from hardware into page struct and reset hw stats
+ * returns attributes of page
+ *
+ * ulong_t hat_page_getshare(pp)
+ * returns approx number of mappings to this pp. A return of 0 implies
+ * there are no mappings to the page.
+ *
+ * faultcode_t hat_softlock(hat, addr, lenp, ppp, flags);
+ * called to softlock pages for zero copy tcp
+ *
+ * void hat_page_demote(pp);
+ * unload all large mappings to pp and decrease p_szc of all
+ * constituent pages according to the remaining mappings.
+ */
+
+void hat_page_setattr(struct page *, uint_t);
+void hat_page_clrattr(struct page *, uint_t);
+uint_t hat_page_getattr(struct page *, uint_t);
+int hat_pageunload(struct page *, uint_t);
+uint_t hat_pagesync(struct page *, uint_t);
+ulong_t hat_page_getshare(struct page *);
+faultcode_t hat_softlock(struct hat *, caddr_t, size_t *,
+ struct page **, uint_t);
+void hat_page_demote(struct page *);
+
+/*
+ * Rountine to expose supported HAT features to PIM.
+ */
+enum hat_features {
+ HAT_SHARED_PT, /* Shared page tables */
+ HAT_DYNAMIC_ISM_UNMAP, /* hat_pageunload() handles ISM pages */
+ HAT_VMODSORT /* support for VMODSORT flag of vnode */
+};
+
+int hat_supported(enum hat_features, void *);
+
+/*
+ * Services provided to the hat:
+ *
+ * void as_signal_proc(as, siginfo)
+ * deliver signal to all processes that have this as.
+ *
+ * int hat_setstat(as, addr, len, rmbits)
+ * informs hatstat layer that ref/mod bits need to be updated for
+ * address range. Returns 0 on success, 1 for failure.
+ */
+void as_signal_proc(struct as *, k_siginfo_t *siginfo);
+void hat_setstat(struct as *, caddr_t, size_t, uint_t);
+
+/*
+ * Flags to pass to hat routines.
+ *
+ * Certain flags only apply to some interfaces:
+ *
+ * HAT_LOAD Default flags to load a translation to the page.
+ * HAT_LOAD_LOCK Lock down mapping resources; hat_map(), hat_memload(),
+ * and hat_devload().
+ * HAT_LOAD_ADV Advisory load - Load translation if and only if
+ * sufficient MMU resources exist (i.e., do not steal).
+ * HAT_LOAD_SHARE A flag to hat_memload() to indicate h/w page tables
+ * that map some user pages (not kas) is shared by more
+ * than one process (eg. ISM).
+ * HAT_LOAD_CONTIG Pages are contigous
+ * HAT_LOAD_NOCONSIST Do not add mapping to mapping list.
+ * HAT_LOAD_REMAP Reload a valid pte with a different page frame.
+ * HAT_RELOAD_SHARE Reload a shared page table entry. Some platforms
+ * may require different actions than on the first
+ * load of a shared mapping.
+ * HAT_NO_KALLOC Do not kmem_alloc while creating the mapping; at this
+ * point, it's setting up mapping to allocate internal
+ * hat layer data structures. This flag forces hat layer
+ * to tap its reserves in order to prevent infinite
+ * recursion.
+ * HAT_LOAD_AUTOLPG Get MMU specific disable_auto_large_pages
+ */
+
+/*
+ * Flags for hat_memload/hat_devload
+ */
+#define HAT_FLAGS_RESV 0xFF000000 /* resv for hat impl */
+#define HAT_LOAD 0x00
+#define HAT_LOAD_LOCK 0x01
+#define HAT_LOAD_ADV 0x04
+#define HAT_LOAD_CONTIG 0x10
+#define HAT_LOAD_NOCONSIST 0x20
+#define HAT_LOAD_SHARE 0x40
+#define HAT_LOAD_REMAP 0x80
+#define HAT_RELOAD_SHARE 0x100
+#define HAT_NO_KALLOC 0x200
+#define HAT_LOAD_TEXT 0x400
+#define HAT_LOAD_AUTOLPG 0x800
+
+/*
+ * Attributes for hat_memload/hat_devload/hat_*attr
+ * are a superset of prot flags defined in mman.h.
+ */
+#define HAT_PLAT_ATTR_MASK 0xF00000
+#define HAT_PROT_MASK 0x0F
+
+#define HAT_NOFAULT 0x10
+#define HAT_NOSYNC 0x20
+
+/*
+ * Advisory ordering attributes. Apply only to device mappings.
+ *
+ * HAT_STRICTORDER: the CPU must issue the references in order, as the
+ * programmer specified. This is the default.
+ * HAT_UNORDERED_OK: the CPU may reorder the references (this is all kinds
+ * of reordering; store or load with store or load).
+ * HAT_MERGING_OK: merging and batching: the CPU may merge individual stores
+ * to consecutive locations (for example, turn two consecutive byte
+ * stores into one halfword store), and it may batch individual loads
+ * (for example, turn two consecutive byte loads into one halfword load).
+ * This also implies re-ordering.
+ * HAT_LOADCACHING_OK: the CPU may cache the data it fetches and reuse it
+ * until another store occurs. The default is to fetch new data
+ * on every load. This also implies merging.
+ * HAT_STORECACHING_OK: the CPU may keep the data in the cache and push it to
+ * the device (perhaps with other data) at a later time. The default is
+ * to push the data right away. This also implies load caching.
+ */
+#define HAT_STRICTORDER 0x0000
+#define HAT_UNORDERED_OK 0x0100
+#define HAT_MERGING_OK 0x0200
+#define HAT_LOADCACHING_OK 0x0300
+#define HAT_STORECACHING_OK 0x0400
+#define HAT_ORDER_MASK 0x0700
+
+/* endian attributes */
+#define HAT_NEVERSWAP 0x0000
+#define HAT_STRUCTURE_BE 0x1000
+#define HAT_STRUCTURE_LE 0x2000
+#define HAT_ENDIAN_MASK 0x3000
+
+/* flags for hat_softlock */
+#define HAT_COW 0x0001
+
+/*
+ * Flags for hat_unload
+ */
+#define HAT_UNLOAD 0x00
+#define HAT_UNLOAD_NOSYNC 0x02
+#define HAT_UNLOAD_UNLOCK 0x04
+#define HAT_UNLOAD_OTHER 0x08
+#define HAT_UNLOAD_UNMAP 0x10
+
+/*
+ * Flags for hat_pagesync, hat_getstat, hat_sync
+ */
+#define HAT_SYNC_DONTZERO 0x00
+#define HAT_SYNC_ZERORM 0x01
+/* Additional flags for hat_pagesync */
+#define HAT_SYNC_STOPON_REF 0x02
+#define HAT_SYNC_STOPON_MOD 0x04
+#define HAT_SYNC_STOPON_RM (HAT_SYNC_STOPON_REF | HAT_SYNC_STOPON_MOD)
+#define HAT_SYNC_STOPON_SHARED 0x08
+
+/*
+ * Flags for hat_dup
+ *
+ * HAT_DUP_ALL dup entire address space
+ * HAT_DUP_COW dup plus hat_clrattr(..PROT_WRITE) on newas
+ */
+#define HAT_DUP_ALL 1
+#define HAT_DUP_COW 2
+
+
+/*
+ * Flags for hat_map
+ */
+#define HAT_MAP 0x00
+
+/*
+ * Flag for hat_pageunload
+ */
+#define HAT_ADV_PGUNLOAD 0x00
+#define HAT_FORCE_PGUNLOAD 0x01
+
+/*
+ * Attributes for hat_page_*attr, hat_setstats and
+ * returned by hat_pagesync.
+ */
+#define P_MOD 0x1 /* the modified bit */
+#define P_REF 0x2 /* the referenced bit */
+#define P_RO 0x4 /* Read only page */
+
+#define hat_ismod(pp) (hat_page_getattr(pp, P_MOD))
+#define hat_isref(pp) (hat_page_getattr(pp, P_REF))
+#define hat_isro(pp) (hat_page_getattr(pp, P_RO))
+
+#define hat_setmod(pp) (hat_page_setattr(pp, P_MOD))
+#define hat_setref(pp) (hat_page_setattr(pp, P_REF))
+#define hat_setrefmod(pp) (hat_page_setattr(pp, P_REF|P_MOD))
+
+#define hat_clrmod(pp) (hat_page_clrattr(pp, P_MOD))
+#define hat_clrref(pp) (hat_page_clrattr(pp, P_REF))
+#define hat_clrrefmod(pp) (hat_page_clrattr(pp, P_REF|P_MOD))
+
+#define hat_page_is_mapped(pp) (hat_page_getshare(pp))
+
+/*
+ * hat_setup is being used in sparc/os/sundep.c
+ */
+void hat_setup(struct hat *, int);
+
+/*
+ * Flags for hat_setup
+ */
+#define HAT_DONTALLOC 0
+#define HAT_ALLOC 1
+#define HAT_INIT 2
+
+/*
+ * Other routines, for statistics
+ */
+int hat_startstat(struct as *);
+void hat_getstat(struct as *, caddr_t, size_t, uint_t, char *, int);
+void hat_freestat(struct as *, int);
+void hat_resvstat(size_t, struct as *, caddr_t);
+
+/*
+ * Transitionary routine while we still allow hat_getkpfnum(caddr_t)
+ * to return a pfn for kernel memory, but want to warn the user that
+ * it isn't supported.
+ */
+void hat_getkpfnum_badcall(void *caller);
+
+/*
+ * Relocation callback routines. Currently only sfmmu HAT supports
+ * these.
+ */
+extern int hat_add_callback(id_t, caddr_t, uint_t, uint_t, void *,
+ pfn_t *);
+extern id_t hat_register_callback(
+ int (*prehandler)(caddr_t, uint_t, uint_t, void *),
+ int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t),
+ int (*errhandler)(caddr_t, uint_t, uint_t, void *), int);
+extern void hat_delete_callback(caddr_t, uint_t, void *, uint_t);
+
+/*
+ * hat_add_callback()/hat_delete_callback() flags.
+ */
+#define HAC_NOSLEEP 0x0
+#define HAC_SLEEP 0x1
+#define HAC_PAGELOCK 0x2
+
+/*
+ * Suspend/unsuspend handler callback arguments.
+ */
+#define HAT_SUSPEND 0x0010
+#define HAT_UNSUSPEND 0x0010
+#define HAT_PRESUSPEND 0x0020
+#define HAT_POSTUNSUSPEND 0x0020
+
+/*
+ * Error handler callback arguments. See the block comments
+ * before the implementation of hat_add_callback() for an
+ * explanation of what these mean.
+ */
+#define HAT_CB_ERR_LEAKED 0x1
+
+#endif /* _KERNEL */
+
+/*
+ * The size of the bit array for ref and mod bit storage must be a power of 2.
+ * 2 bits are collected for each page. Below the power used is 4,
+ * which is 16 8-bit characters = 128 bits, ref and mod bit information
+ * for 64 pages.
+ */
+#define HRM_SHIFT 4
+#define HRM_BYTES (1 << HRM_SHIFT)
+#define HRM_PAGES ((HRM_BYTES * NBBY) / 2)
+#define HRM_PGPERBYTE (NBBY/2)
+#define HRM_PGBYTEMASK (HRM_PGPERBYTE-1)
+
+#define HRM_PGOFFMASK ((HRM_PGPERBYTE-1) << MMU_PAGESHIFT)
+#define HRM_BASEOFFSET (((MMU_PAGESIZE * HRM_PAGES) - 1))
+#define HRM_BASEMASK (~(HRM_BASEOFFSET))
+
+#define HRM_BASESHIFT (MMU_PAGESHIFT + (HRM_SHIFT + 2))
+#define HRM_PAGEMASK (MMU_PAGEMASK ^ HRM_BASEMASK)
+
+#define HRM_HASHSIZE 0x200
+#define HRM_HASHMASK (HRM_HASHSIZE - 1)
+
+#define HRM_BLIST_INCR 0x200
+
+/*
+ * The structure for maintaining referenced and modified information
+ */
+struct hrmstat {
+ struct as *hrm_as; /* stat block belongs to this as */
+ uintptr_t hrm_base; /* base of block */
+ ushort_t hrm_id; /* opaque identifier, one of a_vbits */
+ struct hrmstat *hrm_anext; /* as statistics block list */
+ struct hrmstat *hrm_hnext; /* list for hashed blocks */
+ uchar_t hrm_bits[HRM_BYTES]; /* the ref and mod bits */
+};
+
+/*
+ * For global monitoring of the reference and modified bits
+ * of all address spaces we reserve one id bit.
+ */
+#define HRM_SWSMONID 1
+
+
+#ifdef _KERNEL
+
+/*
+ * Hat locking functions
+ * XXX - these two functions are currently being used by hatstats
+ * they can be removed by using a per-as mutex for hatstats.
+ */
+void hat_enter(struct hat *);
+void hat_exit(struct hat *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_HAT_H */
diff --git a/usr/src/uts/common/vm/hat_refmod.c b/usr/src/uts/common/vm/hat_refmod.c
new file mode 100644
index 0000000000..1a812bd94f
--- /dev/null
+++ b/usr/src/uts/common/vm/hat_refmod.c
@@ -0,0 +1,544 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * The following routines implement the hat layer's
+ * recording of the referenced and modified bits.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+
+/*
+ * Note, usage of cmn_err requires you not hold any hat layer locks.
+ */
+#include <sys/cmn_err.h>
+
+#include <vm/as.h>
+#include <vm/hat.h>
+
+kmutex_t hat_statlock; /* protects all hat statistics data */
+struct hrmstat *hrm_memlist; /* tracks memory alloced for hrm_blist blocks */
+struct hrmstat **hrm_hashtab; /* hash table for finding blocks quickly */
+struct hrmstat *hrm_blist;
+int hrm_blist_incr = HRM_BLIST_INCR;
+int hrm_blist_lowater = HRM_BLIST_INCR/2;
+int hrm_blist_num = 0;
+int hrm_blist_total = 0;
+int hrm_mlockinited = 0;
+int hrm_allocfailmsg = 0; /* print a message when allocations fail */
+int hrm_allocfail = 0;
+
+static struct hrmstat *hrm_balloc(void);
+static int hrm_init(void);
+static void hrm_link(struct hrmstat *);
+static void hrm_setbits(struct hrmstat *, caddr_t, uint_t);
+static void hrm_hashout(struct hrmstat *);
+static void hrm_getblk(int);
+
+#define hrm_hash(as, addr) \
+ (HRM_HASHMASK & \
+ (((uintptr_t)(addr) >> HRM_BASESHIFT) ^ ((uintptr_t)(as) >> 2)))
+
+#define hrm_match(hrm, as, addr) \
+ (((hrm)->hrm_as == (as) && \
+ ((hrm)->hrm_base == ((uintptr_t)(addr) & HRM_BASEMASK))) ? 1 : 0)
+
+/*
+ * reserve enough statistic blocks for
+ * chunk of bytes (pages) in a given as.
+ */
+/* ARGSUSED */
+void
+hat_resvstat(size_t chunk, struct as *as, caddr_t addr)
+{
+ int nhrm = btop(chunk)/HRM_PAGES;
+
+ if (nhrm < HRM_BLIST_INCR)
+ nhrm = 0; /* preallocate at least HRM_BLIST_INCR */
+ hrm_getblk(nhrm);
+}
+
+/*
+ * Start the statistics gathering for an address space.
+ * Return -1 if we can't do it, otherwise return an opaque
+ * identifier to be used when querying for the gathered statistics.
+ * The identifier is an unused bit in a_vbits.
+ * Bit 0 is reserved for swsmon.
+ */
+int
+hat_startstat(struct as *as)
+{
+ uint_t nbits; /* number of bits */
+ uint_t bn; /* bit number */
+ uint_t id; /* new vbit, identifier */
+ uint_t vbits; /* used vbits of address space */
+ size_t chunk; /* mapped size for stats */
+ /*
+ * Initialize global data, if needed.
+ */
+ if (hrm_init() == -1)
+ return (-1);
+
+ /*
+ * If the refmod saving memory allocator runs out, print
+ * a warning message about how to fix it, see comment at
+ * the beginning of hat_setstat.
+ */
+ if (hrm_allocfailmsg) {
+ cmn_err(CE_WARN,
+ "hrm_balloc failures occured, increase hrm_blist_incr");
+ hrm_allocfailmsg = 0;
+ }
+
+ /*
+ * Verify that a buffer of statistics blocks exists
+ * and allocate more, if needed.
+ */
+
+ chunk = hat_get_mapped_size(as->a_hat);
+ chunk = (btop(chunk)/HRM_PAGES);
+ if (chunk < HRM_BLIST_INCR)
+ chunk = 0;
+
+ hrm_getblk((int)chunk);
+
+ /*
+ * Find a unused id in the given address space.
+ */
+ hat_enter(as->a_hat);
+ vbits = as->a_vbits;
+ nbits = sizeof (as->a_vbits) * NBBY;
+ for (bn = 1, id = 2; bn < (nbits - 1); bn++, id <<= 1)
+ if ((id & vbits) == 0)
+ break;
+ if (bn >= (nbits - 1)) {
+ hat_exit(as->a_hat);
+ return (-1);
+ }
+ as->a_vbits |= id;
+ hat_exit(as->a_hat);
+ (void) hat_stats_enable(as->a_hat);
+ return (id);
+}
+
+/*
+ * Record referenced and modified information for an address space.
+ * Rmbits is a word containing the referenced bit in bit position 1
+ * and the modified bit in bit position 0.
+ *
+ * For current informational uses, one can rerun any program using
+ * this facility after modifying the hrm_blist_incr to be a larger
+ * amount so that a larger buffer of blocks will be maintained.
+ */
+void
+hat_setstat(struct as *as, caddr_t addr, size_t len, uint_t rmbits)
+{
+ struct hrmstat *hrm;
+ uint_t vbits, newbits, nb;
+ int h;
+
+ ASSERT(len == PAGESIZE);
+ ASSERT((rmbits & ~(P_MOD|P_REF)) == 0);
+
+ if (rmbits == 0)
+ return;
+
+ /*
+ * Initialize global data, if needed.
+ */
+ if (hrm_init() == -1)
+ return;
+
+ mutex_enter(&hat_statlock);
+
+ /*
+ * Search the hash list for the as and addr we are looking for
+ * and set the ref and mod bits in every block that matches.
+ */
+ vbits = 0;
+ h = hrm_hash(as, addr);
+ for (hrm = hrm_hashtab[h]; hrm; hrm = hrm->hrm_hnext) {
+ if (hrm_match(hrm, as, addr)) {
+ hrm_setbits(hrm, addr, rmbits);
+ vbits |= hrm->hrm_id;
+ }
+ }
+
+ /*
+ * If we didn't find a block for all of the enabled
+ * vpages bits, then allocate and initialize a block
+ * for each bit that was not found.
+ */
+ if (vbits != as->a_vbits) {
+ newbits = vbits ^ as->a_vbits;
+ while (newbits) {
+ if (ffs(newbits))
+ nb = 1 << (ffs(newbits)-1);
+ hrm = (struct hrmstat *)hrm_balloc();
+ if (hrm == NULL) {
+ hrm_allocfailmsg = 1;
+ hrm_allocfail++;
+ mutex_exit(&hat_statlock);
+ return;
+ }
+ hrm->hrm_as = as;
+ hrm->hrm_base = (uintptr_t)addr & HRM_BASEMASK;
+ hrm->hrm_id = nb;
+ hrm_link(hrm);
+ hrm_setbits(hrm, addr, rmbits);
+ newbits &= ~nb;
+ }
+ }
+ mutex_exit(&hat_statlock);
+}
+
+/*
+ * Free the resources used to maintain the referenced and modified
+ * statistics for the virtual page view of an address space
+ * identified by id.
+ */
+void
+hat_freestat(struct as *as, int id)
+{
+ struct hrmstat *hrm, *prev_ahrm;
+
+ hat_stats_disable(as->a_hat); /* tell the hat layer to stop */
+ hat_enter(as->a_hat);
+ if (id == 0)
+ as->a_vbits = 0;
+ else
+ as->a_vbits &= ~id;
+
+ if ((hrm = as->a_hrm) == NULL) {
+ hat_exit(as->a_hat);
+ return;
+ }
+ hat_exit(as->a_hat);
+
+ mutex_enter(&hat_statlock);
+ if (hrm_hashtab == NULL) {
+ /* can't happen? */
+ mutex_exit(&hat_statlock);
+ return;
+ }
+ for (prev_ahrm = NULL; hrm; hrm = hrm->hrm_anext) {
+ if ((id == hrm->hrm_id) || (id == NULL)) {
+
+ hrm_hashout(hrm);
+ hrm->hrm_hnext = hrm_blist;
+ hrm_blist = hrm;
+ hrm_blist_num++;
+
+ if (prev_ahrm == NULL)
+ as->a_hrm = hrm->hrm_anext;
+ else
+ prev_ahrm->hrm_anext = hrm->hrm_anext;
+
+ } else
+ prev_ahrm = hrm;
+ }
+
+ /*
+ * If all statistics blocks are free,
+ * return the memory to the system.
+ */
+ if (hrm_blist_num == hrm_blist_total) {
+ /* zero the block list since we are giving back its memory */
+ hrm_blist = NULL;
+ hrm_blist_num = 0;
+ hrm_blist_total = 0;
+ while (hrm_memlist) {
+ hrm = hrm_memlist;
+ hrm_memlist = hrm->hrm_hnext;
+ kmem_free(hrm, hrm->hrm_base);
+ }
+ ASSERT(hrm_memlist == NULL);
+ kmem_free(hrm_hashtab, HRM_HASHSIZE * sizeof (char *));
+ hrm_hashtab = NULL;
+ }
+ mutex_exit(&hat_statlock);
+}
+
+/*
+ * Initialize any global state for the statistics handling.
+ * Hrm_lock protects the globally allocted memory:
+ * hrm_memlist and hrm_hashtab.
+ */
+static int
+hrm_init(void)
+{
+ /*
+ * Alloacte the hashtable if it doesn't exist yet.
+ */
+ mutex_enter(&hat_statlock);
+ if (hrm_hashtab == NULL)
+ hrm_hashtab =
+ kmem_zalloc(HRM_HASHSIZE * sizeof (char *), KM_SLEEP);
+ mutex_exit(&hat_statlock);
+ return (0);
+}
+
+/*
+ * Grab memory for statistics gathering of the hat layer.
+ */
+static void
+hrm_getblk(int chunk)
+{
+ struct hrmstat *hrm, *l;
+ int i;
+ int hrm_incr;
+
+ mutex_enter(&hat_statlock);
+ if ((hrm_blist == NULL) ||
+ (hrm_blist_num <= hrm_blist_lowater) ||
+ chunk) {
+
+ mutex_exit(&hat_statlock);
+
+ hrm_incr = chunk? chunk : hrm_blist_incr;
+ hrm = kmem_zalloc(sizeof (struct hrmstat) * hrm_incr, KM_SLEEP);
+ hrm->hrm_base = sizeof (struct hrmstat) * hrm_incr;
+
+ /*
+ * thread the allocated blocks onto a freelist
+ * using the first block to hold information for
+ * freeing them all later
+ */
+ mutex_enter(&hat_statlock);
+ hrm->hrm_hnext = hrm_memlist;
+ hrm_memlist = hrm;
+
+ hrm_blist_total += (hrm_incr - 1);
+ for (i = 1; i < hrm_incr; i++) {
+ l = &hrm[i];
+ l->hrm_hnext = hrm_blist;
+ hrm_blist = l;
+ hrm_blist_num++;
+ }
+ }
+ mutex_exit(&hat_statlock);
+}
+
+static void
+hrm_hashin(struct hrmstat *hrm)
+{
+ int h;
+
+ ASSERT(MUTEX_HELD(&hat_statlock));
+ h = hrm_hash(hrm->hrm_as, hrm->hrm_base);
+
+ hrm->hrm_hnext = hrm_hashtab[h];
+ hrm_hashtab[h] = hrm;
+}
+
+static void
+hrm_hashout(struct hrmstat *hrm)
+{
+ struct hrmstat *list, **prev_hrm;
+ int h;
+
+ ASSERT(MUTEX_HELD(&hat_statlock));
+ h = hrm_hash(hrm->hrm_as, hrm->hrm_base);
+ list = hrm_hashtab[h];
+ prev_hrm = &hrm_hashtab[h];
+
+ while (list) {
+ if (list == hrm) {
+ *prev_hrm = list->hrm_hnext;
+ return;
+ }
+ prev_hrm = &list->hrm_hnext;
+ list = list->hrm_hnext;
+ }
+}
+
+
+/*
+ * Link a statistic block into an address space and also put it
+ * on the hash list for future references.
+ */
+static void
+hrm_link(struct hrmstat *hrm)
+{
+ struct as *as = hrm->hrm_as;
+
+ ASSERT(MUTEX_HELD(&hat_statlock));
+ hrm->hrm_anext = as->a_hrm;
+ as->a_hrm = hrm;
+ hrm_hashin(hrm);
+}
+
+/*
+ * Allocate a block for statistics keeping.
+ * Returns NULL if blocks are unavailable.
+ */
+static struct hrmstat *
+hrm_balloc(void)
+{
+ struct hrmstat *hrm;
+
+ ASSERT(MUTEX_HELD(&hat_statlock));
+
+ hrm = hrm_blist;
+ if (hrm != NULL) {
+ hrm_blist = hrm->hrm_hnext;
+ hrm_blist_num--;
+ hrm->hrm_hnext = NULL;
+ }
+ return (hrm);
+}
+
+/*
+ * Set the ref and mod bits for addr within statistics block hrm.
+ */
+static void
+hrm_setbits(struct hrmstat *hrm, caddr_t addr, uint_t bits)
+{
+ uint_t po, bo, spb;
+ uint_t nbits;
+
+ po = ((uintptr_t)addr & HRM_BASEOFFSET) >> MMU_PAGESHIFT; /* pg off */
+ bo = po / (NBBY / 2); /* which byte in bit array */
+ spb = (3 - (po & 3)) * 2; /* shift position within byte */
+ nbits = bits << spb; /* bit mask */
+ hrm->hrm_bits[bo] |= nbits;
+}
+
+/*
+ * Return collected statistics about an address space.
+ * If clearflag is set, atomically read and zero the bits.
+ *
+ * Fill in the data array supplied with the referenced and
+ * modified bits collected for address range [addr ... addr + len]
+ * in address space, as, uniquely identified by id.
+ * The destination is a byte array. We fill in three bits per byte:
+ * referenced, modified, and hwmapped bits.
+ * Kernel only interface, can't fault on destination data array.
+ *
+ */
+void
+hat_getstat(struct as *as, caddr_t addr, size_t len, uint_t id,
+ caddr_t datap, int clearflag)
+{
+ size_t np; /* number of pages */
+ caddr_t a;
+ char *dp;
+
+ np = btop(len);
+ bzero(datap, np);
+
+ hat_sync(as->a_hat, addr, len, clearflag);
+
+ /* allocate more statistics blocks if needed */
+ hrm_getblk(0);
+
+ mutex_enter(&hat_statlock);
+ if (hrm_hashtab == NULL) {
+ /* can happen when victim process exits */
+ mutex_exit(&hat_statlock);
+ return;
+ }
+ dp = datap;
+ a = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ while (a < addr + len) {
+ struct hrmstat *hrm;
+ size_t n; /* number of pages, temp */
+ int h; /* hash index */
+ uint_t po;
+
+ h = hrm_hash(as, a);
+ n = (HRM_PAGES -
+ (((uintptr_t)a & HRM_PAGEMASK) >> MMU_PAGESHIFT));
+ if (n > np)
+ n = np;
+ po = ((uintptr_t)a & HRM_BASEOFFSET) >> MMU_PAGESHIFT;
+
+ for (hrm = hrm_hashtab[h]; hrm; hrm = hrm->hrm_hnext) {
+ if (hrm->hrm_as == as &&
+ hrm->hrm_base == ((uintptr_t)a & HRM_BASEMASK) &&
+ id == hrm->hrm_id) {
+ int i, nr;
+ uint_t bo, spb;
+
+ /*
+ * Extract leading unaligned bits.
+ */
+ i = 0;
+ while (i < n && (po & 3)) {
+ bo = po / (NBBY / 2);
+ spb = (3 - (po & 3)) * 2;
+ *dp++ |= (hrm->hrm_bits[bo] >> spb) & 3;
+ if (clearflag)
+ hrm->hrm_bits[bo] &= ~(3<<spb);
+ po++;
+ i++;
+ }
+ /*
+ * Extract aligned bits.
+ */
+ nr = n/4*4;
+ bo = po / (NBBY / 2);
+ while (i < nr) {
+ int bits = hrm->hrm_bits[bo];
+ *dp++ |= (bits >> 6) & 3;
+ *dp++ |= (bits >> 4) & 3;
+ *dp++ |= (bits >> 2) & 3;
+ *dp++ |= (bits >> 0) & 3;
+ if (clearflag)
+ hrm->hrm_bits[bo] = 0;
+ bo++;
+ po += 4;
+ i += 4;
+ }
+ /*
+ * Extract trailing unaligned bits.
+ */
+ while (i < n) {
+ bo = po / (NBBY / 2);
+ spb = (3 - (po & 3)) * 2;
+ *dp++ |= (hrm->hrm_bits[bo] >> spb) & 3;
+ if (clearflag)
+ hrm->hrm_bits[bo] &= ~(3<<spb);
+ po++;
+ i++;
+ }
+
+ break;
+ }
+ }
+ if (hrm == NULL)
+ dp += n;
+ np -= n;
+ a += n * MMU_PAGESIZE;
+ }
+ mutex_exit(&hat_statlock);
+}
diff --git a/usr/src/uts/common/vm/kpm.h b/usr/src/uts/common/vm/kpm.h
new file mode 100644
index 0000000000..edc213b8f8
--- /dev/null
+++ b/usr/src/uts/common/vm/kpm.h
@@ -0,0 +1,57 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VM_KPM_H
+#define _VM_KPM_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _LP64
+#define SEGKPM_SUPPORT
+#endif
+
+#ifndef _ASM
+
+/*
+ * Machine independent per instance kpm mapping structure
+ */
+struct kpme {
+ struct kpme *kpe_next;
+ struct kpme *kpe_prev;
+ struct page *kpe_page; /* back pointer to (start) page */
+};
+
+#endif /* _ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_KPM_H */
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
new file mode 100644
index 0000000000..9cd32e0ae3
--- /dev/null
+++ b/usr/src/uts/common/vm/page.h
@@ -0,0 +1,1006 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_PAGE_H
+#define _VM_PAGE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <vm/seg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_KERNEL) || defined(_KMEMUSER)
+
+/*
+ * Shared/Exclusive lock.
+ */
+
+/*
+ * Types of page locking supported by page_lock & friends.
+ */
+typedef enum {
+ SE_SHARED,
+ SE_EXCL /* exclusive lock (value == -1) */
+} se_t;
+
+/*
+ * For requesting that page_lock reclaim the page from the free list.
+ */
+typedef enum {
+ P_RECLAIM, /* reclaim page from free list */
+ P_NO_RECLAIM /* DON`T reclaim the page */
+} reclaim_t;
+
+/*
+ * Callers of page_try_reclaim_lock and page_lock_es can use this flag
+ * to get SE_EXCL access before reader/writers are given access.
+ */
+#define SE_EXCL_WANTED 0x02
+
+#endif /* _KERNEL | _KMEMUSER */
+
+typedef int selock_t;
+
+/*
+ * Define VM_STATS to turn on all sorts of statistic gathering about
+ * the VM layer. By default, it is only turned on when DEBUG is
+ * also defined.
+ */
+#ifdef DEBUG
+#define VM_STATS
+#endif /* DEBUG */
+
+#ifdef VM_STATS
+#define VM_STAT_ADD(stat) (stat)++
+#define VM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++))
+#else
+#define VM_STAT_ADD(stat)
+#define VM_STAT_COND_ADD(cond, stat)
+#endif /* VM_STATS */
+
+#ifdef _KERNEL
+
+/*
+ * Macros to acquire and release the page logical lock.
+ */
+#define page_struct_lock(pp) mutex_enter(&page_llock)
+#define page_struct_unlock(pp) mutex_exit(&page_llock)
+
+#endif /* _KERNEL */
+
+#include <sys/t_lock.h>
+
+struct as;
+
+/*
+ * Each physical page has a page structure, which is used to maintain
+ * these pages as a cache. A page can be found via a hashed lookup
+ * based on the [vp, offset]. If a page has an [vp, offset] identity,
+ * then it is entered on a doubly linked circular list off the
+ * vnode using the vpnext/vpprev pointers. If the p_free bit
+ * is on, then the page is also on a doubly linked circular free
+ * list using next/prev pointers. If the "p_selock" and "p_iolock"
+ * are held, then the page is currently being read in (exclusive p_selock)
+ * or written back (shared p_selock). In this case, the next/prev pointers
+ * are used to link the pages together for a consecutive i/o request. If
+ * the page is being brought in from its backing store, then other processes
+ * will wait for the i/o to complete before attaching to the page since it
+ * will have an "exclusive" lock.
+ *
+ * Each page structure has the locks described below along with
+ * the fields they protect:
+ *
+ * p_selock This is a per-page shared/exclusive lock that is
+ * used to implement the logical shared/exclusive
+ * lock for each page. The "shared" lock is normally
+ * used in most cases while the "exclusive" lock is
+ * required to destroy or retain exclusive access to
+ * a page (e.g., while reading in pages). The appropriate
+ * lock is always held whenever there is any reference
+ * to a page structure (e.g., during i/o).
+ * (Note that with the addition of the "writer-lock-wanted"
+ * semantics (via SE_EWANTED), threads must not acquire
+ * multiple reader locks or else a deadly embrace will
+ * occur in the following situation: thread 1 obtains a
+ * reader lock; next thread 2 fails to get a writer lock
+ * but specified SE_EWANTED so it will wait by either
+ * blocking (when using page_lock_es) or spinning while
+ * retrying (when using page_try_reclaim_lock) until the
+ * reader lock is released; then thread 1 attempts to
+ * get another reader lock but is denied due to
+ * SE_EWANTED being set, and now both threads are in a
+ * deadly embrace.)
+ *
+ * p_hash
+ * p_vnode
+ * p_offset
+ *
+ * p_free
+ * p_age
+ *
+ * p_iolock This is a binary semaphore lock that provides
+ * exclusive access to the i/o list links in each
+ * page structure. It is always held while the page
+ * is on an i/o list (i.e., involved in i/o). That is,
+ * even though a page may be only `shared' locked
+ * while it is doing a write, the following fields may
+ * change anyway. Normally, the page must be
+ * `exclusively' locked to change anything in it.
+ *
+ * p_next
+ * p_prev
+ *
+ * The following fields are protected by the global page_llock:
+ *
+ * p_lckcnt
+ * p_cowcnt
+ *
+ * The following lists are protected by the global page_freelock:
+ *
+ * page_cachelist
+ * page_freelist
+ *
+ * The following, for our purposes, are protected by
+ * the global freemem_lock:
+ *
+ * freemem
+ * freemem_wait
+ * freemem_cv
+ *
+ * The following fields are protected by hat layer lock(s). When a page
+ * structure is not mapped and is not associated with a vnode (after a call
+ * to page_hashout() for example) the p_nrm field may be modified with out
+ * holding the hat layer lock:
+ *
+ * p_nrm
+ * p_mapping
+ * p_share
+ *
+ * The following field is file system dependent. How it is used and
+ * the locking strategies applied are up to the individual file system
+ * implementation.
+ *
+ * p_fsdata
+ *
+ * The page structure is used to represent and control the system's
+ * physical pages. There is one instance of the structure for each
+ * page that is not permenately allocated. For example, the pages that
+ * hold the page structures are permanently held by the kernel
+ * and hence do not need page structures to track them. The array
+ * of page structures is allocated early on in the kernel's life and
+ * is based on the amount of available physical memory.
+ *
+ * Each page structure may simultaneously appear on several linked lists.
+ * The lists are: hash list, free or in i/o list, and a vnode's page list.
+ * Each type of list is protected by a different group of mutexes as described
+ * below:
+ *
+ * The hash list is used to quickly find a page when the page's vnode and
+ * offset within the vnode are known. Each page that is hashed is
+ * connected via the `p_hash' field. The anchor for each hash is in the
+ * array `page_hash'. An array of mutexes, `ph_mutex', protects the
+ * lists anchored by page_hash[]. To either search or modify a given hash
+ * list, the appropriate mutex in the ph_mutex array must be held.
+ *
+ * The free list contains pages that are `free to be given away'. For
+ * efficiency reasons, pages on this list are placed in two catagories:
+ * pages that are still associated with a vnode, and pages that are not
+ * associated with a vnode. Free pages always have their `p_free' bit set,
+ * free pages that are still associated with a vnode also have their
+ * `p_age' bit set. Pages on the free list are connected via their
+ * `p_next' and `p_prev' fields. When a page is involved in some sort
+ * of i/o, it is not free and these fields may be used to link associated
+ * pages together. At the moment, the free list is protected by a
+ * single mutex `page_freelock'. The list of free pages still associated
+ * with a vnode is anchored by `page_cachelist' while other free pages
+ * are anchored in architecture dependent ways (to handle page coloring etc.).
+ *
+ * Pages associated with a given vnode appear on a list anchored in the
+ * vnode by the `v_pages' field. They are linked together with
+ * `p_vpnext' and `p_vpprev'. The field `p_offset' contains a page's
+ * offset within the vnode. The pages on this list are not kept in
+ * offset order. These lists, in a manner similar to the hash lists,
+ * are protected by an array of mutexes called `vph_hash'. Before
+ * searching or modifying this chain the appropriate mutex in the
+ * vph_hash[] array must be held.
+ *
+ * Again, each of the lists that a page can appear on is protected by a
+ * mutex. Before reading or writing any of the fields comprising the
+ * list, the appropriate lock must be held. These list locks should only
+ * be held for very short intervals.
+ *
+ * In addition to the list locks, each page structure contains a
+ * shared/exclusive lock that protects various fields within it.
+ * To modify one of these fields, the `p_selock' must be exclusively held.
+ * To read a field with a degree of certainty, the lock must be at least
+ * held shared.
+ *
+ * Removing a page structure from one of the lists requires holding
+ * the appropriate list lock and the page's p_selock. A page may be
+ * prevented from changing identity, being freed, or otherwise modified
+ * by acquiring p_selock shared.
+ *
+ * To avoid deadlocks, a strict locking protocol must be followed. Basically
+ * there are two cases: In the first case, the page structure in question
+ * is known ahead of time (e.g., when the page is to be added or removed
+ * from a list). In the second case, the page structure is not known and
+ * must be found by searching one of the lists.
+ *
+ * When adding or removing a known page to one of the lists, first the
+ * page must be exclusively locked (since at least one of its fields
+ * will be modified), second the lock protecting the list must be acquired,
+ * third the page inserted or deleted, and finally the list lock dropped.
+ *
+ * The more interesting case occures when the particular page structure
+ * is not known ahead of time. For example, when a call is made to
+ * page_lookup(), it is not known if a page with the desired (vnode and
+ * offset pair) identity exists. So the appropriate mutex in ph_mutex is
+ * acquired, the hash list searched, and if the desired page is found
+ * an attempt is made to lock it. The attempt to acquire p_selock must
+ * not block while the hash list lock is held. A deadlock could occure
+ * if some other process was trying to remove the page from the list.
+ * The removing process (following the above protocol) would have exclusively
+ * locked the page, and be spinning waiting to acquire the lock protecting
+ * the hash list. Since the searching process holds the hash list lock
+ * and is waiting to acquire the page lock, a deadlock occurs.
+ *
+ * The proper scheme to follow is: first, lock the appropriate list,
+ * search the list, and if the desired page is found either use
+ * page_trylock() (which will not block) or pass the address of the
+ * list lock to page_lock(). If page_lock() can not acquire the page's
+ * lock, it will drop the list lock before going to sleep. page_lock()
+ * returns a value to indicate if the list lock was dropped allowing the
+ * calling program to react appropriately (i.e., retry the operation).
+ *
+ * If the list lock was dropped before the attempt at locking the page
+ * was made, checks would have to be made to ensure that the page had
+ * not changed identity before its lock was obtained. This is because
+ * the interval between dropping the list lock and acquiring the page
+ * lock is indeterminate.
+ *
+ * In addition, when both a hash list lock (ph_mutex[]) and a vnode list
+ * lock (vph_mutex[]) are needed, the hash list lock must be acquired first.
+ * The routine page_hashin() is a good example of this sequence.
+ * This sequence is ASSERTed by checking that the vph_mutex[] is not held
+ * just before each acquisition of one of the mutexs in ph_mutex[].
+ *
+ * So, as a quick summary:
+ *
+ * pse_mutex[]'s protect the p_selock and p_cv fields.
+ *
+ * p_selock protects the p_free, p_age, p_vnode, p_offset and p_hash,
+ *
+ * ph_mutex[]'s protect the page_hash[] array and its chains.
+ *
+ * vph_mutex[]'s protect the v_pages field and the vp page chains.
+ *
+ * First lock the page, then the hash chain, then the vnode chain. When
+ * this is not possible `trylocks' must be used. Sleeping while holding
+ * any of these mutexes (p_selock is not a mutex) is not allowed.
+ *
+ *
+ * field reading writing ordering
+ * ======================================================================
+ * p_vnode p_selock(E,S) p_selock(E)
+ * p_offset
+ * p_free
+ * p_age
+ * =====================================================================
+ * p_hash p_selock(E,S) p_selock(E) && p_selock, ph_mutex
+ * ph_mutex[]
+ * =====================================================================
+ * p_vpnext p_selock(E,S) p_selock(E) && p_selock, vph_mutex
+ * p_vpprev vph_mutex[]
+ * =====================================================================
+ * When the p_free bit is set:
+ *
+ * p_next p_selock(E,S) p_selock(E) && p_selock,
+ * p_prev page_freelock page_freelock
+ *
+ * When the p_free bit is not set:
+ *
+ * p_next p_selock(E,S) p_selock(E) && p_selock, p_iolock
+ * p_prev p_iolock
+ * =====================================================================
+ * p_selock pse_mutex[] pse_mutex[] can`t acquire any
+ * p_cv other mutexes or
+ * sleep while holding
+ * this lock.
+ * =====================================================================
+ * p_lckcnt p_selock(E,S) p_selock(E) &&
+ * p_cowcnt page_llock
+ * =====================================================================
+ * p_nrm hat layer lock hat layer lock
+ * p_mapping
+ * p_pagenum
+ * =====================================================================
+ *
+ * where:
+ * E----> exclusive version of p_selock.
+ * S----> shared version of p_selock.
+ *
+ *
+ * Global data structures and variable:
+ *
+ * field reading writing ordering
+ * =====================================================================
+ * page_hash[] ph_mutex[] ph_mutex[] can hold this lock
+ * before acquiring
+ * a vph_mutex or
+ * pse_mutex.
+ * =====================================================================
+ * vp->v_pages vph_mutex[] vph_mutex[] can only acquire
+ * a pse_mutex while
+ * holding this lock.
+ * =====================================================================
+ * page_cachelist page_freelock page_freelock can't acquire any
+ * page_freelist page_freelock page_freelock
+ * =====================================================================
+ * freemem freemem_lock freemem_lock can't acquire any
+ * freemem_wait other mutexes while
+ * freemem_cv holding this mutex.
+ * =====================================================================
+ *
+ * Page relocation, PG_NORELOC and P_NORELOC.
+ *
+ * Pages may be relocated using the page_relocate() interface. Relocation
+ * involves moving the contents and identity of a page to another, free page.
+ * To relocate a page, the SE_EXCL lock must be obtained. The way to prevent
+ * a page from being relocated is to hold the SE_SHARED lock (the SE_EXCL
+ * lock must not be held indefinitely). If the page is going to be held
+ * SE_SHARED indefinitely, then the PG_NORELOC hint should be passed
+ * to page_create_va so that pages that are prevented from being relocated
+ * can be managed differently by the platform specific layer.
+ *
+ * Pages locked in memory using page_pp_lock (p_lckcnt/p_cowcnt != 0)
+ * are guaranteed to be held in memory, but can still be relocated
+ * providing the SE_EXCL lock can be obtained.
+ *
+ * The P_NORELOC bit in the page_t.p_state field is provided for use by
+ * the platform specific code in managing pages when the PG_NORELOC
+ * hint is used.
+ *
+ * Memory delete and page locking.
+ *
+ * The set of all usable pages is managed using the global page list as
+ * implemented by the memseg structure defined below. When memory is added
+ * or deleted this list changes. Additions to this list guarantee that the
+ * list is never corrupt. In order to avoid the necessity of an additional
+ * lock to protect against failed accesses to the memseg being deleted and,
+ * more importantly, the page_ts, the memseg structure is never freed and the
+ * page_t virtual address space is remapped to a page (or pages) of
+ * zeros. If a page_t is manipulated while it is p_selock'd, or if it is
+ * locked indirectly via a hash or freelist lock, it is not possible for
+ * memory delete to collect the page and so that part of the page list is
+ * prevented from being deleted. If the page is referenced outside of one
+ * of these locks, it is possible for the page_t being referenced to be
+ * deleted. Examples of this are page_t pointers returned by
+ * page_numtopp_nolock, page_first and page_next. Providing the page_t
+ * is re-checked after taking the p_selock (for p_vnode != NULL), the
+ * remapping to the zero pages will be detected.
+ *
+ *
+ * Page size (p_szc field) and page locking.
+ *
+ * p_szc field of free pages is changed by free list manager under freelist
+ * locks and is of no concern to the rest of VM subsystem.
+ *
+ * p_szc changes of allocated anonymous (swapfs) can only be done only after
+ * exclusively locking all constituent pages and calling hat_pageunload() on
+ * each of them. To prevent p_szc changes of non free anonymous (swapfs) large
+ * pages it's enough to either lock SHARED any of constituent pages or prevent
+ * hat_pageunload() by holding hat level lock that protects mapping lists (this
+ * method is for hat code only)
+ *
+ * To increase (promote) p_szc of allocated non anonymous file system pages
+ * one has to first lock exclusively all involved constituent pages and call
+ * hat_pageunload() on each of them. To prevent p_szc promote it's enough to
+ * either lock SHARED any of constituent pages that will be needed to make a
+ * large page or prevent hat_pageunload() by holding hat level lock that
+ * protects mapping lists (this method is for hat code only).
+ *
+ * To decrease (demote) p_szc of an allocated non anonymous file system large
+ * page one can either use the same method as used for changeing p_szc of
+ * anonymous large pages or if it's not possible to lock all constituent pages
+ * exclusively a different method can be used. In the second method one only
+ * has to exclusively lock one of constituent pages but then one has to
+ * acquire further locks by calling page_szc_lock() and
+ * hat_page_demote(). hat_page_demote() acquires hat level locks and then
+ * demotes the page. This mechanism relies on the fact that any code that
+ * needs to prevent p_szc of a file system large page from changeing either
+ * locks all constituent large pages at least SHARED or locks some pages at
+ * least SHARED and calls page_szc_lock() or uses hat level page locks.
+ * Demotion using this method is implemented by page_demote_vp_pages().
+ * Please see comments in front of page_demote_vp_pages(), hat_page_demote()
+ * and page_szc_lock() for more details.
+ *
+ * Lock order: p_selock, page_szc_lock, ph_mutex/vph_mutex/freelist,
+ * hat level locks.
+ */
+
+typedef struct page {
+ u_offset_t p_offset; /* offset into vnode for this page */
+ struct vnode *p_vnode; /* vnode that this page is named by */
+ selock_t p_selock; /* shared/exclusive lock on the page */
+#if defined(_LP64)
+ int p_selockpad; /* pad for growing selock */
+#endif
+ struct page *p_hash; /* hash by [vnode, offset] */
+ struct page *p_vpnext; /* next page in vnode list */
+ struct page *p_vpprev; /* prev page in vnode list */
+ struct page *p_next; /* next page in free/intrans lists */
+ struct page *p_prev; /* prev page in free/intrans lists */
+ ushort_t p_lckcnt; /* number of locks on page data */
+ ushort_t p_cowcnt; /* number of copy on write lock */
+ kcondvar_t p_cv; /* page struct's condition var */
+ kcondvar_t p_io_cv; /* for iolock */
+ uchar_t p_iolock_state; /* replaces p_iolock */
+ volatile uchar_t p_szc; /* page size code */
+ uchar_t p_fsdata; /* file system dependent byte */
+ uchar_t p_state; /* p_free, p_noreloc */
+ uchar_t p_nrm; /* non-cache, ref, mod readonly bits */
+#if defined(__sparc)
+ uchar_t p_vcolor; /* virtual color */
+#else
+ uchar_t p_embed; /* x86 - changes p_mapping & p_index */
+#endif
+ uchar_t p_index; /* MPSS mapping info. Not used on x86 */
+ uchar_t p_toxic; /* page has an unrecoverable error */
+ void *p_mapping; /* hat specific translation info */
+ pfn_t p_pagenum; /* physical page number */
+
+ uint_t p_share; /* number of translations */
+#if defined(_LP64)
+ uint_t p_sharepad; /* pad for growing p_share */
+#endif
+ uint_t p_msresv_1; /* reserved for future use */
+#if defined(__sparc)
+ uint_t p_kpmref; /* number of kpm mapping sharers */
+ struct kpme *p_kpmelist; /* kpm specific mapping info */
+#else
+ /* index of entry in p_map when p_embed is set */
+ uint_t p_mlentry;
+#endif
+ uint64_t p_msresv_2; /* page allocation debugging */
+} page_t;
+
+
+typedef page_t devpage_t;
+#define devpage page
+
+
+/*
+ * Page hash table is a power-of-two in size, externally chained
+ * through the hash field. PAGE_HASHAVELEN is the average length
+ * desired for this chain, from which the size of the page_hash
+ * table is derived at boot time and stored in the kernel variable
+ * page_hashsz. In the hash function it is given by PAGE_HASHSZ.
+ *
+ * PAGE_HASH_FUNC returns an index into the page_hash[] array. This
+ * index is also used to derive the mutex that protects the chain.
+ *
+ * In constructing the hash function, first we dispose of unimportant bits
+ * (page offset from "off" and the low 3 bits of "vp" which are zero for
+ * struct alignment). Then shift and sum the remaining bits a couple times
+ * in order to get as many source bits from the two source values into the
+ * resulting hashed value. Note that this will perform quickly, since the
+ * shifting/summing are fast register to register operations with no additional
+ * memory references).
+ */
+#if NCPU < 4
+#define PH_TABLE_SIZE 16
+#define VP_SHIFT 7
+#else
+#define PH_TABLE_SIZE 128
+#define VP_SHIFT 9
+#endif
+
+/*
+ * The amount to use for the successive shifts in the hash function below.
+ * The actual value is LOG2(PH_TABLE_SIZE), so that as many bits as
+ * possible will filter thru PAGE_HASH_FUNC() and PAGE_HASH_MUTEX().
+ */
+#define PH_SHIFT_SIZE (7)
+
+#define PAGE_HASHSZ page_hashsz
+#define PAGE_HASHAVELEN 4
+#define PAGE_HASH_FUNC(vp, off) \
+ ((((uintptr_t)(off) >> PAGESHIFT) + \
+ ((uintptr_t)(off) >> (PAGESHIFT + PH_SHIFT_SIZE)) + \
+ ((uintptr_t)(vp) >> 3) + \
+ ((uintptr_t)(vp) >> (3 + PH_SHIFT_SIZE)) + \
+ ((uintptr_t)(vp) >> (3 + 2 * PH_SHIFT_SIZE))) & \
+ (PAGE_HASHSZ - 1))
+#ifdef _KERNEL
+
+/*
+ * The page hash value is re-hashed to an index for the ph_mutex array.
+ *
+ * For 64 bit kernels, the mutex array is padded out to prevent false
+ * sharing of cache sub-blocks (64 bytes) of adjacent mutexes.
+ *
+ * For 32 bit kernels, we don't want to waste kernel address space with
+ * padding, so instead we rely on the hash function to introduce skew of
+ * adjacent vnode/offset indexes (the left shift part of the hash function).
+ * Since sizeof (kmutex_t) is 8, we shift an additional 3 to skew to a different
+ * 64 byte sub-block.
+ */
+typedef struct pad_mutex {
+ kmutex_t pad_mutex;
+#ifdef _LP64
+ char pad_pad[64 - sizeof (kmutex_t)];
+#endif
+} pad_mutex_t;
+extern pad_mutex_t ph_mutex[];
+
+#define PAGE_HASH_MUTEX(x) \
+ &(ph_mutex[((x) + ((x) >> VP_SHIFT) + ((x) << 3)) & \
+ (PH_TABLE_SIZE - 1)].pad_mutex)
+
+/*
+ * Flags used while creating pages.
+ */
+#define PG_EXCL 0x0001
+#define PG_WAIT 0x0002
+#define PG_PHYSCONTIG 0x0004 /* NOT SUPPORTED */
+#define PG_MATCH_COLOR 0x0008 /* SUPPORTED by free list routines */
+#define PG_NORELOC 0x0010 /* Non-relocatable alloc hint. */
+ /* Page must be PP_ISNORELOC */
+#define PG_PANIC 0x0020 /* system will panic if alloc fails */
+#define PG_PUSHPAGE 0x0040 /* alloc may use reserve */
+
+/*
+ * When p_selock has the SE_EWANTED bit set, threads waiting for SE_EXCL
+ * access are given priority over all other waiting threads.
+ */
+#define SE_EWANTED 0x40000000
+#define PAGE_LOCKED(pp) (((pp)->p_selock & ~SE_EWANTED) != 0)
+#define PAGE_SHARED(pp) (((pp)->p_selock & ~SE_EWANTED) > 0)
+#define PAGE_EXCL(pp) ((pp)->p_selock < 0)
+#define PAGE_LOCKED_SE(pp, se) \
+ ((se) == SE_EXCL ? PAGE_EXCL(pp) : PAGE_SHARED(pp))
+
+extern long page_hashsz;
+extern page_t **page_hash;
+
+extern kmutex_t page_llock; /* page logical lock mutex */
+extern kmutex_t freemem_lock; /* freemem lock */
+
+extern pgcnt_t total_pages; /* total pages in the system */
+
+/*
+ * Variables controlling locking of physical memory.
+ */
+extern pgcnt_t pages_pp_maximum; /* tuning: lock + claim <= max */
+extern void init_pages_pp_maximum(void);
+
+struct lgrp;
+
+/* page_list_{add,sub} flags */
+
+/* which list */
+#define PG_FREE_LIST 0x0001
+#define PG_CACHE_LIST 0x0002
+
+/* where on list */
+#define PG_LIST_TAIL 0x0010
+#define PG_LIST_HEAD 0x0020
+
+/* called from */
+#define PG_LIST_ISINIT 0x1000
+#define PG_LIST_ISCAGE 0x2000
+
+/*
+ * Flags for setting the p_toxic flag when a page has errors
+ * These flags may be OR'ed into the p_toxic page flag to
+ * indicate that error(s) have occurred on a page,
+ * (see page_settoxic()). If both PAGE_IS_TOXIC and
+ * PAGE_IS_FAILING are set, PAGE_IS_FAILING takes precedence.
+ *
+ * When an error happens on a page, the trap handler sets
+ * PAGE_IS_FAULTY on the page to indicate that an error has been
+ * seen on the page. The error could be really a memory error or
+ * something else (like a datapath error). When it is determined
+ * that it is a memory error, the page is marked as PAGE_IS_TOXIC
+ * or PAGE_IS_FAILING depending on the type of error and then
+ * retired.
+ *
+ * We use the page's 'toxic' flag to determine whether the page
+ * has just got a single error - PAGE_IS_TOXIC - or is being
+ * retired due to multiple soft errors - PAGE_IS_FAILING. In
+ * page_free(), a page that has been marked PAGE_IS_FAILING will
+ * not be cleaned, it will always be retired. A page marked
+ * PAGE_IS_TOXIC is cleaned and is retired only if this attempt at
+ * cleaning fails.
+ *
+ * When a page has been successfully retired, we set PAGE_IS_RETIRED.
+ */
+#define PAGE_IS_OK 0x0
+#define PAGE_IS_TOXIC 0x1
+#define PAGE_IS_FAILING 0x2
+#define PAGE_IS_RETIRED 0x4
+#define PAGE_IS_FAULTY 0x8
+
+/*
+ * Page frame operations.
+ */
+page_t *page_lookup(struct vnode *, u_offset_t, se_t);
+page_t *page_lookup_create(struct vnode *, u_offset_t, se_t, page_t *,
+ spgcnt_t *, int);
+page_t *page_lookup_nowait(struct vnode *, u_offset_t, se_t);
+page_t *page_find(struct vnode *, u_offset_t);
+page_t *page_exists(struct vnode *, u_offset_t);
+int page_exists_physcontig(vnode_t *, u_offset_t, uint_t, page_t *[]);
+int page_exists_forreal(struct vnode *, u_offset_t, uint_t *);
+void page_needfree(spgcnt_t);
+page_t *page_create(struct vnode *, u_offset_t, size_t, uint_t);
+int page_alloc_pages(struct seg *, caddr_t, page_t **, page_t **,
+ uint_t, int);
+page_t *page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes,
+ uint_t flags, struct seg *seg, caddr_t vaddr, void *arg);
+page_t *page_create_va(struct vnode *, u_offset_t, size_t, uint_t,
+ struct seg *, caddr_t);
+int page_create_wait(size_t npages, uint_t flags);
+void page_create_putback(ssize_t npages);
+void page_free(page_t *, int);
+void page_free_at_startup(page_t *);
+void page_free_pages(page_t *);
+void free_vp_pages(struct vnode *, u_offset_t, size_t);
+int page_reclaim(page_t *, kmutex_t *);
+void page_destroy(page_t *, int);
+void page_destroy_pages(page_t *);
+void page_destroy_free(page_t *);
+void page_rename(page_t *, struct vnode *, u_offset_t);
+int page_hashin(page_t *, struct vnode *, u_offset_t, kmutex_t *);
+void page_hashout(page_t *, kmutex_t *);
+int page_num_hashin(pfn_t, struct vnode *, u_offset_t);
+void page_add(page_t **, page_t *);
+void page_add_common(page_t **, page_t *);
+void page_sub(page_t **, page_t *);
+void page_sub_common(page_t **, page_t *);
+page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
+ caddr_t, size_t, uint_t, struct lgrp *);
+
+page_t *page_get_cachelist(struct vnode *, u_offset_t, struct seg *,
+ caddr_t, uint_t, struct lgrp *);
+void page_list_add(page_t *, int);
+void page_boot_demote(page_t *);
+void page_promote_size(page_t *, uint_t);
+void page_list_add_pages(page_t *, int);
+void page_list_sub(page_t *, int);
+void page_list_break(page_t **, page_t **, size_t);
+void page_list_concat(page_t **, page_t **);
+void page_vpadd(page_t **, page_t *);
+void page_vpsub(page_t **, page_t *);
+int page_lock(page_t *, se_t, kmutex_t *, reclaim_t);
+int page_lock_es(page_t *, se_t, kmutex_t *, reclaim_t, int);
+void page_lock_clr_exclwanted(page_t *);
+int page_trylock(page_t *, se_t);
+int page_try_reclaim_lock(page_t *, se_t, int);
+int page_tryupgrade(page_t *);
+void page_downgrade(page_t *);
+void page_unlock(page_t *);
+void page_lock_delete(page_t *);
+int page_pp_lock(page_t *, int, int);
+void page_pp_unlock(page_t *, int, int);
+int page_resv(pgcnt_t, uint_t);
+void page_unresv(pgcnt_t);
+void page_pp_useclaim(page_t *, page_t *, uint_t);
+int page_addclaim(page_t *);
+int page_subclaim(page_t *);
+int page_addclaim_pages(page_t **);
+int page_subclaim_pages(page_t **);
+pfn_t page_pptonum(page_t *);
+page_t *page_numtopp(pfn_t, se_t);
+page_t *page_numtopp_noreclaim(pfn_t, se_t);
+page_t *page_numtopp_nolock(pfn_t);
+page_t *page_numtopp_nowait(pfn_t, se_t);
+page_t *page_first();
+page_t *page_next(page_t *);
+page_t *page_nextn_raw(page_t *, ulong_t); /* pp += n */
+#define page_next_raw(PP) page_nextn_raw((PP), 1)
+page_t *page_list_next(page_t *);
+page_t *page_nextn(page_t *, ulong_t);
+page_t *page_next_scan_init(void **);
+page_t *page_next_scan_large(page_t *, ulong_t *, void **);
+void prefetch_page_r(void *);
+void ppcopy(page_t *, page_t *);
+void page_relocate_hash(page_t *, page_t *);
+void pagezero(page_t *, uint_t, uint_t);
+void pagescrub(page_t *, uint_t, uint_t);
+void page_io_lock(page_t *);
+void page_io_unlock(page_t *);
+int page_io_trylock(page_t *);
+int page_iolock_assert(page_t *);
+void page_iolock_init(page_t *);
+pgcnt_t page_busy(int);
+void page_lock_init(void);
+ulong_t page_share_cnt(page_t *);
+int page_isshared(page_t *);
+int page_isfree(page_t *);
+int page_isref(page_t *);
+int page_ismod(page_t *);
+int page_release(page_t *, int);
+int page_retire(page_t *, uchar_t);
+int page_istoxic(page_t *);
+int page_isfailing(page_t *);
+int page_isretired(page_t *);
+int page_deteriorating(page_t *);
+void page_settoxic(page_t *, uchar_t);
+void page_clrtoxic(page_t *);
+void page_clrtoxic_flag(page_t *, uchar_t);
+int page_isfaulty(page_t *);
+int page_mem_avail(pgcnt_t);
+
+void page_set_props(page_t *, uint_t);
+void page_clr_all_props(page_t *);
+
+kmutex_t *page_vnode_mutex(struct vnode *);
+kmutex_t *page_se_mutex(struct page *);
+kmutex_t *page_szc_lock(struct page *);
+int page_szc_lock_assert(struct page *pp);
+
+/*
+ * Page relocation interfaces. page_relocate() is generic.
+ * page_get_replacement_page() is provided by the PSM.
+ * page_free_replacement_page() is generic.
+ */
+int group_page_trylock(page_t *, se_t);
+void group_page_unlock(page_t *);
+int page_relocate(page_t **, page_t **, int, int, spgcnt_t *, struct lgrp *);
+int do_page_relocate(page_t **, page_t **, int, spgcnt_t *, struct lgrp *);
+page_t *page_get_replacement_page(page_t *, struct lgrp *, uint_t);
+void page_free_replacement_page(page_t *);
+int page_relocate_cage(page_t **, page_t **);
+
+int page_try_demote_pages(page_t *);
+void page_demote_free_pages(page_t *);
+
+struct anon_map;
+
+void page_mark_migrate(struct seg *, caddr_t, size_t, struct anon_map *,
+ ulong_t, vnode_t *, u_offset_t, int);
+void page_migrate(struct seg *, caddr_t, page_t **, pgcnt_t);
+
+/*
+ * Tell the PIM we are adding physical memory
+ */
+void add_physmem(page_t *, size_t, pfn_t);
+void add_physmem_cb(page_t *, pfn_t); /* callback for page_t part */
+
+/*
+ * hw_page_array[] is configured with hardware supported page sizes by
+ * platform specific code.
+ */
+typedef struct {
+ size_t hp_size;
+ uint_t hp_shift;
+ pgcnt_t hp_pgcnt; /* base pagesize cnt */
+} hw_pagesize_t;
+
+extern hw_pagesize_t hw_page_array[];
+extern uint_t page_colors, page_colors_mask;
+extern uint_t page_coloring_shift;
+extern int cpu_page_colors;
+
+uint_t page_num_pagesizes(void);
+uint_t page_num_user_pagesizes(void);
+size_t page_get_pagesize(uint_t);
+size_t page_get_user_pagesize(uint_t n);
+pgcnt_t page_get_pagecnt(uint_t);
+uint_t page_get_shift(uint_t);
+int page_szc(size_t);
+int page_user_szc(size_t);
+
+
+/* page_get_replacement page flags */
+#define PGR_SAMESZC 0x1 /* only look for page size same as orig */
+#define PGR_NORELOC 0x2 /* allocate a P_NORELOC page */
+
+#endif /* _KERNEL */
+
+/*
+ * Constants used for the p_iolock_state
+ */
+#define PAGE_IO_INUSE 0x1
+#define PAGE_IO_WANTED 0x2
+
+/*
+ * Constants used for page_release status
+ */
+#define PGREL_NOTREL 0x1
+#define PGREL_CLEAN 0x2
+#define PGREL_MOD 0x3
+
+/*
+ * The p_state field holds what used to be the p_age and p_free
+ * bits. These fields are protected by p_selock (see above).
+ */
+#define P_FREE 0x80 /* Page on free list */
+#define P_NORELOC 0x40 /* Page is non-relocatable */
+#define P_MIGRATE 0x20 /* Migrate page on next touch */
+#define P_SWAP 0x10 /* belongs to vnode that is V_ISSWAP */
+
+#define PP_ISFREE(pp) ((pp)->p_state & P_FREE)
+#define PP_ISAGED(pp) (((pp)->p_state & P_FREE) && \
+ ((pp)->p_vnode == NULL))
+#define PP_ISNORELOC(pp) ((pp)->p_state & P_NORELOC)
+#define PP_ISMIGRATE(pp) ((pp)->p_state & P_MIGRATE)
+#define PP_ISSWAP(pp) ((pp)->p_state & P_SWAP)
+
+#define PP_SETFREE(pp) ((pp)->p_state = ((pp)->p_state & ~P_MIGRATE) \
+ | P_FREE)
+#define PP_SETAGED(pp) ASSERT(PP_ISAGED(pp))
+#define PP_SETNORELOC(pp) ((pp)->p_state |= P_NORELOC)
+#define PP_SETMIGRATE(pp) ((pp)->p_state |= P_MIGRATE)
+#define PP_SETSWAP(pp) ((pp)->p_state |= P_SWAP)
+
+#define PP_CLRFREE(pp) ((pp)->p_state &= ~P_FREE)
+#define PP_CLRAGED(pp) ASSERT(!PP_ISAGED(pp))
+#define PP_CLRNORELOC(pp) ((pp)->p_state &= ~P_NORELOC)
+#define PP_CLRMIGRATE(pp) ((pp)->p_state &= ~P_MIGRATE)
+#define PP_CLRSWAP(pp) ((pp)->p_state &= ~P_SWAP)
+
+
+
+/*
+ * kpm large page description.
+ * The virtual address range of segkpm is divided into chunks of
+ * kpm_pgsz. Each chunk is controlled by a kpm_page_t. The ushort
+ * is sufficient for 2^^15 * PAGESIZE, so e.g. the maximum kpm_pgsz
+ * for 8K is 256M and 2G for 64K pages. It it kept as small as
+ * possible to save physical memory space.
+ *
+ * There are 2 segkpm mapping windows within in the virtual address
+ * space when we have to prevent VAC alias conflicts. The so called
+ * Alias window (mappings are always by PAGESIZE) is controlled by
+ * kp_refcnta. The regular window is controlled by kp_refcnt for the
+ * normal operation, which is to use the largest available pagesize.
+ * When VAC alias conflicts are present within a chunk in the regular
+ * window the large page mapping is broken up into smaller PAGESIZE
+ * mappings. kp_refcntc is used to control the pages that are invoked
+ * in the conflict and kp_refcnts holds the active mappings done
+ * with the small page size. In non vac conflict mode kp_refcntc is
+ * also used as "go" indication (-1) for the trap level tsbmiss
+ * handler.
+ */
+typedef struct kpm_page {
+ short kp_refcnt; /* pages mapped large */
+ short kp_refcnta; /* pages mapped in Alias window */
+ short kp_refcntc; /* TL-tsbmiss flag; #vac alias conflict pages */
+ short kp_refcnts; /* vac alias: pages mapped small */
+} kpm_page_t;
+
+/*
+ * Note: khl_lock offset changes must be reflected in sfmmu_asm.s
+ */
+typedef struct kpm_hlk {
+ kmutex_t khl_mutex; /* kpm_page mutex */
+ uint_t khl_lock; /* trap level tsbmiss handling */
+} kpm_hlk_t;
+
+/*
+ * kpm small page description.
+ * When kpm_pgsz is equal to PAGESIZE a smaller representation is used
+ * to save memory space. Alias range mappings and regular segkpm
+ * mappings are done in units of PAGESIZE and can share the mapping
+ * information and the mappings are always distinguishable by their
+ * virtual address. Other information neeeded for VAC conflict prevention
+ * is already available on a per page basis. There are basically 3 states
+ * a kpm_spage can have: not mapped (0), mapped in Alias range or virtually
+ * uncached (1) and mapped in the regular segkpm window (-1). The -1 value
+ * is also used as "go" indication for the segkpm trap level tsbmiss
+ * handler for small pages (value is kept the same as it is used for large
+ * mappings).
+ */
+typedef struct kpm_spage {
+ char kp_mapped; /* page mapped small */
+} kpm_spage_t;
+
+/*
+ * Note: kshl_lock offset changes must be reflected in sfmmu_asm.s
+ */
+typedef struct kpm_shlk {
+ uint_t kshl_lock; /* trap level tsbmiss handling */
+} kpm_shlk_t;
+
+/*
+ * Each segment of physical memory is described by a memseg struct.
+ * Within a segment, memory is considered contiguous. The members
+ * can be categorized as follows:
+ * . Platform independent:
+ * pages, epages, pages_base, pages_end, next, lnext.
+ * . 64bit only but platform independent:
+ * kpm_pbase, kpm_nkpmpgs, kpm_pages, kpm_spages.
+ * . Really platform or mmu specific:
+ * pagespa, epagespa, nextpa, kpm_pagespa.
+ * . Mixed:
+ * msegflags.
+ */
+struct memseg {
+ page_t *pages, *epages; /* [from, to] in page array */
+ pfn_t pages_base, pages_end; /* [from, to] in page numbers */
+ struct memseg *next; /* next segment in list */
+#if defined(__sparc)
+ struct memseg *lnext; /* next segment in deleted list */
+ uint64_t pagespa, epagespa; /* [from, to] page array physical */
+ uint64_t nextpa; /* physical next pointer */
+ pfn_t kpm_pbase; /* start of kpm range */
+ pgcnt_t kpm_nkpmpgs; /* # of kpm_pgsz pages */
+ union _mseg_un {
+ kpm_page_t *kpm_lpgs; /* ptr to kpm_page array */
+ kpm_spage_t *kpm_spgs; /* ptr to kpm_spage array */
+ } mseg_un;
+ uint64_t kpm_pagespa; /* physical ptr to kpm (s)pages array */
+ uint_t msegflags; /* memseg flags */
+#endif /* __sparc */
+};
+
+/* memseg union aliases */
+#define kpm_pages mseg_un.kpm_lpgs
+#define kpm_spages mseg_un.kpm_spgs
+
+/* msegflags */
+#define MEMSEG_DYNAMIC 0x1 /* DR: memory was added dynamically */
+
+/* memseg support macros */
+#define MSEG_NPAGES(SEG) ((SEG)->pages_end - (SEG)->pages_base)
+
+/* memseg hash */
+#define MEM_HASH_SHIFT 0x9
+#define N_MEM_SLOTS 0x200 /* must be a power of 2 */
+#define MEMSEG_PFN_HASH(pfn) (((pfn)/mhash_per_slot) & (N_MEM_SLOTS - 1))
+
+/* memseg externals */
+extern struct memseg *memsegs; /* list of memory segments */
+extern ulong_t mhash_per_slot;
+extern uint64_t memsegspa; /* memsegs as physical address */
+
+void build_pfn_hash();
+extern struct memseg *page_numtomemseg_nolock(pfn_t pfnum);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_PAGE_H */
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
new file mode 100644
index 0000000000..9a2d12dd8e
--- /dev/null
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -0,0 +1,861 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - page locking primitives
+ */
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/vtrace.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/bitmap.h>
+#include <sys/lockstat.h>
+#include <sys/condvar_impl.h>
+#include <vm/page.h>
+#include <vm/seg_enum.h>
+#include <vm/vm_dep.h>
+
+/*
+ * This global mutex is for logical page locking.
+ * The following fields in the page structure are protected
+ * by this lock:
+ *
+ * p_lckcnt
+ * p_cowcnt
+ */
+kmutex_t page_llock;
+
+/*
+ * This is a global lock for the logical page free list. The
+ * logical free list, in this implementation, is maintained as two
+ * separate physical lists - the cache list and the free list.
+ */
+kmutex_t page_freelock;
+
+/*
+ * The hash table, page_hash[], the p_selock fields, and the
+ * list of pages associated with vnodes are protected by arrays of mutexes.
+ *
+ * Unless the hashes are changed radically, the table sizes must be
+ * a power of two. Also, we typically need more mutexes for the
+ * vnodes since these locks are occasionally held for long periods.
+ * And since there seem to be two special vnodes (kvp and swapvp),
+ * we make room for private mutexes for them.
+ *
+ * The pse_mutex[] array holds the mutexes to protect the p_selock
+ * fields of all page_t structures.
+ *
+ * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
+ * when given a pointer to a page_t.
+ *
+ * PSE_TABLE_SIZE must be a power of two. One could argue that we
+ * should go to the trouble of setting it up at run time and base it
+ * on memory size rather than the number of compile time CPUs.
+ *
+ * XX64 We should be using physmem size to calculate PSE_TABLE_SIZE,
+ * PSE_SHIFT, PIO_SHIFT.
+ *
+ * These might break in 64 bit world.
+ */
+#define PSE_SHIFT 7 /* log2(PSE_TABLE_SIZE) */
+
+#define PSE_TABLE_SIZE 128 /* number of mutexes to have */
+
+#define PIO_SHIFT PSE_SHIFT /* next power of 2 bigger than page_t */
+#define PIO_TABLE_SIZE PSE_TABLE_SIZE /* number of io mutexes to have */
+
+pad_mutex_t ph_mutex[PH_TABLE_SIZE];
+pad_mutex_t pse_mutex[PSE_TABLE_SIZE];
+kmutex_t pio_mutex[PIO_TABLE_SIZE];
+
+#define PAGE_SE_MUTEX(pp) \
+ &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
+ ((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
+ (PSE_TABLE_SIZE - 1))].pad_mutex
+
+#define PAGE_IO_MUTEX(pp) \
+ &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
+
+#define PSZC_MTX_TABLE_SIZE 128
+#define PSZC_MTX_TABLE_SHIFT 7
+
+static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE];
+
+#define PAGE_SZC_MUTEX(_pp) \
+ &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
+ ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
+ ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
+ (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
+
+/*
+ * The vph_mutex[] array holds the mutexes to protect the vnode chains,
+ * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
+ * and p_vpnext).
+ *
+ * The page_vnode_mutex(vp) function returns the address of the appropriate
+ * mutex from this array given a pointer to a vnode. It is complicated
+ * by the fact that the kernel's vnode and the swapfs vnode are referenced
+ * frequently enough to warrent their own mutexes.
+ *
+ * The VP_HASH_FUNC returns the index into the vph_mutex array given
+ * an address of a vnode.
+ */
+
+/*
+ * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
+ * Need to review again.
+ */
+#define VPH_TABLE_SIZE (2 << VP_SHIFT)
+
+#define VP_HASH_FUNC(vp) \
+ ((((uintptr_t)(vp) >> 6) + \
+ ((uintptr_t)(vp) >> 8) + \
+ ((uintptr_t)(vp) >> 10) + \
+ ((uintptr_t)(vp) >> 12)) \
+ & (VPH_TABLE_SIZE - 1))
+
+extern struct vnode kvp;
+
+kmutex_t vph_mutex[VPH_TABLE_SIZE + 2];
+
+/*
+ * Initialize the locks used by the Virtual Memory Management system.
+ */
+void
+page_lock_init()
+{
+}
+
+/*
+ * At present we only use page ownership to aid debugging, so it's
+ * OK if the owner field isn't exact. In the 32-bit world two thread ids
+ * can map to the same owner because we just 'or' in 0x80000000 and
+ * then clear the second highest bit, so that (for example) 0x2faced00
+ * and 0xafaced00 both map to 0xafaced00.
+ * In the 64-bit world, p_selock may not be large enough to hold a full
+ * thread pointer. If we ever need precise ownership (e.g. if we implement
+ * priority inheritance for page locks) then p_selock should become a
+ * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
+ */
+#define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
+#define SE_READER 1
+
+/*
+ * A page that is deleted must be marked as such using the
+ * page_lock_delete() function. The page must be exclusively locked.
+ * The SE_DELETED marker is put in p_selock when this function is called.
+ * SE_DELETED must be distinct from any SE_WRITER value.
+ */
+#define SE_DELETED (1 | INT_MIN)
+
+#ifdef VM_STATS
+uint_t vph_kvp_count;
+uint_t vph_swapfsvp_count;
+uint_t vph_other;
+#endif /* VM_STATS */
+
+#ifdef VM_STATS
+uint_t page_lock_count;
+uint_t page_lock_miss;
+uint_t page_lock_miss_lock;
+uint_t page_lock_reclaim;
+uint_t page_lock_bad_reclaim;
+uint_t page_lock_same_page;
+uint_t page_lock_upgrade;
+uint_t page_lock_upgrade_failed;
+uint_t page_lock_deleted;
+
+uint_t page_trylock_locked;
+uint_t page_trylock_missed;
+
+uint_t page_try_reclaim_upgrade;
+#endif /* VM_STATS */
+
+
+/*
+ * Acquire the "shared/exclusive" lock on a page.
+ *
+ * Returns 1 on success and locks the page appropriately.
+ * 0 on failure and does not lock the page.
+ *
+ * If `lock' is non-NULL, it will be dropped and reacquired in the
+ * failure case. This routine can block, and if it does
+ * it will always return a failure since the page identity [vp, off]
+ * or state may have changed.
+ */
+
+int
+page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
+{
+ return (page_lock_es(pp, se, lock, reclaim, 0));
+}
+
+/*
+ * With the addition of reader-writer lock semantics to page_lock_es,
+ * callers wanting an exclusive (writer) lock may prevent shared-lock
+ * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
+ * In this case, when an exclusive lock cannot be acquired, p_selock's
+ * SE_EWANTED bit is set.
+ * This bit, along with the se and es parameters, are used to decide
+ * if the requested lock should be granted:
+ *
+ * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED Action
+ * ---------- -------------- ------------------- ---------
+ * SE_EXCL no dont-care/1 deny lock
+ * SE_EXCL any(see note) unlocked/any grant lock, clear SE_EWANTED
+ * SE_EXCL yes any lock/any deny, set SE_EWANTED
+ * SE_EXCL no any lock/any deny
+ * SE_SHARED not applicable shared/0 grant
+ * SE_SHARED not applicable unlocked/0 grant
+ * SE_SHARED not applicable shared/1 deny
+ * SE_SHARED not applicable unlocked/1 deny
+ * SE_SHARED not applicable excl/any deny
+ *
+ * Note: the code grants an exclusive lock to the caller and clears
+ * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
+ * bit's value. This was deemed acceptable as we are not concerned about
+ * exclusive-lock starvation. If this ever becomes an issue, a priority or
+ * fifo mechanism should also be implemented.
+ */
+int
+page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
+{
+ int retval;
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+ int upgraded;
+ int reclaim_it;
+
+ ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
+
+ VM_STAT_ADD(page_lock_count);
+
+ upgraded = 0;
+ reclaim_it = 0;
+
+ mutex_enter(pse);
+
+ /*
+ * Current uses of 'es':
+ * es == 1 page_lookup_create will attempt page relocation
+ * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete
+ * memory thread); this prevents reader-starvation of waiting
+ * writer thread(s).
+ */
+
+
+ ASSERT(((es & SE_EXCL_WANTED) == 0) ||
+ ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+ if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
+ se = SE_EXCL;
+ }
+
+ if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
+
+ reclaim_it = 1;
+ if (se == SE_SHARED) {
+ /*
+ * This is an interesting situation.
+ *
+ * Remember that p_free can only change if
+ * p_selock < 0.
+ * p_free does not depend on our holding `pse'.
+ * And, since we hold `pse', p_selock can not change.
+ * So, if p_free changes on us, the page is already
+ * exclusively held, and we would fail to get p_selock
+ * regardless.
+ *
+ * We want to avoid getting the share
+ * lock on a free page that needs to be reclaimed.
+ * It is possible that some other thread has the share
+ * lock and has left the free page on the cache list.
+ * pvn_vplist_dirty() does this for brief periods.
+ * If the se_share is currently SE_EXCL, we will fail
+ * to acquire p_selock anyway. Blocking is the
+ * right thing to do.
+ * If we need to reclaim this page, we must get
+ * exclusive access to it, force the upgrade now.
+ * Again, we will fail to acquire p_selock if the
+ * page is not free and block.
+ */
+ upgraded = 1;
+ se = SE_EXCL;
+ VM_STAT_ADD(page_lock_upgrade);
+ }
+ }
+
+ if (se == SE_EXCL) {
+ if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
+ /*
+ * if the caller wants a writer lock (but did not
+ * specify exclusive access), and there is a pending
+ * writer that wants exclusive access, return failure
+ */
+ retval = 0;
+ } else if ((pp->p_selock & ~SE_EWANTED) == 0) {
+ /* no reader/writer lock held */
+ THREAD_KPRI_REQUEST();
+ /* this clears our setting of the SE_EWANTED bit */
+ pp->p_selock = SE_WRITER;
+ retval = 1;
+ } else {
+ /* page is locked */
+ if (es == SE_EXCL_WANTED) {
+ /* set the SE_EWANTED bit */
+ pp->p_selock |= SE_EWANTED;
+ }
+ retval = 0;
+ }
+ } else {
+ retval = 0;
+ if (pp->p_selock >= 0) {
+ /* readers are not allowed when excl wanted */
+ if (!(pp->p_selock & SE_EWANTED)) {
+ pp->p_selock += SE_READER;
+ retval = 1;
+ }
+ }
+ }
+
+ if (retval == 0) {
+ if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
+ VM_STAT_ADD(page_lock_deleted);
+ mutex_exit(pse);
+ return (retval);
+ }
+
+#ifdef VM_STATS
+ VM_STAT_ADD(page_lock_miss);
+ if (upgraded) {
+ VM_STAT_ADD(page_lock_upgrade_failed);
+ }
+#endif
+ if (lock) {
+ VM_STAT_ADD(page_lock_miss_lock);
+ mutex_exit(lock);
+ }
+
+ /*
+ * Now, wait for the page to be unlocked and
+ * release the lock protecting p_cv and p_selock.
+ */
+ cv_wait(&pp->p_cv, pse);
+ mutex_exit(pse);
+
+ /*
+ * The page identity may have changed while we were
+ * blocked. If we are willing to depend on "pp"
+ * still pointing to a valid page structure (i.e.,
+ * assuming page structures are not dynamically allocated
+ * or freed), we could try to lock the page if its
+ * identity hasn't changed.
+ *
+ * This needs to be measured, since we come back from
+ * cv_wait holding pse (the expensive part of this
+ * operation) we might as well try the cheap part.
+ * Though we would also have to confirm that dropping
+ * `lock' did not cause any grief to the callers.
+ */
+ if (lock) {
+ mutex_enter(lock);
+ }
+ } else {
+ /*
+ * We have the page lock.
+ * If we needed to reclaim the page, and the page
+ * needed reclaiming (ie, it was free), then we
+ * have the page exclusively locked. We may need
+ * to downgrade the page.
+ */
+ ASSERT((upgraded) ?
+ ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
+ mutex_exit(pse);
+
+ /*
+ * We now hold this page's lock, either shared or
+ * exclusive. This will prevent its identity from changing.
+ * The page, however, may or may not be free. If the caller
+ * requested, and it is free, go reclaim it from the
+ * free list. If the page can't be reclaimed, return failure
+ * so that the caller can start all over again.
+ *
+ * NOTE:page_reclaim() releases the page lock (p_selock)
+ * if it can't be reclaimed.
+ */
+ if (reclaim_it) {
+ if (!page_reclaim(pp, lock)) {
+ VM_STAT_ADD(page_lock_bad_reclaim);
+ retval = 0;
+ } else {
+ VM_STAT_ADD(page_lock_reclaim);
+ if (upgraded) {
+ page_downgrade(pp);
+ }
+ }
+ }
+ }
+ return (retval);
+}
+
+/*
+ * Clear the SE_EWANTED bit from p_selock. This function allows
+ * callers of page_lock_es and page_try_reclaim_lock to clear
+ * their setting of this bit if they decide they no longer wish
+ * to gain exclusive access to the page. Currently only
+ * delete_memory_thread uses this when the delete memory
+ * operation is cancelled.
+ */
+void
+page_lock_clr_exclwanted(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+ mutex_enter(pse);
+ pp->p_selock &= ~SE_EWANTED;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ mutex_exit(pse);
+}
+
+/*
+ * Read the comments inside of page_lock_es() carefully.
+ *
+ * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
+ * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
+ * This is used by threads subject to reader-starvation (eg. memory delete).
+ *
+ * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
+ * it is expected that it will retry at a later time. Threads that will
+ * not retry the lock *must* call page_lock_clr_exclwanted to clear the
+ * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock,
+ * the bit is cleared.)
+ */
+int
+page_try_reclaim_lock(page_t *pp, se_t se, int es)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+ selock_t old;
+
+ mutex_enter(pse);
+
+ old = pp->p_selock;
+
+ ASSERT(((es & SE_EXCL_WANTED) == 0) ||
+ ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+ if (se == SE_SHARED && es == 1 && old == 0) {
+ se = SE_EXCL;
+ }
+
+ if (se == SE_SHARED) {
+ if (!PP_ISFREE(pp)) {
+ if (old >= 0) {
+ /* readers are not allowed when excl wanted */
+ if (!(old & SE_EWANTED)) {
+ pp->p_selock = old + SE_READER;
+ mutex_exit(pse);
+ return (1);
+ }
+ }
+ mutex_exit(pse);
+ return (0);
+ }
+ /*
+ * The page is free, so we really want SE_EXCL (below)
+ */
+ VM_STAT_ADD(page_try_reclaim_upgrade);
+ }
+
+ /*
+ * The caller wants a writer lock. We try for it only if
+ * SE_EWANTED is not set, or if the caller specified
+ * SE_EXCL_WANTED.
+ */
+ if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) {
+ if ((old & ~SE_EWANTED) == 0) {
+ /* no reader/writer lock held */
+ THREAD_KPRI_REQUEST();
+ /* this clears out our setting of the SE_EWANTED bit */
+ pp->p_selock = SE_WRITER;
+ mutex_exit(pse);
+ return (1);
+ }
+ }
+ if (es == SE_EXCL_WANTED) {
+ /* page is locked, set the SE_EWANTED bit */
+ pp->p_selock |= SE_EWANTED;
+ }
+ mutex_exit(pse);
+ return (0);
+}
+
+/*
+ * Acquire a page's "shared/exclusive" lock, but never block.
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_trylock(page_t *pp, se_t se)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+ mutex_enter(pse);
+ if (pp->p_selock & SE_EWANTED) {
+ /* fail if a thread wants exclusive access */
+ mutex_exit(pse);
+ return (0);
+ }
+
+ if (se == SE_EXCL) {
+ if (pp->p_selock == 0) {
+ THREAD_KPRI_REQUEST();
+ pp->p_selock = SE_WRITER;
+ mutex_exit(pse);
+ return (1);
+ }
+ } else {
+ if (pp->p_selock >= 0) {
+ pp->p_selock += SE_READER;
+ mutex_exit(pse);
+ return (1);
+ }
+ }
+ mutex_exit(pse);
+ return (0);
+}
+
+/*
+ * Release the page's "shared/exclusive" lock and wake up anyone
+ * who might be waiting for it.
+ */
+void
+page_unlock(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+ selock_t old;
+
+ mutex_enter(pse);
+ old = pp->p_selock;
+ if ((old & ~SE_EWANTED) == SE_READER) {
+ pp->p_selock = old & ~SE_READER;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ } else if ((old & ~SE_EWANTED) == SE_DELETED) {
+ panic("page_unlock: page %p is deleted", pp);
+ } else if (old < 0) {
+ THREAD_KPRI_RELEASE();
+ pp->p_selock &= SE_EWANTED;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ } else if ((old & ~SE_EWANTED) > SE_READER) {
+ pp->p_selock = old - SE_READER;
+ } else {
+ panic("page_unlock: page %p is not locked", pp);
+ }
+ mutex_exit(pse);
+}
+
+/*
+ * Try to upgrade the lock on the page from a "shared" to an
+ * "exclusive" lock. Since this upgrade operation is done while
+ * holding the mutex protecting this page, no one else can acquire this page's
+ * lock and change the page. Thus, it is safe to drop the "shared"
+ * lock and attempt to acquire the "exclusive" lock.
+ *
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_tryupgrade(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+ mutex_enter(pse);
+ if (!(pp->p_selock & SE_EWANTED)) {
+ /* no threads want exclusive access, try upgrade */
+ if (pp->p_selock == SE_READER) {
+ THREAD_KPRI_REQUEST();
+ /* convert to exclusive lock */
+ pp->p_selock = SE_WRITER;
+ mutex_exit(pse);
+ return (1);
+ }
+ }
+ mutex_exit(pse);
+ return (0);
+}
+
+/*
+ * Downgrade the "exclusive" lock on the page to a "shared" lock
+ * while holding the mutex protecting this page's p_selock field.
+ */
+void
+page_downgrade(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+ int excl_waiting;
+
+ ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
+ ASSERT(PAGE_EXCL(pp));
+
+ mutex_enter(pse);
+ excl_waiting = pp->p_selock & SE_EWANTED;
+ THREAD_KPRI_RELEASE();
+ pp->p_selock = SE_READER | excl_waiting;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ mutex_exit(pse);
+}
+
+void
+page_lock_delete(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_vnode == NULL);
+ ASSERT(pp->p_offset == (u_offset_t)-1);
+ ASSERT(!PP_ISFREE(pp));
+
+ mutex_enter(pse);
+ THREAD_KPRI_RELEASE();
+ pp->p_selock = SE_DELETED;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ mutex_exit(pse);
+}
+
+/*
+ * Implement the io lock for pages
+ */
+void
+page_iolock_init(page_t *pp)
+{
+ pp->p_iolock_state = 0;
+ cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
+}
+
+/*
+ * Acquire the i/o lock on a page.
+ */
+void
+page_io_lock(page_t *pp)
+{
+ kmutex_t *pio;
+
+ pio = PAGE_IO_MUTEX(pp);
+ mutex_enter(pio);
+ while (pp->p_iolock_state & PAGE_IO_INUSE) {
+ cv_wait(&(pp->p_io_cv), pio);
+ }
+ pp->p_iolock_state |= PAGE_IO_INUSE;
+ mutex_exit(pio);
+}
+
+/*
+ * Release the i/o lock on a page.
+ */
+void
+page_io_unlock(page_t *pp)
+{
+ kmutex_t *pio;
+
+ pio = PAGE_IO_MUTEX(pp);
+ mutex_enter(pio);
+ cv_signal(&pp->p_io_cv);
+ pp->p_iolock_state &= ~PAGE_IO_INUSE;
+ mutex_exit(pio);
+}
+
+/*
+ * Try to acquire the i/o lock on a page without blocking.
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_io_trylock(page_t *pp)
+{
+ kmutex_t *pio;
+
+ if (pp->p_iolock_state & PAGE_IO_INUSE)
+ return (0);
+
+ pio = PAGE_IO_MUTEX(pp);
+ mutex_enter(pio);
+
+ if (pp->p_iolock_state & PAGE_IO_INUSE) {
+ mutex_exit(pio);
+ return (0);
+ }
+ pp->p_iolock_state |= PAGE_IO_INUSE;
+ mutex_exit(pio);
+
+ return (1);
+}
+
+/*
+ * Assert that the i/o lock on a page is held.
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_iolock_assert(page_t *pp)
+{
+ return (pp->p_iolock_state & PAGE_IO_INUSE);
+}
+
+/*
+ * Wrapper exported to kernel routines that are built
+ * platform-independent (the macro is platform-dependent;
+ * the size of vph_mutex[] is based on NCPU).
+ *
+ * Note that you can do stress testing on this by setting the
+ * variable page_vnode_mutex_stress to something other than
+ * zero in a DEBUG kernel in a debugger after loading the kernel.
+ * Setting it after the kernel is running may not work correctly.
+ */
+#ifdef DEBUG
+static int page_vnode_mutex_stress = 0;
+#endif
+
+kmutex_t *
+page_vnode_mutex(vnode_t *vp)
+{
+ if (vp == &kvp)
+ return (&vph_mutex[VPH_TABLE_SIZE + 0]);
+#ifdef DEBUG
+ if (page_vnode_mutex_stress != 0)
+ return (&vph_mutex[0]);
+#endif
+
+ return (&vph_mutex[VP_HASH_FUNC(vp)]);
+}
+
+kmutex_t *
+page_se_mutex(page_t *pp)
+{
+ return (PAGE_SE_MUTEX(pp));
+}
+
+#ifdef VM_STATS
+uint_t pszclck_stat[4];
+#endif
+/*
+ * Find, take and return a mutex held by hat_page_demote().
+ * Called by page_demote_vp_pages() before hat_page_demote() call and by
+ * routines that want to block hat_page_demote() but can't do it
+ * via locking all constituent pages.
+ *
+ * Return NULL if p_szc is 0.
+ *
+ * It should only be used for pages that can be demoted by hat_page_demote()
+ * i.e. non swapfs file system pages. The logic here is lifted from
+ * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
+ * since the page is locked and not free.
+ *
+ * Hash of the root page is used to find the lock.
+ * To find the root in the presense of hat_page_demote() chageing the location
+ * of the root this routine relies on the fact that hat_page_demote() changes
+ * root last.
+ *
+ * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
+ * returned pp's p_szc may be any value.
+ */
+kmutex_t *
+page_szc_lock(page_t *pp)
+{
+ kmutex_t *mtx;
+ page_t *rootpp;
+ uint_t szc;
+ uint_t rszc;
+ uint_t pszc = pp->p_szc;
+
+ ASSERT(pp != NULL);
+ ASSERT(PAGE_LOCKED(pp));
+ ASSERT(!PP_ISFREE(pp));
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT(!IS_SWAPFSVP(pp->p_vnode));
+ ASSERT(pp->p_vnode != &kvp);
+
+again:
+ if (pszc == 0) {
+ VM_STAT_ADD(pszclck_stat[0]);
+ return (NULL);
+ }
+
+ /* The lock lives in the root page */
+
+ rootpp = PP_GROUPLEADER(pp, pszc);
+ mtx = PAGE_SZC_MUTEX(rootpp);
+ mutex_enter(mtx);
+
+ /*
+ * since p_szc can only decrease if pp == rootpp
+ * rootpp will be always the same i.e we have the right root
+ * regardless of rootpp->p_szc.
+ * If location of pp's root didn't change after we took
+ * the lock we have the right root. return mutex hashed off it.
+ */
+ if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
+ VM_STAT_ADD(pszclck_stat[1]);
+ return (mtx);
+ }
+
+ /*
+ * root location changed because page got demoted.
+ * locate the new root.
+ */
+ if (rszc < pszc) {
+ szc = pp->p_szc;
+ ASSERT(szc < pszc);
+ mutex_exit(mtx);
+ pszc = szc;
+ VM_STAT_ADD(pszclck_stat[2]);
+ goto again;
+ }
+
+ VM_STAT_ADD(pszclck_stat[3]);
+ /*
+ * current hat_page_demote not done yet.
+ * wait for it to finish.
+ */
+ mutex_exit(mtx);
+ rootpp = PP_GROUPLEADER(rootpp, rszc);
+ mtx = PAGE_SZC_MUTEX(rootpp);
+ mutex_enter(mtx);
+ mutex_exit(mtx);
+ ASSERT(rootpp->p_szc < rszc);
+ goto again;
+}
+
+int
+page_szc_lock_assert(page_t *pp)
+{
+ page_t *rootpp = PP_PAGEROOT(pp);
+ kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
+
+ return (MUTEX_HELD(mtx));
+}
diff --git a/usr/src/uts/common/vm/pvn.h b/usr/src/uts/common/vm/pvn.h
new file mode 100644
index 0000000000..0467589ae6
--- /dev/null
+++ b/usr/src/uts/common/vm/pvn.h
@@ -0,0 +1,117 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_PVN_H
+#define _VM_PVN_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/buf.h>
+#include <vm/seg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+/*
+ * VM - paged vnode.
+ *
+ * The VM system manages memory as a cache of paged vnodes.
+ * This file desribes the interfaces to common subroutines
+ * used to help implement the VM/file system routines.
+ */
+
+struct page *pvn_read_kluster(struct vnode *vp, u_offset_t off,
+ struct seg *seg, caddr_t addr, u_offset_t *offp,
+ size_t *lenp, u_offset_t vp_off, size_t vp_len,
+ int isra);
+struct page *pvn_write_kluster(struct vnode *vp, struct page *pp,
+ u_offset_t *offp, size_t *lenp, u_offset_t vp_off,
+ size_t vp_len, int flags);
+void pvn_read_done(struct page *plist, int flags);
+void pvn_write_done(struct page *plist, int flags);
+void pvn_io_done(struct page *plist);
+int pvn_vplist_dirty(struct vnode *vp, u_offset_t off,
+ int (*putapage)(vnode_t *, struct page *, u_offset_t *,
+ size_t *, int, cred_t *),
+ int flags, struct cred *cred);
+int pvn_getdirty(struct page *pp, int flags);
+void pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes);
+int pvn_getpages(
+ int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *,
+ struct page *[], size_t, struct seg *,
+ caddr_t, enum seg_rw, cred_t *),
+ struct vnode *vp, u_offset_t off, size_t len,
+ uint_t *protp, struct page **pl, size_t plsz,
+ struct seg *seg, caddr_t addr, enum seg_rw rw,
+ struct cred *cred);
+void pvn_plist_init(struct page *pp, struct page **pl, size_t plsz,
+ u_offset_t off, size_t io_len, enum seg_rw rw);
+void pvn_init(void);
+
+/*
+ * When requesting pages from the getpage routines, pvn_getpages will
+ * allocate space to return PVN_GETPAGE_NUM pages which map PVN_GETPAGE_SZ
+ * worth of bytes. These numbers are chosen to be the minimum of the max's
+ * given in terms of bytes and pages.
+ */
+#define PVN_MAX_GETPAGE_SZ 0x10000 /* getpage size limit */
+#define PVN_MAX_GETPAGE_NUM 0x8 /* getpage page limit */
+
+#if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE
+
+#define PVN_GETPAGE_SZ ptob(PVN_MAX_GETPAGE_NUM)
+#define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM
+
+#else
+
+#define PVN_GETPAGE_SZ PVN_MAX_GETPAGE_SZ
+#define PVN_GETPAGE_NUM btop(PVN_MAX_GETPAGE_SZ)
+
+#endif
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_PVN_H */
diff --git a/usr/src/uts/common/vm/rm.h b/usr/src/uts/common/vm/rm.h
new file mode 100644
index 0000000000..9789283993
--- /dev/null
+++ b/usr/src/uts/common/vm/rm.h
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1988 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_RM_H
+#define _VM_RM_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+extern size_t rm_asrss(struct as *);
+extern size_t rm_assize(struct as *);
+extern ushort_t rm_pctmemory(struct as *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_RM_H */
diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h
new file mode 100644
index 0000000000..2ada345960
--- /dev/null
+++ b/usr/src/uts/common/vm/seg.h
@@ -0,0 +1,252 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_SEG_H
+#define _VM_SEG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vnode.h>
+#include <sys/avl.h>
+#include <vm/seg_enum.h>
+#include <vm/faultcode.h>
+#include <vm/hat.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Segments.
+ */
+
+/*
+ * kstat statistics for segment advise
+ */
+typedef struct {
+ kstat_named_t MADV_FREE_hit;
+ kstat_named_t MADV_FREE_miss;
+} segadvstat_t;
+
+/*
+ * memory object ids
+ */
+typedef struct memid { u_longlong_t val[2]; } memid_t;
+
+/*
+ * An address space contains a set of segments, managed by drivers.
+ * Drivers support mapped devices, sharing, copy-on-write, etc.
+ *
+ * The seg structure contains a lock to prevent races, the base virtual
+ * address and size of the segment, a back pointer to the containing
+ * address space, pointers to maintain an AVL tree of segments in the
+ * same address space, and procedure and data hooks for the driver.
+ * The AVL tree of segments for the address space is sorted by
+ * ascending base addresses and overlapping segments are not allowed.
+ *
+ * After a segment is created, faults may occur on pages of the segment.
+ * When a fault occurs, the fault handling code must get the desired
+ * object and set up the hardware translation to the object. For some
+ * objects, the fault handling code also implements copy-on-write.
+ *
+ * When the hat wants to unload a translation, it can call the unload
+ * routine which is responsible for processing reference and modify bits.
+ *
+ * Each segment is protected by it's containing address space lock. To
+ * access any field in the segment structure, the "as" must be locked.
+ * If a segment field is to be modified, the address space lock must be
+ * write locked.
+ */
+
+struct seg {
+ caddr_t s_base; /* base virtual address */
+ size_t s_size; /* size in bytes */
+ uint_t s_szc; /* max page size code */
+ uint_t s_flags; /* flags for segment, see below */
+ struct as *s_as; /* containing address space */
+ avl_node_t s_tree; /* AVL tree links to segs in this as */
+ struct seg_ops *s_ops; /* ops vector: see below */
+ void *s_data; /* private data for instance */
+};
+
+#define S_PURGE (0x01) /* seg should be purged in as_gap() */
+
+struct seg_ops {
+ int (*dup)(struct seg *, struct seg *);
+ int (*unmap)(struct seg *, caddr_t, size_t);
+ void (*free)(struct seg *);
+ faultcode_t (*fault)(struct hat *, struct seg *, caddr_t, size_t,
+ enum fault_type, enum seg_rw);
+ faultcode_t (*faulta)(struct seg *, caddr_t);
+ int (*setprot)(struct seg *, caddr_t, size_t, uint_t);
+ int (*checkprot)(struct seg *, caddr_t, size_t, uint_t);
+ int (*kluster)(struct seg *, caddr_t, ssize_t);
+ size_t (*swapout)(struct seg *);
+ int (*sync)(struct seg *, caddr_t, size_t, int, uint_t);
+ size_t (*incore)(struct seg *, caddr_t, size_t, char *);
+ int (*lockop)(struct seg *, caddr_t, size_t, int, int, ulong_t *,
+ size_t);
+ int (*getprot)(struct seg *, caddr_t, size_t, uint_t *);
+ u_offset_t (*getoffset)(struct seg *, caddr_t);
+ int (*gettype)(struct seg *, caddr_t);
+ int (*getvp)(struct seg *, caddr_t, struct vnode **);
+ int (*advise)(struct seg *, caddr_t, size_t, uint_t);
+ void (*dump)(struct seg *);
+ int (*pagelock)(struct seg *, caddr_t, size_t, struct page ***,
+ enum lock_type, enum seg_rw);
+ int (*setpagesize)(struct seg *, caddr_t, size_t, uint_t);
+ int (*getmemid)(struct seg *, caddr_t, memid_t *);
+ struct lgrp_mem_policy_info *(*getpolicy)(struct seg *, caddr_t);
+};
+
+#ifdef _KERNEL
+/*
+ * Generic segment operations
+ */
+extern void seg_init(void);
+extern struct seg *seg_alloc(struct as *as, caddr_t base, size_t size);
+extern int seg_attach(struct as *as, caddr_t base, size_t size,
+ struct seg *seg);
+extern void seg_unmap(struct seg *seg);
+extern void seg_free(struct seg *seg);
+
+/*
+ * functions for pagelock cache support
+ */
+extern void seg_ppurge(struct seg *seg);
+extern void seg_ppurge_seg(int (*callback)());
+extern void seg_pinactive(struct seg *seg, caddr_t addr, size_t len,
+ struct page **pp, enum seg_rw rw, int (*callback)());
+extern int seg_pinsert_check(struct seg *seg, size_t len, uint_t flags);
+extern int seg_pinsert(struct seg *seg, caddr_t addr, size_t len,
+ struct page **pp, enum seg_rw rw, uint_t flags,
+ int (*callback)());
+extern struct page **seg_plookup(struct seg *seg, caddr_t addr,
+ size_t len, enum seg_rw rw);
+extern void seg_pasync_thread(void);
+extern void seg_preap(void);
+
+extern int seg_preapahead;
+extern segadvstat_t segadvstat;
+/*
+ * Flags for pagelock cache support
+ */
+#define SEGP_ASYNC_FLUSH 0x1 /* flushed by async thread */
+#define SEGP_FORCE_WIRED 0x2 /* skip check against seg_pwindow */
+
+/*
+ * Return values for seg_pinsert and seg_pinsert_check functions.
+ */
+#define SEGP_SUCCESS 0 /* seg_pinsert() succeeded */
+#define SEGP_FAIL 1 /* seg_pinsert() failed */
+
+/* Page status bits for segop_incore */
+#define SEG_PAGE_INCORE 0x01 /* VA has a page backing it */
+#define SEG_PAGE_LOCKED 0x02 /* VA has a page that is locked */
+#define SEG_PAGE_HASCOW 0x04 /* VA has a page with a copy-on-write */
+#define SEG_PAGE_SOFTLOCK 0x08 /* VA has a page with softlock held */
+#define SEG_PAGE_VNODEBACKED 0x10 /* Segment is backed by a vnode */
+#define SEG_PAGE_ANON 0x20 /* VA has an anonymous page */
+#define SEG_PAGE_VNODE 0x40 /* VA has a vnode page backing it */
+
+#define SEGOP_DUP(s, n) (*(s)->s_ops->dup)((s), (n))
+#define SEGOP_UNMAP(s, a, l) (*(s)->s_ops->unmap)((s), (a), (l))
+#define SEGOP_FREE(s) (*(s)->s_ops->free)((s))
+#define SEGOP_FAULT(h, s, a, l, t, rw) \
+ (*(s)->s_ops->fault)((h), (s), (a), (l), (t), (rw))
+#define SEGOP_FAULTA(s, a) (*(s)->s_ops->faulta)((s), (a))
+#define SEGOP_SETPROT(s, a, l, p) (*(s)->s_ops->setprot)((s), (a), (l), (p))
+#define SEGOP_CHECKPROT(s, a, l, p) (*(s)->s_ops->checkprot)((s), (a), (l), (p))
+#define SEGOP_KLUSTER(s, a, d) (*(s)->s_ops->kluster)((s), (a), (d))
+#define SEGOP_SWAPOUT(s) (*(s)->s_ops->swapout)((s))
+#define SEGOP_SYNC(s, a, l, atr, f) \
+ (*(s)->s_ops->sync)((s), (a), (l), (atr), (f))
+#define SEGOP_INCORE(s, a, l, v) (*(s)->s_ops->incore)((s), (a), (l), (v))
+#define SEGOP_LOCKOP(s, a, l, atr, op, b, p) \
+ (*(s)->s_ops->lockop)((s), (a), (l), (atr), (op), (b), (p))
+#define SEGOP_GETPROT(s, a, l, p) (*(s)->s_ops->getprot)((s), (a), (l), (p))
+#define SEGOP_GETOFFSET(s, a) (*(s)->s_ops->getoffset)((s), (a))
+#define SEGOP_GETTYPE(s, a) (*(s)->s_ops->gettype)((s), (a))
+#define SEGOP_GETVP(s, a, vpp) (*(s)->s_ops->getvp)((s), (a), (vpp))
+#define SEGOP_ADVISE(s, a, l, b) (*(s)->s_ops->advise)((s), (a), (l), (b))
+#define SEGOP_DUMP(s) (*(s)->s_ops->dump)((s))
+#define SEGOP_PAGELOCK(s, a, l, p, t, rw) \
+ (*(s)->s_ops->pagelock)((s), (a), (l), (p), (t), (rw))
+#define SEGOP_SETPAGESIZE(s, a, l, szc) \
+ (*(s)->s_ops->setpagesize)((s), (a), (l), (szc))
+#define SEGOP_GETMEMID(s, a, mp) (*(s)->s_ops->getmemid)((s), (a), (mp))
+#define SEGOP_GETPOLICY(s, a) (*(s)->s_ops->getpolicy)((s), (a))
+
+#define seg_page(seg, addr) \
+ (((uintptr_t)((addr) - (seg)->s_base)) >> PAGESHIFT)
+
+#define seg_pages(seg) \
+ (((uintptr_t)((seg)->s_size + PAGEOFFSET)) >> PAGESHIFT)
+
+#define IE_NOMEM -1 /* internal to seg layer */
+#define IE_RETRY -2 /* internal to seg layer */
+#define IE_REATTACH -3 /* internal to seg layer */
+
+/* Delay/retry factors for seg_p_mem_config_pre_del */
+#define SEGP_PREDEL_DELAY_FACTOR 4
+/*
+ * As a workaround to being unable to purge the pagelock
+ * cache during a DR delete memory operation, we use
+ * a stall threshold that is twice the maximum seen
+ * during testing. This workaround will be removed
+ * when a suitable fix is found.
+ */
+#define SEGP_STALL_SECONDS 25
+#define SEGP_STALL_THRESHOLD \
+ (SEGP_STALL_SECONDS * SEGP_PREDEL_DELAY_FACTOR)
+
+#ifdef VMDEBUG
+
+uint_t seg_page(struct seg *, caddr_t);
+uint_t seg_pages(struct seg *);
+
+#endif /* VMDEBUG */
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_SEG_H */
diff --git a/usr/src/uts/common/vm/seg_dev.c b/usr/src/uts/common/vm/seg_dev.c
new file mode 100644
index 0000000000..9b3733871f
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_dev.c
@@ -0,0 +1,4073 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - segment of a mapped device.
+ *
+ * This segment driver is used when mapping character special devices.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/vtrace.h>
+#include <sys/systm.h>
+#include <sys/vmsystm.h>
+#include <sys/mman.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/conf.h>
+#include <sys/debug.h>
+#include <sys/ddidevmap.h>
+#include <sys/lgrp.h>
+
+#include <vm/page.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_dev.h>
+#include <vm/seg_kp.h>
+#include <vm/seg_kmem.h>
+#include <vm/vpage.h>
+
+#include <sys/sunddi.h>
+#include <sys/esunddi.h>
+#include <sys/fs/snode.h>
+
+#if DEBUG
+int segdev_debug;
+#define DEBUGF(level, args) { if (segdev_debug >= (level)) cmn_err args; }
+#else
+#define DEBUGF(level, args)
+#endif
+
+/* Default timeout for devmap context management */
+#define CTX_TIMEOUT_VALUE 0
+
+#define HOLD_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
+ { mutex_enter(&dhp->dh_lock); }
+
+#define RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
+ { mutex_exit(&dhp->dh_lock); }
+
+#define round_down_p2(a, s) ((a) & ~((s) - 1))
+#define round_up_p2(a, s) (((a) + (s) - 1) & ~((s) - 1))
+
+/*
+ * VA_PA_ALIGNED checks to see if both VA and PA are on pgsize boundary
+ * VA_PA_PGSIZE_ALIGNED check to see if VA is aligned with PA w.r.t. pgsize
+ */
+#define VA_PA_ALIGNED(uvaddr, paddr, pgsize) \
+ (((uvaddr | paddr) & (pgsize - 1)) == 0)
+#define VA_PA_PGSIZE_ALIGNED(uvaddr, paddr, pgsize) \
+ (((uvaddr ^ paddr) & (pgsize - 1)) == 0)
+
+#define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */
+
+#define VTOCVP(vp) (VTOS(vp)->s_commonvp) /* we "know" it's an snode */
+
+static struct devmap_ctx *devmapctx_list = NULL;
+static struct devmap_softlock *devmap_slist = NULL;
+
+/*
+ * mutex, vnode and page for the page of zeros we use for the trash mappings.
+ * One trash page is allocated on the first ddi_umem_setup call that uses it
+ * XXX Eventually, we may want to combine this with what segnf does when all
+ * hat layers implement HAT_NOFAULT.
+ *
+ * The trash page is used when the backing store for a userland mapping is
+ * removed but the application semantics do not take kindly to a SIGBUS.
+ * In that scenario, the applications pages are mapped to some dummy page
+ * which returns garbage on read and writes go into a common place.
+ * (Perfect for NO_FAULT semantics)
+ * The device driver is responsible to communicating to the app with some
+ * other mechanism that such remapping has happened and the app should take
+ * corrective action.
+ * We can also use an anonymous memory page as there is no requirement to
+ * keep the page locked, however this complicates the fault code. RFE.
+ */
+static struct vnode trashvp;
+static struct page *trashpp;
+
+/* Non-pageable kernel memory is allocated from the umem_np_arena. */
+static vmem_t *umem_np_arena;
+
+/* Set the cookie to a value we know will never be a valid umem_cookie */
+#define DEVMAP_DEVMEM_COOKIE ((ddi_umem_cookie_t)0x1)
+
+/*
+ * Macros to check if type of devmap handle
+ */
+#define cookie_is_devmem(c) \
+ ((c) == (struct ddi_umem_cookie *)DEVMAP_DEVMEM_COOKIE)
+
+#define cookie_is_pmem(c) \
+ ((c) == (struct ddi_umem_cookie *)DEVMAP_PMEM_COOKIE)
+
+#define cookie_is_kpmem(c) (!cookie_is_devmem(c) && !cookie_is_pmem(c) &&\
+ ((c)->type == KMEM_PAGEABLE))
+
+#define dhp_is_devmem(dhp) \
+ (cookie_is_devmem((struct ddi_umem_cookie *)((dhp)->dh_cookie)))
+
+#define dhp_is_pmem(dhp) \
+ (cookie_is_pmem((struct ddi_umem_cookie *)((dhp)->dh_cookie)))
+
+#define dhp_is_kpmem(dhp) \
+ (cookie_is_kpmem((struct ddi_umem_cookie *)((dhp)->dh_cookie)))
+
+/*
+ * Private seg op routines.
+ */
+static int segdev_dup(struct seg *, struct seg *);
+static int segdev_unmap(struct seg *, caddr_t, size_t);
+static void segdev_free(struct seg *);
+static faultcode_t segdev_fault(struct hat *, struct seg *, caddr_t, size_t,
+ enum fault_type, enum seg_rw);
+static faultcode_t segdev_faulta(struct seg *, caddr_t);
+static int segdev_setprot(struct seg *, caddr_t, size_t, uint_t);
+static int segdev_checkprot(struct seg *, caddr_t, size_t, uint_t);
+static void segdev_badop(void);
+static int segdev_sync(struct seg *, caddr_t, size_t, int, uint_t);
+static size_t segdev_incore(struct seg *, caddr_t, size_t, char *);
+static int segdev_lockop(struct seg *, caddr_t, size_t, int, int,
+ ulong_t *, size_t);
+static int segdev_getprot(struct seg *, caddr_t, size_t, uint_t *);
+static u_offset_t segdev_getoffset(struct seg *, caddr_t);
+static int segdev_gettype(struct seg *, caddr_t);
+static int segdev_getvp(struct seg *, caddr_t, struct vnode **);
+static int segdev_advise(struct seg *, caddr_t, size_t, uint_t);
+static void segdev_dump(struct seg *);
+static int segdev_pagelock(struct seg *, caddr_t, size_t,
+ struct page ***, enum lock_type, enum seg_rw);
+static int segdev_setpagesize(struct seg *, caddr_t, size_t, uint_t);
+static int segdev_getmemid(struct seg *, caddr_t, memid_t *);
+static lgrp_mem_policy_info_t *segdev_getpolicy(struct seg *, caddr_t);
+
+/*
+ * XXX this struct is used by rootnex_map_fault to identify
+ * the segment it has been passed. So if you make it
+ * "static" you'll need to fix rootnex_map_fault.
+ */
+struct seg_ops segdev_ops = {
+ segdev_dup,
+ segdev_unmap,
+ segdev_free,
+ segdev_fault,
+ segdev_faulta,
+ segdev_setprot,
+ segdev_checkprot,
+ (int (*)())segdev_badop, /* kluster */
+ (size_t (*)(struct seg *))NULL, /* swapout */
+ segdev_sync, /* sync */
+ segdev_incore,
+ segdev_lockop, /* lockop */
+ segdev_getprot,
+ segdev_getoffset,
+ segdev_gettype,
+ segdev_getvp,
+ segdev_advise,
+ segdev_dump,
+ segdev_pagelock,
+ segdev_setpagesize,
+ segdev_getmemid,
+ segdev_getpolicy,
+};
+
+/*
+ * Private segdev support routines
+ */
+static struct segdev_data *sdp_alloc(void);
+
+static void segdev_softunlock(struct hat *, struct seg *, caddr_t,
+ size_t, enum seg_rw);
+
+static faultcode_t segdev_faultpage(struct hat *, struct seg *, caddr_t,
+ struct vpage *, enum fault_type, enum seg_rw, devmap_handle_t *);
+
+static faultcode_t segdev_faultpages(struct hat *, struct seg *, caddr_t,
+ size_t, enum fault_type, enum seg_rw, devmap_handle_t *);
+
+static struct devmap_ctx *devmap_ctxinit(dev_t, ulong_t);
+static struct devmap_softlock *devmap_softlock_init(dev_t, ulong_t);
+static void devmap_softlock_rele(devmap_handle_t *);
+static void devmap_ctx_rele(devmap_handle_t *);
+
+static void devmap_ctxto(void *);
+
+static devmap_handle_t *devmap_find_handle(devmap_handle_t *dhp_head,
+ caddr_t addr);
+
+static ulong_t devmap_roundup(devmap_handle_t *dhp, ulong_t offset, size_t len,
+ ulong_t *opfn, ulong_t *pagesize);
+
+static void free_devmap_handle(devmap_handle_t *dhp);
+
+static int devmap_handle_dup(devmap_handle_t *dhp, devmap_handle_t **new_dhp,
+ struct seg *newseg);
+
+static devmap_handle_t *devmap_handle_unmap(devmap_handle_t *dhp);
+
+static void devmap_handle_unmap_head(devmap_handle_t *dhp, size_t len);
+
+static void devmap_handle_unmap_tail(devmap_handle_t *dhp, caddr_t addr);
+
+static int devmap_device(devmap_handle_t *dhp, struct as *as, caddr_t *addr,
+ offset_t off, size_t len, uint_t flags);
+
+static void devmap_get_large_pgsize(devmap_handle_t *dhp, size_t len,
+ caddr_t addr, size_t *llen, caddr_t *laddr);
+
+static void devmap_handle_reduce_len(devmap_handle_t *dhp, size_t len);
+
+static void *devmap_alloc_pages(vmem_t *vmp, size_t size, int vmflag);
+static void devmap_free_pages(vmem_t *vmp, void *inaddr, size_t size);
+
+static void *devmap_umem_alloc_np(size_t size, size_t flags);
+static void devmap_umem_free_np(void *addr, size_t size);
+
+/*
+ * routines to lock and unlock underlying segkp segment for
+ * KMEM_PAGEABLE type cookies.
+ */
+static faultcode_t acquire_kpmem_lock(struct ddi_umem_cookie *, size_t);
+static void release_kpmem_lock(struct ddi_umem_cookie *, size_t);
+
+/*
+ * Routines to synchronize F_SOFTLOCK and F_INVAL faults for
+ * drivers with devmap_access callbacks
+ */
+static int devmap_softlock_enter(struct devmap_softlock *, size_t,
+ enum fault_type);
+static void devmap_softlock_exit(struct devmap_softlock *, size_t,
+ enum fault_type);
+
+static kmutex_t devmapctx_lock;
+
+static kmutex_t devmap_slock;
+
+/*
+ * Initialize the thread callbacks and thread private data.
+ */
+static struct devmap_ctx *
+devmap_ctxinit(dev_t dev, ulong_t id)
+{
+ struct devmap_ctx *devctx;
+ struct devmap_ctx *tmp;
+ dev_info_t *dip;
+
+ tmp = kmem_zalloc(sizeof (struct devmap_ctx), KM_SLEEP);
+
+ mutex_enter(&devmapctx_lock);
+
+ dip = e_ddi_hold_devi_by_dev(dev, 0);
+ ASSERT(dip != NULL);
+ ddi_release_devi(dip);
+
+ for (devctx = devmapctx_list; devctx != NULL; devctx = devctx->next)
+ if ((devctx->dip == dip) && (devctx->id == id))
+ break;
+
+ if (devctx == NULL) {
+ devctx = tmp;
+ devctx->dip = dip;
+ devctx->id = id;
+ mutex_init(&devctx->lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&devctx->cv, NULL, CV_DEFAULT, NULL);
+ devctx->next = devmapctx_list;
+ devmapctx_list = devctx;
+ } else
+ kmem_free(tmp, sizeof (struct devmap_ctx));
+
+ mutex_enter(&devctx->lock);
+ devctx->refcnt++;
+ mutex_exit(&devctx->lock);
+ mutex_exit(&devmapctx_lock);
+
+ return (devctx);
+}
+
+/*
+ * Timeout callback called if a CPU has not given up the device context
+ * within dhp->dh_timeout_length ticks
+ */
+static void
+devmap_ctxto(void *data)
+{
+ struct devmap_ctx *devctx = data;
+
+ TRACE_1(TR_FAC_DEVMAP, TR_DEVMAP_CTXTO,
+ "devmap_ctxto:timeout expired, devctx=%p", (void *)devctx);
+ mutex_enter(&devctx->lock);
+ /*
+ * Set oncpu = 0 so the next mapping trying to get the device context
+ * can.
+ */
+ devctx->oncpu = 0;
+ devctx->timeout = 0;
+ cv_signal(&devctx->cv);
+ mutex_exit(&devctx->lock);
+}
+
+/*
+ * Create a device segment.
+ */
+int
+segdev_create(struct seg *seg, void *argsp)
+{
+ struct segdev_data *sdp;
+ struct segdev_crargs *a = (struct segdev_crargs *)argsp;
+ devmap_handle_t *dhp = (devmap_handle_t *)a->devmap_data;
+ int error;
+
+ /*
+ * Since the address space is "write" locked, we
+ * don't need the segment lock to protect "segdev" data.
+ */
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
+
+ sdp = sdp_alloc();
+
+ sdp->mapfunc = a->mapfunc;
+ sdp->offset = a->offset;
+ sdp->prot = a->prot;
+ sdp->maxprot = a->maxprot;
+ sdp->type = a->type;
+ sdp->pageprot = 0;
+ sdp->softlockcnt = 0;
+ sdp->vpage = NULL;
+
+ if (sdp->mapfunc == NULL)
+ sdp->devmap_data = dhp;
+ else
+ sdp->devmap_data = dhp = NULL;
+
+ sdp->hat_flags = a->hat_flags;
+ sdp->hat_attr = a->hat_attr;
+
+ /*
+ * Currently, hat_flags supports only HAT_LOAD_NOCONSIST
+ */
+ ASSERT(!(sdp->hat_flags & ~HAT_LOAD_NOCONSIST));
+
+ /*
+ * Hold shadow vnode -- segdev only deals with
+ * character (VCHR) devices. We use the common
+ * vp to hang pages on.
+ */
+ sdp->vp = specfind(a->dev, VCHR);
+ ASSERT(sdp->vp != NULL);
+
+ seg->s_ops = &segdev_ops;
+ seg->s_data = sdp;
+
+ while (dhp != NULL) {
+ dhp->dh_seg = seg;
+ dhp = dhp->dh_next;
+ }
+
+ /*
+ * Inform the vnode of the new mapping.
+ */
+ /*
+ * It is ok to use pass sdp->maxprot to ADDMAP rather than to use
+ * dhp specific maxprot because spec_addmap does not use maxprot.
+ */
+ error = VOP_ADDMAP(VTOCVP(sdp->vp), sdp->offset,
+ seg->s_as, seg->s_base, seg->s_size,
+ sdp->prot, sdp->maxprot, sdp->type, CRED());
+
+ if (error != 0) {
+ sdp->devmap_data = NULL;
+ hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
+ HAT_UNLOAD_UNMAP);
+ }
+
+ return (error);
+}
+
+static struct segdev_data *
+sdp_alloc(void)
+{
+ struct segdev_data *sdp;
+
+ sdp = kmem_zalloc(sizeof (struct segdev_data), KM_SLEEP);
+ mutex_init(&sdp->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (sdp);
+}
+
+/*
+ * Duplicate seg and return new segment in newseg.
+ */
+static int
+segdev_dup(struct seg *seg, struct seg *newseg)
+{
+ struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ struct segdev_data *newsdp;
+ devmap_handle_t *dhp = (devmap_handle_t *)sdp->devmap_data;
+ size_t npages;
+ int ret;
+
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_DUP,
+ "segdev_dup:start dhp=%p, seg=%p", (void *)dhp, (void *)seg);
+
+ DEBUGF(3, (CE_CONT, "segdev_dup: dhp %p seg %p\n",
+ (void *)dhp, (void *)seg));
+
+ /*
+ * Since the address space is "write" locked, we
+ * don't need the segment lock to protect "segdev" data.
+ */
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ newsdp = sdp_alloc();
+
+ newseg->s_ops = seg->s_ops;
+ newseg->s_data = (void *)newsdp;
+
+ VN_HOLD(sdp->vp);
+ newsdp->vp = sdp->vp;
+ newsdp->mapfunc = sdp->mapfunc;
+ newsdp->offset = sdp->offset;
+ newsdp->pageprot = sdp->pageprot;
+ newsdp->prot = sdp->prot;
+ newsdp->maxprot = sdp->maxprot;
+ newsdp->type = sdp->type;
+ newsdp->hat_attr = sdp->hat_attr;
+ newsdp->hat_flags = sdp->hat_flags;
+ newsdp->softlockcnt = 0;
+
+ /*
+ * Initialize per page data if the segment we are
+ * dup'ing has per page information.
+ */
+ npages = seg_pages(newseg);
+
+ if (sdp->vpage != NULL) {
+ size_t nbytes = vpgtob(npages);
+
+ newsdp->vpage = kmem_zalloc(nbytes, KM_SLEEP);
+ bcopy(sdp->vpage, newsdp->vpage, nbytes);
+ } else
+ newsdp->vpage = NULL;
+
+ /*
+ * duplicate devmap handles
+ */
+ if (dhp != NULL) {
+ ret = devmap_handle_dup(dhp,
+ (devmap_handle_t **)&newsdp->devmap_data, newseg);
+ if (ret != 0) {
+ TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_DUP_CK1,
+ "segdev_dup:ret1 ret=%x, dhp=%p seg=%p",
+ ret, (void *)dhp, (void *)seg);
+ DEBUGF(1, (CE_CONT,
+ "segdev_dup: ret %x dhp %p seg %p\n",
+ ret, (void *)dhp, (void *)seg));
+ return (ret);
+ }
+ }
+
+ /*
+ * Inform the common vnode of the new mapping.
+ */
+ return (VOP_ADDMAP(VTOCVP(newsdp->vp),
+ newsdp->offset, newseg->s_as,
+ newseg->s_base, newseg->s_size, newsdp->prot,
+ newsdp->maxprot, sdp->type, CRED()));
+}
+
+/*
+ * duplicate devmap handles
+ */
+static int
+devmap_handle_dup(devmap_handle_t *dhp, devmap_handle_t **new_dhp,
+ struct seg *newseg)
+{
+ devmap_handle_t *newdhp_save = NULL;
+ devmap_handle_t *newdhp = NULL;
+ struct devmap_callback_ctl *callbackops;
+
+ while (dhp != NULL) {
+ newdhp = kmem_alloc(sizeof (devmap_handle_t), KM_SLEEP);
+
+ /* Need to lock the original dhp while copying if REMAP */
+ HOLD_DHP_LOCK(dhp);
+ bcopy(dhp, newdhp, sizeof (devmap_handle_t));
+ RELE_DHP_LOCK(dhp);
+ newdhp->dh_seg = newseg;
+ newdhp->dh_next = NULL;
+ if (newdhp_save != NULL)
+ newdhp_save->dh_next = newdhp;
+ else
+ *new_dhp = newdhp;
+ newdhp_save = newdhp;
+
+ callbackops = &newdhp->dh_callbackops;
+
+ if (dhp->dh_softlock != NULL)
+ newdhp->dh_softlock = devmap_softlock_init(
+ newdhp->dh_dev,
+ (ulong_t)callbackops->devmap_access);
+ if (dhp->dh_ctx != NULL)
+ newdhp->dh_ctx = devmap_ctxinit(newdhp->dh_dev,
+ (ulong_t)callbackops->devmap_access);
+
+ /*
+ * Initialize dh_lock if we want to do remap.
+ */
+ if (newdhp->dh_flags & DEVMAP_ALLOW_REMAP) {
+ mutex_init(&newdhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
+ newdhp->dh_flags |= DEVMAP_LOCK_INITED;
+ }
+
+ if (callbackops->devmap_dup != NULL) {
+ int ret;
+
+ /*
+ * Call the dup callback so that the driver can
+ * duplicate its private data.
+ */
+ ret = (*callbackops->devmap_dup)(dhp, dhp->dh_pvtp,
+ (devmap_cookie_t *)newdhp, &newdhp->dh_pvtp);
+
+ if (ret != 0) {
+ /*
+ * We want to free up this segment as the driver
+ * has indicated that we can't dup it. But we
+ * don't want to call the drivers, devmap_unmap,
+ * callback function as the driver does not
+ * think this segment exists. The caller of
+ * devmap_dup will call seg_free on newseg
+ * as it was the caller that allocated the
+ * segment.
+ */
+ DEBUGF(1, (CE_CONT, "devmap_handle_dup ERROR: "
+ "newdhp %p dhp %p\n", (void *)newdhp,
+ (void *)dhp));
+ callbackops->devmap_unmap = NULL;
+ return (ret);
+ }
+ }
+
+ dhp = dhp->dh_next;
+ }
+
+ return (0);
+}
+
+/*
+ * Split a segment at addr for length len.
+ */
+/*ARGSUSED*/
+static int
+segdev_unmap(struct seg *seg, caddr_t addr, size_t len)
+{
+ register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ register struct segdev_data *nsdp;
+ register struct seg *nseg;
+ register size_t opages; /* old segment size in pages */
+ register size_t npages; /* new segment size in pages */
+ register size_t dpages; /* pages being deleted (unmapped) */
+ register size_t nbytes;
+ devmap_handle_t *dhp = (devmap_handle_t *)sdp->devmap_data;
+ devmap_handle_t *dhpp;
+ devmap_handle_t *newdhp;
+ struct devmap_callback_ctl *callbackops;
+ caddr_t nbase;
+ offset_t off;
+ ulong_t nsize;
+ size_t mlen, sz;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP,
+ "segdev_unmap:start dhp=%p, seg=%p addr=%p len=%lx",
+ (void *)dhp, (void *)seg, (void *)addr, len);
+
+ DEBUGF(3, (CE_CONT, "segdev_unmap: dhp %p seg %p addr %p len %lx\n",
+ (void *)dhp, (void *)seg, (void *)addr, len));
+
+ /*
+ * Since the address space is "write" locked, we
+ * don't need the segment lock to protect "segdev" data.
+ */
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if ((sz = sdp->softlockcnt) > 0) {
+ /*
+ * Fail the unmap if pages are SOFTLOCKed through this mapping.
+ * softlockcnt is protected from change by the as write lock.
+ */
+ TRACE_1(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP_CK1,
+ "segdev_unmap:error softlockcnt = %ld", sz);
+ DEBUGF(1, (CE_CONT, "segdev_unmap: softlockcnt %ld\n", sz));
+ return (EAGAIN);
+ }
+
+ /*
+ * Check for bad sizes
+ */
+ if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
+ (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET))
+ panic("segdev_unmap");
+
+ if (dhp != NULL) {
+ devmap_handle_t *tdhp;
+ /*
+ * If large page size was used in hat_devload(),
+ * the same page size must be used in hat_unload().
+ */
+ dhpp = tdhp = devmap_find_handle(dhp, addr);
+ while (tdhp != NULL) {
+ if (tdhp->dh_flags & DEVMAP_FLAG_LARGE) {
+ break;
+ }
+ tdhp = tdhp->dh_next;
+ }
+ if (tdhp != NULL) { /* found a dhp using large pages */
+ size_t slen = len;
+ size_t mlen;
+ size_t soff;
+
+ soff = (ulong_t)(addr - dhpp->dh_uvaddr);
+ while (slen != 0) {
+ mlen = MIN(slen, (dhpp->dh_len - soff));
+ hat_unload(seg->s_as->a_hat, dhpp->dh_uvaddr,
+ dhpp->dh_len, HAT_UNLOAD_UNMAP);
+ dhpp = dhpp->dh_next;
+ ASSERT(slen >= mlen);
+ slen -= mlen;
+ soff = 0;
+ }
+ } else
+ hat_unload(seg->s_as->a_hat, addr, len,
+ HAT_UNLOAD_UNMAP);
+ } else {
+ /*
+ * Unload any hardware translations in the range
+ * to be taken out.
+ */
+ hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP);
+ }
+
+ /*
+ * get the user offset which will used in the driver callbacks
+ */
+ off = sdp->offset + (offset_t)(addr - seg->s_base);
+
+ /*
+ * Inform the vnode of the unmapping.
+ */
+ ASSERT(sdp->vp != NULL);
+ (void) VOP_DELMAP(VTOCVP(sdp->vp), off, seg->s_as, addr, len,
+ sdp->prot, sdp->maxprot, sdp->type, CRED());
+
+ /*
+ * Check for entire segment
+ */
+ if (addr == seg->s_base && len == seg->s_size) {
+ seg_free(seg);
+ return (0);
+ }
+
+ opages = seg_pages(seg);
+ dpages = btop(len);
+ npages = opages - dpages;
+
+ /*
+ * Check for beginning of segment
+ */
+ if (addr == seg->s_base) {
+ if (sdp->vpage != NULL) {
+ register struct vpage *ovpage;
+
+ ovpage = sdp->vpage; /* keep pointer to vpage */
+
+ nbytes = vpgtob(npages);
+ sdp->vpage = kmem_alloc(nbytes, KM_SLEEP);
+ bcopy(&ovpage[dpages], sdp->vpage, nbytes);
+
+ /* free up old vpage */
+ kmem_free(ovpage, vpgtob(opages));
+ }
+
+ /*
+ * free devmap handles from the beginning of the mapping.
+ */
+ if (dhp != NULL)
+ devmap_handle_unmap_head(dhp, len);
+
+ sdp->offset += (offset_t)len;
+
+ seg->s_base += len;
+ seg->s_size -= len;
+
+ return (0);
+ }
+
+ /*
+ * Check for end of segment
+ */
+ if (addr + len == seg->s_base + seg->s_size) {
+ if (sdp->vpage != NULL) {
+ register struct vpage *ovpage;
+
+ ovpage = sdp->vpage; /* keep pointer to vpage */
+
+ nbytes = vpgtob(npages);
+ sdp->vpage = kmem_alloc(nbytes, KM_SLEEP);
+ bcopy(ovpage, sdp->vpage, nbytes);
+
+ /* free up old vpage */
+ kmem_free(ovpage, vpgtob(opages));
+ }
+ seg->s_size -= len;
+
+ /*
+ * free devmap handles from addr to the end of the mapping.
+ */
+ if (dhp != NULL)
+ devmap_handle_unmap_tail(dhp, addr);
+
+ return (0);
+ }
+
+ /*
+ * The section to go is in the middle of the segment,
+ * have to make it into two segments. nseg is made for
+ * the high end while seg is cut down at the low end.
+ */
+ nbase = addr + len; /* new seg base */
+ nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */
+ seg->s_size = addr - seg->s_base; /* shrink old seg */
+ nseg = seg_alloc(seg->s_as, nbase, nsize);
+ if (nseg == NULL)
+ panic("segdev_unmap seg_alloc");
+
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP_CK2,
+ "segdev_unmap: seg=%p nseg=%p", (void *)seg, (void *)nseg);
+ DEBUGF(3, (CE_CONT, "segdev_unmap: segdev_dup seg %p nseg %p\n",
+ (void *)seg, (void *)nseg));
+ nsdp = sdp_alloc();
+
+ nseg->s_ops = seg->s_ops;
+ nseg->s_data = (void *)nsdp;
+
+ VN_HOLD(sdp->vp);
+ nsdp->mapfunc = sdp->mapfunc;
+ nsdp->offset = sdp->offset + (offset_t)(nseg->s_base - seg->s_base);
+ nsdp->vp = sdp->vp;
+ nsdp->pageprot = sdp->pageprot;
+ nsdp->prot = sdp->prot;
+ nsdp->maxprot = sdp->maxprot;
+ nsdp->type = sdp->type;
+ nsdp->hat_attr = sdp->hat_attr;
+ nsdp->hat_flags = sdp->hat_flags;
+ nsdp->softlockcnt = 0;
+
+ /*
+ * Initialize per page data if the segment we are
+ * dup'ing has per page information.
+ */
+ if (sdp->vpage != NULL) {
+ /* need to split vpage into two arrays */
+ register size_t nnbytes;
+ register size_t nnpages;
+ register struct vpage *ovpage;
+
+ ovpage = sdp->vpage; /* keep pointer to vpage */
+
+ npages = seg_pages(seg); /* seg has shrunk */
+ nbytes = vpgtob(npages);
+ nnpages = seg_pages(nseg);
+ nnbytes = vpgtob(nnpages);
+
+ sdp->vpage = kmem_alloc(nbytes, KM_SLEEP);
+ bcopy(ovpage, sdp->vpage, nbytes);
+
+ nsdp->vpage = kmem_alloc(nnbytes, KM_SLEEP);
+ bcopy(&ovpage[npages + dpages], nsdp->vpage, nnbytes);
+
+ /* free up old vpage */
+ kmem_free(ovpage, vpgtob(opages));
+ } else
+ nsdp->vpage = NULL;
+
+ /*
+ * unmap dhps.
+ */
+ if (dhp == NULL) {
+ nsdp->devmap_data = NULL;
+ return (0);
+ }
+ while (dhp != NULL) {
+ callbackops = &dhp->dh_callbackops;
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_UNMAP_CK3,
+ "segdev_unmap: dhp=%p addr=%p", dhp, addr);
+ DEBUGF(3, (CE_CONT, "unmap: dhp %p addr %p uvaddr %p len %lx\n",
+ (void *)dhp, (void *)addr,
+ (void *)dhp->dh_uvaddr, dhp->dh_len));
+
+ if (addr == (dhp->dh_uvaddr + dhp->dh_len)) {
+ dhpp = dhp->dh_next;
+ dhp->dh_next = NULL;
+ dhp = dhpp;
+ } else if (addr > (dhp->dh_uvaddr + dhp->dh_len)) {
+ dhp = dhp->dh_next;
+ } else if (addr > dhp->dh_uvaddr &&
+ (addr + len) < (dhp->dh_uvaddr + dhp->dh_len)) {
+ /*
+ * <addr, addr+len> is enclosed by dhp.
+ * create a newdhp that begins at addr+len and
+ * ends at dhp->dh_uvaddr+dhp->dh_len.
+ */
+ newdhp = kmem_alloc(sizeof (devmap_handle_t), KM_SLEEP);
+ HOLD_DHP_LOCK(dhp);
+ bcopy(dhp, newdhp, sizeof (devmap_handle_t));
+ RELE_DHP_LOCK(dhp);
+ newdhp->dh_seg = nseg;
+ newdhp->dh_next = dhp->dh_next;
+ if (dhp->dh_softlock != NULL)
+ newdhp->dh_softlock = devmap_softlock_init(
+ newdhp->dh_dev,
+ (ulong_t)callbackops->devmap_access);
+ if (dhp->dh_ctx != NULL)
+ newdhp->dh_ctx = devmap_ctxinit(newdhp->dh_dev,
+ (ulong_t)callbackops->devmap_access);
+ if (newdhp->dh_flags & DEVMAP_LOCK_INITED) {
+ mutex_init(&newdhp->dh_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+ if (callbackops->devmap_unmap != NULL)
+ (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp,
+ off, len, dhp, &dhp->dh_pvtp,
+ newdhp, &newdhp->dh_pvtp);
+ mlen = len + (addr - dhp->dh_uvaddr);
+ devmap_handle_reduce_len(newdhp, mlen);
+ nsdp->devmap_data = newdhp;
+ /* XX Changing len should recalculate LARGE flag */
+ dhp->dh_len = addr - dhp->dh_uvaddr;
+ dhpp = dhp->dh_next;
+ dhp->dh_next = NULL;
+ dhp = dhpp;
+ } else if ((addr > dhp->dh_uvaddr) &&
+ ((addr + len) >= (dhp->dh_uvaddr + dhp->dh_len))) {
+ mlen = dhp->dh_len + dhp->dh_uvaddr - addr;
+ /*
+ * <addr, addr+len> spans over dhps.
+ */
+ if (callbackops->devmap_unmap != NULL)
+ (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp,
+ off, mlen, (devmap_cookie_t *)dhp,
+ &dhp->dh_pvtp, NULL, NULL);
+ /* XX Changing len should recalculate LARGE flag */
+ dhp->dh_len = addr - dhp->dh_uvaddr;
+ dhpp = dhp->dh_next;
+ dhp->dh_next = NULL;
+ dhp = dhpp;
+ nsdp->devmap_data = dhp;
+ } else if ((addr + len) >= (dhp->dh_uvaddr + dhp->dh_len)) {
+ /*
+ * dhp is enclosed by <addr, addr+len>.
+ */
+ dhp->dh_seg = nseg;
+ nsdp->devmap_data = dhp;
+ dhp = devmap_handle_unmap(dhp);
+ nsdp->devmap_data = dhp; /* XX redundant? */
+ } else if (((addr + len) > dhp->dh_uvaddr) &&
+ ((addr + len) < (dhp->dh_uvaddr + dhp->dh_len))) {
+ mlen = addr + len - dhp->dh_uvaddr;
+ if (callbackops->devmap_unmap != NULL)
+ (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp,
+ dhp->dh_uoff, mlen, NULL,
+ NULL, dhp, &dhp->dh_pvtp);
+ devmap_handle_reduce_len(dhp, mlen);
+ nsdp->devmap_data = dhp;
+ dhp->dh_seg = nseg;
+ dhp = dhp->dh_next;
+ } else {
+ dhp->dh_seg = nseg;
+ dhp = dhp->dh_next;
+ }
+ }
+ return (0);
+}
+
+/*
+ * Utility function handles reducing the length of a devmap handle during unmap
+ * Note that is only used for unmapping the front portion of the handler,
+ * i.e., we are bumping up the offset/pfn etc up by len
+ * Do not use if reducing length at the tail.
+ */
+static void
+devmap_handle_reduce_len(devmap_handle_t *dhp, size_t len)
+{
+ struct ddi_umem_cookie *cp;
+ struct devmap_pmem_cookie *pcp;
+ /*
+ * adjust devmap handle fields
+ */
+ ASSERT(len < dhp->dh_len);
+
+ /* Make sure only page-aligned changes are done */
+ ASSERT((len & PAGEOFFSET) == 0);
+
+ dhp->dh_len -= len;
+ dhp->dh_uoff += (offset_t)len;
+ dhp->dh_roff += (offset_t)len;
+ dhp->dh_uvaddr += len;
+ /* Need to grab dhp lock if REMAP */
+ HOLD_DHP_LOCK(dhp);
+ cp = dhp->dh_cookie;
+ if (!(dhp->dh_flags & DEVMAP_MAPPING_INVALID)) {
+ if (cookie_is_devmem(cp)) {
+ dhp->dh_pfn += btop(len);
+ } else if (cookie_is_pmem(cp)) {
+ pcp = (struct devmap_pmem_cookie *)dhp->dh_pcookie;
+ ASSERT((dhp->dh_roff & PAGEOFFSET) == 0 &&
+ dhp->dh_roff < ptob(pcp->dp_npages));
+ } else {
+ ASSERT(dhp->dh_roff < cp->size);
+ ASSERT(dhp->dh_cvaddr >= cp->cvaddr &&
+ dhp->dh_cvaddr < (cp->cvaddr + cp->size));
+ ASSERT((dhp->dh_cvaddr + len) <=
+ (cp->cvaddr + cp->size));
+
+ dhp->dh_cvaddr += len;
+ }
+ }
+ /* XXX - Should recalculate the DEVMAP_FLAG_LARGE after changes */
+ RELE_DHP_LOCK(dhp);
+}
+
+/*
+ * Free devmap handle, dhp.
+ * Return the next devmap handle on the linked list.
+ */
+static devmap_handle_t *
+devmap_handle_unmap(devmap_handle_t *dhp)
+{
+ struct devmap_callback_ctl *callbackops = &dhp->dh_callbackops;
+ struct segdev_data *sdp = (struct segdev_data *)dhp->dh_seg->s_data;
+ devmap_handle_t *dhpp = (devmap_handle_t *)sdp->devmap_data;
+
+ ASSERT(dhp != NULL);
+
+ /*
+ * before we free up dhp, call the driver's devmap_unmap entry point
+ * to free resources allocated for this dhp.
+ */
+ if (callbackops->devmap_unmap != NULL) {
+ (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp, dhp->dh_uoff,
+ dhp->dh_len, NULL, NULL, NULL, NULL);
+ }
+
+ if (dhpp == dhp) { /* releasing first dhp, change sdp data */
+ sdp->devmap_data = dhp->dh_next;
+ } else {
+ while (dhpp->dh_next != dhp) {
+ dhpp = dhpp->dh_next;
+ }
+ dhpp->dh_next = dhp->dh_next;
+ }
+ dhpp = dhp->dh_next; /* return value is next dhp in chain */
+
+ if (dhp->dh_softlock != NULL)
+ devmap_softlock_rele(dhp);
+
+ if (dhp->dh_ctx != NULL)
+ devmap_ctx_rele(dhp);
+
+ if (dhp->dh_flags & DEVMAP_LOCK_INITED) {
+ mutex_destroy(&dhp->dh_lock);
+ }
+ kmem_free(dhp, sizeof (devmap_handle_t));
+
+ return (dhpp);
+}
+
+/*
+ * Free complete devmap handles from dhp for len bytes
+ * dhp can be either the first handle or a subsequent handle
+ */
+static void
+devmap_handle_unmap_head(devmap_handle_t *dhp, size_t len)
+{
+ struct devmap_callback_ctl *callbackops;
+
+ /*
+ * free the devmap handles covered by len.
+ */
+ while (len >= dhp->dh_len) {
+ len -= dhp->dh_len;
+ dhp = devmap_handle_unmap(dhp);
+ }
+ if (len != 0) { /* partial unmap at head of first remaining dhp */
+ callbackops = &dhp->dh_callbackops;
+
+ /*
+ * Call the unmap callback so the drivers can make
+ * adjustment on its private data.
+ */
+ if (callbackops->devmap_unmap != NULL)
+ (*callbackops->devmap_unmap)(dhp, dhp->dh_pvtp,
+ dhp->dh_uoff, len, NULL, NULL, dhp, &dhp->dh_pvtp);
+ devmap_handle_reduce_len(dhp, len);
+ }
+}
+
+/*
+ * Free devmap handles to truncate the mapping after addr
+ * RFE: Simpler to pass in dhp pointing at correct dhp (avoid find again)
+ * Also could then use the routine in middle unmap case too
+ */
+static void
+devmap_handle_unmap_tail(devmap_handle_t *dhp, caddr_t addr)
+{
+ register struct seg *seg = dhp->dh_seg;
+ register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ register devmap_handle_t *dhph = (devmap_handle_t *)sdp->devmap_data;
+ struct devmap_callback_ctl *callbackops;
+ register devmap_handle_t *dhpp;
+ size_t maplen;
+ ulong_t off;
+ size_t len;
+
+ maplen = (size_t)(addr - dhp->dh_uvaddr);
+ dhph = devmap_find_handle(dhph, addr);
+
+ while (dhph != NULL) {
+ if (maplen == 0) {
+ dhph = devmap_handle_unmap(dhph);
+ } else {
+ callbackops = &dhph->dh_callbackops;
+ len = dhph->dh_len - maplen;
+ off = (ulong_t)sdp->offset + (addr - seg->s_base);
+ /*
+ * Call the unmap callback so the driver
+ * can make adjustments on its private data.
+ */
+ if (callbackops->devmap_unmap != NULL)
+ (*callbackops->devmap_unmap)(dhph,
+ dhph->dh_pvtp, off, len,
+ (devmap_cookie_t *)dhph,
+ &dhph->dh_pvtp, NULL, NULL);
+ /* XXX Reducing len needs to recalculate LARGE flag */
+ dhph->dh_len = maplen;
+ maplen = 0;
+ dhpp = dhph->dh_next;
+ dhph->dh_next = NULL;
+ dhph = dhpp;
+ }
+ } /* end while */
+}
+
+/*
+ * Free a segment.
+ */
+static void
+segdev_free(struct seg *seg)
+{
+ register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ devmap_handle_t *dhp = (devmap_handle_t *)sdp->devmap_data;
+
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_FREE,
+ "segdev_free: dhp=%p seg=%p", (void *)dhp, (void *)seg);
+ DEBUGF(3, (CE_CONT, "segdev_free: dhp %p seg %p\n",
+ (void *)dhp, (void *)seg));
+
+ /*
+ * Since the address space is "write" locked, we
+ * don't need the segment lock to protect "segdev" data.
+ */
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ while (dhp != NULL)
+ dhp = devmap_handle_unmap(dhp);
+
+ VN_RELE(sdp->vp);
+ if (sdp->vpage != NULL)
+ kmem_free(sdp->vpage, vpgtob(seg_pages(seg)));
+
+ mutex_destroy(&sdp->lock);
+ kmem_free(sdp, sizeof (*sdp));
+}
+
+static void
+free_devmap_handle(devmap_handle_t *dhp)
+{
+ register devmap_handle_t *dhpp;
+
+ /*
+ * free up devmap handle
+ */
+ while (dhp != NULL) {
+ dhpp = dhp->dh_next;
+ if (dhp->dh_flags & DEVMAP_LOCK_INITED) {
+ mutex_destroy(&dhp->dh_lock);
+ }
+
+ if (dhp->dh_softlock != NULL)
+ devmap_softlock_rele(dhp);
+
+ if (dhp->dh_ctx != NULL)
+ devmap_ctx_rele(dhp);
+
+ kmem_free(dhp, sizeof (devmap_handle_t));
+ dhp = dhpp;
+ }
+}
+
+/*
+ * routines to lock and unlock underlying segkp segment for
+ * KMEM_PAGEABLE type cookies.
+ * segkp only allows a single pending F_SOFTLOCK
+ * we keep track of number of locks in the cookie so we can
+ * have multiple pending faults and manage the calls to segkp.
+ * RFE: if segkp supports either pagelock or can support multiple
+ * calls to F_SOFTLOCK, then these routines can go away.
+ * If pagelock, segdev_faultpage can fault on a page by page basis
+ * and simplifies the code quite a bit.
+ * if multiple calls allowed but not partial ranges, then need for
+ * cookie->lock and locked count goes away, code can call as_fault directly
+ */
+static faultcode_t
+acquire_kpmem_lock(struct ddi_umem_cookie *cookie, size_t npages)
+{
+ int err = 0;
+ ASSERT(cookie_is_kpmem(cookie));
+ /*
+ * Fault in pages in segkp with F_SOFTLOCK.
+ * We want to hold the lock until all pages have been loaded.
+ * segkp only allows single caller to hold SOFTLOCK, so cookie
+ * holds a count so we dont call into segkp multiple times
+ */
+ mutex_enter(&cookie->lock);
+
+ /*
+ * Check for overflow in locked field
+ */
+ if ((UINT32_MAX - cookie->locked) < npages) {
+ err = FC_MAKE_ERR(ENOMEM);
+ } else if (cookie->locked == 0) {
+ /* First time locking */
+ err = as_fault(kas.a_hat, &kas, cookie->cvaddr,
+ cookie->size, F_SOFTLOCK, PROT_READ|PROT_WRITE);
+ }
+ if (!err) {
+ cookie->locked += npages;
+ }
+ mutex_exit(&cookie->lock);
+ return (err);
+}
+
+static void
+release_kpmem_lock(struct ddi_umem_cookie *cookie, size_t npages)
+{
+ mutex_enter(&cookie->lock);
+ ASSERT(cookie_is_kpmem(cookie));
+ ASSERT(cookie->locked >= npages);
+ cookie->locked -= (uint_t)npages;
+ if (cookie->locked == 0) {
+ /* Last unlock */
+ if (as_fault(kas.a_hat, &kas, cookie->cvaddr,
+ cookie->size, F_SOFTUNLOCK, PROT_READ|PROT_WRITE))
+ panic("segdev releasing kpmem lock %p", (void *)cookie);
+ }
+ mutex_exit(&cookie->lock);
+}
+
+/*
+ * Routines to synchronize F_SOFTLOCK and F_INVAL faults for
+ * drivers with devmap_access callbacks
+ * slock->softlocked basically works like a rw lock
+ * -ve counts => F_SOFTLOCK in progress
+ * +ve counts => F_INVAL/F_PROT in progress
+ * We allow only one F_SOFTLOCK at a time
+ * but can have multiple pending F_INVAL/F_PROT calls
+ *
+ * This routine waits using cv_wait_sig so killing processes is more graceful
+ * Returns EINTR if coming out of this routine due to a signal, 0 otherwise
+ */
+static int devmap_softlock_enter(
+ struct devmap_softlock *slock,
+ size_t npages,
+ enum fault_type type)
+{
+ if (npages == 0)
+ return (0);
+ mutex_enter(&(slock->lock));
+ switch (type) {
+ case F_SOFTLOCK :
+ while (slock->softlocked) {
+ if (cv_wait_sig(&(slock)->cv, &(slock)->lock) == 0) {
+ /* signalled */
+ mutex_exit(&(slock->lock));
+ return (EINTR);
+ }
+ }
+ slock->softlocked -= npages; /* -ve count => locked */
+ break;
+ case F_INVAL :
+ case F_PROT :
+ while (slock->softlocked < 0)
+ if (cv_wait_sig(&(slock)->cv, &(slock)->lock) == 0) {
+ /* signalled */
+ mutex_exit(&(slock->lock));
+ return (EINTR);
+ }
+ slock->softlocked += npages; /* +ve count => f_invals */
+ break;
+ default:
+ ASSERT(0);
+ }
+ mutex_exit(&(slock->lock));
+ return (0);
+}
+
+static void devmap_softlock_exit(
+ struct devmap_softlock *slock,
+ size_t npages,
+ enum fault_type type)
+{
+ if (slock == NULL)
+ return;
+ mutex_enter(&(slock->lock));
+ switch (type) {
+ case F_SOFTLOCK :
+ ASSERT(-slock->softlocked >= npages);
+ slock->softlocked += npages; /* -ve count is softlocked */
+ if (slock->softlocked == 0)
+ cv_signal(&slock->cv);
+ break;
+ case F_INVAL :
+ case F_PROT:
+ ASSERT(slock->softlocked >= npages);
+ slock->softlocked -= npages;
+ if (slock->softlocked == 0)
+ cv_signal(&slock->cv);
+ break;
+ default:
+ ASSERT(0);
+ }
+ mutex_exit(&(slock->lock));
+}
+
+/*
+ * Do a F_SOFTUNLOCK call over the range requested.
+ * The range must have already been F_SOFTLOCK'ed.
+ * The segment lock should be held, (but not the segment private lock?)
+ * The softunlock code below does not adjust for large page sizes
+ * assumes the caller already did any addr/len adjustments for
+ * pagesize mappings before calling.
+ */
+/*ARGSUSED*/
+static void
+segdev_softunlock(
+ struct hat *hat, /* the hat */
+ struct seg *seg, /* seg_dev of interest */
+ caddr_t addr, /* base address of range */
+ size_t len, /* number of bytes */
+ enum seg_rw rw) /* type of access at fault */
+{
+ struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ devmap_handle_t *dhp_head = (devmap_handle_t *)sdp->devmap_data;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_SOFTUNLOCK,
+ "segdev_softunlock:dhp_head=%p sdp=%p addr=%p len=%lx",
+ dhp_head, sdp, addr, len);
+ DEBUGF(3, (CE_CONT, "segdev_softunlock: dhp %p lockcnt %lx "
+ "addr %p len %lx\n",
+ (void *)dhp_head, sdp->softlockcnt, (void *)addr, len));
+
+ hat_unlock(hat, addr, len);
+
+ if (dhp_head != NULL) {
+ devmap_handle_t *dhp;
+ size_t mlen;
+ ulong_t off;
+
+ dhp = devmap_find_handle(dhp_head, addr);
+ ASSERT(dhp != NULL);
+
+ off = (ulong_t)(addr - dhp->dh_uvaddr);
+ while (len != 0) {
+ mlen = MIN(len, (dhp->dh_len - off));
+
+ /*
+ * unlock segkp memory, locked during F_SOFTLOCK
+ */
+ if (dhp_is_kpmem(dhp)) {
+ release_kpmem_lock(
+ (struct ddi_umem_cookie *)dhp->dh_cookie,
+ btopr(mlen));
+ }
+
+ /*
+ * Do the softlock accounting for devmap_access
+ */
+ if (dhp->dh_callbackops.devmap_access != NULL) {
+ devmap_softlock_exit(dhp->dh_softlock,
+ btopr(mlen), F_SOFTLOCK);
+ }
+
+ len -= mlen;
+ dhp = dhp->dh_next;
+ off = 0;
+ }
+ }
+
+ mutex_enter(&freemem_lock);
+ ASSERT(sdp->softlockcnt >= btopr(len));
+ sdp->softlockcnt -= btopr(len);
+ mutex_exit(&freemem_lock);
+ if (sdp->softlockcnt == 0) {
+ /*
+ * All SOFTLOCKS are gone. Wakeup any waiting
+ * unmappers so they can try again to unmap.
+ * Check for waiters first without the mutex
+ * held so we don't always grab the mutex on
+ * softunlocks.
+ */
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ mutex_enter(&seg->s_as->a_contents);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ AS_CLRUNMAPWAIT(seg->s_as);
+ cv_broadcast(&seg->s_as->a_cv);
+ }
+ mutex_exit(&seg->s_as->a_contents);
+ }
+ }
+
+}
+
+/*
+ * Handle fault for a single page.
+ * Done in a separate routine so we can handle errors more easily.
+ * This routine is called only from segdev_faultpages()
+ * when looping over the range of addresses requested. The segment lock is held.
+ */
+static faultcode_t
+segdev_faultpage(
+ struct hat *hat, /* the hat */
+ struct seg *seg, /* seg_dev of interest */
+ caddr_t addr, /* address in as */
+ struct vpage *vpage, /* pointer to vpage for seg, addr */
+ enum fault_type type, /* type of fault */
+ enum seg_rw rw, /* type of access at fault */
+ devmap_handle_t *dhp) /* devmap handle if any for this page */
+{
+ struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ uint_t prot;
+ pfn_t pfnum = PFN_INVALID;
+ u_offset_t offset;
+ uint_t hat_flags;
+ dev_info_t *dip;
+
+ TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_FAULTPAGE,
+ "segdev_faultpage: dhp=%p seg=%p addr=%p", dhp, seg, addr);
+ DEBUGF(8, (CE_CONT, "segdev_faultpage: dhp %p seg %p addr %p \n",
+ (void *)dhp, (void *)seg, (void *)addr));
+
+ /*
+ * Initialize protection value for this page.
+ * If we have per page protection values check it now.
+ */
+ if (sdp->pageprot) {
+ uint_t protchk;
+
+ switch (rw) {
+ case S_READ:
+ protchk = PROT_READ;
+ break;
+ case S_WRITE:
+ protchk = PROT_WRITE;
+ break;
+ case S_EXEC:
+ protchk = PROT_EXEC;
+ break;
+ case S_OTHER:
+ default:
+ protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+ break;
+ }
+
+ prot = VPP_PROT(vpage);
+ if ((prot & protchk) == 0)
+ return (FC_PROT); /* illegal access type */
+ } else {
+ prot = sdp->prot;
+ /* caller has already done segment level protection check */
+ }
+
+ if (type == F_SOFTLOCK) {
+ mutex_enter(&freemem_lock);
+ sdp->softlockcnt++;
+ mutex_exit(&freemem_lock);
+ }
+
+ hat_flags = ((type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD);
+ offset = sdp->offset + (u_offset_t)(addr - seg->s_base);
+ /*
+ * In the devmap framework, sdp->mapfunc is set to NULL. we can get
+ * pfnum from dhp->dh_pfn (at beginning of segment) and offset from
+ * seg->s_base.
+ */
+ if (dhp == NULL) {
+ /* If segment has devmap_data, then dhp should be non-NULL */
+ ASSERT(sdp->devmap_data == NULL);
+ pfnum = (pfn_t)cdev_mmap(sdp->mapfunc, sdp->vp->v_rdev,
+ (off_t)offset, prot);
+ prot |= sdp->hat_attr;
+ } else {
+ ulong_t off;
+ struct ddi_umem_cookie *cp;
+ struct devmap_pmem_cookie *pcp;
+
+ /* ensure the dhp passed in contains addr. */
+ ASSERT(dhp == devmap_find_handle(
+ (devmap_handle_t *)sdp->devmap_data, addr));
+
+ off = addr - dhp->dh_uvaddr;
+
+ /*
+ * This routine assumes that the caller makes sure that the
+ * fields in dhp used below are unchanged due to remap during
+ * this call. Caller does HOLD_DHP_LOCK if neeed
+ */
+ cp = dhp->dh_cookie;
+ if (dhp->dh_flags & DEVMAP_MAPPING_INVALID) {
+ pfnum = PFN_INVALID;
+ } else if (cookie_is_devmem(cp)) {
+ pfnum = dhp->dh_pfn + btop(off);
+ } else if (cookie_is_pmem(cp)) {
+ pcp = (struct devmap_pmem_cookie *)dhp->dh_pcookie;
+ ASSERT((dhp->dh_roff & PAGEOFFSET) == 0 &&
+ dhp->dh_roff < ptob(pcp->dp_npages));
+ pfnum = page_pptonum(
+ pcp->dp_pparray[btop(off + dhp->dh_roff)]);
+ } else {
+ ASSERT(dhp->dh_roff < cp->size);
+ ASSERT(dhp->dh_cvaddr >= cp->cvaddr &&
+ dhp->dh_cvaddr < (cp->cvaddr + cp->size));
+ ASSERT((dhp->dh_cvaddr + off) <=
+ (cp->cvaddr + cp->size));
+ ASSERT((dhp->dh_cvaddr + off + PAGESIZE) <=
+ (cp->cvaddr + cp->size));
+
+ switch (cp->type) {
+ case UMEM_LOCKED :
+ if (cp->pparray != NULL) {
+ ASSERT((dhp->dh_roff & PAGEOFFSET) == 0);
+ pfnum = page_pptonum(
+ cp->pparray[btop(off + dhp->dh_roff)]);
+ } else {
+ pfnum = hat_getpfnum(
+ ((proc_t *)cp->procp)->p_as->a_hat,
+ cp->cvaddr + off);
+ }
+ break;
+ case UMEM_TRASH :
+ pfnum = page_pptonum(trashpp);
+ /* We should set hat_flags to HAT_NOFAULT also */
+ /* However, not all hat layers implement this */
+ break;
+ case KMEM_PAGEABLE:
+ case KMEM_NON_PAGEABLE:
+ pfnum = hat_getpfnum(kas.a_hat,
+ dhp->dh_cvaddr + off);
+ break;
+ default :
+ pfnum = PFN_INVALID;
+ break;
+ }
+ }
+ prot |= dhp->dh_hat_attr;
+ }
+ if (pfnum == PFN_INVALID) {
+ return (FC_MAKE_ERR(EFAULT));
+ }
+ /* prot should already be OR'ed in with hat_attributes if needed */
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_FAULTPAGE_CK1,
+ "segdev_faultpage: pfnum=%lx memory=%x prot=%x flags=%x",
+ pfnum, pf_is_memory(pfnum), prot, hat_flags);
+ DEBUGF(9, (CE_CONT, "segdev_faultpage: pfnum %lx memory %x "
+ "prot %x flags %x\n", pfnum, pf_is_memory(pfnum), prot, hat_flags));
+
+ if (pf_is_memory(pfnum) || (dhp != NULL)) {
+ /*
+ * It's not _really_ required here to pass sdp->hat_flags
+ * to hat_devload even though we do it.
+ * This is because hat figures it out DEVMEM mappings
+ * are non-consistent, anyway.
+ */
+ hat_devload(hat, addr, PAGESIZE, pfnum,
+ prot, hat_flags | sdp->hat_flags);
+ return (0);
+ }
+
+ /*
+ * Fall through to the case where devmap is not used and need to call
+ * up the device tree to set up the mapping
+ */
+
+ dip = VTOS(VTOCVP(sdp->vp))->s_dip;
+ ASSERT(dip);
+
+ /*
+ * When calling ddi_map_fault, we do not OR in sdp->hat_attr
+ * This is because this calls drivers which may not expect
+ * prot to have any other values than PROT_ALL
+ * The root nexus driver has a hack to peek into the segment
+ * structure and then OR in sdp->hat_attr.
+ * XX In case the bus_ops interfaces are ever revisited
+ * we need to fix this. prot should include other hat attributes
+ */
+ if (ddi_map_fault(dip, hat, seg, addr, NULL, pfnum, prot & PROT_ALL,
+ (uint_t)(type == F_SOFTLOCK)) != DDI_SUCCESS) {
+ return (FC_MAKE_ERR(EFAULT));
+ }
+ return (0);
+}
+
+static faultcode_t
+segdev_fault(
+ struct hat *hat, /* the hat */
+ struct seg *seg, /* the seg_dev of interest */
+ caddr_t addr, /* the address of the fault */
+ size_t len, /* the length of the range */
+ enum fault_type type, /* type of fault */
+ enum seg_rw rw) /* type of access at fault */
+{
+ struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ devmap_handle_t *dhp_head = (devmap_handle_t *)sdp->devmap_data;
+ devmap_handle_t *dhp;
+ struct devmap_softlock *slock = NULL;
+ ulong_t slpage = 0;
+ ulong_t off;
+ caddr_t maddr = addr;
+ int err;
+ int err_is_faultcode = 0;
+
+ TRACE_5(TR_FAC_DEVMAP, TR_DEVMAP_FAULT,
+ "segdev_fault: dhp_head=%p seg=%p addr=%p len=%lx type=%x",
+ (void *)dhp_head, (void *)seg, (void *)addr, len, type);
+ DEBUGF(7, (CE_CONT, "segdev_fault: dhp_head %p seg %p "
+ "addr %p len %lx type %x\n",
+ (void *)dhp_head, (void *)seg, (void *)addr, len, type));
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /* Handle non-devmap case */
+ if (dhp_head == NULL)
+ return (segdev_faultpages(hat, seg, addr, len, type, rw, NULL));
+
+ /* Find devmap handle */
+ if ((dhp = devmap_find_handle(dhp_head, addr)) == NULL)
+ return (FC_NOMAP);
+
+ /*
+ * The seg_dev driver does not implement copy-on-write,
+ * and always loads translations with maximal allowed permissions
+ * but we got an fault trying to access the device.
+ * Servicing the fault is not going to result in any better result
+ * RFE: If we want devmap_access callbacks to be involved in F_PROT
+ * faults, then the code below is written for that
+ * Pending resolution of the following:
+ * - determine if the F_INVAL/F_SOFTLOCK syncing
+ * is needed for F_PROT also or not. The code below assumes it does
+ * - If driver sees F_PROT and calls devmap_load with same type,
+ * then segdev_faultpages will fail with FC_PROT anyway, need to
+ * change that so calls from devmap_load to segdev_faultpages for
+ * F_PROT type are retagged to F_INVAL.
+ * RFE: Today we dont have drivers that use devmap and want to handle
+ * F_PROT calls. The code in segdev_fault* is written to allow
+ * this case but is not tested. A driver that needs this capability
+ * should be able to remove the short-circuit case; resolve the
+ * above issues and "should" work.
+ */
+ if (type == F_PROT) {
+ return (FC_PROT);
+ }
+
+ /*
+ * Loop through dhp list calling devmap_access or segdev_faultpages for
+ * each devmap handle.
+ * drivers which implement devmap_access can interpose on faults and do
+ * device-appropriate special actions before calling devmap_load.
+ */
+
+ /*
+ * Unfortunately, this simple loop has turned out to expose a variety
+ * of complex problems which results in the following convoluted code.
+ *
+ * First, a desire to handle a serialization of F_SOFTLOCK calls
+ * to the driver within the framework.
+ * This results in a dh_softlock structure that is on a per device
+ * (or device instance) basis and serializes devmap_access calls.
+ * Ideally we would need to do this for underlying
+ * memory/device regions that are being faulted on
+ * but that is hard to identify and with REMAP, harder
+ * Second, a desire to serialize F_INVAL(and F_PROT) calls w.r.t.
+ * to F_SOFTLOCK calls to the driver.
+ * These serializations are to simplify the driver programmer model.
+ * To support these two features, the code first goes through the
+ * devmap handles and counts the pages (slpage) that are covered
+ * by devmap_access callbacks.
+ * This part ends with a devmap_softlock_enter call
+ * which allows only one F_SOFTLOCK active on a device instance,
+ * but multiple F_INVAL/F_PROTs can be active except when a
+ * F_SOFTLOCK is active
+ *
+ * Next, we dont short-circuit the fault code upfront to call
+ * segdev_softunlock for F_SOFTUNLOCK, because we must use
+ * the same length when we softlock and softunlock.
+ *
+ * -Hat layers may not support softunlocking lengths less than the
+ * original length when there is large page support.
+ * -kpmem locking is dependent on keeping the lengths same.
+ * -if drivers handled F_SOFTLOCK, they probably also expect to
+ * see an F_SOFTUNLOCK of the same length
+ * Hence, if extending lengths during softlock,
+ * softunlock has to make the same adjustments and goes through
+ * the same loop calling segdev_faultpages/segdev_softunlock
+ * But some of the synchronization and error handling is different
+ */
+
+ if (type != F_SOFTUNLOCK) {
+ devmap_handle_t *dhpp = dhp;
+ size_t slen = len;
+
+ /*
+ * Calculate count of pages that are :
+ * a) within the (potentially extended) fault region
+ * b) AND covered by devmap handle with devmap_access
+ */
+ off = (ulong_t)(addr - dhpp->dh_uvaddr);
+ while (slen != 0) {
+ size_t mlen;
+
+ /*
+ * Softlocking on a region that allows remap is
+ * unsupported due to unresolved locking issues
+ * XXX: unclear what these are?
+ * One potential is that if there is a pending
+ * softlock, then a remap should not be allowed
+ * until the unlock is done. This is easily
+ * fixed by returning error in devmap*remap on
+ * checking the dh->dh_softlock->softlocked value
+ */
+ if ((type == F_SOFTLOCK) &&
+ (dhpp->dh_flags & DEVMAP_ALLOW_REMAP)) {
+ return (FC_NOSUPPORT);
+ }
+
+ mlen = MIN(slen, (dhpp->dh_len - off));
+ if (dhpp->dh_callbackops.devmap_access) {
+ size_t llen;
+ caddr_t laddr;
+ /*
+ * use extended length for large page mappings
+ */
+ HOLD_DHP_LOCK(dhpp);
+ if ((sdp->pageprot == 0) &&
+ (dhpp->dh_flags & DEVMAP_FLAG_LARGE)) {
+ devmap_get_large_pgsize(dhpp,
+ mlen, maddr, &llen, &laddr);
+ } else {
+ llen = mlen;
+ }
+ RELE_DHP_LOCK(dhpp);
+
+ slpage += btopr(llen);
+ slock = dhpp->dh_softlock;
+ }
+ maddr += mlen;
+ ASSERT(slen >= mlen);
+ slen -= mlen;
+ dhpp = dhpp->dh_next;
+ off = 0;
+ }
+ /*
+ * synchonize with other faulting threads and wait till safe
+ * devmap_softlock_enter might return due to signal in cv_wait
+ *
+ * devmap_softlock_enter has to be called outside of while loop
+ * to prevent a deadlock if len spans over multiple dhps.
+ * dh_softlock is based on device instance and if multiple dhps
+ * use the same device instance, the second dhp's LOCK call
+ * will hang waiting on the first to complete.
+ * devmap_setup verifies that slocks in a dhp_chain are same.
+ * RFE: this deadlock only hold true for F_SOFTLOCK. For
+ * F_INVAL/F_PROT, since we now allow multiple in parallel,
+ * we could have done the softlock_enter inside the loop
+ * and supported multi-dhp mappings with dissimilar devices
+ */
+ if (err = devmap_softlock_enter(slock, slpage, type))
+ return (FC_MAKE_ERR(err));
+ }
+
+ /* reset 'maddr' to the start addr of the range of fault. */
+ maddr = addr;
+
+ /* calculate the offset corresponds to 'addr' in the first dhp. */
+ off = (ulong_t)(addr - dhp->dh_uvaddr);
+
+ /*
+ * The fault length may span over multiple dhps.
+ * Loop until the total length is satisfied.
+ */
+ while (len != 0) {
+ size_t llen;
+ size_t mlen;
+ caddr_t laddr;
+
+ /*
+ * mlen is the smaller of 'len' and the length
+ * from addr to the end of mapping defined by dhp.
+ */
+ mlen = MIN(len, (dhp->dh_len - off));
+
+ HOLD_DHP_LOCK(dhp);
+ /*
+ * Pass the extended length and address to devmap_access
+ * if large pagesize is used for loading address translations.
+ */
+ if ((sdp->pageprot == 0) &&
+ (dhp->dh_flags & DEVMAP_FLAG_LARGE)) {
+ devmap_get_large_pgsize(dhp, mlen, maddr,
+ &llen, &laddr);
+ ASSERT(maddr == addr || laddr == maddr);
+ } else {
+ llen = mlen;
+ laddr = maddr;
+ }
+
+ if (dhp->dh_callbackops.devmap_access != NULL) {
+ offset_t aoff;
+
+ aoff = sdp->offset + (offset_t)(laddr - seg->s_base);
+
+ /*
+ * call driver's devmap_access entry point which will
+ * call devmap_load/contextmgmt to load the translations
+ *
+ * We drop the dhp_lock before calling access so
+ * drivers can call devmap_*_remap within access
+ */
+ RELE_DHP_LOCK(dhp);
+
+ err = (*dhp->dh_callbackops.devmap_access)(
+ dhp, (void *)dhp->dh_pvtp, aoff, llen, type, rw);
+ } else {
+ /*
+ * If no devmap_access entry point, then load mappings
+ * hold dhp_lock across faultpages if REMAP
+ */
+ err = segdev_faultpages(hat, seg, laddr, llen,
+ type, rw, dhp);
+ err_is_faultcode = 1;
+ RELE_DHP_LOCK(dhp);
+ }
+
+ if (err) {
+ if ((type == F_SOFTLOCK) && (maddr > addr)) {
+ /*
+ * If not first dhp, use
+ * segdev_fault(F_SOFTUNLOCK) for prior dhps
+ * While this is recursion, it is incorrect to
+ * call just segdev_softunlock
+ * if we are using either large pages
+ * or devmap_access. It will be more right
+ * to go through the same loop as above
+ * rather than call segdev_softunlock directly
+ * It will use the right lenghths as well as
+ * call into the driver devmap_access routines.
+ */
+ size_t done = (size_t)(maddr - addr);
+ (void) segdev_fault(hat, seg, addr, done,
+ F_SOFTUNLOCK, S_OTHER);
+ /*
+ * reduce slpage by number of pages
+ * released by segdev_softunlock
+ */
+ ASSERT(slpage >= btopr(done));
+ devmap_softlock_exit(slock,
+ slpage - btopr(done), type);
+ } else {
+ devmap_softlock_exit(slock, slpage, type);
+ }
+
+
+ /*
+ * Segdev_faultpages() already returns a faultcode,
+ * hence, result from segdev_faultpages() should be
+ * returned directly.
+ */
+ if (err_is_faultcode)
+ return (err);
+ return (FC_MAKE_ERR(err));
+ }
+
+ maddr += mlen;
+ ASSERT(len >= mlen);
+ len -= mlen;
+ dhp = dhp->dh_next;
+ off = 0;
+
+ ASSERT(!dhp || len == 0 || maddr == dhp->dh_uvaddr);
+ }
+ /*
+ * release the softlock count at end of fault
+ * For F_SOFTLOCk this is done in the later F_SOFTUNLOCK
+ */
+ if ((type == F_INVAL) || (type == F_PROT))
+ devmap_softlock_exit(slock, slpage, type);
+ return (0);
+}
+
+/*
+ * segdev_faultpages
+ *
+ * Used to fault in seg_dev segment pages. Called by segdev_fault or devmap_load
+ * This routine assumes that the callers makes sure that the fields
+ * in dhp used below are not changed due to remap during this call.
+ * Caller does HOLD_DHP_LOCK if neeed
+ * This routine returns a faultcode_t as a return value for segdev_fault.
+ */
+static faultcode_t
+segdev_faultpages(
+ struct hat *hat, /* the hat */
+ struct seg *seg, /* the seg_dev of interest */
+ caddr_t addr, /* the address of the fault */
+ size_t len, /* the length of the range */
+ enum fault_type type, /* type of fault */
+ enum seg_rw rw, /* type of access at fault */
+ devmap_handle_t *dhp) /* devmap handle */
+{
+ register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ register caddr_t a;
+ struct vpage *vpage;
+ struct ddi_umem_cookie *kpmem_cookie = NULL;
+ int err;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_FAULTPAGES,
+ "segdev_faultpages: dhp=%p seg=%p addr=%p len=%lx",
+ (void *)dhp, (void *)seg, (void *)addr, len);
+ DEBUGF(5, (CE_CONT, "segdev_faultpages: "
+ "dhp %p seg %p addr %p len %lx\n",
+ (void *)dhp, (void *)seg, (void *)addr, len));
+
+ /*
+ * The seg_dev driver does not implement copy-on-write,
+ * and always loads translations with maximal allowed permissions
+ * but we got an fault trying to access the device.
+ * Servicing the fault is not going to result in any better result
+ * XXX: If we want to allow devmap_access to handle F_PROT calls,
+ * This code should be removed and let the normal fault handling
+ * take care of finding the error
+ */
+ if (type == F_PROT) {
+ return (FC_PROT);
+ }
+
+ if (type == F_SOFTUNLOCK) {
+ segdev_softunlock(hat, seg, addr, len, rw);
+ return (0);
+ }
+
+ /*
+ * For kernel pageable memory, fault/lock segkp pages
+ * We hold this until the completion of this
+ * fault (INVAL/PROT) or till unlock (SOFTLOCK).
+ */
+ if ((dhp != NULL) && dhp_is_kpmem(dhp)) {
+ kpmem_cookie = (struct ddi_umem_cookie *)dhp->dh_cookie;
+ if (err = acquire_kpmem_lock(kpmem_cookie, btopr(len)))
+ return (err);
+ }
+
+ /*
+ * If we have the same protections for the entire segment,
+ * insure that the access being attempted is legitimate.
+ */
+ mutex_enter(&sdp->lock);
+ if (sdp->pageprot == 0) {
+ uint_t protchk;
+
+ switch (rw) {
+ case S_READ:
+ protchk = PROT_READ;
+ break;
+ case S_WRITE:
+ protchk = PROT_WRITE;
+ break;
+ case S_EXEC:
+ protchk = PROT_EXEC;
+ break;
+ case S_OTHER:
+ default:
+ protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+ break;
+ }
+
+ if ((sdp->prot & protchk) == 0) {
+ mutex_exit(&sdp->lock);
+ /* undo kpmem locking */
+ if (kpmem_cookie != NULL) {
+ release_kpmem_lock(kpmem_cookie, btopr(len));
+ }
+ return (FC_PROT); /* illegal access type */
+ }
+ }
+
+ /*
+ * we do a single hat_devload for the range if
+ * - devmap framework (dhp is not NULL),
+ * - pageprot == 0, i.e., no per-page protection set and
+ * - is device pages, irrespective of whether we are using large pages
+ */
+ if ((sdp->pageprot == 0) && (dhp != NULL) && dhp_is_devmem(dhp)) {
+ pfn_t pfnum;
+ uint_t hat_flags;
+
+ if (dhp->dh_flags & DEVMAP_MAPPING_INVALID) {
+ mutex_exit(&sdp->lock);
+ return (FC_NOMAP);
+ }
+
+ if (type == F_SOFTLOCK) {
+ mutex_enter(&freemem_lock);
+ sdp->softlockcnt += btopr(len);
+ mutex_exit(&freemem_lock);
+ }
+
+ hat_flags = ((type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD);
+ pfnum = dhp->dh_pfn + btop((uintptr_t)(addr - dhp->dh_uvaddr));
+ ASSERT(!pf_is_memory(pfnum));
+
+ hat_devload(hat, addr, len, pfnum, sdp->prot | dhp->dh_hat_attr,
+ hat_flags | sdp->hat_flags);
+ mutex_exit(&sdp->lock);
+ return (0);
+ }
+
+ /* Handle cases where we have to loop through fault handling per-page */
+
+ if (sdp->vpage == NULL)
+ vpage = NULL;
+ else
+ vpage = &sdp->vpage[seg_page(seg, addr)];
+
+ /* loop over the address range handling each fault */
+ for (a = addr; a < addr + len; a += PAGESIZE) {
+ if (err = segdev_faultpage(hat, seg, a, vpage, type, rw, dhp)) {
+ break;
+ }
+ if (vpage != NULL)
+ vpage++;
+ }
+ mutex_exit(&sdp->lock);
+ if (err && (type == F_SOFTLOCK)) { /* error handling for F_SOFTLOCK */
+ size_t done = (size_t)(a - addr); /* pages fault successfully */
+ if (done > 0) {
+ /* use softunlock for those pages */
+ segdev_softunlock(hat, seg, addr, done, S_OTHER);
+ }
+ if (kpmem_cookie != NULL) {
+ /* release kpmem lock for rest of pages */
+ ASSERT(len >= done);
+ release_kpmem_lock(kpmem_cookie, btopr(len - done));
+ }
+ } else if ((kpmem_cookie != NULL) && (type != F_SOFTLOCK)) {
+ /* for non-SOFTLOCK cases, release kpmem */
+ release_kpmem_lock(kpmem_cookie, btopr(len));
+ }
+ return (err);
+}
+
+/*
+ * Asynchronous page fault. We simply do nothing since this
+ * entry point is not supposed to load up the translation.
+ */
+/*ARGSUSED*/
+static faultcode_t
+segdev_faulta(struct seg *seg, caddr_t addr)
+{
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_FAULTA,
+ "segdev_faulta: seg=%p addr=%p", (void *)seg, (void *)addr);
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ return (0);
+}
+
+static int
+segdev_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+ register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ register devmap_handle_t *dhp;
+ register struct vpage *vp, *evp;
+ devmap_handle_t *dhp_head = (devmap_handle_t *)sdp->devmap_data;
+ ulong_t off;
+ size_t mlen, sz;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_SETPROT,
+ "segdev_setprot:start seg=%p addr=%p len=%lx prot=%x",
+ (void *)seg, (void *)addr, len, prot);
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if ((sz = sdp->softlockcnt) > 0 && dhp_head != NULL) {
+ /*
+ * Fail the setprot if pages are SOFTLOCKed through this
+ * mapping.
+ * Softlockcnt is protected from change by the as read lock.
+ */
+ TRACE_1(TR_FAC_DEVMAP, TR_DEVMAP_SETPROT_CK1,
+ "segdev_setprot:error softlockcnt=%lx", sz);
+ DEBUGF(1, (CE_CONT, "segdev_setprot: softlockcnt %ld\n", sz));
+ return (EAGAIN);
+ }
+
+ if (dhp_head != NULL) {
+ if ((dhp = devmap_find_handle(dhp_head, addr)) == NULL)
+ return (EINVAL);
+
+ /*
+ * check if violate maxprot.
+ */
+ off = (ulong_t)(addr - dhp->dh_uvaddr);
+ mlen = len;
+ while (dhp) {
+ if ((dhp->dh_maxprot & prot) != prot)
+ return (EACCES); /* violated maxprot */
+
+ if (mlen > (dhp->dh_len - off)) {
+ mlen -= dhp->dh_len - off;
+ dhp = dhp->dh_next;
+ off = 0;
+ } else
+ break;
+ }
+ } else {
+ if ((sdp->maxprot & prot) != prot)
+ return (EACCES);
+ }
+
+ mutex_enter(&sdp->lock);
+ if (addr == seg->s_base && len == seg->s_size && sdp->pageprot == 0) {
+ if (sdp->prot == prot) {
+ mutex_exit(&sdp->lock);
+ return (0); /* all done */
+ }
+ sdp->prot = (uchar_t)prot;
+ } else {
+ sdp->pageprot = 1;
+ if (sdp->vpage == NULL) {
+ /*
+ * First time through setting per page permissions,
+ * initialize all the vpage structures to prot
+ */
+ sdp->vpage = kmem_zalloc(vpgtob(seg_pages(seg)),
+ KM_SLEEP);
+ evp = &sdp->vpage[seg_pages(seg)];
+ for (vp = sdp->vpage; vp < evp; vp++)
+ VPP_SETPROT(vp, sdp->prot);
+ }
+ /*
+ * Now go change the needed vpages protections.
+ */
+ evp = &sdp->vpage[seg_page(seg, addr + len)];
+ for (vp = &sdp->vpage[seg_page(seg, addr)]; vp < evp; vp++)
+ VPP_SETPROT(vp, prot);
+ }
+ mutex_exit(&sdp->lock);
+
+ if (dhp_head != NULL) {
+ devmap_handle_t *tdhp;
+ /*
+ * If large page size was used in hat_devload(),
+ * the same page size must be used in hat_unload().
+ */
+ dhp = tdhp = devmap_find_handle(dhp_head, addr);
+ while (tdhp != NULL) {
+ if (tdhp->dh_flags & DEVMAP_FLAG_LARGE) {
+ break;
+ }
+ tdhp = tdhp->dh_next;
+ }
+ if (tdhp) {
+ size_t slen = len;
+ size_t mlen;
+ size_t soff;
+
+ soff = (ulong_t)(addr - dhp->dh_uvaddr);
+ while (slen != 0) {
+ mlen = MIN(slen, (dhp->dh_len - soff));
+ hat_unload(seg->s_as->a_hat, dhp->dh_uvaddr,
+ dhp->dh_len, HAT_UNLOAD);
+ dhp = dhp->dh_next;
+ ASSERT(slen >= mlen);
+ slen -= mlen;
+ soff = 0;
+ }
+ return (0);
+ }
+ }
+
+ if ((prot & ~PROT_USER) == PROT_NONE) {
+ hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
+ } else {
+ /*
+ * RFE: the segment should keep track of all attributes
+ * allowing us to remove the deprecated hat_chgprot
+ * and use hat_chgattr.
+ */
+ hat_chgprot(seg->s_as->a_hat, addr, len, prot);
+ }
+
+ return (0);
+}
+
+static int
+segdev_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+ struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ struct vpage *vp, *evp;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_CHECKPROT,
+ "segdev_checkprot:start seg=%p addr=%p len=%lx prot=%x",
+ (void *)seg, (void *)addr, len, prot);
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * If segment protection can be used, simply check against them
+ */
+ mutex_enter(&sdp->lock);
+ if (sdp->pageprot == 0) {
+ register int err;
+
+ err = ((sdp->prot & prot) != prot) ? EACCES : 0;
+ mutex_exit(&sdp->lock);
+ return (err);
+ }
+
+ /*
+ * Have to check down to the vpage level
+ */
+ evp = &sdp->vpage[seg_page(seg, addr + len)];
+ for (vp = &sdp->vpage[seg_page(seg, addr)]; vp < evp; vp++) {
+ if ((VPP_PROT(vp) & prot) != prot) {
+ mutex_exit(&sdp->lock);
+ return (EACCES);
+ }
+ }
+ mutex_exit(&sdp->lock);
+ return (0);
+}
+
+static int
+segdev_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+ struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ size_t pgno;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_GETPROT,
+ "segdev_getprot:start seg=%p addr=%p len=%lx protv=%p",
+ (void *)seg, (void *)addr, len, (void *)protv);
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
+ if (pgno != 0) {
+ mutex_enter(&sdp->lock);
+ if (sdp->pageprot == 0) {
+ do
+ protv[--pgno] = sdp->prot;
+ while (pgno != 0);
+ } else {
+ size_t pgoff = seg_page(seg, addr);
+
+ do {
+ pgno--;
+ protv[pgno] =
+ VPP_PROT(&sdp->vpage[pgno + pgoff]);
+ } while (pgno != 0);
+ }
+ mutex_exit(&sdp->lock);
+ }
+ return (0);
+}
+
+static u_offset_t
+segdev_getoffset(register struct seg *seg, caddr_t addr)
+{
+ register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_GETOFFSET,
+ "segdev_getoffset:start seg=%p addr=%p", (void *)seg, (void *)addr);
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ return ((u_offset_t)sdp->offset + (addr - seg->s_base));
+}
+
+/*ARGSUSED*/
+static int
+segdev_gettype(register struct seg *seg, caddr_t addr)
+{
+ register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_GETTYPE,
+ "segdev_gettype:start seg=%p addr=%p", (void *)seg, (void *)addr);
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ return (sdp->type);
+}
+
+
+/*ARGSUSED*/
+static int
+segdev_getvp(register struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+ register struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_GETVP,
+ "segdev_getvp:start seg=%p addr=%p", (void *)seg, (void *)addr);
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * Note that this vp is the common_vp of the device, where the
+ * pages are hung ..
+ */
+ *vpp = VTOCVP(sdp->vp);
+
+ return (0);
+}
+
+static void
+segdev_badop(void)
+{
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SEGDEV_BADOP,
+ "segdev_badop:start");
+ panic("segdev_badop");
+ /*NOTREACHED*/
+}
+
+/*
+ * segdev pages are not in the cache, and thus can't really be controlled.
+ * Hence, syncs are simply always successful.
+ */
+/*ARGSUSED*/
+static int
+segdev_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
+{
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SYNC, "segdev_sync:start");
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ return (0);
+}
+
+/*
+ * segdev pages are always "in core".
+ */
+/*ARGSUSED*/
+static size_t
+segdev_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
+{
+ size_t v = 0;
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_INCORE, "segdev_incore:start");
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ for (len = (len + PAGEOFFSET) & PAGEMASK; len; len -= PAGESIZE,
+ v += PAGESIZE)
+ *vec++ = 1;
+ return (v);
+}
+
+/*
+ * segdev pages are not in the cache, and thus can't really be controlled.
+ * Hence, locks are simply always successful.
+ */
+/*ARGSUSED*/
+static int
+segdev_lockop(struct seg *seg, caddr_t addr,
+ size_t len, int attr, int op, ulong_t *lockmap, size_t pos)
+{
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_LOCKOP, "segdev_lockop:start");
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ return (0);
+}
+
+/*
+ * segdev pages are not in the cache, and thus can't really be controlled.
+ * Hence, advise is simply always successful.
+ */
+/*ARGSUSED*/
+static int
+segdev_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
+{
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_ADVISE, "segdev_advise:start");
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ return (0);
+}
+
+/*
+ * segdev pages are not dumped, so we just return
+ */
+/*ARGSUSED*/
+static void
+segdev_dump(struct seg *seg)
+{}
+
+/*
+ * ddi_segmap_setup: Used by drivers who wish specify mapping attributes
+ * for a segment. Called from a drivers segmap(9E)
+ * routine.
+ */
+/*ARGSUSED*/
+int
+ddi_segmap_setup(dev_t dev, off_t offset, struct as *as, caddr_t *addrp,
+ off_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cred,
+ ddi_device_acc_attr_t *accattrp, uint_t rnumber)
+{
+ struct segdev_crargs dev_a;
+ int (*mapfunc)(dev_t dev, off_t off, int prot);
+ uint_t hat_attr;
+ pfn_t pfn;
+ int error, i;
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SEGMAP_SETUP,
+ "ddi_segmap_setup:start");
+
+ if ((mapfunc = devopsp[getmajor(dev)]->devo_cb_ops->cb_mmap) == nodev)
+ return (ENODEV);
+
+ /*
+ * Character devices that support the d_mmap
+ * interface can only be mmap'ed shared.
+ */
+ if ((flags & MAP_TYPE) != MAP_SHARED)
+ return (EINVAL);
+
+ /*
+ * Check that this region is indeed mappable on this platform.
+ * Use the mapping function.
+ */
+ if (ddi_device_mapping_check(dev, accattrp, rnumber, &hat_attr) == -1)
+ return (ENXIO);
+
+ /*
+ * Check to ensure that the entire range is
+ * legal and we are not trying to map in
+ * more than the device will let us.
+ */
+ for (i = 0; i < len; i += PAGESIZE) {
+ if (i == 0) {
+ /*
+ * Save the pfn at offset here. This pfn will be
+ * used later to get user address.
+ */
+ if ((pfn = (pfn_t)cdev_mmap(mapfunc, dev, offset,
+ maxprot)) == PFN_INVALID)
+ return (ENXIO);
+ } else {
+ if (cdev_mmap(mapfunc, dev, offset + i, maxprot) ==
+ PFN_INVALID)
+ return (ENXIO);
+ }
+ }
+
+ as_rangelock(as);
+ if ((flags & MAP_FIXED) == 0) {
+ /*
+ * Pick an address w/o worrying about
+ * any vac alignment constraints.
+ */
+ map_addr(addrp, len, ptob(pfn), 0, flags);
+ if (*addrp == NULL) {
+ as_rangeunlock(as);
+ return (ENOMEM);
+ }
+ } else {
+ /*
+ * User-specified address; blow away any previous mappings.
+ */
+ (void) as_unmap(as, *addrp, len);
+ }
+
+ dev_a.mapfunc = mapfunc;
+ dev_a.dev = dev;
+ dev_a.offset = (offset_t)offset;
+ dev_a.type = flags & MAP_TYPE;
+ dev_a.prot = (uchar_t)prot;
+ dev_a.maxprot = (uchar_t)maxprot;
+ dev_a.hat_attr = hat_attr;
+ dev_a.hat_flags = 0;
+ dev_a.devmap_data = NULL;
+
+ error = as_map(as, *addrp, len, segdev_create, &dev_a);
+ as_rangeunlock(as);
+ return (error);
+
+}
+
+/*ARGSUSED*/
+static int
+segdev_pagelock(struct seg *seg, caddr_t addr, size_t len,
+ struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_PAGELOCK,
+ "segdev_pagelock:start");
+ return (ENOTSUP);
+}
+
+/*ARGSUSED*/
+static int
+segdev_setpagesize(struct seg *seg, caddr_t addr, size_t len,
+ uint_t szc)
+{
+ return (ENOTSUP);
+}
+
+/*
+ * devmap_device: Used by devmap framework to establish mapping
+ * called by devmap_seup(9F) during map setup time.
+ */
+/*ARGSUSED*/
+static int
+devmap_device(devmap_handle_t *dhp, struct as *as, caddr_t *addr,
+ offset_t off, size_t len, uint_t flags)
+{
+ devmap_handle_t *rdhp, *maxdhp;
+ struct segdev_crargs dev_a;
+ int err;
+ uint_t maxprot = PROT_ALL;
+ offset_t offset = 0;
+ pfn_t pfn;
+ struct devmap_pmem_cookie *pcp;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_DEVICE,
+ "devmap_device:start dhp=%p addr=%p off=%llx, len=%lx",
+ (void *)dhp, (void *)addr, off, len);
+
+ DEBUGF(2, (CE_CONT, "devmap_device: dhp %p addr %p off %llx len %lx\n",
+ (void *)dhp, (void *)addr, off, len));
+
+ as_rangelock(as);
+ if ((flags & MAP_FIXED) == 0) {
+ offset_t aligned_off;
+
+ rdhp = maxdhp = dhp;
+ while (rdhp != NULL) {
+ maxdhp = (maxdhp->dh_len > rdhp->dh_len) ?
+ maxdhp : rdhp;
+ rdhp = rdhp->dh_next;
+ maxprot |= dhp->dh_maxprot;
+ }
+ offset = maxdhp->dh_uoff - dhp->dh_uoff;
+
+ /*
+ * Use the dhp that has the
+ * largest len to get user address.
+ */
+ /*
+ * If MAPPING_INVALID, cannot use dh_pfn/dh_cvaddr,
+ * use 0 which is as good as any other.
+ */
+ if (maxdhp->dh_flags & DEVMAP_MAPPING_INVALID) {
+ aligned_off = (offset_t)0;
+ } else if (dhp_is_devmem(maxdhp)) {
+ aligned_off = (offset_t)ptob(maxdhp->dh_pfn) - offset;
+ } else if (dhp_is_pmem(maxdhp)) {
+ pcp = (struct devmap_pmem_cookie *)maxdhp->dh_pcookie;
+ pfn = page_pptonum(
+ pcp->dp_pparray[btop(maxdhp->dh_roff)]);
+ aligned_off = (offset_t)ptob(pfn) - offset;
+ } else {
+ aligned_off = (offset_t)(uintptr_t)maxdhp->dh_cvaddr -
+ offset;
+ }
+
+ /*
+ * Pick an address aligned to dh_cookie.
+ * for kernel memory/user memory, cookie is cvaddr.
+ * for device memory, cookie is physical address.
+ */
+ map_addr(addr, len, aligned_off, 1, flags);
+ if (*addr == NULL) {
+ as_rangeunlock(as);
+ return (ENOMEM);
+ }
+ } else {
+ /*
+ * User-specified address; blow away any previous mappings.
+ */
+ (void) as_unmap(as, *addr, len);
+ }
+
+ dev_a.mapfunc = NULL;
+ dev_a.dev = dhp->dh_dev;
+ dev_a.type = flags & MAP_TYPE;
+ dev_a.offset = off;
+ /*
+ * sdp->maxprot has the least restrict protection of all dhps.
+ */
+ dev_a.maxprot = maxprot;
+ dev_a.prot = dhp->dh_prot;
+ /*
+ * devmap uses dhp->dh_hat_attr for hat.
+ */
+ dev_a.hat_flags = 0;
+ dev_a.hat_attr = 0;
+ dev_a.devmap_data = (void *)dhp;
+
+ err = as_map(as, *addr, len, segdev_create, &dev_a);
+ as_rangeunlock(as);
+ return (err);
+}
+
+int
+devmap_do_ctxmgt(devmap_cookie_t dhc, void *pvtp, offset_t off, size_t len,
+ uint_t type, uint_t rw, int (*ctxmgt)(devmap_cookie_t, void *, offset_t,
+ size_t, uint_t, uint_t))
+{
+ register devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+ struct devmap_ctx *devctx;
+ int do_timeout = 0;
+ int ret;
+
+#ifdef lint
+ pvtp = pvtp;
+#endif
+
+ TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT,
+ "devmap_do_ctxmgt:start dhp=%p off=%llx, len=%lx",
+ (void *)dhp, off, len);
+ DEBUGF(7, (CE_CONT, "devmap_do_ctxmgt: dhp %p off %llx len %lx\n",
+ (void *)dhp, off, len));
+
+ if (ctxmgt == NULL)
+ return (FC_HWERR);
+
+ devctx = dhp->dh_ctx;
+
+ /*
+ * If we are on an MP system with more than one cpu running
+ * and if a thread on some CPU already has the context, wait
+ * for it to finish if there is a hysteresis timeout.
+ *
+ * We call cv_wait() instead of cv_wait_sig() because
+ * it does not matter much if it returned due to a signal
+ * or due to a cv_signal() or cv_broadcast(). In either event
+ * we need to complete the mapping otherwise the processes
+ * will die with a SEGV.
+ */
+ if ((dhp->dh_timeout_length > 0) && (ncpus > 1)) {
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK1,
+ "devmap_do_ctxmgt:doing hysteresis, devctl %p dhp %p",
+ devctx, dhp);
+ do_timeout = 1;
+ mutex_enter(&devctx->lock);
+ while (devctx->oncpu)
+ cv_wait(&devctx->cv, &devctx->lock);
+ devctx->oncpu = 1;
+ mutex_exit(&devctx->lock);
+ }
+
+ /*
+ * Call the contextmgt callback so that the driver can handle
+ * the fault.
+ */
+ ret = (*ctxmgt)(dhp, dhp->dh_pvtp, off, len, type, rw);
+
+ /*
+ * If devmap_access() returned -1, then there was a hardware
+ * error so we need to convert the return value to something
+ * that trap() will understand. Otherwise, the return value
+ * is already a fault code generated by devmap_unload()
+ * or devmap_load().
+ */
+ if (ret) {
+ TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK2,
+ "devmap_do_ctxmgt: ret=%x dhp=%p devctx=%p",
+ ret, dhp, devctx);
+ DEBUGF(1, (CE_CONT, "devmap_do_ctxmgt: ret %x dhp %p\n",
+ ret, (void *)dhp));
+ if (devctx->oncpu) {
+ mutex_enter(&devctx->lock);
+ devctx->oncpu = 0;
+ cv_signal(&devctx->cv);
+ mutex_exit(&devctx->lock);
+ }
+ return (FC_HWERR);
+ }
+
+ /*
+ * Setup the timeout if we need to
+ */
+ if (do_timeout) {
+ mutex_enter(&devctx->lock);
+ if (dhp->dh_timeout_length > 0) {
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK3,
+ "devmap_do_ctxmgt:timeout set");
+ devctx->timeout = timeout(devmap_ctxto,
+ devctx, dhp->dh_timeout_length);
+ } else {
+ /*
+ * We don't want to wait so set oncpu to
+ * 0 and wake up anyone waiting.
+ */
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_DO_CTXMGT_CK4,
+ "devmap_do_ctxmgt:timeout not set");
+ devctx->oncpu = 0;
+ cv_signal(&devctx->cv);
+ }
+ mutex_exit(&devctx->lock);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * end of mapping
+ * poff fault_offset |
+ * base | | |
+ * | | | |
+ * V V V V
+ * +-----------+---------------+-------+---------+-------+
+ * ^ ^ ^ ^
+ * |<--- offset--->|<-len->| |
+ * |<--- dh_len(size of mapping) --->|
+ * |<-- pg -->|
+ * -->|rlen|<--
+ */
+static ulong_t
+devmap_roundup(devmap_handle_t *dhp, ulong_t offset, size_t len,
+ ulong_t *opfn, ulong_t *pagesize)
+{
+ register int level;
+ ulong_t pg;
+ ulong_t poff;
+ ulong_t base;
+ caddr_t uvaddr;
+ long rlen;
+
+ TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_ROUNDUP,
+ "devmap_roundup:start dhp=%p off=%lx len=%lx",
+ (void *)dhp, offset, len);
+ DEBUGF(2, (CE_CONT, "devmap_roundup: dhp %p off %lx len %lx\n",
+ (void *)dhp, offset, len));
+
+ /*
+ * get the max. pagesize that is aligned within the range
+ * <dh_pfn, dh_pfn+offset>.
+ *
+ * The calculations below use physical address to ddetermine
+ * the page size to use. The same calculations can use the
+ * virtual address to determine the page size.
+ */
+ base = (ulong_t)ptob(dhp->dh_pfn);
+ for (level = dhp->dh_mmulevel; level >= 0; level--) {
+ pg = page_get_pagesize(level);
+ poff = ((base + offset) & ~(pg - 1));
+ uvaddr = dhp->dh_uvaddr + (poff - base);
+ if ((poff >= base) &&
+ ((poff + pg) <= (base + dhp->dh_len)) &&
+ VA_PA_ALIGNED((uintptr_t)uvaddr, poff, pg))
+ break;
+ }
+
+ TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_ROUNDUP_CK1,
+ "devmap_roundup: base=%lx poff=%lx dhp=%p",
+ base, poff, dhp);
+ DEBUGF(2, (CE_CONT, "devmap_roundup: base %lx poff %lx pfn %lx\n",
+ base, poff, dhp->dh_pfn));
+
+ ASSERT(VA_PA_ALIGNED((uintptr_t)uvaddr, poff, pg));
+ ASSERT(level >= 0);
+
+ *pagesize = pg;
+ *opfn = dhp->dh_pfn + btop(poff - base);
+
+ rlen = len + offset - (poff - base + pg);
+
+ ASSERT(rlen < (long)len);
+
+ TRACE_5(TR_FAC_DEVMAP, TR_DEVMAP_ROUNDUP_CK2,
+ "devmap_roundup:ret dhp=%p level=%x rlen=%lx psiz=%p opfn=%p",
+ (void *)dhp, level, rlen, pagesize, opfn);
+ DEBUGF(1, (CE_CONT, "devmap_roundup: dhp %p "
+ "level %x rlen %lx psize %lx opfn %lx\n",
+ (void *)dhp, level, rlen, *pagesize, *opfn));
+
+ return ((ulong_t)((rlen > 0) ? rlen : 0));
+}
+
+/*
+ * find the dhp that contains addr.
+ */
+static devmap_handle_t *
+devmap_find_handle(devmap_handle_t *dhp_head, caddr_t addr)
+{
+ devmap_handle_t *dhp;
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_FIND_HANDLE,
+ "devmap_find_handle:start");
+
+ dhp = dhp_head;
+ while (dhp) {
+ if (addr >= dhp->dh_uvaddr &&
+ addr < (dhp->dh_uvaddr + dhp->dh_len))
+ return (dhp);
+ dhp = dhp->dh_next;
+ }
+
+ return ((devmap_handle_t *)NULL);
+}
+
+/*
+ * devmap_unload:
+ * Marks a segdev segment or pages if offset->offset+len
+ * is not the entire segment as intercept and unloads the
+ * pages in the range offset -> offset+len.
+ */
+int
+devmap_unload(devmap_cookie_t dhc, offset_t offset, size_t len)
+{
+ register devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+ caddr_t addr;
+ ulong_t size;
+ ssize_t soff;
+
+ TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_UNLOAD,
+ "devmap_unload:start dhp=%p offset=%llx len=%lx",
+ (void *)dhp, offset, len);
+ DEBUGF(7, (CE_CONT, "devmap_unload: dhp %p offset %llx len %lx\n",
+ (void *)dhp, offset, len));
+
+ soff = (ssize_t)(offset - dhp->dh_uoff);
+ soff = round_down_p2(soff, PAGESIZE);
+ if (soff < 0 || soff >= dhp->dh_len)
+ return (FC_MAKE_ERR(EINVAL));
+
+ /*
+ * Address and size must be page aligned. Len is set to the
+ * number of bytes in the number of pages that are required to
+ * support len. Offset is set to the byte offset of the first byte
+ * of the page that contains offset.
+ */
+ len = round_up_p2(len, PAGESIZE);
+
+ /*
+ * If len is == 0, then calculate the size by getting
+ * the number of bytes from offset to the end of the segment.
+ */
+ if (len == 0)
+ size = dhp->dh_len - soff;
+ else {
+ size = len;
+ if ((soff + size) > dhp->dh_len)
+ return (FC_MAKE_ERR(EINVAL));
+ }
+
+ /*
+ * The address is offset bytes from the base address of
+ * the dhp.
+ */
+ addr = (caddr_t)(soff + dhp->dh_uvaddr);
+
+ /*
+ * If large page size was used in hat_devload(),
+ * the same page size must be used in hat_unload().
+ */
+ if (dhp->dh_flags & DEVMAP_FLAG_LARGE) {
+ hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
+ dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
+ } else {
+ hat_unload(dhp->dh_seg->s_as->a_hat, addr, size,
+ HAT_UNLOAD|HAT_UNLOAD_OTHER);
+ }
+
+ return (0);
+}
+
+/*
+ * calculates the optimal page size that will be used for hat_devload().
+ */
+static void
+devmap_get_large_pgsize(devmap_handle_t *dhp, size_t len, caddr_t addr,
+ size_t *llen, caddr_t *laddr)
+{
+ ulong_t off;
+ ulong_t pfn;
+ ulong_t pgsize;
+ uint_t first = 1;
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_GET_LARGE_PGSIZE,
+ "devmap_get_large_pgsize:start");
+
+ /*
+ * RFE - Code only supports large page mappings for devmem
+ * This code could be changed in future if we want to support
+ * large page mappings for kernel exported memory.
+ */
+ ASSERT(dhp_is_devmem(dhp));
+ ASSERT(!(dhp->dh_flags & DEVMAP_MAPPING_INVALID));
+
+ *llen = 0;
+ off = (ulong_t)(addr - dhp->dh_uvaddr);
+ while ((long)len > 0) {
+ /*
+ * get the optimal pfn to minimize address translations.
+ * devmap_roundup() returns residue bytes for next round
+ * calculations.
+ */
+ len = devmap_roundup(dhp, off, len, &pfn, &pgsize);
+
+ if (first) {
+ *laddr = dhp->dh_uvaddr + ptob(pfn - dhp->dh_pfn);
+ first = 0;
+ }
+
+ *llen += pgsize;
+ off = ptob(pfn - dhp->dh_pfn) + pgsize;
+ }
+ /* Large page mapping len/addr cover more range than orginal fault */
+ ASSERT(*llen >= len && *laddr <= addr);
+ ASSERT((*laddr + *llen) >= (addr + len));
+}
+
+/*
+ * Initialize the devmap_softlock structure.
+ */
+static struct devmap_softlock *
+devmap_softlock_init(dev_t dev, ulong_t id)
+{
+ struct devmap_softlock *slock;
+ struct devmap_softlock *tmp;
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SOFTLOCK_INIT,
+ "devmap_softlock_init:start");
+
+ tmp = kmem_zalloc(sizeof (struct devmap_softlock), KM_SLEEP);
+ mutex_enter(&devmap_slock);
+
+ for (slock = devmap_slist; slock != NULL; slock = slock->next)
+ if ((slock->dev == dev) && (slock->id == id))
+ break;
+
+ if (slock == NULL) {
+ slock = tmp;
+ slock->dev = dev;
+ slock->id = id;
+ mutex_init(&slock->lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&slock->cv, NULL, CV_DEFAULT, NULL);
+ slock->next = devmap_slist;
+ devmap_slist = slock;
+ } else
+ kmem_free(tmp, sizeof (struct devmap_softlock));
+
+ mutex_enter(&slock->lock);
+ slock->refcnt++;
+ mutex_exit(&slock->lock);
+ mutex_exit(&devmap_slock);
+
+ return (slock);
+}
+
+/*
+ * Wake up processes that sleep on softlocked.
+ * Free dh_softlock if refcnt is 0.
+ */
+static void
+devmap_softlock_rele(devmap_handle_t *dhp)
+{
+ struct devmap_softlock *slock = dhp->dh_softlock;
+ struct devmap_softlock *tmp;
+ struct devmap_softlock *parent;
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SOFTLOCK_RELE,
+ "devmap_softlock_rele:start");
+
+ mutex_enter(&devmap_slock);
+ mutex_enter(&slock->lock);
+
+ ASSERT(slock->refcnt > 0);
+
+ slock->refcnt--;
+
+ /*
+ * If no one is using the device, free up the slock data.
+ */
+ if (slock->refcnt == 0) {
+ slock->softlocked = 0;
+ cv_signal(&slock->cv);
+
+ if (devmap_slist == slock)
+ devmap_slist = slock->next;
+ else {
+ parent = devmap_slist;
+ for (tmp = devmap_slist->next; tmp != NULL;
+ tmp = tmp->next) {
+ if (tmp == slock) {
+ parent->next = tmp->next;
+ break;
+ }
+ parent = tmp;
+ }
+ }
+ mutex_exit(&slock->lock);
+ mutex_destroy(&slock->lock);
+ cv_destroy(&slock->cv);
+ kmem_free(slock, sizeof (struct devmap_softlock));
+ } else
+ mutex_exit(&slock->lock);
+
+ mutex_exit(&devmap_slock);
+}
+
+/*
+ * Wake up processes that sleep on dh_ctx->locked.
+ * Free dh_ctx if refcnt is 0.
+ */
+static void
+devmap_ctx_rele(devmap_handle_t *dhp)
+{
+ struct devmap_ctx *devctx = dhp->dh_ctx;
+ struct devmap_ctx *tmp;
+ struct devmap_ctx *parent;
+ timeout_id_t tid;
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_CTX_RELE,
+ "devmap_ctx_rele:start");
+
+ mutex_enter(&devmapctx_lock);
+ mutex_enter(&devctx->lock);
+
+ ASSERT(devctx->refcnt > 0);
+
+ devctx->refcnt--;
+
+ /*
+ * If no one is using the device, free up the devctx data.
+ */
+ if (devctx->refcnt == 0) {
+ /*
+ * Untimeout any threads using this mapping as they are about
+ * to go away.
+ */
+ if (devctx->timeout != 0) {
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_CTX_RELE_CK1,
+ "devmap_ctx_rele:untimeout ctx->timeout");
+
+ tid = devctx->timeout;
+ mutex_exit(&devctx->lock);
+ (void) untimeout(tid);
+ mutex_enter(&devctx->lock);
+ }
+
+ devctx->oncpu = 0;
+ cv_signal(&devctx->cv);
+
+ if (devmapctx_list == devctx)
+ devmapctx_list = devctx->next;
+ else {
+ parent = devmapctx_list;
+ for (tmp = devmapctx_list->next; tmp != NULL;
+ tmp = tmp->next) {
+ if (tmp == devctx) {
+ parent->next = tmp->next;
+ break;
+ }
+ parent = tmp;
+ }
+ }
+ mutex_exit(&devctx->lock);
+ mutex_destroy(&devctx->lock);
+ cv_destroy(&devctx->cv);
+ kmem_free(devctx, sizeof (struct devmap_ctx));
+ } else
+ mutex_exit(&devctx->lock);
+
+ mutex_exit(&devmapctx_lock);
+}
+
+/*
+ * devmap_load:
+ * Marks a segdev segment or pages if offset->offset+len
+ * is not the entire segment as nointercept and faults in
+ * the pages in the range offset -> offset+len.
+ */
+int
+devmap_load(devmap_cookie_t dhc, offset_t offset, size_t len, uint_t type,
+ uint_t rw)
+{
+ devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+ struct as *asp = dhp->dh_seg->s_as;
+ caddr_t addr;
+ ulong_t size;
+ ssize_t soff; /* offset from the beginning of the segment */
+ int rc;
+
+ TRACE_3(TR_FAC_DEVMAP, TR_DEVMAP_LOAD,
+ "devmap_load:start dhp=%p offset=%llx len=%lx",
+ (void *)dhp, offset, len);
+
+ DEBUGF(7, (CE_CONT, "devmap_load: dhp %p offset %llx len %lx\n",
+ (void *)dhp, offset, len));
+
+ /*
+ * Hat layer only supports devload to process' context for which
+ * the as lock is held. Verify here and return error if drivers
+ * inadvertently call devmap_load on a wrong devmap handle.
+ */
+ if ((asp != &kas) && !AS_LOCK_HELD(asp, &asp->a_lock))
+ return (FC_MAKE_ERR(EINVAL));
+
+ soff = (ssize_t)(offset - dhp->dh_uoff);
+ soff = round_down_p2(soff, PAGESIZE);
+ if (soff < 0 || soff >= dhp->dh_len)
+ return (FC_MAKE_ERR(EINVAL));
+
+ /*
+ * Address and size must be page aligned. Len is set to the
+ * number of bytes in the number of pages that are required to
+ * support len. Offset is set to the byte offset of the first byte
+ * of the page that contains offset.
+ */
+ len = round_up_p2(len, PAGESIZE);
+
+ /*
+ * If len == 0, then calculate the size by getting
+ * the number of bytes from offset to the end of the segment.
+ */
+ if (len == 0)
+ size = dhp->dh_len - soff;
+ else {
+ size = len;
+ if ((soff + size) > dhp->dh_len)
+ return (FC_MAKE_ERR(EINVAL));
+ }
+
+ /*
+ * The address is offset bytes from the base address of
+ * the segment.
+ */
+ addr = (caddr_t)(soff + dhp->dh_uvaddr);
+
+ HOLD_DHP_LOCK(dhp);
+ rc = segdev_faultpages(asp->a_hat,
+ dhp->dh_seg, addr, size, type, rw, dhp);
+ RELE_DHP_LOCK(dhp);
+ return (rc);
+}
+
+int
+devmap_setup(dev_t dev, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
+{
+ register devmap_handle_t *dhp;
+ int (*devmap)(dev_t, devmap_cookie_t, offset_t, size_t,
+ size_t *, uint_t);
+ int (*mmap)(dev_t, off_t, int);
+ struct devmap_callback_ctl *callbackops;
+ devmap_handle_t *dhp_head = NULL;
+ devmap_handle_t *dhp_prev = NULL;
+ devmap_handle_t *dhp_curr;
+ caddr_t addr;
+ int map_flag;
+ int ret;
+ ulong_t total_len;
+ size_t map_len;
+ size_t resid_len = len;
+ offset_t map_off = off;
+ struct devmap_softlock *slock = NULL;
+
+#ifdef lint
+ cred = cred;
+#endif
+
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_SETUP,
+ "devmap_setup:start off=%llx len=%lx", off, len);
+ DEBUGF(3, (CE_CONT, "devmap_setup: off %llx len %lx\n",
+ off, len));
+
+ devmap = devopsp[getmajor(dev)]->devo_cb_ops->cb_devmap;
+ mmap = devopsp[getmajor(dev)]->devo_cb_ops->cb_mmap;
+
+ /*
+ * driver must provide devmap(9E) entry point in cb_ops to use the
+ * devmap framework.
+ */
+ if (devmap == NULL || devmap == nulldev || devmap == nodev)
+ return (EINVAL);
+
+ /*
+ * To protect from an inadvertent entry because the devmap entry point
+ * is not NULL, return error if D_DEVMAP bit is not set in cb_flag and
+ * mmap is NULL.
+ */
+ map_flag = devopsp[getmajor(dev)]->devo_cb_ops->cb_flag;
+ if ((map_flag & D_DEVMAP) == 0 && (mmap == NULL || mmap == nulldev))
+ return (EINVAL);
+
+ /*
+ * devmap allows mmap(2) to map multiple registers.
+ * one devmap_handle is created for each register mapped.
+ */
+ for (total_len = 0; total_len < len; total_len += map_len) {
+ dhp = kmem_zalloc(sizeof (devmap_handle_t), KM_SLEEP);
+
+ if (dhp_prev != NULL)
+ dhp_prev->dh_next = dhp;
+ else
+ dhp_head = dhp;
+ dhp_prev = dhp;
+
+ dhp->dh_prot = prot;
+ dhp->dh_orig_maxprot = dhp->dh_maxprot = maxprot;
+ dhp->dh_dev = dev;
+ dhp->dh_timeout_length = CTX_TIMEOUT_VALUE;
+ dhp->dh_uoff = map_off;
+
+ /*
+ * Get mapping specific info from
+ * the driver, such as rnumber, roff, len, callbackops,
+ * accattrp and, if the mapping is for kernel memory,
+ * ddi_umem_cookie.
+ */
+ if ((ret = cdev_devmap(dev, dhp, map_off,
+ resid_len, &map_len, get_udatamodel())) != 0) {
+ free_devmap_handle(dhp_head);
+ return (ENXIO);
+ }
+
+ if (map_len & PAGEOFFSET) {
+ free_devmap_handle(dhp_head);
+ return (EINVAL);
+ }
+
+ callbackops = &dhp->dh_callbackops;
+
+ if ((callbackops->devmap_access == NULL) ||
+ (callbackops->devmap_access == nulldev) ||
+ (callbackops->devmap_access == nodev)) {
+ /*
+ * Normally devmap does not support MAP_PRIVATE unless
+ * the drivers provide a valid devmap_access routine.
+ */
+ if ((flags & MAP_PRIVATE) != 0) {
+ free_devmap_handle(dhp_head);
+ return (EINVAL);
+ }
+ } else {
+ /*
+ * Initialize dhp_softlock and dh_ctx if the drivers
+ * provide devmap_access.
+ */
+ dhp->dh_softlock = devmap_softlock_init(dev,
+ (ulong_t)callbackops->devmap_access);
+ dhp->dh_ctx = devmap_ctxinit(dev,
+ (ulong_t)callbackops->devmap_access);
+
+ /*
+ * segdev_fault can only work when all
+ * dh_softlock in a multi-dhp mapping
+ * are same. see comments in segdev_fault
+ * This code keeps track of the first
+ * dh_softlock allocated in slock and
+ * compares all later allocations and if
+ * not similar, returns an error.
+ */
+ if (slock == NULL)
+ slock = dhp->dh_softlock;
+ if (slock != dhp->dh_softlock) {
+ free_devmap_handle(dhp_head);
+ return (ENOTSUP);
+ }
+ }
+
+ map_off += map_len;
+ resid_len -= map_len;
+ }
+
+ /*
+ * get the user virtual address and establish the mapping between
+ * uvaddr and device physical address.
+ */
+ if ((ret = devmap_device(dhp_head, as, addrp, off, len, flags))
+ != 0) {
+ /*
+ * free devmap handles if error during the mapping.
+ */
+ free_devmap_handle(dhp_head);
+
+ return (ret);
+ }
+
+ /*
+ * call the driver's devmap_map callback to do more after the mapping,
+ * such as to allocate driver private data for context management.
+ */
+ dhp = dhp_head;
+ map_off = off;
+ addr = *addrp;
+ while (dhp != NULL) {
+ callbackops = &dhp->dh_callbackops;
+ dhp->dh_uvaddr = addr;
+ dhp_curr = dhp;
+ if (callbackops->devmap_map != NULL) {
+ ret = (*callbackops->devmap_map)((devmap_cookie_t)dhp,
+ dev, flags, map_off,
+ dhp->dh_len, &dhp->dh_pvtp);
+ if (ret != 0) {
+ struct segdev_data *sdp;
+
+ /*
+ * call driver's devmap_unmap entry point
+ * to free driver resources.
+ */
+ dhp = dhp_head;
+ map_off = off;
+ while (dhp != dhp_curr) {
+ callbackops = &dhp->dh_callbackops;
+ if (callbackops->devmap_unmap != NULL) {
+ (*callbackops->devmap_unmap)(
+ dhp, dhp->dh_pvtp,
+ map_off, dhp->dh_len,
+ NULL, NULL, NULL, NULL);
+ }
+ map_off += dhp->dh_len;
+ dhp = dhp->dh_next;
+ }
+ sdp = dhp_head->dh_seg->s_data;
+ sdp->devmap_data = NULL;
+ free_devmap_handle(dhp_head);
+ return (ENXIO);
+ }
+ }
+ map_off += dhp->dh_len;
+ addr += dhp->dh_len;
+ dhp = dhp->dh_next;
+ }
+
+ return (0);
+}
+
+int
+ddi_devmap_segmap(dev_t dev, off_t off, ddi_as_handle_t as, caddr_t *addrp,
+ off_t len, uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
+{
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_SEGMAP,
+ "devmap_segmap:start");
+ return (devmap_setup(dev, (offset_t)off, (struct as *)as, addrp,
+ (size_t)len, prot, maxprot, flags, cred));
+}
+
+/*
+ * Called from devmap_devmem_setup/remap to see if can use large pages for
+ * this device mapping.
+ * Also calculate the max. page size for this mapping.
+ * this page size will be used in fault routine for
+ * optimal page size calculations.
+ */
+static void
+devmap_devmem_large_page_setup(devmap_handle_t *dhp)
+{
+ ASSERT(dhp_is_devmem(dhp));
+ dhp->dh_mmulevel = 0;
+
+ /*
+ * use large page size only if:
+ * 1. device memory.
+ * 2. mmu supports multiple page sizes,
+ * 3. Driver did not disallow it
+ * 4. dhp length is at least as big as the large pagesize
+ * 5. the uvaddr and pfn are large pagesize aligned
+ */
+ if (page_num_pagesizes() > 1 &&
+ !(dhp->dh_flags & (DEVMAP_USE_PAGESIZE | DEVMAP_MAPPING_INVALID))) {
+ ulong_t base;
+ int level;
+
+ base = (ulong_t)ptob(dhp->dh_pfn);
+ for (level = 1; level < page_num_pagesizes(); level++) {
+ size_t pgsize = page_get_pagesize(level);
+ if ((dhp->dh_len < pgsize) ||
+ (!VA_PA_PGSIZE_ALIGNED((uintptr_t)dhp->dh_uvaddr,
+ base, pgsize))) {
+ break;
+ }
+ }
+ dhp->dh_mmulevel = level - 1;
+ }
+ if (dhp->dh_mmulevel > 0) {
+ dhp->dh_flags |= DEVMAP_FLAG_LARGE;
+ } else {
+ dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
+ }
+}
+
+/*
+ * Called by driver devmap routine to pass device specific info to
+ * the framework. used for device memory mapping only.
+ */
+int
+devmap_devmem_setup(devmap_cookie_t dhc, dev_info_t *dip,
+ struct devmap_callback_ctl *callbackops, uint_t rnumber, offset_t roff,
+ size_t len, uint_t maxprot, uint_t flags, ddi_device_acc_attr_t *accattrp)
+{
+ devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+ ddi_acc_handle_t handle;
+ ddi_map_req_t mr;
+ ddi_acc_hdl_t *hp;
+ int err;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_DEVMEM_SETUP,
+ "devmap_devmem_setup:start dhp=%p offset=%llx rnum=%d len=%lx",
+ (void *)dhp, roff, rnumber, (uint_t)len);
+ DEBUGF(2, (CE_CONT, "devmap_devmem_setup: dhp %p offset %llx "
+ "rnum %d len %lx\n", (void *)dhp, roff, rnumber, len));
+
+ /*
+ * First to check if this function has been called for this dhp.
+ */
+ if (dhp->dh_flags & DEVMAP_SETUP_DONE)
+ return (DDI_FAILURE);
+
+ if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
+ return (DDI_FAILURE);
+
+ if (flags & DEVMAP_MAPPING_INVALID) {
+ /*
+ * Don't go up the tree to get pfn if the driver specifies
+ * DEVMAP_MAPPING_INVALID in flags.
+ *
+ * If DEVMAP_MAPPING_INVALID is specified, we have to grant
+ * remap permission.
+ */
+ if (!(flags & DEVMAP_ALLOW_REMAP)) {
+ return (DDI_FAILURE);
+ }
+ dhp->dh_pfn = PFN_INVALID;
+ } else {
+ handle = impl_acc_hdl_alloc(KM_SLEEP, NULL);
+ if (handle == NULL)
+ return (DDI_FAILURE);
+
+ hp = impl_acc_hdl_get(handle);
+ hp->ah_vers = VERS_ACCHDL;
+ hp->ah_dip = dip;
+ hp->ah_rnumber = rnumber;
+ hp->ah_offset = roff;
+ hp->ah_len = len;
+ if (accattrp != NULL)
+ hp->ah_acc = *accattrp;
+
+ mr.map_op = DDI_MO_MAP_LOCKED;
+ mr.map_type = DDI_MT_RNUMBER;
+ mr.map_obj.rnumber = rnumber;
+ mr.map_prot = maxprot & dhp->dh_orig_maxprot;
+ mr.map_flags = DDI_MF_DEVICE_MAPPING;
+ mr.map_handlep = hp;
+ mr.map_vers = DDI_MAP_VERSION;
+
+ /*
+ * up the device tree to get pfn.
+ * The rootnex_map_regspec() routine in nexus drivers has been
+ * modified to return pfn if map_flags is DDI_MF_DEVICE_MAPPING.
+ */
+ err = ddi_map(dip, &mr, roff, len, (caddr_t *)&dhp->dh_pfn);
+ dhp->dh_hat_attr = hp->ah_hat_flags;
+ impl_acc_hdl_free(handle);
+
+ if (err)
+ return (DDI_FAILURE);
+ }
+ /* Should not be using devmem setup for memory pages */
+ ASSERT(!pf_is_memory(dhp->dh_pfn));
+
+ /* Only some of the flags bits are settable by the driver */
+ dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
+ dhp->dh_len = ptob(btopr(len));
+
+ dhp->dh_cookie = DEVMAP_DEVMEM_COOKIE;
+ dhp->dh_roff = ptob(btop(roff));
+
+ /* setup the dh_mmulevel and DEVMAP_FLAG_LARGE */
+ devmap_devmem_large_page_setup(dhp);
+ dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
+ ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
+
+
+ if (callbackops != NULL) {
+ bcopy(callbackops, &dhp->dh_callbackops,
+ sizeof (struct devmap_callback_ctl));
+ }
+
+ /*
+ * Initialize dh_lock if we want to do remap.
+ */
+ if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
+ mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
+ dhp->dh_flags |= DEVMAP_LOCK_INITED;
+ }
+
+ dhp->dh_flags |= DEVMAP_SETUP_DONE;
+
+ return (DDI_SUCCESS);
+}
+
+int
+devmap_devmem_remap(devmap_cookie_t dhc, dev_info_t *dip,
+ uint_t rnumber, offset_t roff, size_t len, uint_t maxprot,
+ uint_t flags, ddi_device_acc_attr_t *accattrp)
+{
+ devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+ ddi_acc_handle_t handle;
+ ddi_map_req_t mr;
+ ddi_acc_hdl_t *hp;
+ pfn_t pfn;
+ uint_t hat_flags;
+ int err;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_DEVMEM_REMAP,
+ "devmap_devmem_setup:start dhp=%p offset=%llx rnum=%d len=%lx",
+ (void *)dhp, roff, rnumber, (uint_t)len);
+ DEBUGF(2, (CE_CONT, "devmap_devmem_remap: dhp %p offset %llx "
+ "rnum %d len %lx\n", (void *)dhp, roff, rnumber, len));
+
+ /*
+ * Return failure if setup has not been done or no remap permission
+ * has been granted during the setup.
+ */
+ if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
+ (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
+ return (DDI_FAILURE);
+
+ /* Only DEVMAP_MAPPING_INVALID flag supported for remap */
+ if ((flags != 0) && (flags != DEVMAP_MAPPING_INVALID))
+ return (DDI_FAILURE);
+
+ if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
+ return (DDI_FAILURE);
+
+ if (!(flags & DEVMAP_MAPPING_INVALID)) {
+ handle = impl_acc_hdl_alloc(KM_SLEEP, NULL);
+ if (handle == NULL)
+ return (DDI_FAILURE);
+ }
+
+ HOLD_DHP_LOCK(dhp);
+
+ /*
+ * Unload the old mapping, so next fault will setup the new mappings
+ * Do this while holding the dhp lock so other faults dont reestablish
+ * the mappings
+ */
+ hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
+ dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
+
+ if (flags & DEVMAP_MAPPING_INVALID) {
+ dhp->dh_flags |= DEVMAP_MAPPING_INVALID;
+ dhp->dh_pfn = PFN_INVALID;
+ } else {
+ /* clear any prior DEVMAP_MAPPING_INVALID flag */
+ dhp->dh_flags &= ~DEVMAP_MAPPING_INVALID;
+ hp = impl_acc_hdl_get(handle);
+ hp->ah_vers = VERS_ACCHDL;
+ hp->ah_dip = dip;
+ hp->ah_rnumber = rnumber;
+ hp->ah_offset = roff;
+ hp->ah_len = len;
+ if (accattrp != NULL)
+ hp->ah_acc = *accattrp;
+
+ mr.map_op = DDI_MO_MAP_LOCKED;
+ mr.map_type = DDI_MT_RNUMBER;
+ mr.map_obj.rnumber = rnumber;
+ mr.map_prot = maxprot & dhp->dh_orig_maxprot;
+ mr.map_flags = DDI_MF_DEVICE_MAPPING;
+ mr.map_handlep = hp;
+ mr.map_vers = DDI_MAP_VERSION;
+
+ /*
+ * up the device tree to get pfn.
+ * The rootnex_map_regspec() routine in nexus drivers has been
+ * modified to return pfn if map_flags is DDI_MF_DEVICE_MAPPING.
+ */
+ err = ddi_map(dip, &mr, roff, len, (caddr_t *)&pfn);
+ hat_flags = hp->ah_hat_flags;
+ impl_acc_hdl_free(handle);
+ if (err) {
+ RELE_DHP_LOCK(dhp);
+ return (DDI_FAILURE);
+ }
+ /*
+ * Store result of ddi_map first in local variables, as we do
+ * not want to overwrite the existing dhp with wrong data.
+ */
+ dhp->dh_pfn = pfn;
+ dhp->dh_hat_attr = hat_flags;
+ }
+
+ /* clear the large page size flag */
+ dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
+
+ dhp->dh_cookie = DEVMAP_DEVMEM_COOKIE;
+ dhp->dh_roff = ptob(btop(roff));
+
+ /* setup the dh_mmulevel and DEVMAP_FLAG_LARGE */
+ devmap_devmem_large_page_setup(dhp);
+ dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
+ ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
+
+ RELE_DHP_LOCK(dhp);
+ return (DDI_SUCCESS);
+}
+
+/*
+ * called by driver devmap routine to pass kernel virtual address mapping
+ * info to the framework. used only for kernel memory
+ * allocated from ddi_umem_alloc().
+ */
+int
+devmap_umem_setup(devmap_cookie_t dhc, dev_info_t *dip,
+ struct devmap_callback_ctl *callbackops, ddi_umem_cookie_t cookie,
+ offset_t off, size_t len, uint_t maxprot, uint_t flags,
+ ddi_device_acc_attr_t *accattrp)
+{
+ devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+ struct ddi_umem_cookie *cp = (struct ddi_umem_cookie *)cookie;
+
+#ifdef lint
+ dip = dip;
+ accattrp = accattrp;
+#endif
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_SETUP,
+ "devmap_umem_setup:start dhp=%p offset=%llx cookie=%p len=%lx",
+ (void *)dhp, off, cookie, len);
+ DEBUGF(2, (CE_CONT, "devmap_umem_setup: dhp %p offset %llx "
+ "cookie %p len %lx\n", (void *)dhp, off, (void *)cookie, len));
+
+ if (cookie == NULL)
+ return (DDI_FAILURE);
+
+ /* For UMEM_TRASH, this restriction is not needed */
+ if ((off + len) > cp->size)
+ return (DDI_FAILURE);
+
+ /*
+ * First to check if this function has been called for this dhp.
+ */
+ if (dhp->dh_flags & DEVMAP_SETUP_DONE)
+ return (DDI_FAILURE);
+
+ if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
+ return (DDI_FAILURE);
+
+ if (flags & DEVMAP_MAPPING_INVALID) {
+ /*
+ * If DEVMAP_MAPPING_INVALID is specified, we have to grant
+ * remap permission.
+ */
+ if (!(flags & DEVMAP_ALLOW_REMAP)) {
+ return (DDI_FAILURE);
+ }
+ } else {
+ dhp->dh_cookie = cookie;
+ dhp->dh_roff = ptob(btop(off));
+ dhp->dh_cvaddr = cp->cvaddr + dhp->dh_roff;
+ }
+
+ /*
+ * The default is _not_ to pass HAT_LOAD_NOCONSIST to hat_devload();
+ * we pass HAT_LOAD_NOCONSIST _only_ in cases where hat tries to
+ * create consistent mappings but our intention was to create
+ * non-consistent mappings.
+ *
+ * DEVMEM: hat figures it out it's DEVMEM and creates non-consistent
+ * mappings.
+ *
+ * kernel exported memory: hat figures it out it's memory and always
+ * creates consistent mappings.
+ *
+ * /dev/mem: non-consistent mappings. See comments in common/io/mem.c
+ *
+ * /dev/kmem: consistent mappings are created unless they are
+ * MAP_FIXED. We _explicitly_ tell hat to create non-consistent
+ * mappings by passing HAT_LOAD_NOCONSIST in case of MAP_FIXED
+ * mappings of /dev/kmem. See common/io/mem.c
+ */
+
+ /* Only some of the flags bits are settable by the driver */
+ dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
+
+ dhp->dh_len = ptob(btopr(len));
+ dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
+ ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
+
+ if (callbackops != NULL) {
+ bcopy(callbackops, &dhp->dh_callbackops,
+ sizeof (struct devmap_callback_ctl));
+ }
+ /*
+ * Initialize dh_lock if we want to do remap.
+ */
+ if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
+ mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
+ dhp->dh_flags |= DEVMAP_LOCK_INITED;
+ }
+
+ dhp->dh_flags |= DEVMAP_SETUP_DONE;
+
+ return (DDI_SUCCESS);
+}
+
+int
+devmap_umem_remap(devmap_cookie_t dhc, dev_info_t *dip,
+ ddi_umem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot,
+ uint_t flags, ddi_device_acc_attr_t *accattrp)
+{
+ devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+ struct ddi_umem_cookie *cp = (struct ddi_umem_cookie *)cookie;
+
+ TRACE_4(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_REMAP,
+ "devmap_umem_remap:start dhp=%p offset=%llx cookie=%p len=%lx",
+ (void *)dhp, off, cookie, len);
+ DEBUGF(2, (CE_CONT, "devmap_umem_remap: dhp %p offset %llx "
+ "cookie %p len %lx\n", (void *)dhp, off, (void *)cookie, len));
+
+#ifdef lint
+ dip = dip;
+ accattrp = accattrp;
+#endif
+ /*
+ * Reture failure if setup has not been done or no remap permission
+ * has been granted during the setup.
+ */
+ if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
+ (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
+ return (DDI_FAILURE);
+
+ /* No flags supported for remap yet */
+ if (flags != 0)
+ return (DDI_FAILURE);
+
+ if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
+ return (DDI_FAILURE);
+
+ /* For UMEM_TRASH, this restriction is not needed */
+ if ((off + len) > cp->size)
+ return (DDI_FAILURE);
+
+ HOLD_DHP_LOCK(dhp);
+ /*
+ * Unload the old mapping, so next fault will setup the new mappings
+ * Do this while holding the dhp lock so other faults dont reestablish
+ * the mappings
+ */
+ hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
+ dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
+
+ dhp->dh_cookie = cookie;
+ dhp->dh_roff = ptob(btop(off));
+ dhp->dh_cvaddr = cp->cvaddr + dhp->dh_roff;
+
+ /* clear the large page size flag */
+ dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
+
+ dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
+ ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
+ RELE_DHP_LOCK(dhp);
+ return (DDI_SUCCESS);
+}
+
+/*
+ * to set timeout value for the driver's context management callback, e.g.
+ * devmap_access().
+ */
+void
+devmap_set_ctx_timeout(devmap_cookie_t dhc, clock_t ticks)
+{
+ devmap_handle_t *dhp = (devmap_handle_t *)dhc;
+
+ TRACE_2(TR_FAC_DEVMAP, TR_DEVMAP_SET_CTX_TIMEOUT,
+ "devmap_set_ctx_timeout:start dhp=%p ticks=%x",
+ (void *)dhp, ticks);
+ dhp->dh_timeout_length = ticks;
+}
+
+int
+devmap_default_access(devmap_cookie_t dhp, void *pvtp, offset_t off,
+ size_t len, uint_t type, uint_t rw)
+{
+#ifdef lint
+ pvtp = pvtp;
+#endif
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_DEFAULT_ACCESS,
+ "devmap_default_access:start");
+ return (devmap_load(dhp, off, len, type, rw));
+}
+
+/*
+ * segkmem_alloc() wrapper to allocate memory which is both
+ * non-relocatable (for DR) and sharelocked, since the rest
+ * of this segment driver requires it.
+ */
+static void *
+devmap_alloc_pages(vmem_t *vmp, size_t size, int vmflag)
+{
+ ASSERT(vmp != NULL);
+ ASSERT(kvseg.s_base != NULL);
+ vmflag |= (VM_NORELOC | SEGKMEM_SHARELOCKED);
+ return (segkmem_alloc(vmp, size, vmflag));
+}
+
+/*
+ * This is where things are a bit incestrous with seg_kmem: unlike
+ * seg_kp, seg_kmem does not keep its pages long-term sharelocked, so
+ * we need to do a bit of a dance around that to prevent duplication of
+ * code until we decide to bite the bullet and implement a new kernel
+ * segment for driver-allocated memory that is exported to user space.
+ */
+static void
+devmap_free_pages(vmem_t *vmp, void *inaddr, size_t size)
+{
+ page_t *pp;
+ caddr_t addr = inaddr;
+ caddr_t eaddr;
+ pgcnt_t npages = btopr(size);
+
+ ASSERT(vmp != NULL);
+ ASSERT(kvseg.s_base != NULL);
+ ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
+
+ hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+
+ for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
+ /*
+ * Use page_find() instead of page_lookup() to find the page
+ * since we know that it is hashed and has a shared lock.
+ */
+ pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr);
+
+ if (pp == NULL)
+ panic("devmap_free_pages: page not found");
+ if (!page_tryupgrade(pp)) {
+ page_unlock(pp);
+ pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr,
+ SE_EXCL);
+ if (pp == NULL)
+ panic("devmap_free_pages: page already freed");
+ }
+ /* Clear p_lckcnt so page_destroy() doesn't update availrmem */
+ pp->p_lckcnt = 0;
+ page_destroy(pp, 0);
+ }
+ page_unresv(npages);
+
+ if (vmp != NULL)
+ vmem_free(vmp, inaddr, size);
+}
+
+/*
+ * devmap_umem_alloc_np() replaces kmem_zalloc() as the method for
+ * allocating non-pageable kmem in response to a ddi_umem_alloc()
+ * default request. For now we allocate our own pages and we keep
+ * them long-term sharelocked, since: A) the fault routines expect the
+ * memory to already be locked; B) pageable umem is already long-term
+ * locked; C) it's a lot of work to make it otherwise, particuarly
+ * since the nexus layer expects the pages to never fault. An RFE is to
+ * not keep the pages long-term locked, but instead to be able to
+ * take faults on them and simply look them up in kvp in case we
+ * fault on them. Even then, we must take care not to let pageout
+ * steal them from us since the data must remain resident; if we
+ * do this we must come up with some way to pin the pages to prevent
+ * faults while a driver is doing DMA to/from them.
+ */
+static void *
+devmap_umem_alloc_np(size_t size, size_t flags)
+{
+ void *buf;
+ int vmflags = (flags & DDI_UMEM_NOSLEEP)? VM_NOSLEEP : VM_SLEEP;
+
+ buf = vmem_alloc(umem_np_arena, size, vmflags);
+ if (buf != NULL)
+ bzero(buf, size);
+ return (buf);
+}
+
+static void
+devmap_umem_free_np(void *addr, size_t size)
+{
+ vmem_free(umem_np_arena, addr, size);
+}
+
+/*
+ * allocate page aligned kernel memory for exporting to user land.
+ * The devmap framework will use the cookie allocated by ddi_umem_alloc()
+ * to find a user virtual address that is in same color as the address
+ * allocated here.
+ */
+void *
+ddi_umem_alloc(size_t size, int flags, ddi_umem_cookie_t *cookie)
+{
+ register size_t len = ptob(btopr(size));
+ void *buf = NULL;
+ struct ddi_umem_cookie *cp;
+ int iflags = 0;
+
+ *cookie = NULL;
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_ALLOC,
+ "devmap_umem_alloc:start");
+ if (len == 0)
+ return ((void *)NULL);
+
+ /*
+ * allocate cookie
+ */
+ if ((cp = kmem_zalloc(sizeof (struct ddi_umem_cookie),
+ flags & DDI_UMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP)) == NULL) {
+ ASSERT(flags & DDI_UMEM_NOSLEEP);
+ return ((void *)NULL);
+ }
+
+ if (flags & DDI_UMEM_PAGEABLE) {
+ /* Only one of the flags is allowed */
+ ASSERT(!(flags & DDI_UMEM_TRASH));
+ /* initialize resource with 0 */
+ iflags = KPD_ZERO;
+
+ /*
+ * to allocate unlocked pageable memory, use segkp_get() to
+ * create a segkp segment. Since segkp can only service kas,
+ * other segment drivers such as segdev have to do
+ * as_fault(segkp, SOFTLOCK) in its fault routine,
+ */
+ if (flags & DDI_UMEM_NOSLEEP)
+ iflags |= KPD_NOWAIT;
+
+ if ((buf = segkp_get(segkp, len, iflags)) == NULL) {
+ kmem_free(cp, sizeof (struct ddi_umem_cookie));
+ return ((void *)NULL);
+ }
+ cp->type = KMEM_PAGEABLE;
+ mutex_init(&cp->lock, NULL, MUTEX_DEFAULT, NULL);
+ cp->locked = 0;
+ } else if (flags & DDI_UMEM_TRASH) {
+ /* Only one of the flags is allowed */
+ ASSERT(!(flags & DDI_UMEM_PAGEABLE));
+ cp->type = UMEM_TRASH;
+ buf = NULL;
+ } else {
+ if ((buf = devmap_umem_alloc_np(len, flags)) == NULL) {
+ kmem_free(cp, sizeof (struct ddi_umem_cookie));
+ return ((void *)NULL);
+ }
+
+ cp->type = KMEM_NON_PAGEABLE;
+ }
+
+ /*
+ * need to save size here. size will be used when
+ * we do kmem_free.
+ */
+ cp->size = len;
+ cp->cvaddr = (caddr_t)buf;
+
+ *cookie = (void *)cp;
+ return (buf);
+}
+
+void
+ddi_umem_free(ddi_umem_cookie_t cookie)
+{
+ struct ddi_umem_cookie *cp;
+
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_UMEM_FREE,
+ "devmap_umem_free:start");
+
+ /*
+ * if cookie is NULL, no effects on the system
+ */
+ if (cookie == NULL)
+ return;
+
+ cp = (struct ddi_umem_cookie *)cookie;
+
+ switch (cp->type) {
+ case KMEM_PAGEABLE :
+ ASSERT(cp->cvaddr != NULL && cp->size != 0);
+ /*
+ * Check if there are still any pending faults on the cookie
+ * while the driver is deleting it,
+ * XXX - could change to an ASSERT but wont catch errant drivers
+ */
+ mutex_enter(&cp->lock);
+ if (cp->locked) {
+ mutex_exit(&cp->lock);
+ panic("ddi_umem_free for cookie with pending faults %p",
+ (void *)cp);
+ return;
+ }
+
+ segkp_release(segkp, cp->cvaddr);
+
+ /*
+ * release mutex associated with this cookie.
+ */
+ mutex_destroy(&cp->lock);
+ break;
+ case KMEM_NON_PAGEABLE :
+ ASSERT(cp->cvaddr != NULL && cp->size != 0);
+ devmap_umem_free_np(cp->cvaddr, cp->size);
+ break;
+ case UMEM_TRASH :
+ break;
+ case UMEM_LOCKED :
+ /* Callers should use ddi_umem_unlock for this type */
+ ddi_umem_unlock(cookie);
+ /* Frees the cookie too */
+ return;
+ default:
+ /* panic so we can diagnose the underlying cause */
+ panic("ddi_umem_free: illegal cookie type 0x%x\n",
+ cp->type);
+ }
+
+ kmem_free(cookie, sizeof (struct ddi_umem_cookie));
+}
+
+
+static int
+segdev_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+ struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+
+ /*
+ * It looks as if it is always mapped shared
+ */
+ TRACE_0(TR_FAC_DEVMAP, TR_DEVMAP_GETMEMID,
+ "segdev_getmemid:start");
+ memidp->val[0] = (uintptr_t)VTOCVP(sdp->vp);
+ memidp->val[1] = sdp->offset + (uintptr_t)(addr - seg->s_base);
+ return (0);
+}
+
+/*ARGSUSED*/
+static lgrp_mem_policy_info_t *
+segdev_getpolicy(struct seg *seg, caddr_t addr)
+{
+ return (NULL);
+}
+
+/*
+ * ddi_umem_alloc() non-pageable quantum cache max size.
+ * This is just a SWAG.
+ */
+#define DEVMAP_UMEM_QUANTUM (8*PAGESIZE)
+
+/*
+ * Initialize seg_dev from boot. This routine sets up the trash page
+ * and creates the umem_np_arena used to back non-pageable memory
+ * requests.
+ */
+void
+segdev_init(void)
+{
+ struct seg kseg;
+
+ umem_np_arena = vmem_create("umem_np", NULL, 0, PAGESIZE,
+ devmap_alloc_pages, devmap_free_pages, heap_arena,
+ DEVMAP_UMEM_QUANTUM, VM_SLEEP);
+
+ kseg.s_as = &kas;
+ trashpp = page_create_va(&trashvp, 0, PAGESIZE,
+ PG_NORELOC | PG_EXCL | PG_WAIT, &kseg, NULL);
+ if (trashpp == NULL)
+ panic("segdev_init: failed to create trash page");
+ pagezero(trashpp, 0, PAGESIZE);
+ page_downgrade(trashpp);
+}
+
+/*
+ * Invoke platform-dependent support routines so that /proc can have
+ * the platform code deal with curious hardware.
+ */
+int
+segdev_copyfrom(struct seg *seg,
+ caddr_t uaddr, const void *devaddr, void *kaddr, size_t len)
+{
+ struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ struct snode *sp = VTOS(VTOCVP(sdp->vp));
+
+ return (e_ddi_copyfromdev(sp->s_dip,
+ (off_t)(uaddr - seg->s_base), devaddr, kaddr, len));
+}
+
+int
+segdev_copyto(struct seg *seg,
+ caddr_t uaddr, const void *kaddr, void *devaddr, size_t len)
+{
+ struct segdev_data *sdp = (struct segdev_data *)seg->s_data;
+ struct snode *sp = VTOS(VTOCVP(sdp->vp));
+
+ return (e_ddi_copytodev(sp->s_dip,
+ (off_t)(uaddr - seg->s_base), kaddr, devaddr, len));
+}
diff --git a/usr/src/uts/common/vm/seg_dev.h b/usr/src/uts/common/vm/seg_dev.h
new file mode 100644
index 0000000000..c498c06ecf
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_dev.h
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_SEG_DEV_H
+#define _VM_SEG_DEV_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/project.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Structure whose pointer is passed to the segdev_create routine
+ */
+struct segdev_crargs {
+ offset_t offset; /* starting offset */
+ int (*mapfunc)(dev_t dev, off_t off, int prot); /* map function */
+ dev_t dev; /* device number */
+ uchar_t type; /* type of sharing done */
+ uchar_t prot; /* protection */
+ uchar_t maxprot; /* maximum protection */
+ uint_t hat_attr; /* hat attr */
+ uint_t hat_flags; /* currently, hat_flags is used ONLY for */
+ /* HAT_LOAD_NOCONSIST; in future, it can be */
+ /* expanded to include any flags that are */
+ /* not already part of hat_attr */
+ void *devmap_data; /* devmap_handle private data */
+};
+
+/*
+ * (Semi) private data maintained by the seg_dev driver per segment mapping
+ *
+ * The segment lock is necessary to protect fields that are modified
+ * when the "read" version of the address space lock is held. This lock
+ * is not needed when the segment operation has the "write" version of
+ * the address space lock (it would be redundant).
+ *
+ * The following fields in segdev_data are read-only when the address
+ * space is "read" locked, and don't require the segment lock:
+ *
+ * vp
+ * offset
+ * mapfunc
+ * maxprot
+ */
+struct segdev_data {
+ offset_t offset; /* device offset for start of mapping */
+ kmutex_t lock; /* protects segdev_data */
+ int (*mapfunc)(dev_t dev, off_t off, int prot);
+ struct vnode *vp; /* vnode associated with device */
+ uchar_t pageprot; /* true if per page protections present */
+ uchar_t prot; /* current segment prot if pageprot == 0 */
+ uchar_t maxprot; /* maximum segment protections */
+ uchar_t type; /* type of sharing done */
+ struct vpage *vpage; /* per-page information, if needed */
+ uint_t hat_attr; /* hat attr - pass to attr in hat_devload */
+ uint_t hat_flags; /* set HAT_LOAD_NOCONSIST flag in hat_devload */
+ /* see comments above in segdev_crargs */
+ size_t softlockcnt; /* # of SOFTLOCKED in seg */
+ void *devmap_data; /* devmap_handle private data */
+};
+
+/* Direct physical-userland mapping, without occupying kernel address space */
+#define DEVMAP_PMEM_COOKIE ((ddi_umem_cookie_t)0x2)
+
+/*
+ * pmem_cookie:
+ * Records physical memory pages to be exported to userland.
+ */
+struct devmap_pmem_cookie {
+ pgcnt_t dp_npages; /* number of allocated mem pages */
+ page_t **dp_pparray; /* pages allocated for this cookie */
+ vnode_t *dp_vnp; /* vnode associated with this cookie */
+ kproject_t *dp_projp; /* project ptr for resource ctl */
+};
+
+#ifdef _KERNEL
+
+extern void segdev_init(void);
+
+extern int segdev_create(struct seg *, void *);
+
+extern int segdev_copyto(struct seg *, caddr_t, const void *, void *, size_t);
+extern int segdev_copyfrom(struct seg *, caddr_t, const void *, void *, size_t);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_SEG_DEV_H */
diff --git a/usr/src/uts/common/vm/seg_enum.h b/usr/src/uts/common/vm/seg_enum.h
new file mode 100644
index 0000000000..25922e7b40
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_enum.h
@@ -0,0 +1,85 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ifndef _VM_SEG_ENUM_H
+#define _VM_SEG_ENUM_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * These enumerations are needed in both <vm/seg.h> and
+ * <sys/vnode.h> in order to declare function prototypes.
+ */
+
+/*
+ * Fault information passed to the seg fault handling routine.
+ * The F_SOFTLOCK and F_SOFTUNLOCK are used by software
+ * to lock and unlock pages for physical I/O.
+ */
+enum fault_type {
+ F_INVAL, /* invalid page */
+ F_PROT, /* protection fault */
+ F_SOFTLOCK, /* software requested locking */
+ F_SOFTUNLOCK /* software requested unlocking */
+};
+
+/*
+ * Lock information passed to the seg pagelock handling routine.
+ */
+enum lock_type {
+ L_PAGELOCK, /* lock pages */
+ L_PAGEUNLOCK, /* unlock pages */
+ L_PAGERECLAIM /* reclaim pages */
+};
+
+/*
+ * seg_rw gives the access type for a fault operation
+ */
+enum seg_rw {
+ S_OTHER, /* unknown or not touched */
+ S_READ, /* read access attempted */
+ S_WRITE, /* write access attempted */
+ S_EXEC, /* execution access attempted */
+ S_CREATE, /* create if page doesn't exist */
+ S_READ_NOCOW /* read access, don't do a copy on write */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_SEG_ENUM_H */
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
new file mode 100644
index 0000000000..6f0c8f5750
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -0,0 +1,1516 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/tuneable.h>
+#include <sys/systm.h>
+#include <sys/vm.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+#include <sys/mman.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/dumphdr.h>
+#include <sys/bootconf.h>
+#include <sys/lgrp.h>
+#include <vm/seg_kmem.h>
+#include <vm/hat.h>
+#include <vm/page.h>
+#include <vm/vm_dep.h>
+#include <vm/faultcode.h>
+#include <sys/promif.h>
+#include <vm/seg_kp.h>
+#include <sys/bitmap.h>
+#include <sys/mem_cage.h>
+
+/*
+ * seg_kmem is the primary kernel memory segment driver. It
+ * maps the kernel heap [kernelheap, ekernelheap), module text,
+ * and all memory which was allocated before the VM was initialized
+ * into kas.
+ *
+ * Pages which belong to seg_kmem are hashed into &kvp vnode at
+ * an offset equal to (u_offset_t)virt_addr, and have p_lckcnt >= 1.
+ * They must never be paged out since segkmem_fault() is a no-op to
+ * prevent recursive faults.
+ *
+ * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on
+ * __x86 and are unlocked (p_sharelock == 0) on __sparc. Once __x86
+ * supports relocation the #ifdef kludges can be removed.
+ *
+ * seg_kmem pages may be subject to relocation by page_relocate(),
+ * provided that the HAT supports it; if this is so, segkmem_reloc
+ * will be set to a nonzero value. All boot time allocated memory as
+ * well as static memory is considered off limits to relocation.
+ * Pages are "relocatable" if p_state does not have P_NORELOC set, so
+ * we request P_NORELOC pages for memory that isn't safe to relocate.
+ *
+ * The kernel heap is logically divided up into four pieces:
+ *
+ * heap32_arena is for allocations that require 32-bit absolute
+ * virtual addresses (e.g. code that uses 32-bit pointers/offsets).
+ *
+ * heap_core is for allocations that require 2GB *relative*
+ * offsets; in other words all memory from heap_core is within
+ * 2GB of all other memory from the same arena. This is a requirement
+ * of the addressing modes of some processors in supervisor code.
+ *
+ * heap_arena is the general heap arena.
+ *
+ * static_arena is the static memory arena. Allocations from it
+ * are not subject to relocation so it is safe to use the memory
+ * physical address as well as the virtual address (e.g. the VA to
+ * PA translations are static). Caches may import from static_arena;
+ * all other static memory allocations should use static_alloc_arena.
+ *
+ * On some platforms which have limited virtual address space, seg_kmem
+ * may share [kernelheap, ekernelheap) with seg_kp; if this is so,
+ * segkp_bitmap is non-NULL, and each bit represents a page of virtual
+ * address space which is actually seg_kp mapped.
+ */
+
+extern ulong_t *segkp_bitmap; /* Is set if segkp is from the kernel heap */
+
+char *kernelheap; /* start of primary kernel heap */
+char *ekernelheap; /* end of primary kernel heap */
+struct seg kvseg; /* primary kernel heap segment */
+struct seg kvseg_core; /* "core" kernel heap segment */
+vmem_t *heap_arena; /* primary kernel heap arena */
+vmem_t *heap_core_arena; /* core kernel heap arena */
+char *heap_core_base; /* start of core kernel heap arena */
+char *heap_lp_base; /* start of kernel large page heap arena */
+char *heap_lp_end; /* end of kernel large page heap arena */
+vmem_t *hat_memload_arena; /* HAT translation data */
+struct seg kvseg32; /* 32-bit kernel heap segment */
+vmem_t *heap32_arena; /* 32-bit kernel heap arena */
+vmem_t *heaptext_arena; /* heaptext arena */
+struct as kas; /* kernel address space */
+struct vnode kvp; /* vnode for all segkmem pages */
+int segkmem_reloc; /* enable/disable relocatable segkmem pages */
+vmem_t *static_arena; /* arena for caches to import static memory */
+vmem_t *static_alloc_arena; /* arena for allocating static memory */
+
+/*
+ * seg_kmem driver can map part of the kernel heap with large pages.
+ * Currently this functionality is implemented for sparc platforms only.
+ *
+ * The large page size "segkmem_lpsize" for kernel heap is selected in the
+ * platform specific code. It can also be modified via /etc/system file.
+ * Setting segkmem_lpsize to PAGESIZE in /etc/system disables usage of large
+ * pages for kernel heap. "segkmem_lpshift" is adjusted appropriately to
+ * match segkmem_lpsize.
+ *
+ * At boot time we carve from kernel heap arena a range of virtual addresses
+ * that will be used for large page mappings. This range [heap_lp_base,
+ * heap_lp_end) is set up as a separate vmem arena - "heap_lp_arena". We also
+ * create "kmem_lp_arena" that caches memory already backed up by large
+ * pages. kmem_lp_arena imports virtual segments from heap_lp_arena.
+ */
+
+size_t segkmem_lpsize;
+static uint_t segkmem_lpshift = PAGESHIFT;
+
+size_t segkmem_kmemlp_quantum = 0x400000; /* 4MB */
+size_t segkmem_heaplp_quantum;
+static vmem_t *heap_lp_arena;
+static vmem_t *kmem_lp_arena;
+static vmem_t *segkmem_ppa_arena;
+static segkmem_lpcb_t segkmem_lpcb;
+
+/*
+ * We use "segkmem_kmemlp_max" to limit the total amount of physical memory
+ * consumed by the large page heap. By default this parameter is set to 1/4 of
+ * physmem but can be adjusted through /etc/system either directly or
+ * indirectly by setting "segkmem_kmemlp_pcnt" to the percent of physmem
+ * we allow for large page heap.
+ */
+size_t segkmem_kmemlp_max;
+static uint_t segkmem_kmemlp_pcnt;
+
+/*
+ * Getting large pages for kernel heap could be problematic due to
+ * physical memory fragmentation. That's why we allow to preallocate
+ * "segkmem_kmemlp_min" bytes at boot time.
+ */
+static size_t segkmem_kmemlp_min;
+
+/*
+ * Throttling is used to avoid expensive tries to allocate large pages
+ * for kernel heap when a lot of succesive attempts to do so fail.
+ */
+static ulong_t segkmem_lpthrottle_max = 0x400000;
+static ulong_t segkmem_lpthrottle_start = 0x40;
+static ulong_t segkmem_use_lpthrottle = 1;
+
+/*
+ * Freed pages accumulate on a garbage list until segkmem is ready,
+ * at which point we call segkmem_gc() to free it all.
+ */
+typedef struct segkmem_gc_list {
+ struct segkmem_gc_list *gc_next;
+ vmem_t *gc_arena;
+ size_t gc_size;
+} segkmem_gc_list_t;
+
+static segkmem_gc_list_t *segkmem_gc_list;
+
+/*
+ * Allocations from the hat_memload arena add VM_MEMLOAD to their
+ * vmflags so that segkmem_xalloc() can inform the hat layer that it needs
+ * to take steps to prevent infinite recursion. HAT allocations also
+ * must be non-relocatable to prevent recursive page faults.
+ */
+static void *
+hat_memload_alloc(vmem_t *vmp, size_t size, int flags)
+{
+ flags |= (VM_MEMLOAD | VM_NORELOC);
+ return (segkmem_alloc(vmp, size, flags));
+}
+
+/*
+ * Allocations from static_arena arena (or any other arena that uses
+ * segkmem_alloc_permanent()) require non-relocatable (permanently
+ * wired) memory pages, since these pages are referenced by physical
+ * as well as virtual address.
+ */
+void *
+segkmem_alloc_permanent(vmem_t *vmp, size_t size, int flags)
+{
+ return (segkmem_alloc(vmp, size, flags | VM_NORELOC));
+}
+
+/*
+ * Initialize kernel heap boundaries.
+ */
+void
+kernelheap_init(
+ void *heap_start,
+ void *heap_end,
+ char *first_avail,
+ void *core_start,
+ void *core_end)
+{
+ uintptr_t textbase;
+ size_t core_size;
+ size_t heap_size;
+ vmem_t *heaptext_parent;
+ size_t heap_lp_size = 0;
+
+ kernelheap = heap_start;
+ ekernelheap = heap_end;
+
+#ifdef __sparc
+ heap_lp_size = (((uintptr_t)heap_end - (uintptr_t)heap_start) / 4);
+ heap_lp_base = ekernelheap - heap_lp_size;
+ heap_lp_end = heap_lp_base + heap_lp_size;
+#endif /* __sparc */
+
+ /*
+ * If this platform has a 'core' heap area, then the space for
+ * overflow module text should be carved out of the end of that
+ * heap. Otherwise, it gets carved out of the general purpose
+ * heap.
+ */
+ core_size = (uintptr_t)core_end - (uintptr_t)core_start;
+ if (core_size > 0) {
+ ASSERT(core_size >= HEAPTEXT_SIZE);
+ textbase = (uintptr_t)core_end - HEAPTEXT_SIZE;
+ core_size -= HEAPTEXT_SIZE;
+ }
+#ifndef __sparc
+ else {
+ ekernelheap -= HEAPTEXT_SIZE;
+ textbase = (uintptr_t)ekernelheap;
+ }
+#endif
+
+ heap_size = (uintptr_t)ekernelheap - (uintptr_t)kernelheap;
+ heap_arena = vmem_init("heap", kernelheap, heap_size, PAGESIZE,
+ segkmem_alloc, segkmem_free);
+
+ if (core_size > 0) {
+ heap_core_arena = vmem_create("heap_core", core_start,
+ core_size, PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
+ heap_core_base = core_start;
+ } else {
+ heap_core_arena = heap_arena;
+ heap_core_base = kernelheap;
+ }
+
+ /*
+ * reserve space for the large page heap. If large pages for kernel
+ * heap is enabled large page heap arean will be created later in the
+ * boot sequence in segkmem_heap_lp_init(). Otherwise the allocated
+ * range will be returned back to the heap_arena.
+ */
+ if (heap_lp_size) {
+ (void) vmem_xalloc(heap_arena, heap_lp_size, PAGESIZE, 0, 0,
+ heap_lp_base, heap_lp_end,
+ VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
+ }
+
+ /*
+ * Remove the already-spoken-for memory range [kernelheap, first_avail).
+ */
+ (void) vmem_xalloc(heap_arena, first_avail - kernelheap, PAGESIZE,
+ 0, 0, kernelheap, first_avail, VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
+
+#ifdef __sparc
+ heap32_arena = vmem_create("heap32", (void *)SYSBASE32,
+ SYSLIMIT32 - SYSBASE32 - HEAPTEXT_SIZE, PAGESIZE, NULL,
+ NULL, NULL, 0, VM_SLEEP);
+
+ textbase = SYSLIMIT32 - HEAPTEXT_SIZE;
+ heaptext_parent = NULL;
+#else /* __sparc */
+ heap32_arena = heap_core_arena;
+ heaptext_parent = heap_core_arena;
+#endif /* __sparc */
+
+ heaptext_arena = vmem_create("heaptext", (void *)textbase,
+ HEAPTEXT_SIZE, PAGESIZE, NULL, NULL, heaptext_parent, 0, VM_SLEEP);
+
+ /*
+ * Create a set of arenas for memory with static translations
+ * (e.g. VA -> PA translations cannot change). Since using
+ * kernel pages by physical address implies it isn't safe to
+ * walk across page boundaries, the static_arena quantum must
+ * be PAGESIZE. Any kmem caches that require static memory
+ * should source from static_arena, while direct allocations
+ * should only use static_alloc_arena.
+ */
+ static_arena = vmem_create("static", NULL, 0, PAGESIZE,
+ segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP);
+ static_alloc_arena = vmem_create("static_alloc", NULL, 0,
+ sizeof (uint64_t), vmem_alloc, vmem_free, static_arena,
+ 0, VM_SLEEP);
+
+ /*
+ * Create an arena for translation data (ptes, hmes, or hblks).
+ * We need an arena for this because hat_memload() is essential
+ * to vmem_populate() (see comments in common/os/vmem.c).
+ *
+ * Note: any kmem cache that allocates from hat_memload_arena
+ * must be created as a KMC_NOHASH cache (i.e. no external slab
+ * and bufctl structures to allocate) so that slab creation doesn't
+ * require anything more than a single vmem_alloc().
+ */
+ hat_memload_arena = vmem_create("hat_memload", NULL, 0, PAGESIZE,
+ hat_memload_alloc, segkmem_free, heap_arena, 0,
+ VM_SLEEP | VMC_POPULATOR);
+}
+
+/*
+ * Grow kernel heap downward.
+ */
+void
+kernelheap_extend(void *range_start, void *range_end)
+{
+ size_t len = (uintptr_t)range_end - (uintptr_t)range_start;
+
+ ASSERT(range_start < range_end && range_end == kernelheap);
+
+ if (vmem_add(heap_arena, range_start, len, VM_NOSLEEP) == NULL) {
+ cmn_err(CE_WARN, "Could not grow kernel heap below 0x%p",
+ (void *)kernelheap);
+ } else {
+ kernelheap = range_start;
+ }
+}
+
+void
+boot_mapin(caddr_t addr, size_t size)
+{
+ caddr_t eaddr;
+ page_t *pp;
+ pfn_t pfnum;
+
+ if (page_resv(btop(size), KM_NOSLEEP) == 0)
+ panic("boot_mapin: page_resv failed");
+
+ for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
+ pfnum = va_to_pfn(addr);
+ if ((pp = page_numtopp_nolock(pfnum)) == NULL)
+ panic("boot_mapin(): No pp for pfnum = %lx", pfnum);
+
+ /*
+ * must break up any large pages that may have constituent
+ * pages being utilized for BOP_ALLOC()'s before calling
+ * page_numtopp().The locking code (ie. page_reclaim())
+ * can't handle them
+ */
+ if (pp->p_szc != 0)
+ page_boot_demote(pp);
+
+ pp = page_numtopp(pfnum, SE_EXCL);
+ if (pp == NULL || PP_ISFREE(pp))
+ panic("boot_alloc: pp is NULL or free");
+
+ /*
+ * If the cage is on but doesn't yet contain this page,
+ * mark it as non-relocatable.
+ */
+ if (kcage_on && !PP_ISNORELOC(pp))
+ PP_SETNORELOC(pp);
+
+ (void) page_hashin(pp, &kvp, (u_offset_t)(uintptr_t)addr, NULL);
+ pp->p_lckcnt = 1;
+#if defined(__x86)
+ page_downgrade(pp);
+#else
+ page_unlock(pp);
+#endif
+ }
+}
+
+/*
+ * Get pages from boot and hash them into the kernel's vp.
+ * Used after page structs have been allocated, but before segkmem is ready.
+ */
+void *
+boot_alloc(void *inaddr, size_t size, uint_t align)
+{
+ caddr_t addr = inaddr;
+
+ if (bootops == NULL)
+ prom_panic("boot_alloc: attempt to allocate memory after "
+ "BOP_GONE");
+
+ size = ptob(btopr(size));
+ if (BOP_ALLOC(bootops, addr, size, align) != addr)
+ panic("boot_alloc: BOP_ALLOC failed");
+ boot_mapin((caddr_t)addr, size);
+ return (addr);
+}
+
+static void
+segkmem_badop()
+{
+ panic("segkmem_badop");
+}
+
+#define SEGKMEM_BADOP(t) (t(*)())segkmem_badop
+
+/*ARGSUSED*/
+static faultcode_t
+segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
+ enum fault_type type, enum seg_rw rw)
+{
+ ASSERT(RW_READ_HELD(&seg->s_as->a_lock));
+
+ if (seg->s_as != &kas || size > seg->s_size ||
+ addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
+ panic("segkmem_fault: bad args");
+
+ if (segkp_bitmap && seg == &kvseg) {
+
+ /*
+ * If it is one of segkp pages, call segkp_fault.
+ */
+ if (BT_TEST(segkp_bitmap,
+ btop((uintptr_t)(addr - seg->s_base))))
+ return (SEGOP_FAULT(hat, segkp, addr, size, type, rw));
+ }
+
+ switch (type) {
+ case F_SOFTLOCK: /* lock down already-loaded translations */
+ if (rw == S_OTHER) {
+ hat_reserve(seg->s_as, addr, size);
+ return (0);
+ }
+ /*FALLTHROUGH*/
+ case F_SOFTUNLOCK:
+ if (rw == S_READ || rw == S_WRITE)
+ return (0);
+ /*FALLTHROUGH*/
+ default:
+ break;
+ }
+ return (FC_NOSUPPORT);
+}
+
+static int
+segkmem_setprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
+{
+ ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
+
+ if (seg->s_as != &kas || size > seg->s_size ||
+ addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
+ panic("segkmem_setprot: bad args");
+
+ if (segkp_bitmap && seg == &kvseg) {
+
+ /*
+ * If it is one of segkp pages, call segkp.
+ */
+ if (BT_TEST(segkp_bitmap,
+ btop((uintptr_t)(addr - seg->s_base))))
+ return (SEGOP_SETPROT(segkp, addr, size, prot));
+ }
+
+ if (prot == 0)
+ hat_unload(kas.a_hat, addr, size, HAT_UNLOAD);
+ else
+ hat_chgprot(kas.a_hat, addr, size, prot);
+ return (0);
+}
+
+/*
+ * This is a dummy segkmem function overloaded to call segkp
+ * when segkp is under the heap.
+ */
+/* ARGSUSED */
+static int
+segkmem_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
+{
+ ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
+
+ if (seg->s_as != &kas)
+ segkmem_badop();
+
+ if (segkp_bitmap && seg == &kvseg) {
+
+ /*
+ * If it is one of segkp pages, call into segkp.
+ */
+ if (BT_TEST(segkp_bitmap,
+ btop((uintptr_t)(addr - seg->s_base))))
+ return (SEGOP_CHECKPROT(segkp, addr, size, prot));
+ }
+ segkmem_badop();
+ return (0);
+}
+
+/*
+ * This is a dummy segkmem function overloaded to call segkp
+ * when segkp is under the heap.
+ */
+/* ARGSUSED */
+static int
+segkmem_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+ ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
+
+ if (seg->s_as != &kas)
+ segkmem_badop();
+
+ if (segkp_bitmap && seg == &kvseg) {
+
+ /*
+ * If it is one of segkp pages, call into segkp.
+ */
+ if (BT_TEST(segkp_bitmap,
+ btop((uintptr_t)(addr - seg->s_base))))
+ return (SEGOP_KLUSTER(segkp, addr, delta));
+ }
+ segkmem_badop();
+ return (0);
+}
+
+static void
+segkmem_xdump_range(void *arg, void *start, size_t size)
+{
+ struct as *as = arg;
+ caddr_t addr = start;
+ caddr_t addr_end = addr + size;
+
+ while (addr < addr_end) {
+ pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
+ if (pfn != PFN_INVALID && pfn <= physmax && pf_is_memory(pfn))
+ dump_addpage(as, addr, pfn);
+ addr += PAGESIZE;
+ dump_timeleft = dump_timeout;
+ }
+}
+
+static void
+segkmem_dump_range(void *arg, void *start, size_t size)
+{
+ caddr_t addr = start;
+ caddr_t addr_end = addr + size;
+
+ /*
+ * If we are about to start dumping the range of addresses we
+ * carved out of the kernel heap for the large page heap walk
+ * heap_lp_arena to find what segments are actually populated
+ */
+ if (SEGKMEM_USE_LARGEPAGES &&
+ addr == heap_lp_base && addr_end == heap_lp_end &&
+ vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
+ vmem_walk(heap_lp_arena, VMEM_ALLOC | VMEM_REENTRANT,
+ segkmem_xdump_range, arg);
+ } else {
+ segkmem_xdump_range(arg, start, size);
+ }
+}
+
+static void
+segkmem_dump(struct seg *seg)
+{
+ /*
+ * The kernel's heap_arena (represented by kvseg) is a very large
+ * VA space, most of which is typically unused. To speed up dumping
+ * we use vmem_walk() to quickly find the pieces of heap_arena that
+ * are actually in use. We do the same for heap32_arena and
+ * heap_core.
+ *
+ * We specify VMEM_REENTRANT to vmem_walk() because dump_addpage()
+ * may ultimately need to allocate memory. Reentrant walks are
+ * necessarily imperfect snapshots. The kernel heap continues
+ * to change during a live crash dump, for example. For a normal
+ * crash dump, however, we know that there won't be any other threads
+ * messing with the heap. Therefore, at worst, we may fail to dump
+ * the pages that get allocated by the act of dumping; but we will
+ * always dump every page that was allocated when the walk began.
+ *
+ * The other segkmem segments are dense (fully populated), so there's
+ * no need to use this technique when dumping them.
+ *
+ * Note: when adding special dump handling for any new sparsely-
+ * populated segments, be sure to add similar handling to the ::kgrep
+ * code in mdb.
+ */
+ if (seg == &kvseg) {
+ vmem_walk(heap_arena, VMEM_ALLOC | VMEM_REENTRANT,
+ segkmem_dump_range, seg->s_as);
+#ifndef __sparc
+ vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
+ segkmem_dump_range, seg->s_as);
+#endif
+ } else if (seg == &kvseg_core) {
+ vmem_walk(heap_core_arena, VMEM_ALLOC | VMEM_REENTRANT,
+ segkmem_dump_range, seg->s_as);
+ } else if (seg == &kvseg32) {
+ vmem_walk(heap32_arena, VMEM_ALLOC | VMEM_REENTRANT,
+ segkmem_dump_range, seg->s_as);
+ vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
+ segkmem_dump_range, seg->s_as);
+ } else {
+ segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
+ }
+}
+
+/*
+ * lock/unlock kmem pages over a given range [addr, addr+len).
+ * Returns a shadow list of pages in ppp if *ppp is not NULL
+ * and memory can be allocated to hold the shadow list.
+ */
+/*ARGSUSED*/
+static int
+segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
+ page_t ***ppp, enum lock_type type, enum seg_rw rw)
+{
+ page_t **pplist, *pp;
+ pgcnt_t npages;
+ size_t nb;
+
+ if (segkp_bitmap && seg == &kvseg) {
+ /*
+ * If it is one of segkp pages, call into segkp.
+ */
+ if (BT_TEST(segkp_bitmap,
+ btop((uintptr_t)(addr - seg->s_base))))
+ return (SEGOP_PAGELOCK(segkp, addr, len, ppp,
+ type, rw));
+ }
+
+ if (type == L_PAGERECLAIM)
+ return (ENOTSUP);
+
+ npages = btopr(len);
+ nb = sizeof (page_t *) * npages;
+
+ if (type == L_PAGEUNLOCK) {
+ if ((pplist = *ppp) == NULL) {
+ /*
+ * No shadow list. Iterate over the range
+ * using page_find() and unlock the pages
+ * that we encounter.
+ */
+ while (npages--) {
+ pp = page_find(&kvp,
+ (u_offset_t)(uintptr_t)addr);
+ if (pp)
+ page_unlock(pp);
+ addr += PAGESIZE;
+ }
+ return (0);
+ }
+
+ while (npages--) {
+ pp = *pplist++;
+ if (pp)
+ page_unlock(pp);
+ }
+ kmem_free(*ppp, nb);
+ return (0);
+ }
+
+ ASSERT(type == L_PAGELOCK);
+
+ pplist = NULL;
+ if (ppp != NULL)
+ *ppp = pplist = kmem_alloc(nb, KM_NOSLEEP);
+
+ while (npages--) {
+ pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_SHARED);
+ /*
+ * We'd like to ASSERT(pp != NULL) here, but we can't
+ * because there are legitimate cases where the address
+ * isn't really mapped -- for instance, attaching a
+ * kernel debugger and poking at a non-existent address.
+ */
+ if (pplist)
+ *pplist++ = pp;
+ addr += PAGESIZE;
+ }
+ return (0);
+}
+
+/*
+ * This is a dummy segkmem function overloaded to call segkp
+ * when segkp is under the heap.
+ */
+/* ARGSUSED */
+static int
+segkmem_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+ ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
+
+ if (seg->s_as != &kas)
+ segkmem_badop();
+
+ if (segkp_bitmap && seg == &kvseg) {
+
+ /*
+ * If it is one of segkp pages, call into segkp.
+ */
+ if (BT_TEST(segkp_bitmap,
+ btop((uintptr_t)(addr - seg->s_base))))
+ return (SEGOP_GETMEMID(segkp, addr, memidp));
+ }
+ segkmem_badop();
+ return (0);
+}
+
+/*ARGSUSED*/
+static lgrp_mem_policy_info_t *
+segkmem_getpolicy(struct seg *seg, caddr_t addr)
+{
+ return (NULL);
+}
+
+
+static struct seg_ops segkmem_ops = {
+ SEGKMEM_BADOP(int), /* dup */
+ SEGKMEM_BADOP(int), /* unmap */
+ SEGKMEM_BADOP(void), /* free */
+ segkmem_fault,
+ SEGKMEM_BADOP(faultcode_t), /* faulta */
+ segkmem_setprot,
+ segkmem_checkprot,
+ segkmem_kluster,
+ SEGKMEM_BADOP(size_t), /* swapout */
+ SEGKMEM_BADOP(int), /* sync */
+ SEGKMEM_BADOP(size_t), /* incore */
+ SEGKMEM_BADOP(int), /* lockop */
+ SEGKMEM_BADOP(int), /* getprot */
+ SEGKMEM_BADOP(u_offset_t), /* getoffset */
+ SEGKMEM_BADOP(int), /* gettype */
+ SEGKMEM_BADOP(int), /* getvp */
+ SEGKMEM_BADOP(int), /* advise */
+ segkmem_dump,
+ segkmem_pagelock,
+ SEGKMEM_BADOP(int), /* setpgsz */
+ segkmem_getmemid,
+ segkmem_getpolicy, /* getpolicy */
+};
+
+int
+segkmem_create(struct seg *seg)
+{
+ ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
+ seg->s_ops = &segkmem_ops;
+ seg->s_data = NULL;
+ kas.a_size += seg->s_size;
+ return (0);
+}
+
+/*ARGSUSED*/
+page_t *
+segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
+{
+ struct seg kseg;
+ int pgflags;
+
+ kseg.s_as = &kas;
+ pgflags = PG_EXCL;
+
+ if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
+ pgflags |= PG_NORELOC;
+ if ((vmflag & VM_NOSLEEP) == 0)
+ pgflags |= PG_WAIT;
+ if (vmflag & VM_PANIC)
+ pgflags |= PG_PANIC;
+ if (vmflag & VM_PUSHPAGE)
+ pgflags |= PG_PUSHPAGE;
+
+ return (page_create_va(&kvp, (u_offset_t)(uintptr_t)addr, size,
+ pgflags, &kseg, addr));
+}
+
+/*
+ * Allocate pages to back the virtual address range [addr, addr + size).
+ * If addr is NULL, allocate the virtual address space as well.
+ */
+void *
+segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
+ page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
+{
+ page_t *ppl;
+ caddr_t addr = inaddr;
+ pgcnt_t npages = btopr(size);
+ int allocflag;
+
+ if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
+ return (NULL);
+
+ ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
+
+ if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
+ if (inaddr == NULL)
+ vmem_free(vmp, addr, size);
+ return (NULL);
+ }
+
+ ppl = page_create_func(addr, size, vmflag, pcarg);
+ if (ppl == NULL) {
+ if (inaddr == NULL)
+ vmem_free(vmp, addr, size);
+ page_unresv(npages);
+ return (NULL);
+ }
+
+ /*
+ * Under certain conditions, we need to let the HAT layer know
+ * that it cannot safely allocate memory. Allocations from
+ * the hat_memload vmem arena always need this, to prevent
+ * infinite recursion.
+ *
+ * In addition, the x86 hat cannot safely do memory
+ * allocations while in vmem_populate(), because there
+ * is no simple bound on its usage.
+ */
+ if (vmflag & VM_MEMLOAD)
+ allocflag = HAT_NO_KALLOC;
+#if defined(__x86)
+ else if (vmem_is_populator())
+ allocflag = HAT_NO_KALLOC;
+#endif
+ else
+ allocflag = 0;
+
+ while (ppl != NULL) {
+ page_t *pp = ppl;
+ page_sub(&ppl, pp);
+ ASSERT(page_iolock_assert(pp));
+ ASSERT(PAGE_EXCL(pp));
+ page_io_unlock(pp);
+ hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp,
+ (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
+ HAT_LOAD_LOCK | allocflag);
+ pp->p_lckcnt = 1;
+#if defined(__x86)
+ page_downgrade(pp);
+#else
+ if (vmflag & SEGKMEM_SHARELOCKED)
+ page_downgrade(pp);
+ else
+ page_unlock(pp);
+#endif
+ }
+
+ return (addr);
+}
+
+void *
+segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
+{
+ void *addr;
+ segkmem_gc_list_t *gcp, **prev_gcpp;
+
+ if (kvseg.s_base == NULL) {
+#ifndef __sparc
+ if (bootops->bsys_alloc == NULL)
+ halt("Memory allocation between bop_alloc() and "
+ "kmem_alloc().\n");
+#endif
+
+ /*
+ * There's not a lot of memory to go around during boot,
+ * so recycle it if we can.
+ */
+ for (prev_gcpp = &segkmem_gc_list; (gcp = *prev_gcpp) != NULL;
+ prev_gcpp = &gcp->gc_next) {
+ if (gcp->gc_arena == vmp && gcp->gc_size == size) {
+ *prev_gcpp = gcp->gc_next;
+ return (gcp);
+ }
+ }
+
+ addr = vmem_alloc(vmp, size, vmflag | VM_PANIC);
+ if (boot_alloc(addr, size, BO_NO_ALIGN) != addr)
+ panic("segkmem_alloc: boot_alloc failed");
+ return (addr);
+ }
+ return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
+ segkmem_page_create, NULL));
+}
+
+/*
+ * Any changes to this routine must also be carried over to
+ * devmap_free_pages() in the seg_dev driver. This is because
+ * we currently don't have a special kernel segment for non-paged
+ * kernel memory that is exported by drivers to user space.
+ */
+void
+segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
+{
+ page_t *pp;
+ caddr_t addr = inaddr;
+ caddr_t eaddr;
+ pgcnt_t npages = btopr(size);
+
+ ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
+
+ if (kvseg.s_base == NULL) {
+ segkmem_gc_list_t *gc = inaddr;
+ gc->gc_arena = vmp;
+ gc->gc_size = size;
+ gc->gc_next = segkmem_gc_list;
+ segkmem_gc_list = gc;
+ return;
+ }
+
+ hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+
+ for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
+#if defined(__x86)
+ pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr);
+ if (pp == NULL)
+ panic("segkmem_free: page not found");
+ if (!page_tryupgrade(pp)) {
+ /*
+ * Some other thread has a sharelock. Wait for
+ * it to drop the lock so we can free this page.
+ */
+ page_unlock(pp);
+ pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr,
+ SE_EXCL);
+ }
+#else
+ pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
+#endif
+ if (pp == NULL)
+ panic("segkmem_free: page not found");
+ /* Clear p_lckcnt so page_destroy() doesn't update availrmem */
+ pp->p_lckcnt = 0;
+ page_destroy(pp, 0);
+ }
+ page_unresv(npages);
+
+ if (vmp != NULL)
+ vmem_free(vmp, inaddr, size);
+}
+
+void
+segkmem_gc(void)
+{
+ ASSERT(kvseg.s_base != NULL);
+ while (segkmem_gc_list != NULL) {
+ segkmem_gc_list_t *gc = segkmem_gc_list;
+ segkmem_gc_list = gc->gc_next;
+ segkmem_free(gc->gc_arena, gc, gc->gc_size);
+ }
+}
+
+/*
+ * Legacy entry points from here to end of file.
+ */
+void
+segkmem_mapin(struct seg *seg, void *addr, size_t size, uint_t vprot,
+ pfn_t pfn, uint_t flags)
+{
+ hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+ hat_devload(seg->s_as->a_hat, addr, size, pfn, vprot,
+ flags | HAT_LOAD_LOCK);
+}
+
+void
+segkmem_mapout(struct seg *seg, void *addr, size_t size)
+{
+ hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+}
+
+void *
+kmem_getpages(pgcnt_t npages, int kmflag)
+{
+ return (kmem_alloc(ptob(npages), kmflag));
+}
+
+void
+kmem_freepages(void *addr, pgcnt_t npages)
+{
+ kmem_free(addr, ptob(npages));
+}
+
+/*
+ * segkmem_page_create_large() allocates a large page to be used for the kmem
+ * caches. If kpr is enabled we ask for a relocatable page unless requested
+ * otherwise. If kpr is disabled we have to ask for a non-reloc page
+ */
+static page_t *
+segkmem_page_create_large(void *addr, size_t size, int vmflag, void *arg)
+{
+ int pgflags;
+
+ pgflags = PG_EXCL;
+
+ if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
+ pgflags |= PG_NORELOC;
+ if (!(vmflag & VM_NOSLEEP))
+ pgflags |= PG_WAIT;
+ if (vmflag & VM_PUSHPAGE)
+ pgflags |= PG_PUSHPAGE;
+
+ return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size,
+ pgflags, &kvseg, addr, arg));
+}
+
+/*
+ * Allocate a large page to back the virtual address range
+ * [addr, addr + size). If addr is NULL, allocate the virtual address
+ * space as well.
+ */
+static void *
+segkmem_xalloc_lp(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
+ uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
+ void *pcarg)
+{
+ caddr_t addr = inaddr, pa;
+ size_t lpsize = segkmem_lpsize;
+ pgcnt_t npages = btopr(size);
+ pgcnt_t nbpages = btop(lpsize);
+ pgcnt_t nlpages = size >> segkmem_lpshift;
+ size_t ppasize = nbpages * sizeof (page_t *);
+ page_t *pp, *rootpp, **ppa, *pplist = NULL;
+ int i;
+
+ if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
+ return (NULL);
+ }
+
+ /*
+ * allocate an array we need for hat_memload_array.
+ * we use a separate arena to avoid recursion.
+ * we will not need this array when hat_memload_array learns pp++
+ */
+ if ((ppa = vmem_alloc(segkmem_ppa_arena, ppasize, vmflag)) == NULL) {
+ goto fail_array_alloc;
+ }
+
+ if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
+ goto fail_vmem_alloc;
+
+ ASSERT(((uintptr_t)addr & (lpsize - 1)) == 0);
+
+ /* create all the pages */
+ for (pa = addr, i = 0; i < nlpages; i++, pa += lpsize) {
+ if ((pp = page_create_func(pa, lpsize, vmflag, pcarg)) == NULL)
+ goto fail_page_create;
+ page_list_concat(&pplist, &pp);
+ }
+
+ /* at this point we have all the resource to complete the request */
+ while ((rootpp = pplist) != NULL) {
+ for (i = 0; i < nbpages; i++) {
+ ASSERT(pplist != NULL);
+ pp = pplist;
+ page_sub(&pplist, pp);
+ ASSERT(page_iolock_assert(pp));
+ page_io_unlock(pp);
+ ppa[i] = pp;
+ }
+ /*
+ * Load the locked entry. It's OK to preload the entry into the
+ * TSB since we now support large mappings in the kernel TSB.
+ */
+ hat_memload_array(kas.a_hat,
+ (caddr_t)(uintptr_t)rootpp->p_offset, lpsize,
+ ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
+ HAT_LOAD_LOCK);
+
+ for (--i; i >= 0; --i) {
+ ppa[i]->p_lckcnt = 1;
+ page_unlock(ppa[i]);
+ }
+ }
+
+ vmem_free(segkmem_ppa_arena, ppa, ppasize);
+ return (addr);
+
+fail_page_create:
+ while ((rootpp = pplist) != NULL) {
+ for (i = 0, pp = pplist; i < nbpages; i++, pp = pplist) {
+ ASSERT(pp != NULL);
+ page_sub(&pplist, pp);
+ ASSERT(page_iolock_assert(pp));
+ page_io_unlock(pp);
+ }
+ page_destroy_pages(rootpp);
+ }
+
+ if (inaddr == NULL)
+ vmem_free(vmp, addr, size);
+
+fail_vmem_alloc:
+ vmem_free(segkmem_ppa_arena, ppa, ppasize);
+
+fail_array_alloc:
+ page_unresv(npages);
+
+ return (NULL);
+}
+
+static void
+segkmem_free_one_lp(caddr_t addr, size_t size)
+{
+ page_t *pp, *rootpp = NULL;
+ pgcnt_t pgs_left = btopr(size);
+
+ ASSERT(size == segkmem_lpsize);
+
+ hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
+
+ for (; pgs_left > 0; addr += PAGESIZE, pgs_left--) {
+ pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
+ if (pp == NULL)
+ panic("segkmem_free_one_lp: page not found");
+ ASSERT(PAGE_EXCL(pp));
+ pp->p_lckcnt = 0;
+ if (rootpp == NULL)
+ rootpp = pp;
+ }
+ ASSERT(rootpp != NULL);
+ page_destroy_pages(rootpp);
+
+ /* page_unresv() is done by the caller */
+}
+
+/*
+ * This function is called to import new spans into the vmem arenas like
+ * kmem_default_arena and kmem_oversize_arena. It first tries to import
+ * spans from large page arena - kmem_lp_arena. In order to do this it might
+ * have to "upgrade the requested size" to kmem_lp_arena quantum. If
+ * it was not able to satisfy the upgraded request it then calls regular
+ * segkmem_alloc() that satisfies the request by importing from "*vmp" arena
+ */
+void *
+segkmem_alloc_lp(vmem_t *vmp, size_t *sizep, int vmflag)
+{
+ size_t size;
+ kthread_t *t = curthread;
+ segkmem_lpcb_t *lpcb = &segkmem_lpcb;
+
+ ASSERT(sizep != NULL);
+
+ size = *sizep;
+
+ if (lpcb->lp_uselp && !(t->t_flag & T_PANIC) &&
+ !(vmflag & SEGKMEM_SHARELOCKED)) {
+
+ size_t kmemlp_qnt = segkmem_kmemlp_quantum;
+ size_t asize = P2ROUNDUP(size, kmemlp_qnt);
+ void *addr = NULL;
+ ulong_t *lpthrtp = &lpcb->lp_throttle;
+ ulong_t lpthrt = *lpthrtp;
+ int dowakeup = 0;
+ int doalloc = 1;
+
+ ASSERT(kmem_lp_arena != NULL);
+ ASSERT(asize >= size);
+
+ if (lpthrt != 0) {
+ /* try to update the throttle value */
+ lpthrt = atomic_add_long_nv(lpthrtp, 1);
+ if (lpthrt >= segkmem_lpthrottle_max) {
+ lpthrt = atomic_cas_ulong(lpthrtp, lpthrt,
+ segkmem_lpthrottle_max / 4);
+ }
+
+ /*
+ * when we get above throttle start do an exponential
+ * backoff at trying large pages and reaping
+ */
+ if (lpthrt > segkmem_lpthrottle_start &&
+ (lpthrt & (lpthrt - 1))) {
+ atomic_add_64(&lpcb->allocs_throttled, 1L);
+ lpthrt--;
+ if ((lpthrt & (lpthrt - 1)) == 0)
+ kmem_reap();
+ return (segkmem_alloc(vmp, size, vmflag));
+ }
+ }
+
+ if (!(vmflag & VM_NOSLEEP) &&
+ segkmem_heaplp_quantum >= (8 * kmemlp_qnt) &&
+ vmem_size(kmem_lp_arena, VMEM_FREE) <= kmemlp_qnt &&
+ asize < (segkmem_heaplp_quantum - kmemlp_qnt)) {
+
+ /*
+ * we are low on free memory in kmem_lp_arena
+ * we let only one guy to allocate heap_lp
+ * quantum size chunk that everybody is going to
+ * share
+ */
+ mutex_enter(&lpcb->lp_lock);
+
+ if (lpcb->lp_wait) {
+
+ /* we are not the first one - wait */
+ cv_wait(&lpcb->lp_cv, &lpcb->lp_lock);
+ if (vmem_size(kmem_lp_arena, VMEM_FREE) <
+ kmemlp_qnt) {
+ doalloc = 0;
+ }
+ } else if (vmem_size(kmem_lp_arena, VMEM_FREE) <=
+ kmemlp_qnt) {
+
+ /*
+ * we are the first one, make sure we import
+ * a large page
+ */
+ if (asize == kmemlp_qnt)
+ asize += kmemlp_qnt;
+ dowakeup = 1;
+ lpcb->lp_wait = 1;
+ }
+
+ mutex_exit(&lpcb->lp_lock);
+ }
+
+ /*
+ * VM_ABORT flag prevents sleeps in vmem_xalloc when
+ * large pages are not available. In that case this allocation
+ * attempt will fail and we will retry allocation with small
+ * pages. We also do not want to panic if this allocation fails
+ * because we are going to retry.
+ */
+ if (doalloc) {
+ addr = vmem_alloc(kmem_lp_arena, asize,
+ (vmflag | VM_ABORT) & ~VM_PANIC);
+
+ if (dowakeup) {
+ mutex_enter(&lpcb->lp_lock);
+ ASSERT(lpcb->lp_wait != 0);
+ lpcb->lp_wait = 0;
+ cv_broadcast(&lpcb->lp_cv);
+ mutex_exit(&lpcb->lp_lock);
+ }
+ }
+
+ if (addr != NULL) {
+ *sizep = asize;
+ *lpthrtp = 0;
+ return (addr);
+ }
+
+ if (vmflag & VM_NOSLEEP)
+ atomic_add_64(&lpcb->nosleep_allocs_failed, 1L);
+ else
+ atomic_add_64(&lpcb->sleep_allocs_failed, 1L);
+ atomic_add_64(&lpcb->alloc_bytes_failed, size);
+
+ /* if large page throttling is not started yet do it */
+ if (segkmem_use_lpthrottle && lpthrt == 0) {
+ lpthrt = atomic_cas_ulong(lpthrtp, lpthrt, 1);
+ }
+ }
+ return (segkmem_alloc(vmp, size, vmflag));
+}
+
+void
+segkmem_free_lp(vmem_t *vmp, void *inaddr, size_t size)
+{
+ if (kmem_lp_arena == NULL || !IS_KMEM_VA_LARGEPAGE((caddr_t)inaddr)) {
+ segkmem_free(vmp, inaddr, size);
+ } else {
+ vmem_free(kmem_lp_arena, inaddr, size);
+ }
+}
+
+/*
+ * segkmem_alloc_lpi() imports virtual memory from large page heap arena
+ * into kmem_lp arena. In the process it maps the imported segment with
+ * large pages
+ */
+static void *
+segkmem_alloc_lpi(vmem_t *vmp, size_t size, int vmflag)
+{
+ segkmem_lpcb_t *lpcb = &segkmem_lpcb;
+ void *addr;
+
+ ASSERT(size != 0);
+ ASSERT(vmp == heap_lp_arena);
+
+ /* do not allow large page heap grow beyound limits */
+ if (vmem_size(vmp, VMEM_ALLOC) >= segkmem_kmemlp_max) {
+ atomic_add_64(&lpcb->allocs_limited, 1);
+ return (NULL);
+ }
+
+ addr = segkmem_xalloc_lp(vmp, NULL, size, vmflag, 0,
+ segkmem_page_create_large, NULL);
+ return (addr);
+}
+
+/*
+ * segkmem_free_lpi() returns virtual memory back into large page heap arena
+ * from kmem_lp arena. Beore doing this it unmaps the segment and frees
+ * large pages used to map it.
+ */
+static void
+segkmem_free_lpi(vmem_t *vmp, void *inaddr, size_t size)
+{
+ pgcnt_t nlpages = size >> segkmem_lpshift;
+ size_t lpsize = segkmem_lpsize;
+ caddr_t addr = inaddr;
+ pgcnt_t npages = btopr(size);
+ int i;
+
+ ASSERT(vmp == heap_lp_arena);
+ ASSERT(IS_KMEM_VA_LARGEPAGE(addr));
+ ASSERT(((uintptr_t)inaddr & (lpsize - 1)) == 0);
+
+ for (i = 0; i < nlpages; i++) {
+ segkmem_free_one_lp(addr, lpsize);
+ addr += lpsize;
+ }
+
+ page_unresv(npages);
+
+ vmem_free(vmp, inaddr, size);
+}
+
+/*
+ * This function is called at system boot time by kmem_init right after
+ * /etc/system file has been read. It checks based on hardware configuration
+ * and /etc/system settings if system is going to use large pages. The
+ * initialiazation necessary to actually start using large pages
+ * happens later in the process after segkmem_heap_lp_init() is called.
+ */
+int
+segkmem_lpsetup()
+{
+ int use_large_pages = 0;
+
+#ifdef __sparc
+
+ size_t memtotal = physmem * PAGESIZE;
+
+ if (heap_lp_base == NULL) {
+ segkmem_lpsize = PAGESIZE;
+ return (0);
+ }
+
+ /* get a platform dependent value of large page size for kernel heap */
+ segkmem_lpsize = get_segkmem_lpsize(segkmem_lpsize);
+
+ if (segkmem_lpsize <= PAGESIZE) {
+ /*
+ * put virtual space reserved for the large page kernel
+ * back to the regular heap
+ */
+ vmem_xfree(heap_arena, heap_lp_base,
+ heap_lp_end - heap_lp_base);
+ heap_lp_base = NULL;
+ heap_lp_end = NULL;
+ segkmem_lpsize = PAGESIZE;
+ return (0);
+ }
+
+ /* set heap_lp quantum if necessary */
+ if (segkmem_heaplp_quantum == 0 ||
+ (segkmem_heaplp_quantum & (segkmem_heaplp_quantum - 1)) ||
+ P2PHASE(segkmem_heaplp_quantum, segkmem_lpsize)) {
+ segkmem_heaplp_quantum = segkmem_lpsize;
+ }
+
+ /* set kmem_lp quantum if necessary */
+ if (segkmem_kmemlp_quantum == 0 ||
+ (segkmem_kmemlp_quantum & (segkmem_kmemlp_quantum - 1)) ||
+ segkmem_kmemlp_quantum > segkmem_heaplp_quantum) {
+ segkmem_kmemlp_quantum = segkmem_heaplp_quantum;
+ }
+
+ /* set total amount of memory allowed for large page kernel heap */
+ if (segkmem_kmemlp_max == 0) {
+ if (segkmem_kmemlp_pcnt == 0 || segkmem_kmemlp_pcnt > 100)
+ segkmem_kmemlp_pcnt = 25;
+ segkmem_kmemlp_max = (memtotal * 100) / segkmem_kmemlp_pcnt;
+ }
+ segkmem_kmemlp_max = P2ROUNDUP(segkmem_kmemlp_max,
+ segkmem_heaplp_quantum);
+
+ /* fix lp kmem preallocation request if necesssary */
+ if (segkmem_kmemlp_min) {
+ segkmem_kmemlp_min = P2ROUNDUP(segkmem_kmemlp_min,
+ segkmem_heaplp_quantum);
+ if (segkmem_kmemlp_min > segkmem_kmemlp_max)
+ segkmem_kmemlp_min = segkmem_kmemlp_max;
+ }
+
+ use_large_pages = 1;
+ segkmem_lpshift = page_get_shift(page_szc(segkmem_lpsize));
+
+#endif
+ return (use_large_pages);
+}
+
+#ifdef __sparc
+
+
+static void *
+segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag)
+{
+ size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *);
+ void *addr;
+
+ if (ppaquantum <= PAGESIZE)
+ return (segkmem_alloc(vmp, size, vmflag));
+
+ ASSERT((size & (ppaquantum - 1)) == 0);
+
+ addr = vmem_xalloc(vmp, size, ppaquantum, 0, 0, NULL, NULL, vmflag);
+ if (addr != NULL && segkmem_xalloc(vmp, addr, size, vmflag, 0,
+ segkmem_page_create, NULL) == NULL) {
+ vmem_xfree(vmp, addr, size);
+ addr = NULL;
+ }
+
+ return (addr);
+}
+
+static void
+segkmem_free_ppa(vmem_t *vmp, void *addr, size_t size)
+{
+ size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *);
+
+ ASSERT(addr != NULL);
+
+ if (ppaquantum <= PAGESIZE) {
+ segkmem_free(vmp, addr, size);
+ } else {
+ segkmem_free(NULL, addr, size);
+ vmem_xfree(vmp, addr, size);
+ }
+}
+
+void
+segkmem_heap_lp_init()
+{
+ segkmem_lpcb_t *lpcb = &segkmem_lpcb;
+ size_t heap_lp_size = heap_lp_end - heap_lp_base;
+ size_t lpsize = segkmem_lpsize;
+ size_t ppaquantum;
+ void *addr;
+
+ if (segkmem_lpsize <= PAGESIZE) {
+ ASSERT(heap_lp_base == NULL);
+ ASSERT(heap_lp_end == NULL);
+ return;
+ }
+
+ ASSERT(segkmem_heaplp_quantum >= lpsize);
+ ASSERT((segkmem_heaplp_quantum & (lpsize - 1)) == 0);
+ ASSERT(lpcb->lp_uselp == 0);
+ ASSERT(heap_lp_base != NULL);
+ ASSERT(heap_lp_end != NULL);
+ ASSERT(heap_lp_base < heap_lp_end);
+ ASSERT(heap_lp_arena == NULL);
+ ASSERT(((uintptr_t)heap_lp_base & (lpsize - 1)) == 0);
+ ASSERT(((uintptr_t)heap_lp_end & (lpsize - 1)) == 0);
+
+ /* create large page heap arena */
+ heap_lp_arena = vmem_create("heap_lp", heap_lp_base, heap_lp_size,
+ segkmem_heaplp_quantum, NULL, NULL, NULL, 0, VM_SLEEP);
+
+ ASSERT(heap_lp_arena != NULL);
+
+ /* This arena caches memory already mapped by large pages */
+ kmem_lp_arena = vmem_create("kmem_lp", NULL, 0, segkmem_kmemlp_quantum,
+ segkmem_alloc_lpi, segkmem_free_lpi, heap_lp_arena, 0, VM_SLEEP);
+
+ ASSERT(kmem_lp_arena != NULL);
+
+ mutex_init(&lpcb->lp_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&lpcb->lp_cv, NULL, CV_DEFAULT, NULL);
+
+ /*
+ * this arena is used for the array of page_t pointers necessary
+ * to call hat_mem_load_array
+ */
+ ppaquantum = btopr(lpsize) * sizeof (page_t *);
+ segkmem_ppa_arena = vmem_create("segkmem_ppa", NULL, 0, ppaquantum,
+ segkmem_alloc_ppa, segkmem_free_ppa, heap_arena, ppaquantum,
+ VM_SLEEP);
+
+ ASSERT(segkmem_ppa_arena != NULL);
+
+ /* prealloacate some memory for the lp kernel heap */
+ if (segkmem_kmemlp_min) {
+
+ ASSERT(P2PHASE(segkmem_kmemlp_min,
+ segkmem_heaplp_quantum) == 0);
+
+ if ((addr = segkmem_alloc_lpi(heap_lp_arena,
+ segkmem_kmemlp_min, VM_SLEEP)) != NULL) {
+
+ addr = vmem_add(kmem_lp_arena, addr,
+ segkmem_kmemlp_min, VM_SLEEP);
+ ASSERT(addr != NULL);
+ }
+ }
+
+ lpcb->lp_uselp = 1;
+}
+
+#endif
diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h
new file mode 100644
index 0000000000..a1fcf43643
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kmem.h
@@ -0,0 +1,129 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VM_SEG_KMEM_H
+#define _VM_SEG_KMEM_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/vnode.h>
+#include <sys/vmem.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/page.h>
+
+/*
+ * VM - Kernel Segment Driver
+ */
+
+#if defined(_KERNEL)
+
+extern char *kernelheap; /* start of primary kernel heap */
+extern char *ekernelheap; /* end of primary kernel heap */
+extern char *heap_lp_base; /* start of kernel large page heap arena */
+extern char *heap_lp_end; /* end of kernel large page heap arena */
+extern struct seg kvseg; /* primary kernel heap segment */
+extern struct seg kvseg_core; /* "core" kernel heap segment */
+extern vmem_t *heap_arena; /* primary kernel heap arena */
+extern vmem_t *hat_memload_arena; /* HAT translation arena */
+extern struct seg kvseg32; /* 32-bit kernel heap segment */
+extern vmem_t *heap32_arena; /* 32-bit kernel heap arena */
+extern vmem_t *heaptext_arena; /* kernel text arena, from heap */
+extern struct ctx *kctx; /* kernel context */
+extern struct as kas; /* kernel address space */
+extern struct vnode kvp; /* vnode for all segkmem pages */
+extern int segkmem_reloc; /* enable/disable segkmem relocatable pages */
+extern vmem_t *static_arena; /* arena for caches to import static memory */
+extern vmem_t *static_alloc_arena; /* arena for allocating static memory */
+
+extern int segkmem_create(struct seg *);
+extern page_t *segkmem_page_create(void *, size_t, int, void *);
+extern void *segkmem_xalloc(vmem_t *, void *, size_t, int, uint_t,
+ page_t *(*page_create_func)(void *, size_t, int, void *), void *);
+extern void *segkmem_alloc(vmem_t *, size_t, int);
+extern void *segkmem_alloc_permanent(vmem_t *, size_t, int);
+extern void segkmem_free(vmem_t *, void *, size_t);
+
+extern void *boot_alloc(void *, size_t, uint_t);
+extern void boot_mapin(caddr_t addr, size_t size);
+extern void kernelheap_init(void *, void *, char *, void *, void *);
+extern void kernelheap_extend(void *, void *);
+extern void segkmem_gc(void);
+
+/*
+ * Flags for segkmem_xalloc().
+ *
+ * SEGKMEM_SHARELOCKED requests pages which are locked SE_SHARED to be
+ * returned rather than unlocked which is now the default. Note that
+ * memory returned by SEGKMEM_SHARELOCKED cannot be freed by segkmem_free().
+ * This is a hack for seg_dev that should be cleaned up in the future.
+ */
+#define SEGKMEM_SHARELOCKED 0x20000
+
+/*
+ * Large page for kmem caches support
+ */
+typedef struct segkmem_lpcb {
+ kmutex_t lp_lock;
+ kcondvar_t lp_cv;
+ uint_t lp_wait;
+ uint_t lp_uselp;
+ ulong_t lp_throttle;
+
+ /* stats */
+ uint64_t sleep_allocs_failed;
+ uint64_t nosleep_allocs_failed;
+ uint64_t allocs_throttled;
+ uint64_t allocs_limited;
+ uint64_t alloc_bytes_failed;
+} segkmem_lpcb_t;
+
+extern void *segkmem_alloc_lp(vmem_t *, size_t *, int);
+extern void segkmem_free_lp(vmem_t *, void *, size_t);
+extern int segkmem_lpsetup();
+extern void segkmem_heap_lp_init(void);
+
+extern size_t segkmem_lpsize;
+extern size_t segkmem_heaplp_quantum;
+extern size_t segkmem_kmemlp_max;
+
+#define SEGKMEM_USE_LARGEPAGES (segkmem_lpsize > PAGESIZE)
+
+#define IS_KMEM_VA_LARGEPAGE(vaddr) \
+ (((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end))
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_SEG_KMEM_H */
diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c
new file mode 100644
index 0000000000..9c7b0710f3
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kp.c
@@ -0,0 +1,1444 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * segkp is a segment driver that administers the allocation and deallocation
+ * of pageable variable size chunks of kernel virtual address space. Each
+ * allocated resource is page-aligned.
+ *
+ * The user may specify whether the resource should be initialized to 0,
+ * include a redzone, or locked in memory.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/thread.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/mman.h>
+#include <sys/vnode.h>
+#include <sys/cmn_err.h>
+#include <sys/swap.h>
+#include <sys/tuneable.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+#include <sys/cred.h>
+#include <sys/dumphdr.h>
+#include <sys/debug.h>
+#include <sys/vtrace.h>
+#include <sys/stack.h>
+#include <sys/atomic.h>
+#include <sys/archsystm.h>
+#include <sys/lgrp.h>
+
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kp.h>
+#include <vm/seg_kmem.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <vm/hat.h>
+#include <sys/bitmap.h>
+
+/*
+ * Private seg op routines
+ */
+static void segkp_badop(void);
+static void segkp_dump(struct seg *seg);
+static int segkp_checkprot(struct seg *seg, caddr_t addr, size_t len,
+ uint_t prot);
+static int segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
+static int segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
+ struct page ***page, enum lock_type type,
+ enum seg_rw rw);
+static void segkp_insert(struct seg *seg, struct segkp_data *kpd);
+static void segkp_delete(struct seg *seg, struct segkp_data *kpd);
+static caddr_t segkp_get_internal(struct seg *seg, size_t len, uint_t flags,
+ struct segkp_data **tkpd, struct anon_map *amp);
+static void segkp_release_internal(struct seg *seg,
+ struct segkp_data *kpd, size_t len);
+static int segkp_unlock(struct hat *hat, struct seg *seg, caddr_t vaddr,
+ size_t len, struct segkp_data *kpd, uint_t flags);
+static int segkp_load(struct hat *hat, struct seg *seg, caddr_t vaddr,
+ size_t len, struct segkp_data *kpd, uint_t flags);
+static struct segkp_data *segkp_find(struct seg *seg, caddr_t vaddr);
+static int segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
+static lgrp_mem_policy_info_t *segkp_getpolicy(struct seg *seg,
+ caddr_t addr);
+
+/*
+ * Lock used to protect the hash table(s) and caches.
+ */
+static kmutex_t segkp_lock;
+
+/*
+ * The segkp caches
+ */
+static struct segkp_cache segkp_cache[SEGKP_MAX_CACHE];
+
+#define SEGKP_BADOP(t) (t(*)())segkp_badop
+
+/*
+ * When there are fewer than red_minavail bytes left on the stack,
+ * segkp_map_red() will map in the redzone (if called). 5000 seems
+ * to work reasonably well...
+ */
+long red_minavail = 5000;
+
+/*
+ * will be set to 1 for 32 bit x86 systems only, in startup.c
+ */
+int segkp_fromheap = 0;
+ulong_t *segkp_bitmap;
+
+/*
+ * If segkp_map_red() is called with the redzone already mapped and
+ * with less than RED_DEEP_THRESHOLD bytes available on the stack,
+ * then the stack situation has become quite serious; if much more stack
+ * is consumed, we have the potential of scrogging the next thread/LWP
+ * structure. To help debug the "can't happen" panics which may
+ * result from this condition, we record lbolt and the calling thread
+ * in red_deep_lbolt and red_deep_thread respectively.
+ */
+#define RED_DEEP_THRESHOLD 2000
+
+clock_t red_deep_lbolt;
+kthread_t *red_deep_thread;
+
+uint32_t red_nmapped;
+uint32_t red_closest = UINT_MAX;
+uint32_t red_ndoubles;
+
+pgcnt_t anon_segkp_pages_locked; /* See vm/anon.h */
+
+static struct seg_ops segkp_ops = {
+ SEGKP_BADOP(int), /* dup */
+ SEGKP_BADOP(int), /* unmap */
+ SEGKP_BADOP(void), /* free */
+ segkp_fault,
+ SEGKP_BADOP(faultcode_t), /* faulta */
+ SEGKP_BADOP(int), /* setprot */
+ segkp_checkprot,
+ segkp_kluster,
+ SEGKP_BADOP(size_t), /* swapout */
+ SEGKP_BADOP(int), /* sync */
+ SEGKP_BADOP(size_t), /* incore */
+ SEGKP_BADOP(int), /* lockop */
+ SEGKP_BADOP(int), /* getprot */
+ SEGKP_BADOP(u_offset_t), /* getoffset */
+ SEGKP_BADOP(int), /* gettype */
+ SEGKP_BADOP(int), /* getvp */
+ SEGKP_BADOP(int), /* advise */
+ segkp_dump, /* dump */
+ segkp_pagelock, /* pagelock */
+ SEGKP_BADOP(int), /* setpgsz */
+ segkp_getmemid, /* getmemid */
+ segkp_getpolicy, /* getpolicy */
+};
+
+
+static void
+segkp_badop(void)
+{
+ panic("segkp_badop");
+ /*NOTREACHED*/
+}
+
+static void segkpinit_mem_config(struct seg *);
+
+static uint32_t segkp_indel;
+
+/*
+ * Allocate the segment specific private data struct and fill it in
+ * with the per kp segment mutex, anon ptr. array and hash table.
+ */
+int
+segkp_create(struct seg *seg)
+{
+ struct segkp_segdata *kpsd;
+ size_t np;
+
+ ASSERT(seg != NULL && seg->s_as == &kas);
+ ASSERT(RW_WRITE_HELD(&seg->s_as->a_lock));
+
+ if (seg->s_size & PAGEOFFSET) {
+ panic("Bad segkp size");
+ /*NOTREACHED*/
+ }
+
+ kpsd = kmem_zalloc(sizeof (struct segkp_segdata), KM_SLEEP);
+
+ /*
+ * Allocate the virtual memory for segkp and initialize it
+ */
+ if (segkp_fromheap) {
+ np = btop(kvseg.s_size);
+ segkp_bitmap = kmem_zalloc(BT_SIZEOFMAP(np), KM_SLEEP);
+ kpsd->kpsd_arena = vmem_create("segkp", NULL, 0, PAGESIZE,
+ vmem_alloc, vmem_free, heap_arena, 5 * PAGESIZE, VM_SLEEP);
+ } else {
+ segkp_bitmap = NULL;
+ np = btop(seg->s_size);
+ kpsd->kpsd_arena = vmem_create("segkp", seg->s_base,
+ seg->s_size, PAGESIZE, NULL, NULL, NULL, 5 * PAGESIZE,
+ VM_SLEEP);
+ }
+
+ kpsd->kpsd_anon = anon_create(np, ANON_SLEEP | ANON_ALLOC_FORCE);
+
+ kpsd->kpsd_hash = kmem_zalloc(SEGKP_HASHSZ * sizeof (struct segkp *),
+ KM_SLEEP);
+ seg->s_data = (void *)kpsd;
+ seg->s_ops = &segkp_ops;
+ segkpinit_mem_config(seg);
+ return (0);
+}
+
+
+/*
+ * Find a free 'freelist' and initialize it with the appropriate attributes
+ */
+void *
+segkp_cache_init(struct seg *seg, int maxsize, size_t len, uint_t flags)
+{
+ int i;
+
+ if ((flags & KPD_NO_ANON) && !(flags & KPD_LOCKED))
+ return ((void *)-1);
+
+ mutex_enter(&segkp_lock);
+ for (i = 0; i < SEGKP_MAX_CACHE; i++) {
+ if (segkp_cache[i].kpf_inuse)
+ continue;
+ segkp_cache[i].kpf_inuse = 1;
+ segkp_cache[i].kpf_max = maxsize;
+ segkp_cache[i].kpf_flags = flags;
+ segkp_cache[i].kpf_seg = seg;
+ segkp_cache[i].kpf_len = len;
+ mutex_exit(&segkp_lock);
+ return ((void *)(uintptr_t)i);
+ }
+ mutex_exit(&segkp_lock);
+ return ((void *)-1);
+}
+
+/*
+ * Free all the cache resources.
+ */
+void
+segkp_cache_free(void)
+{
+ struct segkp_data *kpd;
+ struct seg *seg;
+ int i;
+
+ mutex_enter(&segkp_lock);
+ for (i = 0; i < SEGKP_MAX_CACHE; i++) {
+ if (!segkp_cache[i].kpf_inuse)
+ continue;
+ /*
+ * Disconnect the freelist and process each element
+ */
+ kpd = segkp_cache[i].kpf_list;
+ seg = segkp_cache[i].kpf_seg;
+ segkp_cache[i].kpf_list = NULL;
+ segkp_cache[i].kpf_count = 0;
+ mutex_exit(&segkp_lock);
+
+ while (kpd != NULL) {
+ struct segkp_data *next;
+
+ next = kpd->kp_next;
+ segkp_release_internal(seg, kpd, kpd->kp_len);
+ kpd = next;
+ }
+ mutex_enter(&segkp_lock);
+ }
+ mutex_exit(&segkp_lock);
+}
+
+/*
+ * There are 2 entries into segkp_get_internal. The first includes a cookie
+ * used to access a pool of cached segkp resources. The second does not
+ * use the cache.
+ */
+caddr_t
+segkp_get(struct seg *seg, size_t len, uint_t flags)
+{
+ struct segkp_data *kpd = NULL;
+
+ if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
+ kpd->kp_cookie = -1;
+ return (stom(kpd->kp_base, flags));
+ }
+ return (NULL);
+}
+
+/*
+ * Return a 'cached' segkp address
+ */
+caddr_t
+segkp_cache_get(void *cookie)
+{
+ struct segkp_cache *freelist = NULL;
+ struct segkp_data *kpd = NULL;
+ int index = (int)(uintptr_t)cookie;
+ struct seg *seg;
+ size_t len;
+ uint_t flags;
+
+ if (index < 0 || index >= SEGKP_MAX_CACHE)
+ return (NULL);
+ freelist = &segkp_cache[index];
+
+ mutex_enter(&segkp_lock);
+ seg = freelist->kpf_seg;
+ flags = freelist->kpf_flags;
+ if (freelist->kpf_list != NULL) {
+ kpd = freelist->kpf_list;
+ freelist->kpf_list = kpd->kp_next;
+ freelist->kpf_count--;
+ mutex_exit(&segkp_lock);
+ kpd->kp_next = NULL;
+ segkp_insert(seg, kpd);
+ return (stom(kpd->kp_base, flags));
+ }
+ len = freelist->kpf_len;
+ mutex_exit(&segkp_lock);
+ if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
+ kpd->kp_cookie = index;
+ return (stom(kpd->kp_base, flags));
+ }
+ return (NULL);
+}
+
+caddr_t
+segkp_get_withanonmap(
+ struct seg *seg,
+ size_t len,
+ uint_t flags,
+ struct anon_map *amp)
+{
+ struct segkp_data *kpd = NULL;
+
+ ASSERT(amp != NULL);
+ flags |= KPD_HASAMP;
+ if (segkp_get_internal(seg, len, flags, &kpd, amp) != NULL) {
+ kpd->kp_cookie = -1;
+ return (stom(kpd->kp_base, flags));
+ }
+ return (NULL);
+}
+
+/*
+ * This does the real work of segkp allocation.
+ * Return to client base addr. len must be page-aligned. A null value is
+ * returned if there are no more vm resources (e.g. pages, swap). The len
+ * and base recorded in the private data structure include the redzone
+ * and the redzone length (if applicable). If the user requests a redzone
+ * either the first or last page is left unmapped depending whether stacks
+ * grow to low or high memory.
+ *
+ * The client may also specify a no-wait flag. If that is set then the
+ * request will choose a non-blocking path when requesting resources.
+ * The default is make the client wait.
+ */
+static caddr_t
+segkp_get_internal(
+ struct seg *seg,
+ size_t len,
+ uint_t flags,
+ struct segkp_data **tkpd,
+ struct anon_map *amp)
+{
+ struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
+ struct segkp_data *kpd;
+ caddr_t vbase = NULL; /* always first virtual, may not be mapped */
+ pgcnt_t np = 0; /* number of pages in the resource */
+ pgcnt_t segkpindex;
+ long i;
+ caddr_t va;
+ pgcnt_t pages = 0;
+ ulong_t anon_idx = 0;
+ int kmflag = (flags & KPD_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
+ caddr_t s_base = (segkp_fromheap) ? kvseg.s_base : seg->s_base;
+
+ if (len & PAGEOFFSET) {
+ panic("segkp_get: len is not page-aligned");
+ /*NOTREACHED*/
+ }
+
+ ASSERT(((flags & KPD_HASAMP) == 0) == (amp == NULL));
+
+ /* Only allow KPD_NO_ANON if we are going to lock it down */
+ if ((flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON)
+ return (NULL);
+
+ if ((kpd = kmem_zalloc(sizeof (struct segkp_data), kmflag)) == NULL)
+ return (NULL);
+ /*
+ * Fix up the len to reflect the REDZONE if applicable
+ */
+ if (flags & KPD_HASREDZONE)
+ len += PAGESIZE;
+ np = btop(len);
+
+ vbase = vmem_alloc(SEGKP_VMEM(seg), len, kmflag | VM_BESTFIT);
+ if (vbase == NULL) {
+ kmem_free(kpd, sizeof (struct segkp_data));
+ return (NULL);
+ }
+
+ /* If locking, reserve physical memory */
+ if (flags & KPD_LOCKED) {
+ pages = btop(SEGKP_MAPLEN(len, flags));
+ if (page_resv(pages, kmflag) == 0) {
+ vmem_free(SEGKP_VMEM(seg), vbase, len);
+ kmem_free(kpd, sizeof (struct segkp_data));
+ return (NULL);
+ }
+ if ((flags & KPD_NO_ANON) == 0)
+ atomic_add_long(&anon_segkp_pages_locked, pages);
+ }
+
+ /*
+ * Reserve sufficient swap space for this vm resource. We'll
+ * actually allocate it in the loop below, but reserving it
+ * here allows us to back out more gracefully than if we
+ * had an allocation failure in the body of the loop.
+ *
+ * Note that we don't need swap space for the red zone page.
+ */
+ if (amp != NULL) {
+ ASSERT((flags & KPD_NO_ANON) == 0);
+ /* The reserve has been done and the anon_hdr is separate. */
+ anon_idx = 0;
+ kpd->kp_anon_idx = anon_idx;
+ kpd->kp_anon = amp->ahp;
+
+ TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
+ kpd, vbase, len, flags, 1);
+
+ } else if ((flags & KPD_NO_ANON) == 0) {
+ if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) {
+ if (flags & KPD_LOCKED) {
+ atomic_add_long(&anon_segkp_pages_locked,
+ -pages);
+ page_unresv(pages);
+ }
+ vmem_free(SEGKP_VMEM(seg), vbase, len);
+ kmem_free(kpd, sizeof (struct segkp_data));
+ return (NULL);
+ }
+ anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT;
+ kpd->kp_anon_idx = anon_idx;
+ kpd->kp_anon = kpsd->kpsd_anon;
+
+ TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
+ kpd, vbase, len, flags, 1);
+ } else {
+ kpd->kp_anon = NULL;
+ kpd->kp_anon_idx = 0;
+ }
+
+ /*
+ * Allocate page and anon resources for the virtual address range
+ * except the redzone
+ */
+ if (segkp_fromheap)
+ segkpindex = btop((uintptr_t)(vbase - kvseg.s_base));
+ for (i = 0, va = vbase; i < np; i++, va += PAGESIZE) {
+ page_t *pl[2];
+ struct vnode *vp;
+ anoff_t off;
+ int err;
+ page_t *pp = NULL;
+
+ /*
+ * Mark this page to be a segkp page in the bitmap.
+ */
+ if (segkp_fromheap) {
+ BT_ATOMIC_SET(segkp_bitmap, segkpindex);
+ segkpindex++;
+ }
+
+ /*
+ * If this page is the red zone page, we don't need swap
+ * space for it. Note that we skip over the code that
+ * establishes MMU mappings, so that the page remains
+ * invalid.
+ */
+ if ((flags & KPD_HASREDZONE) && KPD_REDZONE(kpd) == i)
+ continue;
+
+ if (kpd->kp_anon != NULL) {
+ struct anon *ap;
+
+ ASSERT(anon_get_ptr(kpd->kp_anon, anon_idx + i)
+ == NULL);
+ /*
+ * Determine the "vp" and "off" of the anon slot.
+ */
+ ap = anon_alloc(NULL, 0);
+ if (amp != NULL)
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ (void) anon_set_ptr(kpd->kp_anon, anon_idx + i,
+ ap, ANON_SLEEP);
+ if (amp != NULL)
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ swap_xlate(ap, &vp, &off);
+
+ /*
+ * Create a page with the specified identity. The
+ * page is returned with the "shared" lock held.
+ */
+ err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE,
+ NULL, pl, PAGESIZE, seg, va, S_CREATE,
+ kcred);
+ if (err) {
+ /*
+ * XXX - This should not fail.
+ */
+ panic("segkp_get: no pages");
+ /*NOTREACHED*/
+ }
+ pp = pl[0];
+ } else {
+ ASSERT(page_exists(&kvp,
+ (u_offset_t)(uintptr_t)va) == NULL);
+
+ if ((pp = page_create_va(&kvp,
+ (u_offset_t)(uintptr_t)va, PAGESIZE,
+ (flags & KPD_NOWAIT ? 0 : PG_WAIT) | PG_EXCL |
+ PG_NORELOC, seg, va)) == NULL) {
+ /*
+ * Legitimize resource; then destroy it.
+ * Easier than trying to unwind here.
+ */
+ kpd->kp_flags = flags;
+ kpd->kp_base = vbase;
+ kpd->kp_len = len;
+ segkp_release_internal(seg, kpd, va - vbase);
+ return (NULL);
+ }
+ page_io_unlock(pp);
+ }
+
+ if (flags & KPD_ZERO)
+ pagezero(pp, 0, PAGESIZE);
+
+ /*
+ * Load and lock an MMU translation for the page.
+ */
+ hat_memload(seg->s_as->a_hat, va, pp, (PROT_READ|PROT_WRITE),
+ ((flags & KPD_LOCKED) ? HAT_LOAD_LOCK : HAT_LOAD));
+
+ /*
+ * Now, release lock on the page.
+ */
+ if (flags & KPD_LOCKED)
+ page_downgrade(pp);
+ else
+ page_unlock(pp);
+ }
+
+ kpd->kp_flags = flags;
+ kpd->kp_base = vbase;
+ kpd->kp_len = len;
+ segkp_insert(seg, kpd);
+ *tkpd = kpd;
+ return (stom(kpd->kp_base, flags));
+}
+
+/*
+ * Release the resource to cache if the pool(designate by the cookie)
+ * has less than the maximum allowable. If inserted in cache,
+ * segkp_delete insures element is taken off of active list.
+ */
+void
+segkp_release(struct seg *seg, caddr_t vaddr)
+{
+ struct segkp_cache *freelist;
+ struct segkp_data *kpd = NULL;
+
+ if ((kpd = segkp_find(seg, vaddr)) == NULL) {
+ panic("segkp_release: null kpd");
+ /*NOTREACHED*/
+ }
+
+ if (kpd->kp_cookie != -1) {
+ freelist = &segkp_cache[kpd->kp_cookie];
+ mutex_enter(&segkp_lock);
+ if (!segkp_indel && freelist->kpf_count < freelist->kpf_max) {
+ segkp_delete(seg, kpd);
+ kpd->kp_next = freelist->kpf_list;
+ freelist->kpf_list = kpd;
+ freelist->kpf_count++;
+ mutex_exit(&segkp_lock);
+ return;
+ } else {
+ mutex_exit(&segkp_lock);
+ kpd->kp_cookie = -1;
+ }
+ }
+ segkp_release_internal(seg, kpd, kpd->kp_len);
+}
+
+/*
+ * Free the entire resource. segkp_unlock gets called with the start of the
+ * mapped portion of the resource. The length is the size of the mapped
+ * portion
+ */
+static void
+segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len)
+{
+ caddr_t va;
+ long i;
+ long redzone;
+ size_t np;
+ page_t *pp;
+ struct vnode *vp;
+ anoff_t off;
+ struct anon *ap;
+ pgcnt_t segkpindex;
+
+ ASSERT(kpd != NULL);
+ ASSERT((kpd->kp_flags & KPD_HASAMP) == 0 || kpd->kp_cookie == -1);
+ np = btop(len);
+
+ /* Remove from active hash list */
+ if (kpd->kp_cookie == -1) {
+ mutex_enter(&segkp_lock);
+ segkp_delete(seg, kpd);
+ mutex_exit(&segkp_lock);
+ }
+
+ /*
+ * Precompute redzone page index.
+ */
+ redzone = -1;
+ if (kpd->kp_flags & KPD_HASREDZONE)
+ redzone = KPD_REDZONE(kpd);
+
+
+ va = kpd->kp_base;
+
+ hat_unload(seg->s_as->a_hat, va, (np << PAGESHIFT),
+ ((kpd->kp_flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
+ /*
+ * Free up those anon resources that are quiescent.
+ */
+ if (segkp_fromheap)
+ segkpindex = btop((uintptr_t)(va - kvseg.s_base));
+ for (i = 0; i < np; i++, va += PAGESIZE) {
+
+ /*
+ * Clear the bit for this page from the bitmap.
+ */
+ if (segkp_fromheap) {
+ BT_ATOMIC_CLEAR(segkp_bitmap, segkpindex);
+ segkpindex++;
+ }
+
+ if (i == redzone)
+ continue;
+ if (kpd->kp_anon) {
+ /*
+ * Free up anon resources and destroy the
+ * associated pages.
+ *
+ * Release the lock if there is one. Have to get the
+ * page to do this, unfortunately.
+ */
+ if (kpd->kp_flags & KPD_LOCKED) {
+ ap = anon_get_ptr(kpd->kp_anon,
+ kpd->kp_anon_idx + i);
+ swap_xlate(ap, &vp, &off);
+ /* Find the shared-locked page. */
+ pp = page_find(vp, (u_offset_t)off);
+ if (pp == NULL) {
+ panic("segkp_release: "
+ "kp_anon: no page to unlock ");
+ /*NOTREACHED*/
+ }
+ page_unlock(pp);
+ }
+ if ((kpd->kp_flags & KPD_HASAMP) == 0) {
+ anon_free(kpd->kp_anon, kpd->kp_anon_idx + i,
+ PAGESIZE);
+ anon_unresv(PAGESIZE);
+ }
+ TRACE_5(TR_FAC_VM,
+ TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
+ kpd, va, PAGESIZE, 0, 0);
+ } else {
+ if (kpd->kp_flags & KPD_LOCKED) {
+ pp = page_find(&kvp, (u_offset_t)(uintptr_t)va);
+ if (pp == NULL) {
+ panic("segkp_release: "
+ "no page to unlock");
+ /*NOTREACHED*/
+ }
+ /*
+ * We should just upgrade the lock here
+ * but there is no upgrade that waits.
+ */
+ page_unlock(pp);
+ }
+ pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)va,
+ SE_EXCL);
+ if (pp != NULL)
+ page_destroy(pp, 0);
+ }
+ }
+
+ /* If locked, release physical memory reservation */
+ if (kpd->kp_flags & KPD_LOCKED) {
+ pgcnt_t pages = btop(SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
+ if ((kpd->kp_flags & KPD_NO_ANON) == 0)
+ atomic_add_long(&anon_segkp_pages_locked, -pages);
+ page_unresv(pages);
+ }
+
+ vmem_free(SEGKP_VMEM(seg), kpd->kp_base, kpd->kp_len);
+ kmem_free(kpd, sizeof (struct segkp_data));
+}
+
+/*
+ * segkp_map_red() will check the current frame pointer against the
+ * stack base. If the amount of stack remaining is questionable
+ * (less than red_minavail), then segkp_map_red() will map in the redzone
+ * and return 1. Otherwise, it will return 0. segkp_map_red() can
+ * _only_ be called when:
+ *
+ * - it is safe to sleep on page_create_va().
+ * - the caller is non-swappable.
+ *
+ * It is up to the caller to remember whether segkp_map_red() successfully
+ * mapped the redzone, and, if so, to call segkp_unmap_red() at a later
+ * time. Note that the caller must _remain_ non-swappable until after
+ * calling segkp_unmap_red().
+ *
+ * Currently, this routine is only called from pagefault() (which necessarily
+ * satisfies the above conditions).
+ */
+#if defined(STACK_GROWTH_DOWN)
+int
+segkp_map_red(void)
+{
+ uintptr_t fp = STACK_BIAS + (uintptr_t)getfp();
+#ifndef _LP64
+ caddr_t stkbase;
+#endif
+
+ ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
+
+ /*
+ * Optimize for the common case where we simply return.
+ */
+ if ((curthread->t_red_pp == NULL) &&
+ (fp - (uintptr_t)curthread->t_stkbase >= red_minavail))
+ return (0);
+
+#if defined(_LP64)
+ /*
+ * XXX We probably need something better than this.
+ */
+ panic("kernel stack overflow");
+ /*NOTREACHED*/
+#else /* _LP64 */
+ if (curthread->t_red_pp == NULL) {
+ page_t *red_pp;
+ struct seg kseg;
+
+ caddr_t red_va = (caddr_t)
+ (((uintptr_t)curthread->t_stkbase & (uintptr_t)PAGEMASK) -
+ PAGESIZE);
+
+ ASSERT(page_exists(&kvp, (u_offset_t)(uintptr_t)red_va) ==
+ NULL);
+
+ /*
+ * Allocate the physical for the red page.
+ */
+ /*
+ * No PG_NORELOC here to avoid waits. Unlikely to get
+ * a relocate happening in the short time the page exists
+ * and it will be OK anyway.
+ */
+
+ kseg.s_as = &kas;
+ red_pp = page_create_va(&kvp, (u_offset_t)(uintptr_t)red_va,
+ PAGESIZE, PG_WAIT | PG_EXCL, &kseg, red_va);
+ ASSERT(red_pp != NULL);
+
+ /*
+ * So we now have a page to jam into the redzone...
+ */
+ page_io_unlock(red_pp);
+
+ hat_memload(kas.a_hat, red_va, red_pp,
+ (PROT_READ|PROT_WRITE), HAT_LOAD_LOCK);
+ page_downgrade(red_pp);
+
+ /*
+ * The page is left SE_SHARED locked so we can hold on to
+ * the page_t pointer.
+ */
+ curthread->t_red_pp = red_pp;
+
+ atomic_add_32(&red_nmapped, 1);
+ while (fp - (uintptr_t)curthread->t_stkbase < red_closest) {
+ (void) cas32(&red_closest, red_closest,
+ (uint32_t)(fp - (uintptr_t)curthread->t_stkbase));
+ }
+ return (1);
+ }
+
+ stkbase = (caddr_t)(((uintptr_t)curthread->t_stkbase &
+ (uintptr_t)PAGEMASK) - PAGESIZE);
+
+ atomic_add_32(&red_ndoubles, 1);
+
+ if (fp - (uintptr_t)stkbase < RED_DEEP_THRESHOLD) {
+ /*
+ * Oh boy. We're already deep within the mapped-in
+ * redzone page, and the caller is trying to prepare
+ * for a deep stack run. We're running without a
+ * redzone right now: if the caller plows off the
+ * end of the stack, it'll plow another thread or
+ * LWP structure. That situation could result in
+ * a very hard-to-debug panic, so, in the spirit of
+ * recording the name of one's killer in one's own
+ * blood, we're going to record lbolt and the calling
+ * thread.
+ */
+ red_deep_lbolt = lbolt;
+ red_deep_thread = curthread;
+ }
+
+ /*
+ * If this is a DEBUG kernel, and we've run too deep for comfort, toss.
+ */
+ ASSERT(fp - (uintptr_t)stkbase >= RED_DEEP_THRESHOLD);
+ return (0);
+#endif /* _LP64 */
+}
+
+void
+segkp_unmap_red(void)
+{
+ page_t *pp;
+ caddr_t red_va = (caddr_t)(((uintptr_t)curthread->t_stkbase &
+ (uintptr_t)PAGEMASK) - PAGESIZE);
+
+ ASSERT(curthread->t_red_pp != NULL);
+ ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
+
+ /*
+ * Because we locked the mapping down, we can't simply rely
+ * on page_destroy() to clean everything up; we need to call
+ * hat_unload() to explicitly unlock the mapping resources.
+ */
+ hat_unload(kas.a_hat, red_va, PAGESIZE, HAT_UNLOAD_UNLOCK);
+
+ pp = curthread->t_red_pp;
+
+ ASSERT(pp == page_find(&kvp, (u_offset_t)(uintptr_t)red_va));
+
+ /*
+ * Need to upgrade the SE_SHARED lock to SE_EXCL.
+ */
+ if (!page_tryupgrade(pp)) {
+ /*
+ * As there is now wait for upgrade, release the
+ * SE_SHARED lock and wait for SE_EXCL.
+ */
+ page_unlock(pp);
+ pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)red_va, SE_EXCL);
+ /* pp may be NULL here, hence the test below */
+ }
+
+ /*
+ * Destroy the page, with dontfree set to zero (i.e. free it).
+ */
+ if (pp != NULL)
+ page_destroy(pp, 0);
+ curthread->t_red_pp = NULL;
+}
+#else
+#error Red stacks only supported with downwards stack growth.
+#endif
+
+/*
+ * Handle a fault on an address corresponding to one of the
+ * resources in the segkp segment.
+ */
+faultcode_t
+segkp_fault(
+ struct hat *hat,
+ struct seg *seg,
+ caddr_t vaddr,
+ size_t len,
+ enum fault_type type,
+ enum seg_rw rw)
+{
+ struct segkp_data *kpd = NULL;
+ int err;
+
+ ASSERT(seg->s_as == &kas && RW_READ_HELD(&seg->s_as->a_lock));
+
+ /*
+ * Sanity checks.
+ */
+ if (type == F_PROT) {
+ panic("segkp_fault: unexpected F_PROT fault");
+ /*NOTREACHED*/
+ }
+
+ if ((kpd = segkp_find(seg, vaddr)) == NULL)
+ return (FC_NOMAP);
+
+ mutex_enter(&kpd->kp_lock);
+
+ if (type == F_SOFTLOCK) {
+ ASSERT(!(kpd->kp_flags & KPD_LOCKED));
+ /*
+ * The F_SOFTLOCK case has more stringent
+ * range requirements: the given range must exactly coincide
+ * with the resource's mapped portion. Note reference to
+ * redzone is handled since vaddr would not equal base
+ */
+ if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
+ len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
+ mutex_exit(&kpd->kp_lock);
+ return (FC_MAKE_ERR(EFAULT));
+ }
+
+ if ((err = segkp_load(hat, seg, vaddr, len, kpd, KPD_LOCKED))) {
+ mutex_exit(&kpd->kp_lock);
+ return (FC_MAKE_ERR(err));
+ }
+ kpd->kp_flags |= KPD_LOCKED;
+ mutex_exit(&kpd->kp_lock);
+ return (0);
+ }
+
+ if (type == F_INVAL) {
+ ASSERT(!(kpd->kp_flags & KPD_NO_ANON));
+
+ /*
+ * Check if we touched the redzone. Somewhat optimistic
+ * here if we are touching the redzone of our own stack
+ * since we wouldn't have a stack to get this far...
+ */
+ if ((kpd->kp_flags & KPD_HASREDZONE) &&
+ btop((uintptr_t)(vaddr - kpd->kp_base)) == KPD_REDZONE(kpd))
+ panic("segkp_fault: accessing redzone");
+
+ /*
+ * This fault may occur while the page is being F_SOFTLOCK'ed.
+ * Return since a 2nd segkp_load is unnecessary and also would
+ * result in the page being locked twice and eventually
+ * hang the thread_reaper thread.
+ */
+ if (kpd->kp_flags & KPD_LOCKED) {
+ mutex_exit(&kpd->kp_lock);
+ return (0);
+ }
+
+ err = segkp_load(hat, seg, vaddr, len, kpd, kpd->kp_flags);
+ mutex_exit(&kpd->kp_lock);
+ return (err ? FC_MAKE_ERR(err) : 0);
+ }
+
+ if (type == F_SOFTUNLOCK) {
+ uint_t flags;
+
+ /*
+ * Make sure the addr is LOCKED and it has anon backing
+ * before unlocking
+ */
+ if ((kpd->kp_flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON) {
+ panic("segkp_fault: bad unlock");
+ /*NOTREACHED*/
+ }
+
+ if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
+ len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
+ panic("segkp_fault: bad range");
+ /*NOTREACHED*/
+ }
+
+ if (rw == S_WRITE)
+ flags = kpd->kp_flags | KPD_WRITEDIRTY;
+ else
+ flags = kpd->kp_flags;
+ err = segkp_unlock(hat, seg, vaddr, len, kpd, flags);
+ kpd->kp_flags &= ~KPD_LOCKED;
+ mutex_exit(&kpd->kp_lock);
+ return (err ? FC_MAKE_ERR(err) : 0);
+ }
+ mutex_exit(&kpd->kp_lock);
+ panic("segkp_fault: bogus fault type: %d\n", type);
+ /*NOTREACHED*/
+}
+
+/*
+ * Check that the given protections suffice over the range specified by
+ * vaddr and len. For this segment type, the only issue is whether or
+ * not the range lies completely within the mapped part of an allocated
+ * resource.
+ */
+/* ARGSUSED */
+static int
+segkp_checkprot(struct seg *seg, caddr_t vaddr, size_t len, uint_t prot)
+{
+ struct segkp_data *kpd = NULL;
+ caddr_t mbase;
+ size_t mlen;
+
+ if ((kpd = segkp_find(seg, vaddr)) == NULL)
+ return (EACCES);
+
+ mutex_enter(&kpd->kp_lock);
+ mbase = stom(kpd->kp_base, kpd->kp_flags);
+ mlen = SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags);
+ if (len > mlen || vaddr < mbase ||
+ ((vaddr + len) > (mbase + mlen))) {
+ mutex_exit(&kpd->kp_lock);
+ return (EACCES);
+ }
+ mutex_exit(&kpd->kp_lock);
+ return (0);
+}
+
+
+/*
+ * Check to see if it makes sense to do kluster/read ahead to
+ * addr + delta relative to the mapping at addr. We assume here
+ * that delta is a signed PAGESIZE'd multiple (which can be negative).
+ *
+ * For seg_u we always "approve" of this action from our standpoint.
+ */
+/*ARGSUSED*/
+static int
+segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+ return (0);
+}
+
+/*
+ * Load and possibly lock intra-slot resources in the range given by
+ * vaddr and len.
+ */
+static int
+segkp_load(
+ struct hat *hat,
+ struct seg *seg,
+ caddr_t vaddr,
+ size_t len,
+ struct segkp_data *kpd,
+ uint_t flags)
+{
+ caddr_t va;
+ caddr_t vlim;
+ ulong_t i;
+ uint_t lock;
+
+ ASSERT(MUTEX_HELD(&kpd->kp_lock));
+
+ len = P2ROUNDUP(len, PAGESIZE);
+
+ /* If locking, reserve physical memory */
+ if (flags & KPD_LOCKED) {
+ pgcnt_t pages = btop(len);
+ if ((kpd->kp_flags & KPD_NO_ANON) == 0)
+ atomic_add_long(&anon_segkp_pages_locked, pages);
+ (void) page_resv(pages, KM_SLEEP);
+ }
+
+ /*
+ * Loop through the pages in the given range.
+ */
+ va = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
+ vaddr = va;
+ vlim = va + len;
+ lock = flags & KPD_LOCKED;
+ i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
+ for (; va < vlim; va += PAGESIZE, i++) {
+ page_t *pl[2]; /* second element NULL terminator */
+ struct vnode *vp;
+ anoff_t off;
+ int err;
+ struct anon *ap;
+
+ /*
+ * Summon the page. If it's not resident, arrange
+ * for synchronous i/o to pull it in.
+ */
+ ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
+ swap_xlate(ap, &vp, &off);
+
+ /*
+ * The returned page list will have exactly one entry,
+ * which is returned to us already kept.
+ */
+ err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, NULL,
+ pl, PAGESIZE, seg, va, S_READ, kcred);
+
+ if (err) {
+ /*
+ * Back out of what we've done so far.
+ */
+ (void) segkp_unlock(hat, seg, vaddr,
+ (va - vaddr), kpd, flags);
+ return (err);
+ }
+
+ /*
+ * Load an MMU translation for the page.
+ */
+ hat_memload(hat, va, pl[0], (PROT_READ|PROT_WRITE),
+ lock ? HAT_LOAD_LOCK : HAT_LOAD);
+
+ if (!lock) {
+ /*
+ * Now, release "shared" lock on the page.
+ */
+ page_unlock(pl[0]);
+ }
+ }
+ return (0);
+}
+
+/*
+ * At the very least unload the mmu-translations and unlock the range if locked
+ * Can be called with the following flag value KPD_WRITEDIRTY which specifies
+ * any dirty pages should be written to disk.
+ */
+static int
+segkp_unlock(
+ struct hat *hat,
+ struct seg *seg,
+ caddr_t vaddr,
+ size_t len,
+ struct segkp_data *kpd,
+ uint_t flags)
+{
+ caddr_t va;
+ caddr_t vlim;
+ ulong_t i;
+ struct page *pp;
+ struct vnode *vp;
+ anoff_t off;
+ struct anon *ap;
+
+#ifdef lint
+ seg = seg;
+#endif /* lint */
+
+ ASSERT(MUTEX_HELD(&kpd->kp_lock));
+
+ /*
+ * Loop through the pages in the given range. It is assumed
+ * segkp_unlock is called with page aligned base
+ */
+ va = vaddr;
+ vlim = va + len;
+ i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
+ hat_unload(hat, va, len,
+ ((flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
+ for (; va < vlim; va += PAGESIZE, i++) {
+ /*
+ * Find the page associated with this part of the
+ * slot, tracking it down through its associated swap
+ * space.
+ */
+ ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
+ swap_xlate(ap, &vp, &off);
+
+ if (flags & KPD_LOCKED) {
+ if ((pp = page_find(vp, off)) == NULL) {
+ if (flags & KPD_LOCKED) {
+ panic("segkp_softunlock: missing page");
+ /*NOTREACHED*/
+ }
+ }
+ } else {
+ /*
+ * Nothing to do if the slot is not locked and the
+ * page doesn't exist.
+ */
+ if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL)
+ continue;
+ }
+
+ /*
+ * If the page doesn't have any translations, is
+ * dirty and not being shared, then push it out
+ * asynchronously and avoid waiting for the
+ * pageout daemon to do it for us.
+ *
+ * XXX - Do we really need to get the "exclusive"
+ * lock via an upgrade?
+ */
+ if ((flags & KPD_WRITEDIRTY) && !hat_page_is_mapped(pp) &&
+ hat_ismod(pp) && page_tryupgrade(pp)) {
+ /*
+ * Hold the vnode before releasing the page lock to
+ * prevent it from being freed and re-used by some
+ * other thread.
+ */
+ VN_HOLD(vp);
+ page_unlock(pp);
+
+ /*
+ * Want most powerful credentials we can get so
+ * use kcred.
+ */
+ (void) VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
+ B_ASYNC | B_FREE, kcred);
+ VN_RELE(vp);
+ } else {
+ page_unlock(pp);
+ }
+ }
+
+ /* If unlocking, release physical memory */
+ if (flags & KPD_LOCKED) {
+ pgcnt_t pages = btopr(len);
+ if ((kpd->kp_flags & KPD_NO_ANON) == 0)
+ atomic_add_long(&anon_segkp_pages_locked, -pages);
+ page_unresv(pages);
+ }
+ return (0);
+}
+
+/*
+ * Insert the kpd in the hash table.
+ */
+static void
+segkp_insert(struct seg *seg, struct segkp_data *kpd)
+{
+ struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
+ int index;
+
+ /*
+ * Insert the kpd based on the address that will be returned
+ * via segkp_release.
+ */
+ index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
+ mutex_enter(&segkp_lock);
+ kpd->kp_next = kpsd->kpsd_hash[index];
+ kpsd->kpsd_hash[index] = kpd;
+ mutex_exit(&segkp_lock);
+}
+
+/*
+ * Remove kpd from the hash table.
+ */
+static void
+segkp_delete(struct seg *seg, struct segkp_data *kpd)
+{
+ struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
+ struct segkp_data **kpp;
+ int index;
+
+ ASSERT(MUTEX_HELD(&segkp_lock));
+
+ index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
+ for (kpp = &kpsd->kpsd_hash[index];
+ *kpp != NULL; kpp = &((*kpp)->kp_next)) {
+ if (*kpp == kpd) {
+ *kpp = kpd->kp_next;
+ return;
+ }
+ }
+ panic("segkp_delete: unable to find element to delete");
+ /*NOTREACHED*/
+}
+
+/*
+ * Find the kpd associated with a vaddr.
+ *
+ * Most of the callers of segkp_find will pass the vaddr that
+ * hashes to the desired index, but there are cases where
+ * this is not true in which case we have to (potentially) scan
+ * the whole table looking for it. This should be very rare
+ * (e.g. a segkp_fault(F_INVAL) on an address somewhere in the
+ * middle of the segkp_data region).
+ */
+static struct segkp_data *
+segkp_find(struct seg *seg, caddr_t vaddr)
+{
+ struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
+ struct segkp_data *kpd;
+ int i;
+ int stop;
+
+ i = stop = SEGKP_HASH(vaddr);
+ mutex_enter(&segkp_lock);
+ do {
+ for (kpd = kpsd->kpsd_hash[i]; kpd != NULL;
+ kpd = kpd->kp_next) {
+ if (vaddr >= kpd->kp_base &&
+ vaddr < kpd->kp_base + kpd->kp_len) {
+ mutex_exit(&segkp_lock);
+ return (kpd);
+ }
+ }
+ if (--i < 0)
+ i = SEGKP_HASHSZ - 1; /* Wrap */
+ } while (i != stop);
+ mutex_exit(&segkp_lock);
+ return (NULL); /* Not found */
+}
+
+/*
+ * returns size of swappable area.
+ */
+size_t
+swapsize(caddr_t v)
+{
+ struct segkp_data *kpd;
+
+ if ((kpd = segkp_find(segkp, v)) != NULL)
+ return (SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
+ else
+ return (NULL);
+}
+
+/*
+ * Dump out all the active segkp pages
+ */
+static void
+segkp_dump(struct seg *seg)
+{
+ int i;
+ struct segkp_data *kpd;
+ struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
+
+ for (i = 0; i < SEGKP_HASHSZ; i++) {
+ for (kpd = kpsd->kpsd_hash[i];
+ kpd != NULL; kpd = kpd->kp_next) {
+ pfn_t pfn;
+ caddr_t addr;
+ caddr_t eaddr;
+
+ addr = kpd->kp_base;
+ eaddr = addr + kpd->kp_len;
+ while (addr < eaddr) {
+ ASSERT(seg->s_as == &kas);
+ pfn = hat_getpfnum(seg->s_as->a_hat, addr);
+ if (pfn != PFN_INVALID)
+ dump_addpage(seg->s_as, addr, pfn);
+ addr += PAGESIZE;
+ dump_timeleft = dump_timeout;
+ }
+ }
+ }
+}
+
+/*ARGSUSED*/
+static int
+segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
+ struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+ return (ENOTSUP);
+}
+
+/*ARGSUSED*/
+static int
+segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+ return (ENODEV);
+}
+
+/*ARGSUSED*/
+static lgrp_mem_policy_info_t *
+segkp_getpolicy(struct seg *seg, caddr_t addr)
+{
+ return (NULL);
+}
+
+#include <sys/mem_config.h>
+
+/*ARGSUSED*/
+static void
+segkp_mem_config_post_add(void *arg, pgcnt_t delta_pages)
+{}
+
+/*
+ * During memory delete, turn off caches so that pages are not held.
+ * A better solution may be to unlock the pages while they are
+ * in the cache so that they may be collected naturally.
+ */
+
+/*ARGSUSED*/
+static int
+segkp_mem_config_pre_del(void *arg, pgcnt_t delta_pages)
+{
+ atomic_add_32(&segkp_indel, 1);
+ segkp_cache_free();
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+segkp_mem_config_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
+{
+ atomic_add_32(&segkp_indel, -1);
+}
+
+static kphysm_setup_vector_t segkp_mem_config_vec = {
+ KPHYSM_SETUP_VECTOR_VERSION,
+ segkp_mem_config_post_add,
+ segkp_mem_config_pre_del,
+ segkp_mem_config_post_del,
+};
+
+static void
+segkpinit_mem_config(struct seg *seg)
+{
+ int ret;
+
+ ret = kphysm_setup_func_register(&segkp_mem_config_vec, (void *)seg);
+ ASSERT(ret == 0);
+}
diff --git a/usr/src/uts/common/vm/seg_kp.h b/usr/src/uts/common/vm/seg_kp.h
new file mode 100644
index 0000000000..64fa883cc9
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kp.h
@@ -0,0 +1,165 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VM_SEG_KP_H
+#define _VM_SEG_KP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * segkp (as in kernel pageable) is a segment driver that supports allocation
+ * of page-aligned variable size of vm resources.
+ *
+ * Each vm resource represents a page-aligned range of virtual addresses.
+ * The caller may specify whether the resource should include a redzone,
+ * be locked down, or be zero initialized.
+ */
+
+#include <vm/seg.h>
+#include <sys/vmem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+/*
+ * Private information per overall segkp segment (as opposed
+ * to per resource within segment). There are as many anon slots
+ * allocated as there there are pages in the segment.
+ */
+struct segkp_segdata {
+ struct anon_hdr *kpsd_anon; /* anon structs */
+ vmem_t *kpsd_arena; /* virtual memory descriptor */
+ struct segkp_data **kpsd_hash; /* Hash table for lookups */
+};
+
+#define SEGKP_VMEM(seg) (((struct segkp_segdata *)(seg)->s_data)->kpsd_arena)
+
+/*
+ * A hash table is used to aid in the lookup of a kpd's based on vaddr.
+ * Since the heaviest use of segkp occurs from segkp_*get and segkp_*release,
+ * the hashing is based on the vaddr used by these routines.
+ */
+#define SEGKP_HASHSZ 256 /* power of two */
+#define SEGKP_HASHMASK (SEGKP_HASHSZ - 1)
+#define SEGKP_HASH(vaddr) \
+ ((int)(((uintptr_t)vaddr >> PAGESHIFT) & SEGKP_HASHMASK))
+
+struct segkp_data {
+ kmutex_t kp_lock; /* per resource lock */
+ caddr_t kp_base; /* starting addr of chunk */
+ size_t kp_len; /* # of bytes */
+ uint_t kp_flags; /* state info */
+ int kp_cookie; /* index into cache array */
+ ulong_t kp_anon_idx; /* index into main anon array */
+ /* in segkp_segdata */
+ struct anon_hdr *kp_anon; /* anon structs */
+ struct segkp_data *kp_next; /* ptr to next in hash chain */
+};
+
+/*
+ * Flag bits
+ *
+ */
+#define KPD_ZERO 0x01 /* initialize resource with 0 */
+#define KPD_LOCKED 0x02 /* resources locked */
+#define KPD_NO_ANON 0x04 /* no swap resources required */
+#define KPD_HASREDZONE 0x08 /* include a redzone */
+#define KPD_NOWAIT 0x10 /* do not wait for res. if unavail. */
+#define KPD_WRITEDIRTY 0x20 /* dirty pages should be flushed */
+#define KPD_HASAMP 0x40 /* anon_hdr managed by caller */
+
+/*
+ * A cache of segkp elements may be created via segkp_cache_init().
+ * The elements on the freelist all have the same len and flags value.
+ * The cookie passed to the client is an index into the freelist array.
+ */
+struct segkp_cache {
+ int kpf_max; /* max # of elements allowed */
+ int kpf_count; /* current no. of elments */
+ int kpf_inuse; /* list inuse */
+ uint_t kpf_flags; /* seg_kp flag value */
+ size_t kpf_len; /* len of resource */
+ struct seg *kpf_seg; /* segment */
+ struct segkp_data *kpf_list; /* list of kpd's */
+};
+#define SEGKP_MAX_CACHE 4 /* Number of caches maintained */
+
+/*
+ * Define redzone, and stack_to_memory macros.
+ * The redzone is PAGESIZE bytes.
+ */
+#ifdef STACK_GROWTH_DOWN
+#define KPD_REDZONE(kpd) (0)
+#define stom(v, flags) (((flags) & KPD_HASREDZONE) ? (v) + PAGESIZE : (v))
+
+#else /* STACK_GROWTH_DOWN */
+
+#define KPD_REDZONE(kpd) (btop(kpd->kp_len) - 1)
+#define stom(v) (v)
+#endif /* STACK_GROWTH_DOWN */
+
+#define SEGKP_MAPLEN(len, flags) \
+ (((flags) & KPD_HASREDZONE) ? (len) - PAGESIZE : (len))
+
+extern struct seg *segkp;
+/* If segkp becomes more than one seg this test will need changing. */
+#define SEG_IS_SEGKP(SEG) ((SEG) == segkp)
+
+/*
+ * Public routine declarations not part of the segment ops vector go here.
+ */
+int segkp_create(struct seg *seg);
+caddr_t segkp_get(struct seg *seg, size_t len, uint_t flags);
+void segkp_release(struct seg *seg, caddr_t vaddr);
+void * segkp_cache_init(struct seg *seg, int maxsize, size_t len,
+ uint_t flags);
+void segkp_cache_free();
+caddr_t segkp_cache_get(void *cookie);
+int segkp_map_red(void);
+void segkp_unmap_red(void);
+size_t swapsize(caddr_t v);
+
+/* Special currently only used by schedctl. */
+struct anon_map; /* Make the compiler happy about the next line. */
+caddr_t segkp_get_withanonmap(struct seg *, size_t, uint_t, struct anon_map *);
+
+/*
+ * We allow explicit calls to segkp_fault, even though it's part
+ * of the segkp ops vector.
+ */
+faultcode_t segkp_fault(struct hat *hat, struct seg *seg, caddr_t addr,
+ size_t len, enum fault_type type, enum seg_rw rw);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_SEG_KP_H */
diff --git a/usr/src/uts/common/vm/seg_kpm.c b/usr/src/uts/common/vm/seg_kpm.c
new file mode 100644
index 0000000000..73b7dbe94c
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kpm.c
@@ -0,0 +1,323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Kernel Physical Mapping (kpm) segment driver (segkpm).
+ *
+ * This driver delivers along with the hat_kpm* interfaces an alternative
+ * mechanism for kernel mappings within the 64-bit Solaris operating system,
+ * which allows the mapping of all physical memory into the kernel address
+ * space at once. This is feasible in 64 bit kernels, e.g. for Ultrasparc II
+ * and beyond processors, since the available VA range is much larger than
+ * possible physical memory. Momentarily all physical memory is supported,
+ * that is represented by the list of memory segments (memsegs).
+ *
+ * Segkpm mappings have also very low overhead and large pages are used
+ * (when possible) to minimize the TLB and TSB footprint. It is also
+ * extentable for other than Sparc architectures (e.g. AMD64). Main
+ * advantage is the avoidance of the TLB-shootdown X-calls, which are
+ * normally needed when a kernel (global) mapping has to be removed.
+ *
+ * First example of a kernel facility that uses the segkpm mapping scheme
+ * is seg_map, where it is used as an alternative to hat_memload().
+ * See also hat layer for more information about the hat_kpm* routines.
+ * The kpm facilty can be turned off at boot time (e.g. /etc/system).
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/bitmap.h>
+#include <sys/atomic.h>
+
+#include <vm/seg_kmem.h>
+#include <vm/seg_kpm.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/page.h>
+
+/*
+ * Global kpm controls.
+ * See also platform and mmu specific controls.
+ *
+ * kpm_enable -- global on/off switch for segkpm.
+ * . Set by default on 64bit platforms that have kpm support.
+ * . Will be disabled from platform layer if not supported.
+ * . Can be disabled via /etc/system.
+ *
+ * kpm_smallpages -- use only regular/system pagesize for kpm mappings.
+ * . Can be useful for critical debugging of kpm clients.
+ * . Set to zero by default for platforms that support kpm large pages.
+ * The use of kpm large pages reduces the footprint of kpm meta data
+ * and has all the other advantages of using large pages (e.g TLB
+ * miss reduction).
+ * . Set by default for platforms that don't support kpm large pages or
+ * where large pages cannot be used for other reasons (e.g. there are
+ * only few full associative TLB entries available for large pages).
+ *
+ * segmap_kpm -- separate on/off switch for segmap using segkpm:
+ * . Set by default.
+ * . Will be disabled when kpm_enable is zero.
+ * . Will be disabled when MAXBSIZE != PAGESIZE.
+ * . Can be disabled via /etc/system.
+ *
+ */
+int kpm_enable = 1;
+int kpm_smallpages = 0;
+int segmap_kpm = 1;
+
+/*
+ * Private seg op routines.
+ */
+faultcode_t segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr,
+ size_t len, enum fault_type type, enum seg_rw rw);
+static void segkpm_dump(struct seg *);
+static void segkpm_badop(void);
+static int segkpm_notsup(void);
+
+#define SEGKPM_BADOP(t) (t(*)())segkpm_badop
+#define SEGKPM_NOTSUP (int(*)())segkpm_notsup
+
+static struct seg_ops segkpm_ops = {
+ SEGKPM_BADOP(int), /* dup */
+ SEGKPM_BADOP(int), /* unmap */
+ SEGKPM_BADOP(void), /* free */
+ segkpm_fault,
+ SEGKPM_BADOP(int), /* faulta */
+ SEGKPM_BADOP(int), /* setprot */
+ SEGKPM_BADOP(int), /* checkprot */
+ SEGKPM_BADOP(int), /* kluster */
+ SEGKPM_BADOP(size_t), /* swapout */
+ SEGKPM_BADOP(int), /* sync */
+ SEGKPM_BADOP(size_t), /* incore */
+ SEGKPM_BADOP(int), /* lockop */
+ SEGKPM_BADOP(int), /* getprot */
+ SEGKPM_BADOP(u_offset_t), /* getoffset */
+ SEGKPM_BADOP(int), /* gettype */
+ SEGKPM_BADOP(int), /* getvp */
+ SEGKPM_BADOP(int), /* advise */
+ segkpm_dump, /* dump */
+ SEGKPM_NOTSUP, /* pagelock */
+ SEGKPM_BADOP(int), /* setpgsz */
+ SEGKPM_BADOP(int), /* getmemid */
+};
+
+/*
+ * kpm_pgsz and kpm_pgshft are set by platform layer.
+ */
+size_t kpm_pgsz; /* kpm page size */
+uint_t kpm_pgshft; /* kpm page shift */
+u_offset_t kpm_pgoff; /* kpm page offset mask */
+uint_t kpmp2pshft; /* kpm page to page shift */
+pgcnt_t kpmpnpgs; /* how many pages per kpm page */
+
+
+#ifdef SEGKPM_SUPPORT
+
+int
+segkpm_create(struct seg *seg, void *argsp)
+{
+ struct segkpm_data *skd;
+ struct segkpm_crargs *b = (struct segkpm_crargs *)argsp;
+ ushort_t *p;
+ int i, j;
+
+ ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
+ ASSERT(btokpmp(seg->s_size) >= 1 &&
+ kpmpageoff((uintptr_t)seg->s_base) == 0 &&
+ kpmpageoff((uintptr_t)seg->s_base + seg->s_size) == 0);
+
+ skd = kmem_zalloc(sizeof (struct segkpm_data), KM_SLEEP);
+
+ seg->s_data = (void *)skd;
+ seg->s_ops = &segkpm_ops;
+ skd->skd_prot = b->prot;
+
+ /*
+ * (1) Segkpm virtual addresses are based on physical adresses.
+ * From this and in opposite to other segment drivers it is
+ * often required to allocate a page first to be able to
+ * calculate the final segkpm virtual address.
+ * (2) Page allocation is done by calling page_create_va(),
+ * one important input argument is a virtual address (also
+ * expressed by the "va" in the function name). This function
+ * is highly optimized to select the right page for an optimal
+ * processor and platform support (e.g. virtual addressed
+ * caches (VAC), physical addressed caches, NUMA).
+ *
+ * Because of (1) the approach is to generate a faked virtual
+ * address for calling page_create_va(). In order to exploit
+ * the abilities of (2), especially to utilize the cache
+ * hierarchy (3) and to avoid VAC alias conflicts (4) the
+ * selection has to be done carefully. For each virtual color
+ * a separate counter is provided (4). The count values are
+ * used for the utilization of all cache lines (3) and are
+ * corresponding to the cache bins.
+ */
+ skd->skd_nvcolors = b->nvcolors;
+
+ p = skd->skd_va_select =
+ kmem_zalloc(NCPU * b->nvcolors * sizeof (ushort_t), KM_SLEEP);
+
+ for (i = 0; i < NCPU; i++)
+ for (j = 0; j < b->nvcolors; j++, p++)
+ *p = j;
+
+ return (0);
+}
+
+/*
+ * This routine is called via a machine specific fault handling
+ * routine.
+ */
+/* ARGSUSED */
+faultcode_t
+segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
+ enum fault_type type, enum seg_rw rw)
+{
+ faultcode_t error;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ error = hat_kpm_fault(hat, addr);
+
+ return (error);
+}
+
+#define addr_to_vcolor(addr, vcolors) \
+ ((int)(((uintptr_t)(addr) & ((vcolors << PAGESHIFT) - 1)) >> PAGESHIFT))
+
+/*
+ * Create a virtual address that can be used for invocations of
+ * page_create_va. Goal is to utilize the cache hierarchy (round
+ * robin bins) and to select the right color for virtual indexed
+ * caches. It isn't exact since we also increment the bin counter
+ * when the caller uses VOP_GETPAGE and gets a hit in the page
+ * cache, but we keep the bins turning for cache distribution
+ * (see also segkpm_create block comment).
+ */
+caddr_t
+segkpm_create_va(u_offset_t off)
+{
+ int vcolor;
+ ushort_t *p;
+ struct segkpm_data *skd = (struct segkpm_data *)segkpm->s_data;
+ int nvcolors = skd->skd_nvcolors;
+ caddr_t va;
+
+ vcolor = (nvcolors > 1) ? addr_to_vcolor(off, nvcolors) : 0;
+ p = &skd->skd_va_select[(CPU->cpu_id * nvcolors) + vcolor];
+ va = (caddr_t)ptob(*p);
+
+ atomic_add_16(p, nvcolors);
+
+ return (va);
+}
+
+/*
+ * Unload mapping if the instance has an active kpm mapping.
+ */
+void
+segkpm_mapout_validkpme(struct kpme *kpme)
+{
+ caddr_t vaddr;
+ page_t *pp;
+
+retry:
+ if ((pp = kpme->kpe_page) == NULL) {
+ return;
+ }
+
+ if (page_lock(pp, SE_SHARED, (kmutex_t *)NULL, P_RECLAIM) == 0)
+ goto retry;
+
+ /*
+ * Check if segkpm mapping is not unloaded in the meantime
+ */
+ if (kpme->kpe_page == NULL) {
+ page_unlock(pp);
+ return;
+ }
+
+ vaddr = hat_kpm_page2va(pp, 1);
+ hat_kpm_mapout(pp, kpme, vaddr);
+ page_unlock(pp);
+}
+
+static void
+segkpm_badop()
+{
+ panic("segkpm_badop");
+}
+
+#else /* SEGKPM_SUPPORT */
+
+/* segkpm stubs */
+
+/*ARGSUSED*/
+int segkpm_create(struct seg *seg, void *argsp) { return (0); }
+
+/* ARGSUSED */
+faultcode_t
+segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
+ enum fault_type type, enum seg_rw rw)
+{
+ return ((faultcode_t)0);
+}
+
+/* ARGSUSED */
+caddr_t segkpm_create_va(u_offset_t off) { return (NULL); }
+
+/* ARGSUSED */
+void segkpm_mapout_validkpme(struct kpme *kpme) {}
+
+static void
+segkpm_badop() {}
+
+#endif /* SEGKPM_SUPPORT */
+
+static int
+segkpm_notsup()
+{
+ return (ENOTSUP);
+}
+
+/*
+ * segkpm pages are not dumped, so we just return
+ */
+/*ARGSUSED*/
+static void
+segkpm_dump(struct seg *seg)
+{}
diff --git a/usr/src/uts/common/vm/seg_kpm.h b/usr/src/uts/common/vm/seg_kpm.h
new file mode 100644
index 0000000000..0b766bbaf4
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_kpm.h
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VM_SEG_KPM_H
+#define _VM_SEG_KPM_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Kernel Physical Mapping (segkpm) segment driver.
+ */
+
+#include <vm/kpm.h>
+
+struct segkpm_data {
+ ushort_t *skd_va_select; /* page_create_va kpm vaddr bin count */
+ short skd_nvcolors; /* VAC colors to deal with */
+ uchar_t skd_prot;
+};
+
+/*
+ * segkpm create needs some platform knowledge
+ */
+struct segkpm_crargs {
+ uint_t prot;
+ short nvcolors; /* VAC # virtual colors, 0 for PAC. */
+};
+
+extern struct seg *segkpm;
+extern u_offset_t kpm_pgoff;
+extern size_t kpm_pgsz;
+extern uint_t kpm_pgshft;
+extern uint_t kpmp2pshft;
+extern pgcnt_t kpmpnpgs;
+
+/* kpm controls */
+extern int kpm_enable;
+extern int kpm_smallpages;
+extern int segmap_kpm;
+
+/*
+ * kpm_page_t macros:
+ * . bytes (b) to kpm pages (kpmp)
+ * . pages (p) to kpm pages (kpmp), and back (with and without roundup)
+ * . kpm page offset in bytes
+ * . pages (p) modulo kpm pages (kpmp)
+ */
+#define btokpmp(x) ((x) >> kpm_pgshft)
+#define btokpmpr(x) (((x) + kpm_pgoff) >> kpm_pgshft)
+#define ptokpmp(x) ((x) >> kpmp2pshft)
+#define ptokpmpr(x) (((x) + (kpmpnpgs - 1)) >> kpmp2pshft)
+#define kpmptop(x) ((x) << kpmp2pshft)
+#define kpmpageoff(x) ((x) & kpm_pgoff)
+#define pmodkpmp(x) ((x) & (kpmpnpgs - 1))
+
+#ifdef SEGKPM_SUPPORT
+
+#define IS_KPM_ADDR(addr) \
+ ((addr) >= segkpm->s_base && (addr) < (segkpm->s_base + segkpm->s_size))
+
+#define KPMPAGE_T_SZ \
+ ((kpm_smallpages == 0) ? sizeof (kpm_page_t) : sizeof (kpm_spage_t))
+
+#else /* SEGKPM_SUPPORT */
+
+#define IS_KPM_ADDR(addr) (segkpm != NULL)
+#define KPMPAGE_T_SZ (0)
+
+#endif /* SEGKPM_SUPPORT */
+
+#ifdef _KERNEL
+/*
+ * Public seg_kpm segment operations.
+ */
+extern int segkpm_create(struct seg *, void *);
+extern faultcode_t segkpm_fault(struct hat *, struct seg *, caddr_t,
+ size_t, enum fault_type, enum seg_rw);
+
+/*
+ * Public seg_kpm interfaces.
+ */
+extern caddr_t segkpm_create_va(u_offset_t);
+extern void segkpm_mapout_validkpme(struct kpme *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_SEG_KPM_H */
diff --git a/usr/src/uts/common/vm/seg_map.c b/usr/src/uts/common/vm/seg_map.c
new file mode 100644
index 0000000000..d4b6a16ca4
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_map.c
@@ -0,0 +1,2345 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - generic vnode mapping segment.
+ *
+ * The segmap driver is used only by the kernel to get faster (than seg_vn)
+ * mappings [lower routine overhead; more persistent cache] to random
+ * vnode/offsets. Note than the kernel may (and does) use seg_vn as well.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/buf.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/mman.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/vtrace.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/thread.h>
+#include <sys/dumphdr.h>
+#include <sys/bitmap.h>
+#include <sys/lgrp.h>
+
+#include <vm/seg_kmem.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kpm.h>
+#include <vm/seg_map.h>
+#include <vm/page.h>
+#include <vm/pvn.h>
+#include <vm/rm.h>
+
+/*
+ * Private seg op routines.
+ */
+static void segmap_free(struct seg *seg);
+faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
+ size_t len, enum fault_type type, enum seg_rw rw);
+static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
+static int segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
+ uint_t prot);
+static int segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
+static int segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
+ uint_t *protv);
+static u_offset_t segmap_getoffset(struct seg *seg, caddr_t addr);
+static int segmap_gettype(struct seg *seg, caddr_t addr);
+static int segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
+static void segmap_dump(struct seg *seg);
+static int segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
+ struct page ***ppp, enum lock_type type,
+ enum seg_rw rw);
+static void segmap_badop(void);
+static int segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
+static lgrp_mem_policy_info_t *segmap_getpolicy(struct seg *seg,
+ caddr_t addr);
+
+/* segkpm support */
+static caddr_t segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
+ struct smap *, enum seg_rw);
+struct smap *get_smap_kpm(caddr_t, page_t **);
+
+#define SEGMAP_BADOP(t) (t(*)())segmap_badop
+
+static struct seg_ops segmap_ops = {
+ SEGMAP_BADOP(int), /* dup */
+ SEGMAP_BADOP(int), /* unmap */
+ segmap_free,
+ segmap_fault,
+ segmap_faulta,
+ SEGMAP_BADOP(int), /* setprot */
+ segmap_checkprot,
+ segmap_kluster,
+ SEGMAP_BADOP(size_t), /* swapout */
+ SEGMAP_BADOP(int), /* sync */
+ SEGMAP_BADOP(size_t), /* incore */
+ SEGMAP_BADOP(int), /* lockop */
+ segmap_getprot,
+ segmap_getoffset,
+ segmap_gettype,
+ segmap_getvp,
+ SEGMAP_BADOP(int), /* advise */
+ segmap_dump,
+ segmap_pagelock, /* pagelock */
+ SEGMAP_BADOP(int), /* setpgsz */
+ segmap_getmemid, /* getmemid */
+ segmap_getpolicy, /* getpolicy */
+};
+
+/*
+ * Private segmap routines.
+ */
+static void segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
+ size_t len, enum seg_rw rw, struct smap *smp);
+static void segmap_smapadd(struct smap *smp);
+static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
+ u_offset_t off, int hashid);
+static void segmap_hashout(struct smap *smp);
+
+
+/*
+ * Statistics for segmap operations.
+ *
+ * No explicit locking to protect these stats.
+ */
+struct segmapcnt segmapcnt = {
+ { "fault", KSTAT_DATA_ULONG },
+ { "faulta", KSTAT_DATA_ULONG },
+ { "getmap", KSTAT_DATA_ULONG },
+ { "get_use", KSTAT_DATA_ULONG },
+ { "get_reclaim", KSTAT_DATA_ULONG },
+ { "get_reuse", KSTAT_DATA_ULONG },
+ { "get_unused", KSTAT_DATA_ULONG },
+ { "get_nofree", KSTAT_DATA_ULONG },
+ { "rel_async", KSTAT_DATA_ULONG },
+ { "rel_write", KSTAT_DATA_ULONG },
+ { "rel_free", KSTAT_DATA_ULONG },
+ { "rel_abort", KSTAT_DATA_ULONG },
+ { "rel_dontneed", KSTAT_DATA_ULONG },
+ { "release", KSTAT_DATA_ULONG },
+ { "pagecreate", KSTAT_DATA_ULONG },
+ { "free_notfree", KSTAT_DATA_ULONG },
+ { "free_dirty", KSTAT_DATA_ULONG },
+ { "free", KSTAT_DATA_ULONG },
+ { "stolen", KSTAT_DATA_ULONG },
+ { "get_nomtx", KSTAT_DATA_ULONG }
+};
+
+kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
+uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
+
+/*
+ * Return number of map pages in segment.
+ */
+#define MAP_PAGES(seg) ((seg)->s_size >> MAXBSHIFT)
+
+/*
+ * Translate addr into smap number within segment.
+ */
+#define MAP_PAGE(seg, addr) (((addr) - (seg)->s_base) >> MAXBSHIFT)
+
+/*
+ * Translate addr in seg into struct smap pointer.
+ */
+#define GET_SMAP(seg, addr) \
+ &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
+
+/*
+ * Bit in map (16 bit bitmap).
+ */
+#define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
+
+static int smd_colormsk = 0;
+static int smd_ncolor = 0;
+static int smd_nfree = 0;
+static int smd_freemsk = 0;
+#ifdef DEBUG
+static int *colors_used;
+#endif
+static struct smap *smd_smap;
+static struct smaphash *smd_hash;
+#ifdef SEGMAP_HASHSTATS
+static unsigned int *smd_hash_len;
+#endif
+static struct smfree *smd_free;
+static ulong_t smd_hashmsk = 0;
+
+#define SEGMAP_MAXCOLOR 2
+#define SEGMAP_CACHE_PAD 64
+
+union segmap_cpu {
+ struct {
+ uint32_t scpu_free_ndx[SEGMAP_MAXCOLOR];
+ struct smap *scpu_last_smap;
+ ulong_t scpu_getmap;
+ ulong_t scpu_release;
+ ulong_t scpu_get_reclaim;
+ ulong_t scpu_fault;
+ ulong_t scpu_pagecreate;
+ ulong_t scpu_get_reuse;
+ } scpu;
+ char scpu_pad[SEGMAP_CACHE_PAD];
+};
+static union segmap_cpu *smd_cpu;
+
+/*
+ * There are three locks in seg_map:
+ * - per freelist mutexes
+ * - per hashchain mutexes
+ * - per smap mutexes
+ *
+ * The lock ordering is to get the smap mutex to lock down the slot
+ * first then the hash lock (for hash in/out (vp, off) list) or the
+ * freelist lock to put the slot back on the free list.
+ *
+ * The hash search is done by only holding the hashchain lock, when a wanted
+ * slot is found, we drop the hashchain lock then lock the slot so there
+ * is no overlapping of hashchain and smap locks. After the slot is
+ * locked, we verify again if the slot is still what we are looking
+ * for.
+ *
+ * Allocation of a free slot is done by holding the freelist lock,
+ * then locking the smap slot at the head of the freelist. This is
+ * in reversed lock order so mutex_tryenter() is used.
+ *
+ * The smap lock protects all fields in smap structure except for
+ * the link fields for hash/free lists which are protected by
+ * hashchain and freelist locks.
+ */
+
+#define SHASHMTX(hashid) (&smd_hash[hashid].sh_mtx)
+
+#define SMP2SMF(smp) (&smd_free[(smp - smd_smap) & smd_freemsk])
+#define SMP2SMF_NDX(smp) (ushort_t)((smp - smd_smap) & smd_freemsk)
+
+#define SMAPMTX(smp) (&smp->sm_mtx)
+
+#define SMAP_HASHFUNC(vp, off, hashid) \
+ { \
+ hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
+ ((off) >> MAXBSHIFT)) & smd_hashmsk); \
+ }
+
+/*
+ * The most frequently updated kstat counters are kept in the
+ * per cpu array to avoid hot cache blocks. The update function
+ * sums the cpu local counters to update the global counters.
+ */
+
+/* ARGSUSED */
+int
+segmap_kstat_update(kstat_t *ksp, int rw)
+{
+ int i;
+ ulong_t getmap, release, get_reclaim;
+ ulong_t fault, pagecreate, get_reuse;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+ getmap = release = get_reclaim = (ulong_t)0;
+ fault = pagecreate = get_reuse = (ulong_t)0;
+ for (i = 0; i < max_ncpus; i++) {
+ getmap += smd_cpu[i].scpu.scpu_getmap;
+ release += smd_cpu[i].scpu.scpu_release;
+ get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
+ fault += smd_cpu[i].scpu.scpu_fault;
+ pagecreate += smd_cpu[i].scpu.scpu_pagecreate;
+ get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
+ }
+ segmapcnt.smp_getmap.value.ul = getmap;
+ segmapcnt.smp_release.value.ul = release;
+ segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
+ segmapcnt.smp_fault.value.ul = fault;
+ segmapcnt.smp_pagecreate.value.ul = pagecreate;
+ segmapcnt.smp_get_reuse.value.ul = get_reuse;
+ return (0);
+}
+
+int
+segmap_create(struct seg *seg, void *argsp)
+{
+ struct segmap_data *smd;
+ struct smap *smp;
+ struct smfree *sm;
+ struct segmap_crargs *a = (struct segmap_crargs *)argsp;
+ struct smaphash *shashp;
+ union segmap_cpu *scpu;
+ long i, npages;
+ size_t hashsz;
+ uint_t nfreelist;
+ extern void prefetch_smap_w(void *);
+ extern int max_ncpus;
+
+ ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
+
+ if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
+ panic("segkmap not MAXBSIZE aligned");
+ /*NOTREACHED*/
+ }
+
+ smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
+
+ seg->s_data = (void *)smd;
+ seg->s_ops = &segmap_ops;
+ smd->smd_prot = a->prot;
+
+ /*
+ * Scale the number of smap freelists to be
+ * proportional to max_ncpus * number of virtual colors.
+ * The caller can over-ride this scaling by providing
+ * a non-zero a->nfreelist argument.
+ */
+ nfreelist = a->nfreelist;
+ if (nfreelist == 0)
+ nfreelist = max_ncpus;
+ else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
+ cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
+ "%d, using %d", nfreelist, max_ncpus);
+ nfreelist = max_ncpus;
+ }
+ if (nfreelist & (nfreelist - 1)) {
+ /* round up nfreelist to the next power of two. */
+ nfreelist = 1 << (highbit(nfreelist));
+ }
+
+ /*
+ * Get the number of virtual colors - must be a power of 2.
+ */
+ if (a->shmsize)
+ smd_ncolor = a->shmsize >> MAXBSHIFT;
+ else
+ smd_ncolor = 1;
+ ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
+ ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
+ smd_colormsk = smd_ncolor - 1;
+ smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
+ smd_freemsk = smd_nfree - 1;
+
+ /*
+ * Allocate and initialize the freelist headers.
+ * Note that sm_freeq[1] starts out as the release queue. This
+ * is known when the smap structures are initialized below.
+ */
+ smd_free = smd->smd_free =
+ kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
+ for (i = 0; i < smd_nfree; i++) {
+ sm = &smd->smd_free[i];
+ mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
+ sm->sm_allocq = &sm->sm_freeq[0];
+ sm->sm_releq = &sm->sm_freeq[1];
+ }
+
+ /*
+ * Allocate and initialize the smap hash chain headers.
+ * Compute hash size rounding down to the next power of two.
+ */
+ npages = MAP_PAGES(seg);
+ smd->smd_npages = npages;
+ hashsz = npages / SMAP_HASHAVELEN;
+ hashsz = 1 << (highbit(hashsz)-1);
+ smd_hashmsk = hashsz - 1;
+ smd_hash = smd->smd_hash =
+ kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
+#ifdef SEGMAP_HASHSTATS
+ smd_hash_len =
+ kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
+#endif
+ for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
+ shashp->sh_hash_list = NULL;
+ mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
+ }
+
+ /*
+ * Allocate and initialize the smap structures.
+ * Link all slots onto the appropriate freelist.
+ * The smap array is large enough to affect boot time
+ * on large systems, so use memory prefetching and only
+ * go through the array 1 time. Inline a optimized version
+ * of segmap_smapadd to add structures to freelists with
+ * knowledge that no locks are needed here.
+ */
+ smd_smap = smd->smd_sm =
+ kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
+
+ for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
+ smp >= smd->smd_sm; smp--) {
+ struct smap *smpfreelist;
+ struct sm_freeq *releq;
+
+ prefetch_smap_w((char *)smp);
+
+ smp->sm_vp = NULL;
+ smp->sm_hash = NULL;
+ smp->sm_off = 0;
+ smp->sm_bitmap = 0;
+ smp->sm_refcnt = 0;
+ mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
+ smp->sm_free_ndx = SMP2SMF_NDX(smp);
+
+ sm = SMP2SMF(smp);
+ releq = sm->sm_releq;
+
+ smpfreelist = releq->smq_free;
+ if (smpfreelist == 0) {
+ releq->smq_free = smp->sm_next = smp->sm_prev = smp;
+ } else {
+ smp->sm_next = smpfreelist;
+ smp->sm_prev = smpfreelist->sm_prev;
+ smpfreelist->sm_prev = smp;
+ smp->sm_prev->sm_next = smp;
+ releq->smq_free = smp->sm_next;
+ }
+
+ /*
+ * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
+ */
+ smp->sm_flags = 0;
+
+#ifdef SEGKPM_SUPPORT
+ /*
+ * Due to the fragile prefetch loop no
+ * separate function is used here.
+ */
+ smp->sm_kpme_next = NULL;
+ smp->sm_kpme_prev = NULL;
+ smp->sm_kpme_page = NULL;
+#endif
+ }
+
+ /*
+ * Allocate the per color indices that distribute allocation
+ * requests over the free lists. Each cpu will have a private
+ * rotor index to spread the allocations even across the available
+ * smap freelists. Init the scpu_last_smap field to the first
+ * smap element so there is no need to check for NULL.
+ */
+ smd_cpu =
+ kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
+ for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
+ int j;
+ for (j = 0; j < smd_ncolor; j++)
+ scpu->scpu.scpu_free_ndx[j] = j;
+ scpu->scpu.scpu_last_smap = smd_smap;
+ }
+
+#ifdef DEBUG
+ /*
+ * Keep track of which colors are used more often.
+ */
+ colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
+#endif /* DEBUG */
+
+ return (0);
+}
+
+static void
+segmap_free(seg)
+ struct seg *seg;
+{
+ ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
+}
+
+/*
+ * Do a F_SOFTUNLOCK call over the range requested.
+ * The range must have already been F_SOFTLOCK'ed.
+ */
+static void
+segmap_unlock(
+ struct hat *hat,
+ struct seg *seg,
+ caddr_t addr,
+ size_t len,
+ enum seg_rw rw,
+ struct smap *smp)
+{
+ page_t *pp;
+ caddr_t adr;
+ u_offset_t off;
+ struct vnode *vp;
+ kmutex_t *smtx;
+
+ ASSERT(smp->sm_refcnt > 0);
+
+#ifdef lint
+ seg = seg;
+#endif
+
+ if (segmap_kpm && IS_KPM_ADDR(addr)) {
+
+ /*
+ * We're called only from segmap_fault and this was a
+ * NOP in case of a kpm based smap, so dangerous things
+ * must have happened in the meantime. Pages are prefaulted
+ * and locked in segmap_getmapflt and they will not be
+ * unlocked until segmap_release.
+ */
+ panic("segmap_unlock: called with kpm addr %p", (void *)addr);
+ /*NOTREACHED*/
+ }
+
+ vp = smp->sm_vp;
+ off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
+
+ hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
+ for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
+ ushort_t bitmask;
+
+ /*
+ * Use page_find() instead of page_lookup() to
+ * find the page since we know that it has
+ * "shared" lock.
+ */
+ pp = page_find(vp, off);
+ if (pp == NULL) {
+ panic("segmap_unlock: page not found");
+ /*NOTREACHED*/
+ }
+
+ if (rw == S_WRITE) {
+ hat_setrefmod(pp);
+ } else if (rw != S_OTHER) {
+ TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
+ "segmap_fault:pp %p vp %p offset %llx",
+ pp, vp, off);
+ hat_setref(pp);
+ }
+
+ /*
+ * Clear bitmap, if the bit corresponding to "off" is set,
+ * since the page and translation are being unlocked.
+ */
+ bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
+
+ /*
+ * Large Files: Following assertion is to verify
+ * the correctness of the cast to (int) above.
+ */
+ ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
+ smtx = SMAPMTX(smp);
+ mutex_enter(smtx);
+ if (smp->sm_bitmap & bitmask) {
+ smp->sm_bitmap &= ~bitmask;
+ }
+ mutex_exit(smtx);
+
+ page_unlock(pp);
+ }
+}
+
+#define MAXPPB (MAXBSIZE/4096) /* assumes minimum page size of 4k */
+
+/*
+ * This routine is called via a machine specific fault handling
+ * routine. It is also called by software routines wishing to
+ * lock or unlock a range of addresses.
+ *
+ * Note that this routine expects a page-aligned "addr".
+ */
+faultcode_t
+segmap_fault(
+ struct hat *hat,
+ struct seg *seg,
+ caddr_t addr,
+ size_t len,
+ enum fault_type type,
+ enum seg_rw rw)
+{
+ struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+ struct smap *smp;
+ page_t *pp, **ppp;
+ struct vnode *vp;
+ u_offset_t off;
+ page_t *pl[MAXPPB + 1];
+ uint_t prot;
+ u_offset_t addroff;
+ caddr_t adr;
+ int err;
+ u_offset_t sm_off;
+ int hat_flag;
+
+ if (segmap_kpm && IS_KPM_ADDR(addr)) {
+ int newpage;
+ kmutex_t *smtx;
+
+ /*
+ * Pages are successfully prefaulted and locked in
+ * segmap_getmapflt and can't be unlocked until
+ * segmap_release. No hat mappings have to be locked
+ * and they also can't be unlocked as long as the
+ * caller owns an active kpm addr.
+ */
+#ifndef DEBUG
+ if (type != F_SOFTUNLOCK)
+ return (0);
+#endif
+
+ if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
+ panic("segmap_fault: smap not found "
+ "for addr %p", (void *)addr);
+ /*NOTREACHED*/
+ }
+
+ smtx = SMAPMTX(smp);
+#ifdef DEBUG
+ newpage = smp->sm_flags & SM_KPM_NEWPAGE;
+ if (newpage) {
+ cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
+ (void *)smp);
+ }
+
+ if (type != F_SOFTUNLOCK) {
+ mutex_exit(smtx);
+ return (0);
+ }
+#endif
+ mutex_exit(smtx);
+ vp = smp->sm_vp;
+ sm_off = smp->sm_off;
+
+ if (vp == NULL)
+ return (FC_MAKE_ERR(EIO));
+
+ ASSERT(smp->sm_refcnt > 0);
+
+ addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
+ if (addroff + len > MAXBSIZE)
+ panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
+ (void *)(addr + len));
+
+ off = sm_off + addroff;
+
+ pp = page_find(vp, off);
+
+ if (pp == NULL)
+ panic("segmap_fault: softunlock page not found");
+
+ /*
+ * Set ref bit also here in case of S_OTHER to avoid the
+ * overhead of supporting other cases than F_SOFTUNLOCK
+ * with segkpm. We can do this because the underlying
+ * pages are locked anyway.
+ */
+ if (rw == S_WRITE) {
+ hat_setrefmod(pp);
+ } else {
+ TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
+ "segmap_fault:pp %p vp %p offset %llx",
+ pp, vp, off);
+ hat_setref(pp);
+ }
+
+ return (0);
+ }
+
+ smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
+ smp = GET_SMAP(seg, addr);
+ vp = smp->sm_vp;
+ sm_off = smp->sm_off;
+
+ if (vp == NULL)
+ return (FC_MAKE_ERR(EIO));
+
+ ASSERT(smp->sm_refcnt > 0);
+
+ addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
+ if (addroff + len > MAXBSIZE) {
+ panic("segmap_fault: endaddr %p "
+ "exceeds MAXBSIZE chunk", (void *)(addr + len));
+ /*NOTREACHED*/
+ }
+ off = sm_off + addroff;
+
+ /*
+ * First handle the easy stuff
+ */
+ if (type == F_SOFTUNLOCK) {
+ segmap_unlock(hat, seg, addr, len, rw, smp);
+ return (0);
+ }
+
+ TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
+ "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
+ err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
+ seg, addr, rw, CRED());
+
+ if (err)
+ return (FC_MAKE_ERR(err));
+
+ prot &= smd->smd_prot;
+
+ /*
+ * Handle all pages returned in the pl[] array.
+ * This loop is coded on the assumption that if
+ * there was no error from the VOP_GETPAGE routine,
+ * that the page list returned will contain all the
+ * needed pages for the vp from [off..off + len].
+ */
+ ppp = pl;
+ while ((pp = *ppp++) != NULL) {
+ u_offset_t poff;
+ ASSERT(pp->p_vnode == vp);
+ hat_flag = HAT_LOAD;
+
+ /*
+ * Verify that the pages returned are within the range
+ * of this segmap region. Note that it is theoretically
+ * possible for pages outside this range to be returned,
+ * but it is not very likely. If we cannot use the
+ * page here, just release it and go on to the next one.
+ */
+ if (pp->p_offset < sm_off ||
+ pp->p_offset >= sm_off + MAXBSIZE) {
+ (void) page_release(pp, 1);
+ continue;
+ }
+
+ ASSERT(hat == kas.a_hat);
+ poff = pp->p_offset;
+ adr = addr + (poff - off);
+ if (adr >= addr && adr < addr + len) {
+ hat_setref(pp);
+ TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
+ "segmap_fault:pp %p vp %p offset %llx",
+ pp, vp, poff);
+ if (type == F_SOFTLOCK)
+ hat_flag = HAT_LOAD_LOCK;
+ }
+
+ /*
+ * Deal with VMODSORT pages here. If we know this is a write
+ * do the setmod now and allow write protection.
+ * As long as it's modified or not S_OTHER, remove write
+ * protection. With S_OTHER it's up to the FS to deal with this.
+ */
+ if (IS_VMODSORT(vp)) {
+ if (rw == S_WRITE)
+ hat_setmod(pp);
+ else if (rw != S_OTHER && !hat_ismod(pp))
+ prot &= ~PROT_WRITE;
+ }
+
+ hat_memload(hat, adr, pp, prot, hat_flag);
+ if (hat_flag != HAT_LOAD_LOCK)
+ page_unlock(pp);
+ }
+ return (0);
+}
+
+/*
+ * This routine is used to start I/O on pages asynchronously.
+ */
+static faultcode_t
+segmap_faulta(struct seg *seg, caddr_t addr)
+{
+ struct smap *smp;
+ struct vnode *vp;
+ u_offset_t off;
+ int err;
+
+ if (segmap_kpm && IS_KPM_ADDR(addr)) {
+ int newpage;
+ kmutex_t *smtx;
+
+ /*
+ * Pages are successfully prefaulted and locked in
+ * segmap_getmapflt and can't be unlocked until
+ * segmap_release. No hat mappings have to be locked
+ * and they also can't be unlocked as long as the
+ * caller owns an active kpm addr.
+ */
+#ifdef DEBUG
+ if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
+ panic("segmap_faulta: smap not found "
+ "for addr %p", (void *)addr);
+ /*NOTREACHED*/
+ }
+
+ smtx = SMAPMTX(smp);
+ newpage = smp->sm_flags & SM_KPM_NEWPAGE;
+ mutex_exit(smtx);
+ if (newpage)
+ cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
+ (void *)smp);
+#endif
+ return (0);
+ }
+
+ segmapcnt.smp_faulta.value.ul++;
+ smp = GET_SMAP(seg, addr);
+
+ ASSERT(smp->sm_refcnt > 0);
+
+ vp = smp->sm_vp;
+ off = smp->sm_off;
+
+ if (vp == NULL) {
+ cmn_err(CE_WARN, "segmap_faulta - no vp");
+ return (FC_MAKE_ERR(EIO));
+ }
+
+ TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
+ "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
+
+ err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
+ & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
+ seg, addr, S_READ, CRED());
+
+ if (err)
+ return (FC_MAKE_ERR(err));
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+ struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+
+ ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
+
+ /*
+ * Need not acquire the segment lock since
+ * "smd_prot" is a read-only field.
+ */
+ return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
+}
+
+static int
+segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+ struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+ size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if (pgno != 0) {
+ do
+ protv[--pgno] = smd->smd_prot;
+ while (pgno != 0);
+ }
+ return (0);
+}
+
+static u_offset_t
+segmap_getoffset(struct seg *seg, caddr_t addr)
+{
+ struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+
+ ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
+
+ return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
+}
+
+/*ARGSUSED*/
+static int
+segmap_gettype(struct seg *seg, caddr_t addr)
+{
+ ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
+
+ return (MAP_SHARED);
+}
+
+/*ARGSUSED*/
+static int
+segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+ struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+
+ ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
+
+ /* XXX - This doesn't make any sense */
+ *vpp = smd->smd_sm->sm_vp;
+ return (0);
+}
+
+/*
+ * Check to see if it makes sense to do kluster/read ahead to
+ * addr + delta relative to the mapping at addr. We assume here
+ * that delta is a signed PAGESIZE'd multiple (which can be negative).
+ *
+ * For segmap we always "approve" of this action from our standpoint.
+ */
+/*ARGSUSED*/
+static int
+segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+ return (0);
+}
+
+static void
+segmap_badop()
+{
+ panic("segmap_badop");
+ /*NOTREACHED*/
+}
+
+/*
+ * Special private segmap operations
+ */
+
+/*
+ * Add smap to the appropriate free list.
+ */
+static void
+segmap_smapadd(struct smap *smp)
+{
+ struct smfree *sm;
+ struct smap *smpfreelist;
+ struct sm_freeq *releq;
+
+ ASSERT(MUTEX_HELD(SMAPMTX(smp)));
+
+ if (smp->sm_refcnt != 0) {
+ panic("segmap_smapadd");
+ /*NOTREACHED*/
+ }
+
+ sm = &smd_free[smp->sm_free_ndx];
+ /*
+ * Add to the tail of the release queue
+ * Note that sm_releq and sm_allocq could toggle
+ * before we get the lock. This does not affect
+ * correctness as the 2 queues are only maintained
+ * to reduce lock pressure.
+ */
+ releq = sm->sm_releq;
+ if (releq == &sm->sm_freeq[0])
+ smp->sm_flags |= SM_QNDX_ZERO;
+ else
+ smp->sm_flags &= ~SM_QNDX_ZERO;
+ mutex_enter(&releq->smq_mtx);
+ smpfreelist = releq->smq_free;
+ if (smpfreelist == 0) {
+ int want;
+
+ releq->smq_free = smp->sm_next = smp->sm_prev = smp;
+ /*
+ * Both queue mutexes held to set sm_want;
+ * snapshot the value before dropping releq mutex.
+ * If sm_want appears after the releq mutex is dropped,
+ * then the smap just freed is already gone.
+ */
+ want = sm->sm_want;
+ mutex_exit(&releq->smq_mtx);
+ /*
+ * See if there was a waiter before dropping the releq mutex
+ * then recheck after obtaining sm_freeq[0] mutex as
+ * the another thread may have already signaled.
+ */
+ if (want) {
+ mutex_enter(&sm->sm_freeq[0].smq_mtx);
+ if (sm->sm_want)
+ cv_signal(&sm->sm_free_cv);
+ mutex_exit(&sm->sm_freeq[0].smq_mtx);
+ }
+ } else {
+ smp->sm_next = smpfreelist;
+ smp->sm_prev = smpfreelist->sm_prev;
+ smpfreelist->sm_prev = smp;
+ smp->sm_prev->sm_next = smp;
+ mutex_exit(&releq->smq_mtx);
+ }
+}
+
+
+static struct smap *
+segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
+{
+ struct smap **hpp;
+ struct smap *tmp;
+ kmutex_t *hmtx;
+
+ ASSERT(MUTEX_HELD(SMAPMTX(smp)));
+ ASSERT(smp->sm_vp == NULL);
+ ASSERT(smp->sm_hash == NULL);
+ ASSERT(smp->sm_prev == NULL);
+ ASSERT(smp->sm_next == NULL);
+ ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
+
+ hmtx = SHASHMTX(hashid);
+
+ mutex_enter(hmtx);
+ /*
+ * First we need to verify that no one has created a smp
+ * with (vp,off) as its tag before we us.
+ */
+ for (tmp = smd_hash[hashid].sh_hash_list;
+ tmp != NULL; tmp = tmp->sm_hash)
+ if (tmp->sm_vp == vp && tmp->sm_off == off)
+ break;
+
+ if (tmp == NULL) {
+ /*
+ * No one created one yet.
+ *
+ * Funniness here - we don't increment the ref count on the
+ * vnode * even though we have another pointer to it here.
+ * The reason for this is that we don't want the fact that
+ * a seg_map entry somewhere refers to a vnode to prevent the
+ * vnode * itself from going away. This is because this
+ * reference to the vnode is a "soft one". In the case where
+ * a mapping is being used by a rdwr [or directory routine?]
+ * there already has to be a non-zero ref count on the vnode.
+ * In the case where the vp has been freed and the the smap
+ * structure is on the free list, there are no pages in memory
+ * that can refer to the vnode. Thus even if we reuse the same
+ * vnode/smap structure for a vnode which has the same
+ * address but represents a different object, we are ok.
+ */
+ smp->sm_vp = vp;
+ smp->sm_off = off;
+
+ hpp = &smd_hash[hashid].sh_hash_list;
+ smp->sm_hash = *hpp;
+ *hpp = smp;
+#ifdef SEGMAP_HASHSTATS
+ smd_hash_len[hashid]++;
+#endif
+ }
+ mutex_exit(hmtx);
+
+ return (tmp);
+}
+
+static void
+segmap_hashout(struct smap *smp)
+{
+ struct smap **hpp, *hp;
+ struct vnode *vp;
+ kmutex_t *mtx;
+ int hashid;
+ u_offset_t off;
+
+ ASSERT(MUTEX_HELD(SMAPMTX(smp)));
+
+ vp = smp->sm_vp;
+ off = smp->sm_off;
+
+ SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
+ mtx = SHASHMTX(hashid);
+ mutex_enter(mtx);
+
+ hpp = &smd_hash[hashid].sh_hash_list;
+ for (;;) {
+ hp = *hpp;
+ if (hp == NULL) {
+ panic("segmap_hashout");
+ /*NOTREACHED*/
+ }
+ if (hp == smp)
+ break;
+ hpp = &hp->sm_hash;
+ }
+
+ *hpp = smp->sm_hash;
+ smp->sm_hash = NULL;
+#ifdef SEGMAP_HASHSTATS
+ smd_hash_len[hashid]--;
+#endif
+ mutex_exit(mtx);
+
+ smp->sm_vp = NULL;
+ smp->sm_off = (u_offset_t)0;
+
+}
+
+/*
+ * Attempt to free unmodified, unmapped, and non locked segmap
+ * pages.
+ */
+void
+segmap_pagefree(struct vnode *vp, u_offset_t off)
+{
+ u_offset_t pgoff;
+ page_t *pp;
+
+ for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
+
+ if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
+ continue;
+
+ switch (page_release(pp, 1)) {
+ case PGREL_NOTREL:
+ segmapcnt.smp_free_notfree.value.ul++;
+ break;
+ case PGREL_MOD:
+ segmapcnt.smp_free_dirty.value.ul++;
+ break;
+ case PGREL_CLEAN:
+ segmapcnt.smp_free.value.ul++;
+ break;
+ }
+ }
+}
+
+/*
+ * Locks held on entry: smap lock
+ * Locks held on exit : smap lock.
+ */
+
+static void
+grab_smp(struct smap *smp, page_t *pp)
+{
+ ASSERT(MUTEX_HELD(SMAPMTX(smp)));
+ ASSERT(smp->sm_refcnt == 0);
+
+ if (smp->sm_vp != (struct vnode *)NULL) {
+ struct vnode *vp = smp->sm_vp;
+ u_offset_t off = smp->sm_off;
+ /*
+ * Destroy old vnode association and
+ * unload any hardware translations to
+ * the old object.
+ */
+ smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
+ segmap_hashout(smp);
+
+ /*
+ * This node is off freelist and hashlist,
+ * so there is no reason to drop/reacquire sm_mtx
+ * across calls to hat_unload.
+ */
+ if (segmap_kpm) {
+ caddr_t vaddr;
+ int hat_unload_needed = 0;
+
+ /*
+ * unload kpm mapping
+ */
+ if (pp != NULL) {
+ vaddr = hat_kpm_page2va(pp, 1);
+ hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
+ page_unlock(pp);
+ }
+
+ /*
+ * Check if we have (also) the rare case of a
+ * non kpm mapping.
+ */
+ if (smp->sm_flags & SM_NOTKPM_RELEASED) {
+ hat_unload_needed = 1;
+ smp->sm_flags &= ~SM_NOTKPM_RELEASED;
+ }
+
+ if (hat_unload_needed) {
+ hat_unload(kas.a_hat, segkmap->s_base +
+ ((smp - smd_smap) * MAXBSIZE),
+ MAXBSIZE, HAT_UNLOAD);
+ }
+
+ } else {
+ ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
+ smp->sm_flags &= ~SM_NOTKPM_RELEASED;
+ hat_unload(kas.a_hat, segkmap->s_base +
+ ((smp - smd_smap) * MAXBSIZE),
+ MAXBSIZE, HAT_UNLOAD);
+ }
+ segmap_pagefree(vp, off);
+ }
+}
+
+static struct smap *
+get_free_smp(int free_ndx)
+{
+ struct smfree *sm;
+ kmutex_t *smtx;
+ struct smap *smp, *first;
+ struct sm_freeq *allocq, *releq;
+ struct kpme *kpme;
+ page_t *pp = NULL;
+ int end_ndx, page_locked = 0;
+
+ end_ndx = free_ndx;
+ sm = &smd_free[free_ndx];
+
+retry_queue:
+ allocq = sm->sm_allocq;
+ mutex_enter(&allocq->smq_mtx);
+
+ if ((smp = allocq->smq_free) == NULL) {
+
+skip_queue:
+ /*
+ * The alloc list is empty or this queue is being skipped;
+ * first see if the allocq toggled.
+ */
+ if (sm->sm_allocq != allocq) {
+ /* queue changed */
+ mutex_exit(&allocq->smq_mtx);
+ goto retry_queue;
+ }
+ releq = sm->sm_releq;
+ if (!mutex_tryenter(&releq->smq_mtx)) {
+ /* cannot get releq; a free smp may be there now */
+ mutex_exit(&allocq->smq_mtx);
+
+ /*
+ * This loop could spin forever if this thread has
+ * higher priority than the thread that is holding
+ * releq->smq_mtx. In order to force the other thread
+ * to run, we'll lock/unlock the mutex which is safe
+ * since we just unlocked the allocq mutex.
+ */
+ mutex_enter(&releq->smq_mtx);
+ mutex_exit(&releq->smq_mtx);
+ goto retry_queue;
+ }
+ if (releq->smq_free == NULL) {
+ /*
+ * This freelist is empty.
+ * This should not happen unless clients
+ * are failing to release the segmap
+ * window after accessing the data.
+ * Before resorting to sleeping, try
+ * the next list of the same color.
+ */
+ free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
+ if (free_ndx != end_ndx) {
+ mutex_exit(&releq->smq_mtx);
+ mutex_exit(&allocq->smq_mtx);
+ sm = &smd_free[free_ndx];
+ goto retry_queue;
+ }
+ /*
+ * Tried all freelists of the same color once,
+ * wait on this list and hope something gets freed.
+ */
+ segmapcnt.smp_get_nofree.value.ul++;
+ sm->sm_want++;
+ mutex_exit(&sm->sm_freeq[1].smq_mtx);
+ cv_wait(&sm->sm_free_cv,
+ &sm->sm_freeq[0].smq_mtx);
+ sm->sm_want--;
+ mutex_exit(&sm->sm_freeq[0].smq_mtx);
+ sm = &smd_free[free_ndx];
+ goto retry_queue;
+ } else {
+ /*
+ * Something on the rele queue; flip the alloc
+ * and rele queues and retry.
+ */
+ sm->sm_allocq = releq;
+ sm->sm_releq = allocq;
+ mutex_exit(&allocq->smq_mtx);
+ mutex_exit(&releq->smq_mtx);
+ if (page_locked) {
+ delay(hz >> 2);
+ page_locked = 0;
+ }
+ goto retry_queue;
+ }
+ } else {
+ /*
+ * Fastpath the case we get the smap mutex
+ * on the first try.
+ */
+ first = smp;
+next_smap:
+ smtx = SMAPMTX(smp);
+ if (!mutex_tryenter(smtx)) {
+ /*
+ * Another thread is trying to reclaim this slot.
+ * Skip to the next queue or smap.
+ */
+ if ((smp = smp->sm_next) == first) {
+ goto skip_queue;
+ } else {
+ goto next_smap;
+ }
+ } else {
+ /*
+ * if kpme exists, get shared lock on the page
+ */
+ if (segmap_kpm && smp->sm_vp != NULL) {
+
+ kpme = GET_KPME(smp);
+ pp = kpme->kpe_page;
+
+ if (pp != NULL) {
+ if (!page_trylock(pp, SE_SHARED)) {
+ smp = smp->sm_next;
+ mutex_exit(smtx);
+ page_locked = 1;
+
+ pp = NULL;
+
+ if (smp == first) {
+ goto skip_queue;
+ } else {
+ goto next_smap;
+ }
+ } else {
+ if (kpme->kpe_page == NULL) {
+ page_unlock(pp);
+ pp = NULL;
+ }
+ }
+ }
+ }
+
+ /*
+ * At this point, we've selected smp. Remove smp
+ * from its freelist. If smp is the first one in
+ * the freelist, update the head of the freelist.
+ */
+ if (first == smp) {
+ ASSERT(first == allocq->smq_free);
+ allocq->smq_free = smp->sm_next;
+ }
+
+ /*
+ * if the head of the freelist still points to smp,
+ * then there are no more free smaps in that list.
+ */
+ if (allocq->smq_free == smp)
+ /*
+ * Took the last one
+ */
+ allocq->smq_free = NULL;
+ else {
+ smp->sm_prev->sm_next = smp->sm_next;
+ smp->sm_next->sm_prev = smp->sm_prev;
+ }
+ mutex_exit(&allocq->smq_mtx);
+ smp->sm_prev = smp->sm_next = NULL;
+
+ /*
+ * if pp != NULL, pp must have been locked;
+ * grab_smp() unlocks pp.
+ */
+ ASSERT((pp == NULL) || PAGE_LOCKED(pp));
+ grab_smp(smp, pp);
+ /* return smp locked. */
+ ASSERT(SMAPMTX(smp) == smtx);
+ ASSERT(MUTEX_HELD(smtx));
+ return (smp);
+ }
+ }
+}
+
+/*
+ * Special public segmap operations
+ */
+
+/*
+ * Create pages (without using VOP_GETPAGE) and load up tranlations to them.
+ * If softlock is TRUE, then set things up so that it looks like a call
+ * to segmap_fault with F_SOFTLOCK.
+ *
+ * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
+ *
+ * All fields in the generic segment (struct seg) are considered to be
+ * read-only for "segmap" even though the kernel address space (kas) may
+ * not be locked, hence no lock is needed to access them.
+ */
+int
+segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
+{
+ struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+ page_t *pp;
+ u_offset_t off;
+ struct smap *smp;
+ struct vnode *vp;
+ caddr_t eaddr;
+ int newpage = 0;
+ uint_t prot;
+ kmutex_t *smtx;
+ int hat_flag;
+
+ ASSERT(seg->s_as == &kas);
+
+ if (segmap_kpm && IS_KPM_ADDR(addr)) {
+ /*
+ * Pages are successfully prefaulted and locked in
+ * segmap_getmapflt and can't be unlocked until
+ * segmap_release. The SM_KPM_NEWPAGE flag is set
+ * in segmap_pagecreate_kpm when new pages are created.
+ * and it is returned as "newpage" indication here.
+ */
+ if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
+ panic("segmap_pagecreate: smap not found "
+ "for addr %p", (void *)addr);
+ /*NOTREACHED*/
+ }
+
+ smtx = SMAPMTX(smp);
+ newpage = smp->sm_flags & SM_KPM_NEWPAGE;
+ smp->sm_flags &= ~SM_KPM_NEWPAGE;
+ mutex_exit(smtx);
+
+ return (newpage);
+ }
+
+ smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
+
+ eaddr = addr + len;
+ addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+
+ smp = GET_SMAP(seg, addr);
+
+ /*
+ * We don't grab smp mutex here since we assume the smp
+ * has a refcnt set already which prevents the slot from
+ * changing its id.
+ */
+ ASSERT(smp->sm_refcnt > 0);
+
+ vp = smp->sm_vp;
+ off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
+ prot = smd->smd_prot;
+
+ for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
+ hat_flag = HAT_LOAD;
+ pp = page_lookup(vp, off, SE_SHARED);
+ if (pp == NULL) {
+ ushort_t bitindex;
+
+ if ((pp = page_create_va(vp, off,
+ PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
+ panic("segmap_pagecreate: page_create failed");
+ /*NOTREACHED*/
+ }
+ newpage = 1;
+ page_io_unlock(pp);
+
+ /*
+ * Since pages created here do not contain valid
+ * data until the caller writes into them, the
+ * "exclusive" lock will not be dropped to prevent
+ * other users from accessing the page. We also
+ * have to lock the translation to prevent a fault
+ * from occuring when the virtual address mapped by
+ * this page is written into. This is necessary to
+ * avoid a deadlock since we haven't dropped the
+ * "exclusive" lock.
+ */
+ bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
+
+ /*
+ * Large Files: The following assertion is to
+ * verify the cast above.
+ */
+ ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
+ smtx = SMAPMTX(smp);
+ mutex_enter(smtx);
+ smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
+ mutex_exit(smtx);
+
+ hat_flag = HAT_LOAD_LOCK;
+ } else if (softlock) {
+ hat_flag = HAT_LOAD_LOCK;
+ }
+
+ if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
+ hat_setmod(pp);
+
+ hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
+
+ if (hat_flag != HAT_LOAD_LOCK)
+ page_unlock(pp);
+
+ TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
+ "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
+ seg, addr, pp, vp, off);
+ }
+
+ return (newpage);
+}
+
+void
+segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
+{
+ struct smap *smp;
+ ushort_t bitmask;
+ page_t *pp;
+ struct vnode *vp;
+ u_offset_t off;
+ caddr_t eaddr;
+ kmutex_t *smtx;
+
+ ASSERT(seg->s_as == &kas);
+
+ eaddr = addr + len;
+ addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+
+ if (segmap_kpm && IS_KPM_ADDR(addr)) {
+ /*
+ * Pages are successfully prefaulted and locked in
+ * segmap_getmapflt and can't be unlocked until
+ * segmap_release, so no pages or hat mappings have
+ * to be unlocked at this point.
+ */
+#ifdef DEBUG
+ if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
+ panic("segmap_pageunlock: smap not found "
+ "for addr %p", (void *)addr);
+ /*NOTREACHED*/
+ }
+
+ ASSERT(smp->sm_refcnt > 0);
+ mutex_exit(SMAPMTX(smp));
+#endif
+ return;
+ }
+
+ smp = GET_SMAP(seg, addr);
+ smtx = SMAPMTX(smp);
+
+ ASSERT(smp->sm_refcnt > 0);
+
+ vp = smp->sm_vp;
+ off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
+
+ for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
+ bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
+
+ /*
+ * Large Files: Following assertion is to verify
+ * the correctness of the cast to (int) above.
+ */
+ ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
+
+ /*
+ * If the bit corresponding to "off" is set,
+ * clear this bit in the bitmap, unlock translations,
+ * and release the "exclusive" lock on the page.
+ */
+ if (smp->sm_bitmap & bitmask) {
+ mutex_enter(smtx);
+ smp->sm_bitmap &= ~bitmask;
+ mutex_exit(smtx);
+
+ hat_unlock(kas.a_hat, addr, PAGESIZE);
+
+ /*
+ * Use page_find() instead of page_lookup() to
+ * find the page since we know that it has
+ * "exclusive" lock.
+ */
+ pp = page_find(vp, off);
+ if (pp == NULL) {
+ panic("segmap_pageunlock: page not found");
+ /*NOTREACHED*/
+ }
+ if (rw == S_WRITE) {
+ hat_setrefmod(pp);
+ } else if (rw != S_OTHER) {
+ hat_setref(pp);
+ }
+
+ page_unlock(pp);
+ }
+ }
+}
+
+caddr_t
+segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
+{
+ return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
+}
+
+/*
+ * This is the magic virtual address that offset 0 of an ELF
+ * file gets mapped to in user space. This is used to pick
+ * the vac color on the freelist.
+ */
+#define ELF_OFFZERO_VA (0x10000)
+/*
+ * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
+ * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
+ * The return address is always MAXBSIZE aligned.
+ *
+ * If forcefault is nonzero and the MMU translations haven't yet been created,
+ * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
+ */
+caddr_t
+segmap_getmapflt(
+ struct seg *seg,
+ struct vnode *vp,
+ u_offset_t off,
+ size_t len,
+ int forcefault,
+ enum seg_rw rw)
+{
+ struct smap *smp, *nsmp;
+ extern struct vnode *common_specvp();
+ caddr_t baseaddr; /* MAXBSIZE aligned */
+ u_offset_t baseoff;
+ int newslot;
+ caddr_t vaddr;
+ int color, hashid;
+ kmutex_t *hashmtx, *smapmtx;
+ struct smfree *sm;
+ page_t *pp;
+ struct kpme *kpme;
+ uint_t prot;
+ caddr_t base;
+ page_t *pl[MAXPPB + 1];
+ int error;
+ int is_kpm = 1;
+
+ ASSERT(seg->s_as == &kas);
+ ASSERT(seg == segkmap);
+
+ baseoff = off & (offset_t)MAXBMASK;
+ if (off + len > baseoff + MAXBSIZE) {
+ panic("segmap_getmap bad len");
+ /*NOTREACHED*/
+ }
+
+ /*
+ * If this is a block device we have to be sure to use the
+ * "common" block device vnode for the mapping.
+ */
+ if (vp->v_type == VBLK)
+ vp = common_specvp(vp);
+
+ smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
+
+ if (segmap_kpm == 0 ||
+ (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
+ is_kpm = 0;
+ }
+
+ SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
+ hashmtx = SHASHMTX(hashid);
+
+retry_hash:
+ mutex_enter(hashmtx);
+ for (smp = smd_hash[hashid].sh_hash_list;
+ smp != NULL; smp = smp->sm_hash)
+ if (smp->sm_vp == vp && smp->sm_off == baseoff)
+ break;
+ mutex_exit(hashmtx);
+
+vrfy_smp:
+ if (smp != NULL) {
+
+ ASSERT(vp->v_count != 0);
+
+ /*
+ * Get smap lock and recheck its tag. The hash lock
+ * is dropped since the hash is based on (vp, off)
+ * and (vp, off) won't change when we have smap mtx.
+ */
+ smapmtx = SMAPMTX(smp);
+ mutex_enter(smapmtx);
+ if (smp->sm_vp != vp || smp->sm_off != baseoff) {
+ mutex_exit(smapmtx);
+ goto retry_hash;
+ }
+
+ if (smp->sm_refcnt == 0) {
+
+ smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
+
+ /*
+ * Could still be on the free list. However, this
+ * could also be an smp that is transitioning from
+ * the free list when we have too much contention
+ * for the smapmtx's. In this case, we have an
+ * unlocked smp that is not on the free list any
+ * longer, but still has a 0 refcnt. The only way
+ * to be sure is to check the freelist pointers.
+ * Since we now have the smapmtx, we are guaranteed
+ * that the (vp, off) won't change, so we are safe
+ * to reclaim it. get_free_smp() knows that this
+ * can happen, and it will check the refcnt.
+ */
+
+ if ((smp->sm_next != NULL)) {
+ struct sm_freeq *freeq;
+
+ ASSERT(smp->sm_prev != NULL);
+ sm = &smd_free[smp->sm_free_ndx];
+
+ if (smp->sm_flags & SM_QNDX_ZERO)
+ freeq = &sm->sm_freeq[0];
+ else
+ freeq = &sm->sm_freeq[1];
+
+ mutex_enter(&freeq->smq_mtx);
+ if (freeq->smq_free != smp) {
+ /*
+ * fastpath normal case
+ */
+ smp->sm_prev->sm_next = smp->sm_next;
+ smp->sm_next->sm_prev = smp->sm_prev;
+ } else if (smp == smp->sm_next) {
+ /*
+ * Taking the last smap on freelist
+ */
+ freeq->smq_free = NULL;
+ } else {
+ /*
+ * Reclaiming 1st smap on list
+ */
+ freeq->smq_free = smp->sm_next;
+ smp->sm_prev->sm_next = smp->sm_next;
+ smp->sm_next->sm_prev = smp->sm_prev;
+ }
+ mutex_exit(&freeq->smq_mtx);
+ smp->sm_prev = smp->sm_next = NULL;
+ } else {
+ ASSERT(smp->sm_prev == NULL);
+ segmapcnt.smp_stolen.value.ul++;
+ }
+
+ } else {
+ segmapcnt.smp_get_use.value.ul++;
+ }
+ smp->sm_refcnt++; /* another user */
+
+ /*
+ * We don't invoke segmap_fault via TLB miss, so we set ref
+ * and mod bits in advance. For S_OTHER we set them in
+ * segmap_fault F_SOFTUNLOCK.
+ */
+ if (is_kpm) {
+ if (rw == S_WRITE) {
+ smp->sm_flags |= SM_WRITE_DATA;
+ } else if (rw == S_READ) {
+ smp->sm_flags |= SM_READ_DATA;
+ }
+ }
+ mutex_exit(smapmtx);
+
+ newslot = 0;
+ } else {
+
+ uint32_t free_ndx, *free_ndxp;
+ union segmap_cpu *scpu;
+
+ /*
+ * On a PAC machine or a machine with anti-alias
+ * hardware, smd_colormsk will be zero.
+ *
+ * On a VAC machine- pick color by offset in the file
+ * so we won't get VAC conflicts on elf files.
+ * On data files, color does not matter but we
+ * don't know what kind of file it is so we always
+ * pick color by offset. This causes color
+ * corresponding to file offset zero to be used more
+ * heavily.
+ */
+ color = (baseoff >> MAXBSHIFT) & smd_colormsk;
+ scpu = smd_cpu+CPU->cpu_seqid;
+ free_ndxp = &scpu->scpu.scpu_free_ndx[color];
+ free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
+#ifdef DEBUG
+ colors_used[free_ndx]++;
+#endif /* DEBUG */
+
+ /*
+ * Get a locked smp slot from the free list.
+ */
+ smp = get_free_smp(free_ndx);
+ smapmtx = SMAPMTX(smp);
+
+ ASSERT(smp->sm_vp == NULL);
+
+ if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
+ /*
+ * Failed to hashin, there exists one now.
+ * Return the smp we just allocated.
+ */
+ segmap_smapadd(smp);
+ mutex_exit(smapmtx);
+
+ smp = nsmp;
+ goto vrfy_smp;
+ }
+ smp->sm_refcnt++; /* another user */
+
+ /*
+ * We don't invoke segmap_fault via TLB miss, so we set ref
+ * and mod bits in advance. For S_OTHER we set them in
+ * segmap_fault F_SOFTUNLOCK.
+ */
+ if (is_kpm) {
+ if (rw == S_WRITE) {
+ smp->sm_flags |= SM_WRITE_DATA;
+ } else if (rw == S_READ) {
+ smp->sm_flags |= SM_READ_DATA;
+ }
+ }
+ mutex_exit(smapmtx);
+
+ newslot = 1;
+ }
+
+ if (!is_kpm)
+ goto use_segmap_range;
+
+ /*
+ * Use segkpm
+ */
+ ASSERT(PAGESIZE == MAXBSIZE);
+
+ /*
+ * remember the last smp faulted on this cpu.
+ */
+ (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
+
+ if (forcefault == SM_PAGECREATE) {
+ baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
+ return (baseaddr);
+ }
+
+ if (newslot == 0 &&
+ (pp = GET_KPME(smp)->kpe_page) != NULL) {
+
+ /* fastpath */
+ switch (rw) {
+ case S_READ:
+ case S_WRITE:
+ if (page_trylock(pp, SE_SHARED)) {
+ if (PP_ISFREE(pp) ||
+ !(pp->p_vnode == vp &&
+ pp->p_offset == baseoff)) {
+ page_unlock(pp);
+ pp = page_lookup(vp, baseoff,
+ SE_SHARED);
+ }
+ } else {
+ pp = page_lookup(vp, baseoff, SE_SHARED);
+ }
+
+ if (pp == NULL) {
+ ASSERT(GET_KPME(smp)->kpe_page == NULL);
+ break;
+ }
+
+ if (rw == S_WRITE &&
+ hat_page_getattr(pp, P_MOD | P_REF) !=
+ (P_MOD | P_REF)) {
+ page_unlock(pp);
+ break;
+ }
+
+ /*
+ * We have the p_selock as reader, grab_smp
+ * can't hit us, we have bumped the smap
+ * refcnt and hat_pageunload needs the
+ * p_selock exclusive.
+ */
+ kpme = GET_KPME(smp);
+ if (kpme->kpe_page == pp) {
+ baseaddr = hat_kpm_page2va(pp, 0);
+ } else if (kpme->kpe_page == NULL) {
+ baseaddr = hat_kpm_mapin(pp, kpme);
+ } else {
+ panic("segmap_getmapflt: stale "
+ "kpme page, kpme %p", (void *)kpme);
+ /*NOTREACHED*/
+ }
+
+ /*
+ * We don't invoke segmap_fault via TLB miss,
+ * so we set ref and mod bits in advance.
+ * For S_OTHER and we set them in segmap_fault
+ * F_SOFTUNLOCK.
+ */
+ if (rw == S_READ && !hat_isref(pp))
+ hat_setref(pp);
+
+ return (baseaddr);
+ default:
+ break;
+ }
+ }
+
+ base = segkpm_create_va(baseoff);
+ error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
+ seg, base, rw, CRED());
+
+ pp = pl[0];
+ if (error || pp == NULL) {
+ /*
+ * Use segmap address slot and let segmap_fault deal
+ * with the error cases. There is no error return
+ * possible here.
+ */
+ goto use_segmap_range;
+ }
+
+ ASSERT(pl[1] == NULL);
+
+ /*
+ * When prot is not returned w/ PROT_ALL the returned pages
+ * are not backed by fs blocks. For most of the segmap users
+ * this is no problem, they don't write to the pages in the
+ * same request and therefore don't rely on a following
+ * trap driven segmap_fault. With SM_LOCKPROTO users it
+ * is more secure to use segkmap adresses to allow
+ * protection segmap_fault's.
+ */
+ if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
+ /*
+ * Use segmap address slot and let segmap_fault
+ * do the error return.
+ */
+ ASSERT(rw != S_WRITE);
+ ASSERT(PAGE_LOCKED(pp));
+ page_unlock(pp);
+ forcefault = 0;
+ goto use_segmap_range;
+ }
+
+ /*
+ * We have the p_selock as reader, grab_smp can't hit us, we
+ * have bumped the smap refcnt and hat_pageunload needs the
+ * p_selock exclusive.
+ */
+ kpme = GET_KPME(smp);
+ if (kpme->kpe_page == pp) {
+ baseaddr = hat_kpm_page2va(pp, 0);
+ } else if (kpme->kpe_page == NULL) {
+ baseaddr = hat_kpm_mapin(pp, kpme);
+ } else {
+ panic("segmap_getmapflt: stale kpme page after "
+ "VOP_GETPAGE, kpme %p", (void *)kpme);
+ /*NOTREACHED*/
+ }
+
+ smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
+
+ return (baseaddr);
+
+
+use_segmap_range:
+ baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
+ TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
+ "segmap_getmap:seg %p addr %p vp %p offset %llx",
+ seg, baseaddr, vp, baseoff);
+
+ /*
+ * Prefault the translations
+ */
+ vaddr = baseaddr + (off - baseoff);
+ if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
+
+ caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
+ (uintptr_t)PAGEMASK);
+
+ (void) segmap_fault(kas.a_hat, seg, pgaddr,
+ (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
+ F_INVAL, rw);
+ }
+
+ return (baseaddr);
+}
+
+int
+segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
+{
+ struct smap *smp;
+ int error;
+ int bflags = 0;
+ struct vnode *vp;
+ u_offset_t offset;
+ kmutex_t *smtx;
+ int is_kpm = 0;
+ page_t *pp;
+
+ if (segmap_kpm && IS_KPM_ADDR(addr)) {
+
+ if (((uintptr_t)addr & MAXBOFFSET) != 0) {
+ panic("segmap_release: addr %p not "
+ "MAXBSIZE aligned", (void *)addr);
+ /*NOTREACHED*/
+ }
+
+ if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
+ panic("segmap_release: smap not found "
+ "for addr %p", (void *)addr);
+ /*NOTREACHED*/
+ }
+
+ TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
+ "segmap_relmap:seg %p addr %p smp %p",
+ seg, addr, smp);
+
+ smtx = SMAPMTX(smp);
+
+ /*
+ * For compatibilty reasons segmap_pagecreate_kpm sets this
+ * flag to allow a following segmap_pagecreate to return
+ * this as "newpage" flag. When segmap_pagecreate is not
+ * called at all we clear it now.
+ */
+ smp->sm_flags &= ~SM_KPM_NEWPAGE;
+ is_kpm = 1;
+ if (smp->sm_flags & SM_WRITE_DATA) {
+ hat_setrefmod(pp);
+ } else if (smp->sm_flags & SM_READ_DATA) {
+ hat_setref(pp);
+ }
+ } else {
+ if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
+ ((uintptr_t)addr & MAXBOFFSET) != 0) {
+ panic("segmap_release: bad addr %p", (void *)addr);
+ /*NOTREACHED*/
+ }
+ smp = GET_SMAP(seg, addr);
+
+ TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
+ "segmap_relmap:seg %p addr %p smp %p",
+ seg, addr, smp);
+
+ smtx = SMAPMTX(smp);
+ mutex_enter(smtx);
+ smp->sm_flags |= SM_NOTKPM_RELEASED;
+ }
+
+ ASSERT(smp->sm_refcnt > 0);
+
+ /*
+ * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
+ * are set.
+ */
+ if ((flags & ~SM_DONTNEED) != 0) {
+ if (flags & SM_WRITE)
+ segmapcnt.smp_rel_write.value.ul++;
+ if (flags & SM_ASYNC) {
+ bflags |= B_ASYNC;
+ segmapcnt.smp_rel_async.value.ul++;
+ }
+ if (flags & SM_INVAL) {
+ bflags |= B_INVAL;
+ segmapcnt.smp_rel_abort.value.ul++;
+ }
+ if (flags & SM_DESTROY) {
+ bflags |= (B_INVAL|B_TRUNC);
+ segmapcnt.smp_rel_abort.value.ul++;
+ }
+ if (smp->sm_refcnt == 1) {
+ /*
+ * We only bother doing the FREE and DONTNEED flags
+ * if no one else is still referencing this mapping.
+ */
+ if (flags & SM_FREE) {
+ bflags |= B_FREE;
+ segmapcnt.smp_rel_free.value.ul++;
+ }
+ if (flags & SM_DONTNEED) {
+ bflags |= B_DONTNEED;
+ segmapcnt.smp_rel_dontneed.value.ul++;
+ }
+ }
+ } else {
+ smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
+ }
+
+ vp = smp->sm_vp;
+ offset = smp->sm_off;
+
+ if (--smp->sm_refcnt == 0) {
+
+ if (is_kpm) {
+ smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
+ }
+ if (flags & (SM_INVAL|SM_DESTROY)) {
+ segmap_hashout(smp); /* remove map info */
+ if (is_kpm) {
+ hat_kpm_mapout(pp, GET_KPME(smp), addr);
+ if (smp->sm_flags & SM_NOTKPM_RELEASED) {
+ smp->sm_flags &= ~SM_NOTKPM_RELEASED;
+ hat_unload(kas.a_hat, addr, MAXBSIZE,
+ HAT_UNLOAD);
+ }
+
+ } else {
+ if (segmap_kpm)
+ segkpm_mapout_validkpme(GET_KPME(smp));
+
+ smp->sm_flags &= ~SM_NOTKPM_RELEASED;
+ hat_unload(kas.a_hat, addr, MAXBSIZE,
+ HAT_UNLOAD);
+ }
+ }
+ segmap_smapadd(smp); /* add to free list */
+ }
+
+ mutex_exit(smtx);
+
+ if (is_kpm)
+ page_unlock(pp);
+ /*
+ * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
+ * are set.
+ */
+ if ((flags & ~SM_DONTNEED) != 0) {
+ error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
+ bflags, CRED());
+ } else {
+ error = 0;
+ }
+
+ return (error);
+}
+
+/*
+ * Dump the pages belonging to this segmap segment.
+ */
+static void
+segmap_dump(struct seg *seg)
+{
+ struct segmap_data *smd;
+ struct smap *smp, *smp_end;
+ page_t *pp;
+ pfn_t pfn;
+ u_offset_t off;
+ caddr_t addr;
+
+ smd = (struct segmap_data *)seg->s_data;
+ addr = seg->s_base;
+ for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
+ smp < smp_end; smp++) {
+
+ if (smp->sm_refcnt) {
+ for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
+ int we_own_it = 0;
+
+ /*
+ * If pp == NULL, the page either does
+ * not exist or is exclusively locked.
+ * So determine if it exists before
+ * searching for it.
+ */
+ if ((pp = page_lookup_nowait(smp->sm_vp,
+ smp->sm_off + off, SE_SHARED)))
+ we_own_it = 1;
+ else
+ pp = page_exists(smp->sm_vp,
+ smp->sm_off + off);
+
+ if (pp) {
+ pfn = page_pptonum(pp);
+ dump_addpage(seg->s_as,
+ addr + off, pfn);
+ if (we_own_it)
+ page_unlock(pp);
+ }
+ dump_timeleft = dump_timeout;
+ }
+ }
+ addr += MAXBSIZE;
+ }
+}
+
+/*ARGSUSED*/
+static int
+segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
+ struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+ return (ENOTSUP);
+}
+
+static int
+segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+ struct segmap_data *smd = (struct segmap_data *)seg->s_data;
+
+ memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
+ memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
+ return (0);
+}
+
+/*ARGSUSED*/
+static lgrp_mem_policy_info_t *
+segmap_getpolicy(struct seg *seg, caddr_t addr)
+{
+ return (NULL);
+}
+
+
+#ifdef SEGKPM_SUPPORT
+
+/*
+ * segkpm support routines
+ */
+
+static caddr_t
+segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
+ struct smap *smp, enum seg_rw rw)
+{
+ caddr_t base;
+ page_t *pp;
+ int newpage = 0;
+ struct kpme *kpme;
+
+ ASSERT(smp->sm_refcnt > 0);
+
+ if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
+ kmutex_t *smtx;
+
+ base = segkpm_create_va(off);
+
+ if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
+ seg, base)) == NULL) {
+ panic("segmap_pagecreate_kpm: "
+ "page_create failed");
+ /*NOTREACHED*/
+ }
+
+ newpage = 1;
+ page_io_unlock(pp);
+ ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
+
+ /*
+ * Mark this here until the following segmap_pagecreate
+ * or segmap_release.
+ */
+ smtx = SMAPMTX(smp);
+ mutex_enter(smtx);
+ smp->sm_flags |= SM_KPM_NEWPAGE;
+ mutex_exit(smtx);
+ }
+
+ kpme = GET_KPME(smp);
+ if (!newpage && kpme->kpe_page == pp)
+ base = hat_kpm_page2va(pp, 0);
+ else
+ base = hat_kpm_mapin(pp, kpme);
+
+ /*
+ * FS code may decide not to call segmap_pagecreate and we
+ * don't invoke segmap_fault via TLB miss, so we have to set
+ * ref and mod bits in advance.
+ */
+ if (rw == S_WRITE) {
+ hat_setrefmod(pp);
+ } else {
+ ASSERT(rw == S_READ);
+ hat_setref(pp);
+ }
+
+ smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
+
+ return (base);
+}
+
+/*
+ * Find the smap structure corresponding to the
+ * KPM addr and return it locked.
+ */
+struct smap *
+get_smap_kpm(caddr_t addr, page_t **ppp)
+{
+ struct smap *smp;
+ struct vnode *vp;
+ u_offset_t offset;
+ caddr_t baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
+ int hashid;
+ kmutex_t *hashmtx;
+ page_t *pp;
+ union segmap_cpu *scpu;
+
+ pp = hat_kpm_vaddr2page(baseaddr);
+
+ ASSERT(pp && !PP_ISFREE(pp));
+ ASSERT(PAGE_LOCKED(pp));
+ ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
+
+ vp = pp->p_vnode;
+ offset = pp->p_offset;
+ ASSERT(vp != NULL);
+
+ /*
+ * Assume the last smap used on this cpu is the one needed.
+ */
+ scpu = smd_cpu+CPU->cpu_seqid;
+ smp = scpu->scpu.scpu_last_smap;
+ mutex_enter(&smp->sm_mtx);
+ if (smp->sm_vp == vp && smp->sm_off == offset) {
+ ASSERT(smp->sm_refcnt > 0);
+ } else {
+ /*
+ * Assumption wrong, find the smap on the hash chain.
+ */
+ mutex_exit(&smp->sm_mtx);
+ SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
+ hashmtx = SHASHMTX(hashid);
+
+ mutex_enter(hashmtx);
+ smp = smd_hash[hashid].sh_hash_list;
+ for (; smp != NULL; smp = smp->sm_hash) {
+ if (smp->sm_vp == vp && smp->sm_off == offset)
+ break;
+ }
+ mutex_exit(hashmtx);
+ if (smp) {
+ mutex_enter(&smp->sm_mtx);
+ ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
+ }
+ }
+
+ if (ppp)
+ *ppp = smp ? pp : NULL;
+
+ return (smp);
+}
+
+#else /* SEGKPM_SUPPORT */
+
+/* segkpm stubs */
+
+/*ARGSUSED*/
+static caddr_t
+segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
+ struct smap *smp, enum seg_rw rw)
+{
+ return (NULL);
+}
+
+/*ARGSUSED*/
+struct smap *
+get_smap_kpm(caddr_t addr, page_t **ppp)
+{
+ return (NULL);
+}
+
+#endif /* SEGKPM_SUPPORT */
diff --git a/usr/src/uts/common/vm/seg_map.h b/usr/src/uts/common/vm/seg_map.h
new file mode 100644
index 0000000000..339dabe674
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_map.h
@@ -0,0 +1,294 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_SEG_MAP_H
+#define _VM_SEG_MAP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * When segmap is created it is possible to program its behavior,
+ * using the create args [needed for performance reasons].
+ * Segmap creates n lists of pages.
+ * For VAC machines, there will be at least one free list
+ * per color. If more than one free list per color is needed,
+ * set nfreelist as needed.
+ *
+ * For PAC machines, it will be treated as VAC with only one
+ * color- every page is of the same color. Again, set nfreelist
+ * to get more than one free list.
+ */
+struct segmap_crargs {
+ uint_t prot;
+ uint_t shmsize; /* shm_alignment for VAC, 0 for PAC. */
+ uint_t nfreelist; /* number of freelist per color, >= 1 */
+};
+
+#include <vm/kpm.h>
+
+/*
+ * Each smap struct represents a MAXBSIZE sized mapping to the
+ * <sm_vp, sm_off> given in the structure. The location of the
+ * the structure in the array gives the virtual address of the
+ * mapping. Structure rearranged for 64bit sm_off.
+ */
+struct smap {
+ kmutex_t sm_mtx; /* protect non-list fields */
+ struct vnode *sm_vp; /* vnode pointer (if mapped) */
+ struct smap *sm_hash; /* hash pointer */
+ struct smap *sm_next; /* next pointer */
+ struct smap *sm_prev; /* previous pointer */
+ u_offset_t sm_off; /* file offset for mapping */
+ ushort_t sm_bitmap; /* bit map for locked translations */
+ ushort_t sm_refcnt; /* reference count for uses */
+ ushort_t sm_flags; /* smap flags */
+ ushort_t sm_free_ndx; /* freelist */
+#ifdef SEGKPM_SUPPORT
+ struct kpme sm_kpme; /* segkpm */
+#endif
+};
+
+#ifdef SEGKPM_SUPPORT
+#define GET_KPME(smp) (&(smp)->sm_kpme)
+#define sm_kpme_next sm_kpme.kpe_next
+#define sm_kpme_prev sm_kpme.kpe_prev
+#define sm_kpme_page sm_kpme.kpe_page
+#else
+#define GET_KPME(smp) ((struct kpme *)NULL)
+#endif
+
+/* sm_flags */
+#define SM_KPM_NEWPAGE 0x00000001 /* page created in segmap_getmapft */
+#define SM_NOTKPM_RELEASED 0x00000002 /* released smap not in segkpm mode */
+#define SM_QNDX_ZERO 0x00000004 /* on the index 0 freelist */
+#define SM_READ_DATA 0x00000010 /* page created for read */
+#define SM_WRITE_DATA 0x00000020 /* page created for write */
+
+/*
+ * Multiple smap free lists are maintained so that allocations
+ * will scale with cpu count. Each free list is made up of 2 queues
+ * so that allocations and deallocations can proceed concurrently.
+ * Each queue structure is padded to 64 bytes to avoid false sharing.
+ */
+#define SM_FREEQ_PAD (64 - sizeof (struct smap *) - sizeof (kmutex_t))
+struct sm_freeq {
+ struct smap *smq_free; /* points into freelist */
+ kmutex_t smq_mtx; /* protects smq_free */
+ char smq_pad[SM_FREEQ_PAD];
+};
+
+struct smfree {
+ struct sm_freeq sm_freeq[2]; /* alloc and release queues */
+ struct sm_freeq *sm_allocq; /* current allocq */
+ struct sm_freeq *sm_releq; /* current releq */
+ kcondvar_t sm_free_cv;
+ ushort_t sm_want; /* someone wants a slot of this color */
+};
+
+/*
+ * Cached smaps are kept on hash chains to enable fast reclaim lookups.
+ */
+struct smaphash {
+ kmutex_t sh_mtx; /* protects this hash chain */
+ struct smap *sh_hash_list; /* start of hash chain */
+};
+
+/*
+ * (Semi) private data maintained by the segmap driver per SEGMENT mapping
+ * All fields in segmap_data are read-only after the segment is created.
+ *
+ */
+
+struct segmap_data {
+ struct smap *smd_sm; /* array of smap structures */
+ long smd_npages; /* size of smap array */
+ struct smfree *smd_free; /* ptr to freelist header array */
+ struct smaphash *smd_hash; /* ptr to hash header array */
+ int smd_nfree; /* number of free lists */
+ uchar_t smd_prot; /* protections for all smap's */
+};
+
+/*
+ * Statistics for segmap operations.
+ *
+ * No explicit locking to protect these stats.
+ */
+struct segmapcnt {
+ kstat_named_t smp_fault; /* number of segmap_faults */
+ kstat_named_t smp_faulta; /* number of segmap_faultas */
+ kstat_named_t smp_getmap; /* number of segmap_getmaps */
+ kstat_named_t smp_get_use; /* getmaps that reuse existing map */
+ kstat_named_t smp_get_reclaim; /* getmaps that do a reclaim */
+ kstat_named_t smp_get_reuse; /* getmaps that reuse a slot */
+ kstat_named_t smp_get_unused; /* getmaps that reuse existing map */
+ kstat_named_t smp_get_nofree; /* getmaps with no free slots */
+ kstat_named_t smp_rel_async; /* releases that are async */
+ kstat_named_t smp_rel_write; /* releases that write */
+ kstat_named_t smp_rel_free; /* releases that free */
+ kstat_named_t smp_rel_abort; /* releases that abort */
+ kstat_named_t smp_rel_dontneed; /* releases with dontneed set */
+ kstat_named_t smp_release; /* releases with no other action */
+ kstat_named_t smp_pagecreate; /* pagecreates */
+ kstat_named_t smp_free_notfree; /* pages not freed in */
+ /* segmap_pagefree */
+ kstat_named_t smp_free_dirty; /* dirty pages freeed */
+ /* in segmap_pagefree */
+ kstat_named_t smp_free; /* clean pages freeed in */
+ /* segmap_pagefree */
+ kstat_named_t smp_stolen; /* segmap_getmapflt() stole */
+ /* from get_free_smp() */
+ kstat_named_t smp_get_nomtx; /* free smaps but no mutex */
+};
+
+/*
+ * These are flags used on release. Some of these might get handled
+ * by segment operations needed for msync (when we figure them out).
+ * SM_ASYNC modifies SM_WRITE. SM_DONTNEED modifies SM_FREE. SM_FREE
+ * and SM_INVAL as well as SM_FREE and SM_DESTROY are mutually exclusive.
+ * SM_DESTROY behaves like SM_INVAL but also forces the pages to be
+ * destroyed -- this prevents them from being written to the backing
+ * store.
+ */
+#define SM_WRITE 0x01 /* write back the pages upon release */
+#define SM_ASYNC 0x02 /* do the write asynchronously */
+#define SM_FREE 0x04 /* put pages back on free list */
+#define SM_INVAL 0x08 /* invalidate page (no caching) */
+#define SM_DONTNEED 0x10 /* less likely to be needed soon */
+#define SM_DESTROY 0x20 /* invalidate page, don't write back */
+
+/*
+ * These are the forcefault flags used on getmapflt.
+ *
+ * The orginal semantic was extended to allow using the segkpm mapping
+ * scheme w/o a major segmap interface change for MAXBSIZE == PAGESIZE
+ * (which is required to enable segkpm for MAXBSIZE > PAGESIZE).
+ * Most segmap consumers needn't to be changed at all or only need to
+ * be changed slightly to take advantage of segkpm. Because the segkpm
+ * virtual address is based on the physical address of a page, a page is
+ * required to determine the virtual address (return value). Pages mapped
+ * with segkpm are always at least read locked and are hence protected
+ * from pageout or fsflush from segmap_getmap until segmap_release. This
+ * implies, that the segkpm mappings are locked within this period too.
+ * No trap driven segmap_fault's are possible in segkpm mode.
+ *
+ * The following combinations of "forcefault" and "rw" allow segkpm mode.
+ * (1) SM_FAULT, S_READ
+ * (2) SM_FAULT, S_WRITE
+ * (3) SM_PAGECREATE, S_WRITE
+ * (4) SM_LOCKPROTO, {S_READ, S_WRITE, S_OTHER}
+ *
+ * The regular additional operations (come in pairs in most of the cases):
+ * . segmap_pagecreate/segmap_pageunlock
+ * . segmap_fault(F_SOFTLOCK)/segmap_fault(F_SOFTUNLOCK)
+ *
+ * are mostly a no-op in segkpm mode with the following exceptions:
+ * . The "newpage" return value of segmap_pagecreate is still supported
+ * for zeroout operations needed on newly created pages.
+ *
+ * . segmap_fault() must follow when a error could be expected in
+ * the VOP_GETPAGE. In segkpm mode this error is recognized in
+ * segmap_getmapflt and returned from the following segmap_fault()
+ * call. The "hole" optimization (read only after first VOP_GETPAGE
+ * mapping in segmap_getmapflt followed by a trap driven protection
+ * fault and a second VOP_GETPAGE via segmap_fault) cannot be used.
+ *
+ * . segmap_fault(F_SOFTUNLOCK) must follow when segmap_getmapflt was
+ * called w/ (SM_LOCKPROTO, S_OTHER). S_WRITE has to be applied, when
+ * the page should be marked "dirty". Otherwise the page is not
+ * written to the backing store later (as mentioned above, no page
+ * or protection faults are possible in segkpm mode). Caller cannot
+ * use only S_OTHER and rely on a protection fault to force the page
+ * to become dirty.
+ *
+ * . The segmap_pagecreate parameter softlock is ignored, pages and
+ * mappings are locked anyway.
+ *
+ * SM_LOCKPROTO is used in the fbio layer and some special segmap consumers.
+ */
+#define SM_PAGECREATE 0x00 /* create page in segkpm mode, no I/O */
+#define SM_FAULT 0x01 /* fault in page if necessary */
+#define SM_LOCKPROTO 0x02 /* lock/unlock protocol used */
+
+#define MAXBSHIFT 13 /* log2(MAXBSIZE) */
+
+#define MAXBOFFSET (MAXBSIZE - 1)
+#define MAXBMASK (~MAXBOFFSET)
+
+/*
+ * SMAP_HASHAVELEN is the average length desired for this chain, from
+ * which the size of the smd_hash table is derived at segment create time.
+ * SMAP_HASHVPSHIFT is defined so that 1 << SMAP_HASHVPSHIFT is the
+ * approximate size of a vnode struct.
+ */
+#define SMAP_HASHAVELEN 4
+#define SMAP_HASHVPSHIFT 6
+
+
+#ifdef _KERNEL
+/*
+ * The kernel generic mapping segment.
+ */
+extern struct seg *segkmap;
+
+/*
+ * Public seg_map segment operations.
+ */
+extern int segmap_create(struct seg *, void *);
+extern int segmap_pagecreate(struct seg *, caddr_t, size_t, int);
+extern void segmap_pageunlock(struct seg *, caddr_t, size_t, enum seg_rw);
+extern faultcode_t segmap_fault(struct hat *, struct seg *, caddr_t, size_t,
+ enum fault_type, enum seg_rw);
+extern caddr_t segmap_getmap(struct seg *, struct vnode *, u_offset_t);
+extern caddr_t segmap_getmapflt(struct seg *, struct vnode *, u_offset_t,
+ size_t, int, enum seg_rw);
+extern int segmap_release(struct seg *, caddr_t, uint_t);
+extern void segmap_flush(struct seg *, struct vnode *);
+extern void segmap_inval(struct seg *, struct vnode *, u_offset_t);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_SEG_MAP_H */
diff --git a/usr/src/uts/common/vm/seg_spt.c b/usr/src/uts/common/vm/seg_spt.c
new file mode 100644
index 0000000000..a97719ad5f
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_spt.c
@@ -0,0 +1,2701 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/user.h>
+#include <sys/mman.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/systm.h>
+#include <sys/tuneable.h>
+#include <vm/hat.h>
+#include <vm/seg.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <sys/buf.h>
+#include <sys/swap.h>
+#include <sys/atomic.h>
+#include <vm/seg_spt.h>
+#include <sys/debug.h>
+#include <sys/vtrace.h>
+#include <sys/shm.h>
+#include <sys/lgrp.h>
+#include <sys/vmsystm.h>
+
+#include <sys/tnf_probe.h>
+
+#define SEGSPTADDR (caddr_t)0x0
+
+/*
+ * # pages used for spt
+ */
+static size_t spt_used;
+
+/*
+ * segspt_minfree is the memory left for system after ISM
+ * locked its pages; it is set up to 5% of availrmem in
+ * sptcreate when ISM is created. ISM should not use more
+ * than ~90% of availrmem; if it does, then the performance
+ * of the system may decrease. Machines with large memories may
+ * be able to use up more memory for ISM so we set the default
+ * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
+ * If somebody wants even more memory for ISM (risking hanging
+ * the system) they can patch the segspt_minfree to smaller number.
+ */
+pgcnt_t segspt_minfree = 0;
+
+static int segspt_create(struct seg *seg, caddr_t argsp);
+static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
+static void segspt_free(struct seg *seg);
+static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
+static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
+
+static void
+segspt_badop()
+{
+ panic("segspt_badop called");
+ /*NOTREACHED*/
+}
+
+#define SEGSPT_BADOP(t) (t(*)())segspt_badop
+
+struct seg_ops segspt_ops = {
+ SEGSPT_BADOP(int), /* dup */
+ segspt_unmap,
+ segspt_free,
+ SEGSPT_BADOP(int), /* fault */
+ SEGSPT_BADOP(faultcode_t), /* faulta */
+ SEGSPT_BADOP(int), /* setprot */
+ SEGSPT_BADOP(int), /* checkprot */
+ SEGSPT_BADOP(int), /* kluster */
+ SEGSPT_BADOP(size_t), /* swapout */
+ SEGSPT_BADOP(int), /* sync */
+ SEGSPT_BADOP(size_t), /* incore */
+ SEGSPT_BADOP(int), /* lockop */
+ SEGSPT_BADOP(int), /* getprot */
+ SEGSPT_BADOP(u_offset_t), /* getoffset */
+ SEGSPT_BADOP(int), /* gettype */
+ SEGSPT_BADOP(int), /* getvp */
+ SEGSPT_BADOP(int), /* advise */
+ SEGSPT_BADOP(void), /* dump */
+ SEGSPT_BADOP(int), /* pagelock */
+ SEGSPT_BADOP(int), /* setpgsz */
+ SEGSPT_BADOP(int), /* getmemid */
+ segspt_getpolicy, /* getpolicy */
+};
+
+static int segspt_shmdup(struct seg *seg, struct seg *newseg);
+static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
+static void segspt_shmfree(struct seg *seg);
+static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
+ caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
+static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
+static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
+ register size_t len, register uint_t prot);
+static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
+ uint_t prot);
+static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
+static size_t segspt_shmswapout(struct seg *seg);
+static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
+ register char *vec);
+static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
+ int attr, uint_t flags);
+static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
+ int attr, int op, ulong_t *lockmap, size_t pos);
+static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
+ uint_t *protv);
+static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
+static int segspt_shmgettype(struct seg *seg, caddr_t addr);
+static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
+static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
+ uint_t behav);
+static void segspt_shmdump(struct seg *seg);
+static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
+ struct page ***, enum lock_type, enum seg_rw);
+static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
+static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
+static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
+
+struct seg_ops segspt_shmops = {
+ segspt_shmdup,
+ segspt_shmunmap,
+ segspt_shmfree,
+ segspt_shmfault,
+ segspt_shmfaulta,
+ segspt_shmsetprot,
+ segspt_shmcheckprot,
+ segspt_shmkluster,
+ segspt_shmswapout,
+ segspt_shmsync,
+ segspt_shmincore,
+ segspt_shmlockop,
+ segspt_shmgetprot,
+ segspt_shmgetoffset,
+ segspt_shmgettype,
+ segspt_shmgetvp,
+ segspt_shmadvise, /* advise */
+ segspt_shmdump,
+ segspt_shmpagelock,
+ segspt_shmsetpgsz,
+ segspt_shmgetmemid,
+ segspt_shmgetpolicy,
+};
+
+static void segspt_purge(struct seg *seg);
+static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **,
+ enum seg_rw);
+static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
+ page_t **ppa);
+
+
+
+/*ARGSUSED*/
+int
+sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
+ uint_t prot, uint_t flags, uint_t share_szc)
+{
+ int err;
+ struct as *newas;
+ struct segspt_crargs sptcargs;
+
+#ifdef DEBUG
+ TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
+ tnf_ulong, size, size );
+#endif
+ if (segspt_minfree == 0) /* leave min 5% of availrmem for */
+ segspt_minfree = availrmem/20; /* for the system */
+
+ if (!hat_supported(HAT_SHARED_PT, (void *)0))
+ return (EINVAL);
+
+ /*
+ * get a new as for this shared memory segment
+ */
+ newas = as_alloc();
+ sptcargs.amp = amp;
+ sptcargs.prot = prot;
+ sptcargs.flags = flags;
+ sptcargs.szc = share_szc;
+
+ /*
+ * create a shared page table (spt) segment
+ */
+
+ if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
+ as_free(newas);
+ return (err);
+ }
+ *sptseg = sptcargs.seg_spt;
+ return (0);
+}
+
+void
+sptdestroy(struct as *as, struct anon_map *amp)
+{
+
+#ifdef DEBUG
+ TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
+#endif
+ (void) as_unmap(as, SEGSPTADDR, amp->size);
+ as_free(as);
+}
+
+/*
+ * called from seg_free().
+ * free (i.e., unlock, unmap, return to free list)
+ * all the pages in the given seg.
+ */
+void
+segspt_free(struct seg *seg)
+{
+ struct spt_data *sptd = (struct spt_data *)seg->s_data;
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if (sptd != NULL) {
+ if (sptd->spt_realsize)
+ segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
+
+ if (sptd->spt_ppa_lckcnt)
+ kmem_free(sptd->spt_ppa_lckcnt,
+ sizeof (*sptd->spt_ppa_lckcnt)
+ * btopr(sptd->spt_amp->size));
+ kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
+ mutex_destroy(&sptd->spt_lock);
+ kmem_free(sptd, sizeof (*sptd));
+ }
+}
+
+/*ARGSUSED*/
+static int
+segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
+ uint_t flags)
+{
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static size_t
+segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
+{
+ caddr_t eo_seg;
+ pgcnt_t npages;
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct seg *sptseg;
+ struct spt_data *sptd;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+#ifdef lint
+ seg = seg;
+#endif
+ sptseg = shmd->shm_sptseg;
+ sptd = sptseg->s_data;
+
+ if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+ eo_seg = addr + len;
+ while (addr < eo_seg) {
+ /* page exists, and it's locked. */
+ *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
+ SEG_PAGE_ANON;
+ addr += PAGESIZE;
+ }
+ return (len);
+ } else {
+ struct anon_map *amp = shmd->shm_amp;
+ struct anon *ap;
+ page_t *pp;
+ pgcnt_t anon_index;
+ struct vnode *vp;
+ u_offset_t off;
+ ulong_t i;
+ int ret;
+ anon_sync_obj_t cookie;
+
+ addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ anon_index = seg_page(seg, addr);
+ npages = btopr(len);
+ if (anon_index + npages > btopr(shmd->shm_amp->size)) {
+ return (EINVAL);
+ }
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ for (i = 0; i < npages; i++, anon_index++) {
+ ret = 0;
+ anon_array_enter(amp, anon_index, &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_index);
+ if (ap != NULL) {
+ swap_xlate(ap, &vp, &off);
+ anon_array_exit(&cookie);
+ pp = page_lookup_nowait(vp, off, SE_SHARED);
+ if (pp != NULL) {
+ ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
+ page_unlock(pp);
+ }
+ } else {
+ anon_array_exit(&cookie);
+ }
+ if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
+ ret |= SEG_PAGE_LOCKED;
+ }
+ *vec++ = (char)ret;
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ return (len);
+ }
+}
+
+static int
+segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
+{
+ size_t share_size;
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * seg.s_size may have been rounded up to the largest page size
+ * in shmat().
+ * XXX This should be cleanedup. sptdestroy should take a length
+ * argument which should be the same as sptcreate. Then
+ * this rounding would not be needed (or is done in shm.c)
+ * Only the check for full segment will be needed.
+ *
+ * XXX -- shouldn't raddr == 0 always? These tests don't seem
+ * to be useful at all.
+ */
+ share_size = page_get_pagesize(seg->s_szc);
+ ssize = P2ROUNDUP(ssize, share_size);
+
+ if (raddr == seg->s_base && ssize == seg->s_size) {
+ seg_free(seg);
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+int
+segspt_create(struct seg *seg, caddr_t argsp)
+{
+ int err;
+ caddr_t addr = seg->s_base;
+ struct spt_data *sptd;
+ struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
+ struct anon_map *amp = sptcargs->amp;
+ struct cred *cred = CRED();
+ ulong_t i, j, anon_index = 0;
+ pgcnt_t npages = btopr(amp->size);
+ struct vnode *vp;
+ page_t **ppa;
+ uint_t hat_flags;
+
+ /*
+ * We are holding the a_lock on the underlying dummy as,
+ * so we can make calls to the HAT layer.
+ */
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+#ifdef DEBUG
+ TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
+ tnf_opaque, addr, addr,
+ tnf_ulong, len, seg->s_size);
+#endif
+ if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
+ if (err = anon_swap_adjust(npages))
+ return (err);
+ }
+ err = ENOMEM;
+
+ if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
+ goto out1;
+
+ if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
+ if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
+ KM_NOSLEEP)) == NULL)
+ goto out2;
+ }
+
+ mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
+ goto out3;
+
+ seg->s_ops = &segspt_ops;
+ sptd->spt_vp = vp;
+ sptd->spt_amp = amp;
+ sptd->spt_prot = sptcargs->prot;
+ sptd->spt_flags = sptcargs->flags;
+ seg->s_data = (caddr_t)sptd;
+ sptd->spt_ppa = NULL;
+ sptd->spt_ppa_lckcnt = NULL;
+ seg->s_szc = sptcargs->szc;
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ amp->a_szc = seg->s_szc;
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+
+ /*
+ * Set policy to affect initial allocation of pages in
+ * anon_map_createpages()
+ */
+ (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
+ NULL, 0, ptob(npages));
+
+ if (sptcargs->flags & SHM_PAGEABLE) {
+ size_t share_sz;
+ pgcnt_t new_npgs, more_pgs;
+ struct anon_hdr *nahp;
+
+ share_sz = page_get_pagesize(seg->s_szc);
+ if (!IS_P2ALIGNED(amp->size, share_sz)) {
+ /*
+ * We are rounding up the size of the anon array
+ * on 4 M boundary because we always create 4 M
+ * of page(s) when locking, faulting pages and we
+ * don't have to check for all corner cases e.g.
+ * if there is enough space to allocate 4 M
+ * page.
+ */
+ new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
+ more_pgs = new_npgs - npages;
+
+ if (anon_resv(ptob(more_pgs)) == 0) {
+ err = ENOMEM;
+ goto out4;
+ }
+ nahp = anon_create(new_npgs, ANON_SLEEP);
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
+ ANON_SLEEP);
+ anon_release(amp->ahp, npages);
+ amp->ahp = nahp;
+ amp->swresv = amp->size = ptob(new_npgs);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ npages = new_npgs;
+ }
+
+ sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
+ sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
+ sptd->spt_pcachecnt = 0;
+ sptd->spt_realsize = ptob(npages);
+ sptcargs->seg_spt = seg;
+ return (0);
+ }
+
+ /*
+ * get array of pages for each anon slot in amp
+ */
+ if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
+ seg, addr, S_CREATE, cred)) != 0)
+ goto out4;
+
+ /*
+ * addr is initial address corresponding to the first page on ppa list
+ */
+ for (i = 0; i < npages; i++) {
+ /* attempt to lock all pages */
+ if (!page_pp_lock(ppa[i], 0, 1)) {
+ /*
+ * if unable to lock any page, unlock all
+ * of them and return error
+ */
+ for (j = 0; j < i; j++)
+ page_pp_unlock(ppa[j], 0, 1);
+ for (i = 0; i < npages; i++) {
+ page_unlock(ppa[i]);
+ }
+ err = ENOMEM;
+ goto out4;
+ }
+ }
+
+ /*
+ * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
+ * for the entire life of the segment. For example platforms
+ * that do not support Dynamic Reconfiguration.
+ */
+ hat_flags = HAT_LOAD_SHARE;
+ if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
+ hat_flags |= HAT_LOAD_LOCK;
+
+ hat_memload_array(seg->s_as->a_hat, addr, ptob(npages),
+ ppa, sptd->spt_prot, hat_flags);
+
+ /*
+ * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
+ * we will leave the pages locked SE_SHARED for the life
+ * of the ISM segment. This will prevent any calls to
+ * hat_pageunload() on this ISM segment for those platforms.
+ */
+ if (!(hat_flags & HAT_LOAD_LOCK)) {
+ /*
+ * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
+ * we no longer need to hold the SE_SHARED lock on the pages,
+ * since L_PAGELOCK and F_SOFTLOCK calls will grab the
+ * SE_SHARED lock on the pages as necessary.
+ */
+ for (i = 0; i < npages; i++)
+ page_unlock(ppa[i]);
+ }
+ sptd->spt_pcachecnt = 0;
+ kmem_free(ppa, ((sizeof (page_t *)) * npages));
+ sptd->spt_realsize = ptob(npages);
+ atomic_add_long(&spt_used, npages);
+ sptcargs->seg_spt = seg;
+ return (0);
+
+out4:
+ seg->s_data = NULL;
+ kmem_free(vp, sizeof (*vp));
+out3:
+ mutex_destroy(&sptd->spt_lock);
+ if ((sptcargs->flags & SHM_PAGEABLE) == 0)
+ kmem_free(ppa, (sizeof (*ppa) * npages));
+out2:
+ kmem_free(sptd, sizeof (*sptd));
+out1:
+ if ((sptcargs->flags & SHM_PAGEABLE) == 0)
+ anon_swap_restore(npages);
+ return (err);
+}
+
+/*ARGSUSED*/
+void
+segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
+{
+ struct page *pp;
+ struct spt_data *sptd = (struct spt_data *)seg->s_data;
+ pgcnt_t npages;
+ ulong_t anon_idx;
+ struct anon_map *amp;
+ struct anon *ap;
+ struct vnode *vp;
+ u_offset_t off;
+ uint_t hat_flags;
+ int root = 0;
+ pgcnt_t pgs, curnpgs = 0;
+ page_t *rootpp;
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ len = P2ROUNDUP(len, PAGESIZE);
+
+ npages = btop(len);
+
+ hat_flags = HAT_UNLOAD_UNLOCK;
+ if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
+ (sptd->spt_flags & SHM_PAGEABLE)) {
+ hat_flags = HAT_UNLOAD;
+ }
+
+ hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
+
+ amp = sptd->spt_amp;
+ if (sptd->spt_flags & SHM_PAGEABLE)
+ npages = btop(amp->size);
+
+ ASSERT(amp);
+ for (anon_idx = 0; anon_idx < npages; anon_idx++) {
+ if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+ if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
+ panic("segspt_free_pages: null app");
+ /*NOTREACHED*/
+ }
+ } else {
+ if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
+ == NULL)
+ continue;
+ }
+ ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
+ swap_xlate(ap, &vp, &off);
+
+ /*
+ * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
+ * the pages won't be having SE_SHARED lock at this
+ * point.
+ *
+ * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
+ * the pages are still held SE_SHARED locked from the
+ * original segspt_create()
+ *
+ * Our goal is to get SE_EXCL lock on each page, remove
+ * permanent lock on it and invalidate the page.
+ */
+ if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+ if (hat_flags == HAT_UNLOAD)
+ pp = page_lookup(vp, off, SE_EXCL);
+ else {
+ if ((pp = page_find(vp, off)) == NULL) {
+ panic("segspt_free_pages: "
+ "page not locked");
+ /*NOTREACHED*/
+ }
+ if (!page_tryupgrade(pp)) {
+ page_unlock(pp);
+ pp = page_lookup(vp, off, SE_EXCL);
+ }
+ }
+ if (pp == NULL) {
+ panic("segspt_free_pages: "
+ "page not in the system");
+ /*NOTREACHED*/
+ }
+ page_pp_unlock(pp, 0, 1);
+ } else {
+ if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
+ continue;
+ page_pp_unlock(pp, 0, 0);
+ }
+ /*
+ * It's logical to invalidate the pages here as in most cases
+ * these were created by segspt.
+ */
+ if (pp->p_szc != 0) {
+ /*
+ * For DISM swap is released in shm_rm_amp.
+ */
+ if ((sptd->spt_flags & SHM_PAGEABLE) == 0 &&
+ ap->an_pvp != NULL) {
+ panic("segspt_free_pages: pvp non NULL");
+ /*NOTREACHED*/
+ }
+ if (root == 0) {
+ ASSERT(curnpgs == 0);
+ root = 1;
+ rootpp = pp;
+ pgs = curnpgs = page_get_pagecnt(pp->p_szc);
+ ASSERT(pgs > 1);
+ ASSERT(IS_P2ALIGNED(pgs, pgs));
+ ASSERT(!(page_pptonum(pp) & (pgs - 1)));
+ curnpgs--;
+ } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
+ ASSERT(curnpgs == 1);
+ ASSERT(page_pptonum(pp) ==
+ page_pptonum(rootpp) + (pgs - 1));
+ page_destroy_pages(rootpp);
+ root = 0;
+ curnpgs = 0;
+ } else {
+ ASSERT(curnpgs > 1);
+ ASSERT(page_pptonum(pp) ==
+ page_pptonum(rootpp) + (pgs - curnpgs));
+ curnpgs--;
+ }
+ } else {
+ if (root != 0 || curnpgs != 0) {
+ panic("segspt_free_pages: bad large page");
+ /*NOTREACHED*/
+ }
+ /*LINTED: constant in conditional context */
+ VN_DISPOSE(pp, B_INVAL, 0, kcred);
+ }
+ }
+
+ if (root != 0 || curnpgs != 0) {
+ panic("segspt_free_pages: bad large page");
+ /*NOTREACHED*/
+ }
+
+ /*
+ * mark that pages have been released
+ */
+ sptd->spt_realsize = 0;
+
+ if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+ atomic_add_long(&spt_used, -npages);
+ anon_swap_restore(npages);
+ }
+}
+
+/*
+ * Get memory allocation policy info for specified address in given segment
+ */
+static lgrp_mem_policy_info_t *
+segspt_getpolicy(struct seg *seg, caddr_t addr)
+{
+ struct anon_map *amp;
+ ulong_t anon_index;
+ lgrp_mem_policy_info_t *policy_info;
+ struct spt_data *spt_data;
+
+ ASSERT(seg != NULL);
+
+ /*
+ * Get anon_map from segspt
+ *
+ * Assume that no lock needs to be held on anon_map, since
+ * it should be protected by its reference count which must be
+ * nonzero for an existing segment
+ * Need to grab readers lock on policy tree though
+ */
+ spt_data = (struct spt_data *)seg->s_data;
+ if (spt_data == NULL)
+ return (NULL);
+ amp = spt_data->spt_amp;
+ ASSERT(amp->refcnt != 0);
+
+ /*
+ * Get policy info
+ *
+ * Assume starting anon index of 0
+ */
+ anon_index = seg_page(seg, addr);
+ policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
+
+ return (policy_info);
+}
+
+/*
+ * DISM only.
+ * Return locked pages over a given range.
+ *
+ * We will cache all DISM locked pages and save the pplist for the
+ * entire segment in the ppa field of the underlying DISM segment structure.
+ * Later, during a call to segspt_reclaim() we will use this ppa array
+ * to page_unlock() all of the pages and then we will free this ppa list.
+ */
+/*ARGSUSED*/
+static int
+segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
+ struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct seg *sptseg = shmd->shm_sptseg;
+ struct spt_data *sptd = sptseg->s_data;
+ pgcnt_t pg_idx, npages, tot_npages, npgs;
+ struct page **pplist, **pl, **ppa, *pp;
+ struct anon_map *amp;
+ spgcnt_t an_idx;
+ int ret = ENOTSUP;
+ uint_t pl_built = 0;
+ struct anon *ap;
+ struct vnode *vp;
+ u_offset_t off;
+ pgcnt_t claim_availrmem = 0;
+ uint_t szc;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * We want to lock/unlock the entire ISM segment. Therefore,
+ * we will be using the underlying sptseg and it's base address
+ * and length for the caching arguments.
+ */
+ ASSERT(sptseg);
+ ASSERT(sptd);
+
+ pg_idx = seg_page(seg, addr);
+ npages = btopr(len);
+
+ /*
+ * check if the request is larger than number of pages covered
+ * by amp
+ */
+ if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
+ *ppp = NULL;
+ return (ENOTSUP);
+ }
+
+ if (type == L_PAGEUNLOCK) {
+ ASSERT(sptd->spt_ppa != NULL);
+
+ seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
+
+ /*
+ * If someone is blocked while unmapping, we purge
+ * segment page cache and thus reclaim pplist synchronously
+ * without waiting for seg_pasync_thread. This speeds up
+ * unmapping in cases where munmap(2) is called, while
+ * raw async i/o is still in progress or where a thread
+ * exits on data fault in a multithreaded application.
+ */
+ if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
+ segspt_purge(seg);
+ }
+ return (0);
+ } else if (type == L_PAGERECLAIM) {
+ ASSERT(sptd->spt_ppa != NULL);
+ (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_ppa, sptd->spt_prot);
+ return (0);
+ }
+
+ if (sptd->spt_flags & DISM_PPA_CHANGED) {
+ segspt_purge(seg);
+ /*
+ * for DISM ppa needs to be rebuild since
+ * number of locked pages could be changed
+ */
+ *ppp = NULL;
+ return (ENOTSUP);
+ }
+
+ /*
+ * First try to find pages in segment page cache, without
+ * holding the segment lock.
+ */
+ pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_prot);
+ if (pplist != NULL) {
+ ASSERT(sptd->spt_ppa != NULL);
+ ASSERT(sptd->spt_ppa == pplist);
+ ppa = sptd->spt_ppa;
+ for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
+ if (ppa[an_idx] == NULL) {
+ seg_pinactive(seg, seg->s_base,
+ sptd->spt_amp->size, ppa,
+ sptd->spt_prot, segspt_reclaim);
+ *ppp = NULL;
+ return (ENOTSUP);
+ }
+ if ((szc = ppa[an_idx]->p_szc) != 0) {
+ npgs = page_get_pagecnt(szc);
+ an_idx = P2ROUNDUP(an_idx + 1, npgs);
+ } else {
+ an_idx++;
+ }
+ }
+ /*
+ * Since we cache the entire DISM segment, we want to
+ * set ppp to point to the first slot that corresponds
+ * to the requested addr, i.e. pg_idx.
+ */
+ *ppp = &(sptd->spt_ppa[pg_idx]);
+ return (0);
+ }
+
+ /* The L_PAGELOCK case... */
+ mutex_enter(&sptd->spt_lock);
+ /*
+ * try to find pages in segment page cache with mutex
+ */
+ pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_prot);
+ if (pplist != NULL) {
+ ASSERT(sptd->spt_ppa != NULL);
+ ASSERT(sptd->spt_ppa == pplist);
+ ppa = sptd->spt_ppa;
+ for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
+ if (ppa[an_idx] == NULL) {
+ mutex_exit(&sptd->spt_lock);
+ seg_pinactive(seg, seg->s_base,
+ sptd->spt_amp->size, ppa,
+ sptd->spt_prot, segspt_reclaim);
+ *ppp = NULL;
+ return (ENOTSUP);
+ }
+ if ((szc = ppa[an_idx]->p_szc) != 0) {
+ npgs = page_get_pagecnt(szc);
+ an_idx = P2ROUNDUP(an_idx + 1, npgs);
+ } else {
+ an_idx++;
+ }
+ }
+ /*
+ * Since we cache the entire DISM segment, we want to
+ * set ppp to point to the first slot that corresponds
+ * to the requested addr, i.e. pg_idx.
+ */
+ mutex_exit(&sptd->spt_lock);
+ *ppp = &(sptd->spt_ppa[pg_idx]);
+ return (0);
+ }
+ if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
+ SEGP_FAIL) {
+ mutex_exit(&sptd->spt_lock);
+ *ppp = NULL;
+ return (ENOTSUP);
+ }
+
+ /*
+ * No need to worry about protections because DISM pages are always rw.
+ */
+ pl = pplist = NULL;
+ amp = sptd->spt_amp;
+
+ /*
+ * Do we need to build the ppa array?
+ */
+ if (sptd->spt_ppa == NULL) {
+ pgcnt_t lpg_cnt = 0;
+
+ pl_built = 1;
+ tot_npages = btopr(sptd->spt_amp->size);
+
+ ASSERT(sptd->spt_pcachecnt == 0);
+ pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
+ pl = pplist;
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ for (an_idx = 0; an_idx < tot_npages; ) {
+ ap = anon_get_ptr(amp->ahp, an_idx);
+ /*
+ * Cache only mlocked pages. For large pages
+ * if one (constituent) page is mlocked
+ * all pages for that large page
+ * are cached also. This is for quick
+ * lookups of ppa array;
+ */
+ if ((ap != NULL) && (lpg_cnt != 0 ||
+ (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
+
+ swap_xlate(ap, &vp, &off);
+ pp = page_lookup(vp, off, SE_SHARED);
+ ASSERT(pp != NULL);
+ if (lpg_cnt == 0) {
+ npgs = page_get_pagecnt(pp->p_szc);
+ if (!IS_P2ALIGNED(an_idx, npgs)) {
+ an_idx = P2ALIGN(an_idx, npgs);
+ page_unlock(pp);
+ continue;
+ }
+ }
+ if (++lpg_cnt == npgs)
+ lpg_cnt = 0;
+
+ /*
+ * availrmem is decremented only
+ * for unlocked pages
+ */
+ if (sptd->spt_ppa_lckcnt[an_idx] == 0)
+ claim_availrmem++;
+ pplist[an_idx] = pp;
+ }
+ an_idx++;
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+
+ mutex_enter(&freemem_lock);
+ if (availrmem < tune.t_minarmem + claim_availrmem) {
+ mutex_exit(&freemem_lock);
+ ret = FC_MAKE_ERR(ENOMEM);
+ claim_availrmem = 0;
+ goto insert_fail;
+ } else {
+ availrmem -= claim_availrmem;
+ }
+ mutex_exit(&freemem_lock);
+
+ sptd->spt_ppa = pl;
+ } else {
+ /*
+ * We already have a valid ppa[].
+ */
+ pl = sptd->spt_ppa;
+ }
+
+ ASSERT(pl != NULL);
+
+ ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
+ pl, sptd->spt_prot, SEGP_FORCE_WIRED | SEGP_ASYNC_FLUSH,
+ segspt_reclaim);
+ if (ret == SEGP_FAIL) {
+ /*
+ * seg_pinsert failed. We return
+ * ENOTSUP, so that the as_pagelock() code will
+ * then try the slower F_SOFTLOCK path.
+ */
+ sptd->spt_ppa = NULL;
+ ret = ENOTSUP;
+ goto insert_fail;
+ }
+
+ /*
+ * In either case, we increment softlockcnt on the 'real' segment.
+ */
+ sptd->spt_pcachecnt++;
+ atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);
+
+ ppa = sptd->spt_ppa;
+ for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
+ if (ppa[an_idx] == NULL) {
+ mutex_exit(&sptd->spt_lock);
+ seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
+ pl, sptd->spt_prot, segspt_reclaim);
+ *ppp = NULL;
+ return (ENOTSUP);
+ }
+ if ((szc = ppa[an_idx]->p_szc) != 0) {
+ npgs = page_get_pagecnt(szc);
+ an_idx = P2ROUNDUP(an_idx + 1, npgs);
+ } else {
+ an_idx++;
+ }
+ }
+ /*
+ * We can now drop the sptd->spt_lock since the ppa[]
+ * exists and he have incremented pacachecnt.
+ */
+ mutex_exit(&sptd->spt_lock);
+
+ /*
+ * Since we cache the entire segment, we want to
+ * set ppp to point to the first slot that corresponds
+ * to the requested addr, i.e. pg_idx.
+ */
+ *ppp = &(sptd->spt_ppa[pg_idx]);
+ return (ret);
+
+insert_fail:
+ /*
+ * We will only reach this code if we tried and failed.
+ *
+ * And we can drop the lock on the dummy seg, once we've failed
+ * to set up a new ppa[].
+ */
+ mutex_exit(&sptd->spt_lock);
+
+ if (pl_built) {
+ mutex_enter(&freemem_lock);
+ availrmem += claim_availrmem;
+ mutex_exit(&freemem_lock);
+
+ /*
+ * We created pl and we need to destroy it.
+ */
+ pplist = pl;
+ for (an_idx = 0; an_idx < tot_npages; an_idx++) {
+ if (pplist[an_idx] != NULL)
+ page_unlock(pplist[an_idx]);
+ }
+ kmem_free(pl, sizeof (page_t *) * tot_npages);
+ }
+
+ if (shmd->shm_softlockcnt <= 0) {
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ mutex_enter(&seg->s_as->a_contents);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ AS_CLRUNMAPWAIT(seg->s_as);
+ cv_broadcast(&seg->s_as->a_cv);
+ }
+ mutex_exit(&seg->s_as->a_contents);
+ }
+ }
+ *ppp = NULL;
+ return (ret);
+}
+
+
+
+/*
+ * return locked pages over a given range.
+ *
+ * We will cache the entire ISM segment and save the pplist for the
+ * entire segment in the ppa field of the underlying ISM segment structure.
+ * Later, during a call to segspt_reclaim() we will use this ppa array
+ * to page_unlock() all of the pages and then we will free this ppa list.
+ */
+/*ARGSUSED*/
+static int
+segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
+ struct page ***ppp, enum lock_type type, enum seg_rw rw)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct seg *sptseg = shmd->shm_sptseg;
+ struct spt_data *sptd = sptseg->s_data;
+ pgcnt_t np, page_index, npages;
+ caddr_t a, spt_base;
+ struct page **pplist, **pl, *pp;
+ struct anon_map *amp;
+ ulong_t anon_index;
+ int ret = ENOTSUP;
+ uint_t pl_built = 0;
+ struct anon *ap;
+ struct vnode *vp;
+ u_offset_t off;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * We want to lock/unlock the entire ISM segment. Therefore,
+ * we will be using the underlying sptseg and it's base address
+ * and length for the caching arguments.
+ */
+ ASSERT(sptseg);
+ ASSERT(sptd);
+
+ if (sptd->spt_flags & SHM_PAGEABLE) {
+ return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
+ }
+
+ page_index = seg_page(seg, addr);
+ npages = btopr(len);
+
+ /*
+ * check if the request is larger than number of pages covered
+ * by amp
+ */
+ if (page_index + npages > btopr(sptd->spt_amp->size)) {
+ *ppp = NULL;
+ return (ENOTSUP);
+ }
+
+ if (type == L_PAGEUNLOCK) {
+
+ ASSERT(sptd->spt_ppa != NULL);
+
+ seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
+
+ /*
+ * If someone is blocked while unmapping, we purge
+ * segment page cache and thus reclaim pplist synchronously
+ * without waiting for seg_pasync_thread. This speeds up
+ * unmapping in cases where munmap(2) is called, while
+ * raw async i/o is still in progress or where a thread
+ * exits on data fault in a multithreaded application.
+ */
+ if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
+ segspt_purge(seg);
+ }
+ return (0);
+ } else if (type == L_PAGERECLAIM) {
+ ASSERT(sptd->spt_ppa != NULL);
+
+ (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_ppa, sptd->spt_prot);
+ return (0);
+ }
+
+ /*
+ * First try to find pages in segment page cache, without
+ * holding the segment lock.
+ */
+ pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_prot);
+ if (pplist != NULL) {
+ ASSERT(sptd->spt_ppa == pplist);
+ ASSERT(sptd->spt_ppa[page_index]);
+ /*
+ * Since we cache the entire ISM segment, we want to
+ * set ppp to point to the first slot that corresponds
+ * to the requested addr, i.e. page_index.
+ */
+ *ppp = &(sptd->spt_ppa[page_index]);
+ return (0);
+ }
+
+ /* The L_PAGELOCK case... */
+ mutex_enter(&sptd->spt_lock);
+
+ /*
+ * try to find pages in segment page cache
+ */
+ pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_prot);
+ if (pplist != NULL) {
+ ASSERT(sptd->spt_ppa == pplist);
+ /*
+ * Since we cache the entire segment, we want to
+ * set ppp to point to the first slot that corresponds
+ * to the requested addr, i.e. page_index.
+ */
+ mutex_exit(&sptd->spt_lock);
+ *ppp = &(sptd->spt_ppa[page_index]);
+ return (0);
+ }
+
+ if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
+ SEGP_FAIL) {
+ mutex_exit(&sptd->spt_lock);
+ *ppp = NULL;
+ return (ENOTSUP);
+ }
+
+ /*
+ * No need to worry about protections because ISM pages
+ * are always rw.
+ */
+ pl = pplist = NULL;
+
+ /*
+ * Do we need to build the ppa array?
+ */
+ if (sptd->spt_ppa == NULL) {
+ ASSERT(sptd->spt_ppa == pplist);
+
+ spt_base = sptseg->s_base;
+ pl_built = 1;
+
+ /*
+ * availrmem is decremented once during anon_swap_adjust()
+ * and is incremented during the anon_unresv(), which is
+ * called from shm_rm_amp() when the segment is destroyed.
+ */
+ amp = sptd->spt_amp;
+ ASSERT(amp != NULL);
+
+ /* pcachecnt is protected by sptd->spt_lock */
+ ASSERT(sptd->spt_pcachecnt == 0);
+ pplist = kmem_zalloc(sizeof (page_t *)
+ * btopr(sptd->spt_amp->size), KM_SLEEP);
+ pl = pplist;
+
+ anon_index = seg_page(sptseg, spt_base);
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
+ a += PAGESIZE, anon_index++, pplist++) {
+ ap = anon_get_ptr(amp->ahp, anon_index);
+ ASSERT(ap != NULL);
+ swap_xlate(ap, &vp, &off);
+ pp = page_lookup(vp, off, SE_SHARED);
+ ASSERT(pp != NULL);
+ *pplist = pp;
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+
+ if (a < (spt_base + sptd->spt_amp->size)) {
+ ret = ENOTSUP;
+ goto insert_fail;
+ }
+ sptd->spt_ppa = pl;
+ } else {
+ /*
+ * We already have a valid ppa[].
+ */
+ pl = sptd->spt_ppa;
+ }
+
+ ASSERT(pl != NULL);
+
+ ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
+ pl, sptd->spt_prot, SEGP_FORCE_WIRED, segspt_reclaim);
+ if (ret == SEGP_FAIL) {
+ /*
+ * seg_pinsert failed. We return
+ * ENOTSUP, so that the as_pagelock() code will
+ * then try the slower F_SOFTLOCK path.
+ */
+ if (pl_built) {
+ /*
+ * No one else has referenced the ppa[].
+ * We created it and we need to destroy it.
+ */
+ sptd->spt_ppa = NULL;
+ }
+ ret = ENOTSUP;
+ goto insert_fail;
+ }
+
+ /*
+ * In either case, we increment softlockcnt on the 'real' segment.
+ */
+ sptd->spt_pcachecnt++;
+ atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);
+
+ /*
+ * We can now drop the sptd->spt_lock since the ppa[]
+ * exists and he have incremented pacachecnt.
+ */
+ mutex_exit(&sptd->spt_lock);
+
+ /*
+ * Since we cache the entire segment, we want to
+ * set ppp to point to the first slot that corresponds
+ * to the requested addr, i.e. page_index.
+ */
+ *ppp = &(sptd->spt_ppa[page_index]);
+ return (ret);
+
+insert_fail:
+ /*
+ * We will only reach this code if we tried and failed.
+ *
+ * And we can drop the lock on the dummy seg, once we've failed
+ * to set up a new ppa[].
+ */
+ mutex_exit(&sptd->spt_lock);
+
+ if (pl_built) {
+ /*
+ * We created pl and we need to destroy it.
+ */
+ pplist = pl;
+ np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
+ while (np) {
+ page_unlock(*pplist);
+ np--;
+ pplist++;
+ }
+ kmem_free(pl, sizeof (page_t *) *
+ btopr(sptd->spt_amp->size));
+ }
+ if (shmd->shm_softlockcnt <= 0) {
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ mutex_enter(&seg->s_as->a_contents);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ AS_CLRUNMAPWAIT(seg->s_as);
+ cv_broadcast(&seg->s_as->a_cv);
+ }
+ mutex_exit(&seg->s_as->a_contents);
+ }
+ }
+ *ppp = NULL;
+ return (ret);
+}
+
+/*
+ * purge any cached pages in the I/O page cache
+ */
+static void
+segspt_purge(struct seg *seg)
+{
+ seg_ppurge(seg);
+}
+
+static int
+segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
+ enum seg_rw rw)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct seg *sptseg;
+ struct spt_data *sptd;
+ pgcnt_t npages, i, free_availrmem = 0;
+ int done = 0;
+
+#ifdef lint
+ addr = addr;
+#endif
+ sptseg = shmd->shm_sptseg;
+ sptd = sptseg->s_data;
+ npages = (len >> PAGESHIFT);
+ ASSERT(npages);
+ ASSERT(sptd->spt_pcachecnt != 0);
+ ASSERT(sptd->spt_ppa == pplist);
+ ASSERT(npages == btopr(sptd->spt_amp->size));
+
+ /*
+ * Acquire the lock on the dummy seg and destroy the
+ * ppa array IF this is the last pcachecnt.
+ */
+ mutex_enter(&sptd->spt_lock);
+ if (--sptd->spt_pcachecnt == 0) {
+ for (i = 0; i < npages; i++) {
+ if (pplist[i] == NULL) {
+ continue;
+ }
+ if (rw == S_WRITE) {
+ hat_setrefmod(pplist[i]);
+ } else {
+ hat_setref(pplist[i]);
+ }
+ if ((sptd->spt_flags & SHM_PAGEABLE) &&
+ (sptd->spt_ppa_lckcnt[i] == 0))
+ free_availrmem++;
+ page_unlock(pplist[i]);
+ }
+ if (sptd->spt_flags & SHM_PAGEABLE) {
+ mutex_enter(&freemem_lock);
+ availrmem += free_availrmem;
+ mutex_exit(&freemem_lock);
+ }
+ /*
+ * Since we want to cach/uncache the entire ISM segment,
+ * we will track the pplist in a segspt specific field
+ * ppa, that is initialized at the time we add an entry to
+ * the cache.
+ */
+ ASSERT(sptd->spt_pcachecnt == 0);
+ kmem_free(pplist, sizeof (page_t *) * npages);
+ sptd->spt_ppa = NULL;
+ sptd->spt_flags &= ~DISM_PPA_CHANGED;
+ done = 1;
+ }
+ mutex_exit(&sptd->spt_lock);
+ /*
+ * Now decrement softlockcnt.
+ */
+ atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1);
+
+ if (shmd->shm_softlockcnt <= 0) {
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ mutex_enter(&seg->s_as->a_contents);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ AS_CLRUNMAPWAIT(seg->s_as);
+ cv_broadcast(&seg->s_as->a_cv);
+ }
+ mutex_exit(&seg->s_as->a_contents);
+ }
+ }
+ return (done);
+}
+
+/*
+ * Do a F_SOFTUNLOCK call over the range requested.
+ * The range must have already been F_SOFTLOCK'ed.
+ *
+ * The calls to acquire and release the anon map lock mutex were
+ * removed in order to avoid a deadly embrace during a DR
+ * memory delete operation. (Eg. DR blocks while waiting for a
+ * exclusive lock on a page that is being used for kaio; the
+ * thread that will complete the kaio and call segspt_softunlock
+ * blocks on the anon map lock; another thread holding the anon
+ * map lock blocks on another page lock via the segspt_shmfault
+ * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
+ *
+ * The appropriateness of the removal is based upon the following:
+ * 1. If we are holding a segment's reader lock and the page is held
+ * shared, then the corresponding element in anonmap which points to
+ * anon struct cannot change and there is no need to acquire the
+ * anonymous map lock.
+ * 2. Threads in segspt_softunlock have a reader lock on the segment
+ * and already have the shared page lock, so we are guaranteed that
+ * the anon map slot cannot change and therefore can call anon_get_ptr()
+ * without grabbing the anonymous map lock.
+ * 3. Threads that softlock a shared page break copy-on-write, even if
+ * its a read. Thus cow faults can be ignored with respect to soft
+ * unlocking, since the breaking of cow means that the anon slot(s) will
+ * not be shared.
+ */
+static void
+segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
+ size_t len, enum seg_rw rw)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct seg *sptseg;
+ struct spt_data *sptd;
+ page_t *pp;
+ caddr_t adr;
+ struct vnode *vp;
+ u_offset_t offset;
+ ulong_t anon_index;
+ struct anon_map *amp; /* XXX - for locknest */
+ struct anon *ap = NULL;
+ pgcnt_t npages;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ sptseg = shmd->shm_sptseg;
+ sptd = sptseg->s_data;
+
+ /*
+ * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
+ * and therefore their pages are SE_SHARED locked
+ * for the entire life of the segment.
+ */
+ if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
+ ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
+ goto softlock_decrement;
+ }
+
+ /*
+ * Any thread is free to do a page_find and
+ * page_unlock() on the pages within this seg.
+ *
+ * We are already holding the as->a_lock on the user's
+ * real segment, but we need to hold the a_lock on the
+ * underlying dummy as. This is mostly to satisfy the
+ * underlying HAT layer.
+ */
+ AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
+ hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
+ AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
+
+ amp = sptd->spt_amp;
+ ASSERT(amp != NULL);
+ anon_index = seg_page(sptseg, sptseg_addr);
+
+ for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
+ ap = anon_get_ptr(amp->ahp, anon_index++);
+ ASSERT(ap != NULL);
+ swap_xlate(ap, &vp, &offset);
+
+ /*
+ * Use page_find() instead of page_lookup() to
+ * find the page since we know that it has a
+ * "shared" lock.
+ */
+ pp = page_find(vp, offset);
+ ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
+ if (pp == NULL) {
+ panic("segspt_softunlock: "
+ "addr %p, ap %p, vp %p, off %llx",
+ (void *)adr, (void *)ap, (void *)vp, offset);
+ /*NOTREACHED*/
+ }
+
+ if (rw == S_WRITE) {
+ hat_setrefmod(pp);
+ } else if (rw != S_OTHER) {
+ hat_setref(pp);
+ }
+ page_unlock(pp);
+ }
+
+softlock_decrement:
+ npages = btopr(len);
+ atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
+ if (shmd->shm_softlockcnt == 0) {
+ /*
+ * All SOFTLOCKS are gone. Wakeup any waiting
+ * unmappers so they can try again to unmap.
+ * Check for waiters first without the mutex
+ * held so we don't always grab the mutex on
+ * softunlocks.
+ */
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ mutex_enter(&seg->s_as->a_contents);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ AS_CLRUNMAPWAIT(seg->s_as);
+ cv_broadcast(&seg->s_as->a_cv);
+ }
+ mutex_exit(&seg->s_as->a_contents);
+ }
+ }
+}
+
+int
+segspt_shmattach(struct seg *seg, caddr_t *argsp)
+{
+ struct shm_data *shmd_arg = (struct shm_data *)argsp;
+ struct shm_data *shmd;
+ struct anon_map *shm_amp = shmd_arg->shm_amp;
+ struct spt_data *sptd;
+ int error = 0;
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
+ if (shmd == NULL)
+ return (ENOMEM);
+
+ shmd->shm_sptas = shmd_arg->shm_sptas;
+ shmd->shm_amp = shm_amp;
+ shmd->shm_sptseg = shmd_arg->shm_sptseg;
+
+ (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
+ NULL, 0, seg->s_size);
+
+ seg->s_data = (void *)shmd;
+ seg->s_ops = &segspt_shmops;
+ seg->s_szc = shmd->shm_sptseg->s_szc;
+ sptd = shmd->shm_sptseg->s_data;
+
+ if (sptd->spt_flags & SHM_PAGEABLE) {
+ if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
+ KM_NOSLEEP)) == NULL) {
+ seg->s_data = (void *)NULL;
+ kmem_free(shmd, (sizeof (*shmd)));
+ return (ENOMEM);
+ }
+ shmd->shm_lckpgs = 0;
+ if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
+ if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
+ shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
+ seg->s_size, seg->s_szc)) != 0) {
+ kmem_free(shmd->shm_vpage,
+ btopr(shm_amp->size));
+ }
+ }
+ } else {
+ error = hat_share(seg->s_as->a_hat, seg->s_base,
+ shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
+ seg->s_size, seg->s_szc);
+ }
+ if (error) {
+ seg->s_szc = 0;
+ seg->s_data = (void *)NULL;
+ kmem_free(shmd, (sizeof (*shmd)));
+ } else {
+ ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
+ shm_amp->refcnt++;
+ ANON_LOCK_EXIT(&shm_amp->a_rwlock);
+ }
+ return (error);
+}
+
+int
+segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ int reclaim = 1;
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+retry:
+ if (shmd->shm_softlockcnt > 0) {
+ if (reclaim == 1) {
+ segspt_purge(seg);
+ reclaim = 0;
+ goto retry;
+ }
+ return (EAGAIN);
+ }
+
+ if (ssize != seg->s_size) {
+#ifdef DEBUG
+ cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
+ ssize, seg->s_size);
+#endif
+ return (EINVAL);
+ }
+
+ (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
+ NULL, 0);
+ hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
+
+ seg_free(seg);
+
+ return (0);
+}
+
+void
+segspt_shmfree(struct seg *seg)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct anon_map *shm_amp = shmd->shm_amp;
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
+ MC_UNLOCK, NULL, 0);
+
+ /*
+ * Need to increment refcnt when attaching
+ * and decrement when detaching because of dup().
+ */
+ ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
+ shm_amp->refcnt--;
+ ANON_LOCK_EXIT(&shm_amp->a_rwlock);
+
+ if (shmd->shm_vpage) { /* only for DISM */
+ kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
+ shmd->shm_vpage = NULL;
+ }
+ kmem_free(shmd, sizeof (*shmd));
+}
+
+/*ARGSUSED*/
+int
+segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * Shared page table is more than shared mapping.
+ * Individual process sharing page tables can't change prot
+ * because there is only one set of page tables.
+ * This will be allowed after private page table is
+ * supported.
+ */
+/* need to return correct status error? */
+ return (0);
+}
+
+
+faultcode_t
+segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
+ size_t len, enum fault_type type, enum seg_rw rw)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct seg *sptseg = shmd->shm_sptseg;
+ struct as *curspt = shmd->shm_sptas;
+ struct spt_data *sptd = sptseg->s_data;
+ pgcnt_t npages;
+ size_t share_sz, size;
+ caddr_t segspt_addr, shm_addr;
+ page_t **ppa;
+ int i;
+ ulong_t an_idx = 0;
+ int err = 0;
+
+#ifdef lint
+ hat = hat;
+#endif
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * Because of the way spt is implemented
+ * the realsize of the segment does not have to be
+ * equal to the segment size itself. The segment size is
+ * often in multiples of a page size larger than PAGESIZE.
+ * The realsize is rounded up to the nearest PAGESIZE
+ * based on what the user requested. This is a bit of
+ * ungliness that is historical but not easily fixed
+ * without re-designing the higher levels of ISM.
+ */
+ ASSERT(addr >= seg->s_base);
+ if (((addr + len) - seg->s_base) > sptd->spt_realsize)
+ return (FC_NOMAP);
+ /*
+ * For all of the following cases except F_PROT, we need to
+ * make any necessary adjustments to addr and len
+ * and get all of the necessary page_t's into an array called ppa[].
+ *
+ * The code in shmat() forces base addr and len of ISM segment
+ * to be aligned to largest page size supported. Therefore,
+ * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
+ * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
+ * in large pagesize chunks, or else we will screw up the HAT
+ * layer by calling hat_memload_array() with differing page sizes
+ * over a given virtual range.
+ */
+ share_sz = page_get_pagesize(sptseg->s_szc);
+ shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
+ size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_sz);
+ npages = btopr(size);
+
+ /*
+ * Now we need to convert from addr in segshm to addr in segspt.
+ */
+ an_idx = seg_page(seg, shm_addr);
+ segspt_addr = sptseg->s_base + ptob(an_idx);
+
+ ASSERT((segspt_addr + ptob(npages)) <=
+ (sptseg->s_base + sptd->spt_realsize));
+ ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
+
+ switch (type) {
+
+ case F_SOFTLOCK:
+
+ mutex_enter(&freemem_lock);
+ if (availrmem < tune.t_minarmem + npages) {
+ mutex_exit(&freemem_lock);
+ return (FC_MAKE_ERR(ENOMEM));
+ } else {
+ availrmem -= npages;
+ }
+ mutex_exit(&freemem_lock);
+ atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
+ /*
+ * Fall through to the F_INVAL case to load up the hat layer
+ * entries with the HAT_LOAD_LOCK flag.
+ */
+ /* FALLTHRU */
+ case F_INVAL:
+
+ if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
+ return (FC_NOMAP);
+
+ ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
+
+ err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
+ if (err != 0) {
+ if (type == F_SOFTLOCK) {
+ mutex_enter(&freemem_lock);
+ availrmem += npages;
+ mutex_exit(&freemem_lock);
+ atomic_add_long((ulong_t *)(
+ &(shmd->shm_softlockcnt)), -npages);
+ }
+ goto dism_err;
+ }
+ AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
+ if (type == F_SOFTLOCK) {
+
+ /*
+ * Load up the translation keeping it
+ * locked and don't unlock the page.
+ */
+ hat_memload_array(sptseg->s_as->a_hat, segspt_addr,
+ size, ppa, sptd->spt_prot,
+ HAT_LOAD_LOCK | HAT_LOAD_SHARE);
+ } else {
+ if (hat == seg->s_as->a_hat) {
+
+ /*
+ * Migrate pages marked for migration
+ */
+ if (lgrp_optimizations())
+ page_migrate(seg, shm_addr, ppa,
+ npages);
+
+ /* CPU HAT */
+ hat_memload_array(sptseg->s_as->a_hat,
+ segspt_addr, size, ppa, sptd->spt_prot,
+ HAT_LOAD_SHARE);
+ } else {
+ /* XHAT. Pass real address */
+ hat_memload_array(hat, shm_addr,
+ size, ppa, sptd->spt_prot, HAT_LOAD_SHARE);
+ }
+
+ /*
+ * And now drop the SE_SHARED lock(s).
+ */
+ for (i = 0; i < npages; i++)
+ page_unlock(ppa[i]);
+ }
+
+ if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
+ if (hat_share(seg->s_as->a_hat, shm_addr,
+ curspt->a_hat, segspt_addr, ptob(npages),
+ seg->s_szc) != 0) {
+ panic("hat_share err in DISM fault");
+ /* NOTREACHED */
+ }
+ }
+ AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
+dism_err:
+ kmem_free(ppa, npages * sizeof (page_t *));
+ return (err);
+
+ case F_SOFTUNLOCK:
+
+ mutex_enter(&freemem_lock);
+ availrmem += npages;
+ mutex_exit(&freemem_lock);
+
+ /*
+ * This is a bit ugly, we pass in the real seg pointer,
+ * but the segspt_addr is the virtual address within the
+ * dummy seg.
+ */
+ segspt_softunlock(seg, segspt_addr, size, rw);
+ return (0);
+
+ case F_PROT:
+
+ /*
+ * This takes care of the unusual case where a user
+ * allocates a stack in shared memory and a register
+ * window overflow is written to that stack page before
+ * it is otherwise modified.
+ *
+ * We can get away with this because ISM segments are
+ * always rw. Other than this unusual case, there
+ * should be no instances of protection violations.
+ */
+ return (0);
+
+ default:
+#ifdef DEBUG
+ panic("segspt_dismfault default type?");
+#else
+ return (FC_NOMAP);
+#endif
+ }
+}
+
+
+faultcode_t
+segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
+ size_t len, enum fault_type type, enum seg_rw rw)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct seg *sptseg = shmd->shm_sptseg;
+ struct as *curspt = shmd->shm_sptas;
+ struct spt_data *sptd = sptseg->s_data;
+ pgcnt_t npages;
+ size_t share_size, size;
+ caddr_t sptseg_addr, shm_addr;
+ page_t *pp, **ppa;
+ int i;
+ u_offset_t offset;
+ ulong_t anon_index = 0;
+ struct vnode *vp;
+ struct anon_map *amp; /* XXX - for locknest */
+ struct anon *ap = NULL;
+ anon_sync_obj_t cookie;
+
+#ifdef lint
+ hat = hat;
+#endif
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if (sptd->spt_flags & SHM_PAGEABLE) {
+ return (segspt_dismfault(hat, seg, addr, len, type, rw));
+ }
+
+ /*
+ * Because of the way spt is implemented
+ * the realsize of the segment does not have to be
+ * equal to the segment size itself. The segment size is
+ * often in multiples of a page size larger than PAGESIZE.
+ * The realsize is rounded up to the nearest PAGESIZE
+ * based on what the user requested. This is a bit of
+ * ungliness that is historical but not easily fixed
+ * without re-designing the higher levels of ISM.
+ */
+ ASSERT(addr >= seg->s_base);
+ if (((addr + len) - seg->s_base) > sptd->spt_realsize)
+ return (FC_NOMAP);
+ /*
+ * For all of the following cases except F_PROT, we need to
+ * make any necessary adjustments to addr and len
+ * and get all of the necessary page_t's into an array called ppa[].
+ *
+ * The code in shmat() forces base addr and len of ISM segment
+ * to be aligned to largest page size supported. Therefore,
+ * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
+ * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
+ * in large pagesize chunks, or else we will screw up the HAT
+ * layer by calling hat_memload_array() with differing page sizes
+ * over a given virtual range.
+ */
+ share_size = page_get_pagesize(sptseg->s_szc);
+ shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
+ size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), share_size);
+ npages = btopr(size);
+
+ /*
+ * Now we need to convert from addr in segshm to addr in segspt.
+ */
+ anon_index = seg_page(seg, shm_addr);
+ sptseg_addr = sptseg->s_base + ptob(anon_index);
+
+ /*
+ * And now we may have to adjust npages downward if we have
+ * exceeded the realsize of the segment or initial anon
+ * allocations.
+ */
+ if ((sptseg_addr + ptob(npages)) >
+ (sptseg->s_base + sptd->spt_realsize))
+ size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
+
+ npages = btopr(size);
+
+ ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
+ ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
+
+ switch (type) {
+
+ case F_SOFTLOCK:
+
+ /*
+ * availrmem is decremented once during anon_swap_adjust()
+ * and is incremented during the anon_unresv(), which is
+ * called from shm_rm_amp() when the segment is destroyed.
+ */
+ atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
+ /*
+ * Some platforms assume that ISM pages are SE_SHARED
+ * locked for the entire life of the segment.
+ */
+ if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
+ return (0);
+ /*
+ * Fall through to the F_INVAL case to load up the hat layer
+ * entries with the HAT_LOAD_LOCK flag.
+ */
+
+ /* FALLTHRU */
+ case F_INVAL:
+
+ if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
+ return (FC_NOMAP);
+
+ /*
+ * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
+ * may still rely on this call to hat_share(). That
+ * would imply that those hat's can fault on a
+ * HAT_LOAD_LOCK translation, which would seem
+ * contradictory.
+ */
+ if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
+ if (hat_share(seg->s_as->a_hat, seg->s_base,
+ curspt->a_hat, sptseg->s_base,
+ sptseg->s_size, sptseg->s_szc) != 0) {
+ panic("hat_share error in ISM fault");
+ /*NOTREACHED*/
+ }
+ return (0);
+ }
+ ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
+
+ /*
+ * I see no need to lock the real seg,
+ * here, because all of our work will be on the underlying
+ * dummy seg.
+ *
+ * sptseg_addr and npages now account for large pages.
+ */
+ amp = sptd->spt_amp;
+ ASSERT(amp != NULL);
+ anon_index = seg_page(sptseg, sptseg_addr);
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ for (i = 0; i < npages; i++) {
+ anon_array_enter(amp, anon_index, &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_index++);
+ ASSERT(ap != NULL);
+ swap_xlate(ap, &vp, &offset);
+ anon_array_exit(&cookie);
+ pp = page_lookup(vp, offset, SE_SHARED);
+ ASSERT(pp != NULL);
+ ppa[i] = pp;
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ ASSERT(i == npages);
+
+ /*
+ * We are already holding the as->a_lock on the user's
+ * real segment, but we need to hold the a_lock on the
+ * underlying dummy as. This is mostly to satisfy the
+ * underlying HAT layer.
+ */
+ AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
+ if (type == F_SOFTLOCK) {
+ /*
+ * Load up the translation keeping it
+ * locked and don't unlock the page.
+ */
+ hat_memload_array(sptseg->s_as->a_hat, sptseg_addr,
+ ptob(npages), ppa, sptd->spt_prot,
+ HAT_LOAD_LOCK | HAT_LOAD_SHARE);
+ } else {
+ if (hat == seg->s_as->a_hat) {
+
+ /*
+ * Migrate pages marked for migration.
+ */
+ if (lgrp_optimizations())
+ page_migrate(seg, shm_addr, ppa,
+ npages);
+
+ /* CPU HAT */
+ hat_memload_array(sptseg->s_as->a_hat,
+ sptseg_addr, ptob(npages), ppa,
+ sptd->spt_prot, HAT_LOAD_SHARE);
+ } else {
+ /* XHAT. Pass real address */
+ hat_memload_array(hat, shm_addr,
+ ptob(npages), ppa, sptd->spt_prot,
+ HAT_LOAD_SHARE);
+ }
+
+ /*
+ * And now drop the SE_SHARED lock(s).
+ */
+ for (i = 0; i < npages; i++)
+ page_unlock(ppa[i]);
+ }
+ AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
+
+ kmem_free(ppa, sizeof (page_t *) * npages);
+ return (0);
+ case F_SOFTUNLOCK:
+
+ /*
+ * This is a bit ugly, we pass in the real seg pointer,
+ * but the sptseg_addr is the virtual address within the
+ * dummy seg.
+ */
+ segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
+ return (0);
+
+ case F_PROT:
+
+ /*
+ * This takes care of the unusual case where a user
+ * allocates a stack in shared memory and a register
+ * window overflow is written to that stack page before
+ * it is otherwise modified.
+ *
+ * We can get away with this because ISM segments are
+ * always rw. Other than this unusual case, there
+ * should be no instances of protection violations.
+ */
+ return (0);
+
+ default:
+#ifdef DEBUG
+ cmn_err(CE_WARN, "segspt_shmfault default type?");
+#endif
+ return (FC_NOMAP);
+ }
+}
+
+/*ARGSUSED*/
+static faultcode_t
+segspt_shmfaulta(struct seg *seg, caddr_t addr)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+static size_t
+segspt_shmswapout(struct seg *seg)
+{
+ return (0);
+}
+
+/*
+ * duplicate the shared page tables
+ */
+int
+segspt_shmdup(struct seg *seg, struct seg *newseg)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct anon_map *amp = shmd->shm_amp;
+ struct shm_data *shmd_new;
+ struct seg *spt_seg = shmd->shm_sptseg;
+ struct spt_data *sptd = spt_seg->s_data;
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
+ newseg->s_data = (void *)shmd_new;
+ shmd_new->shm_sptas = shmd->shm_sptas;
+ shmd_new->shm_amp = amp;
+ shmd_new->shm_sptseg = shmd->shm_sptseg;
+ newseg->s_ops = &segspt_shmops;
+ newseg->s_szc = seg->s_szc;
+ ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ amp->refcnt++;
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+
+ if (sptd->spt_flags & SHM_PAGEABLE) {
+ shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
+ shmd_new->shm_lckpgs = 0;
+ }
+ return (hat_share(newseg->s_as->a_hat, newseg->s_base,
+ shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, seg->s_szc));
+}
+
+/*ARGSUSED*/
+int
+segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * ISM segment is always rw.
+ */
+ return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
+}
+
+/*
+ * Return an array of locked large pages, for empty slots allocate
+ * private zero-filled anon pages.
+ */
+static int
+spt_anon_getpages(
+ struct seg *sptseg,
+ caddr_t sptaddr,
+ size_t len,
+ page_t *ppa[])
+{
+ struct spt_data *sptd = sptseg->s_data;
+ struct anon_map *amp = sptd->spt_amp;
+ enum seg_rw rw = sptd->spt_prot;
+ uint_t szc = sptseg->s_szc;
+ size_t pg_sz, share_sz = page_get_pagesize(szc);
+ pgcnt_t lp_npgs;
+ caddr_t lp_addr, e_sptaddr;
+ uint_t vpprot, ppa_szc = 0;
+ struct vpage *vpage = NULL;
+ ulong_t j, ppa_idx;
+ int err, ierr = 0;
+ pgcnt_t an_idx;
+ anon_sync_obj_t cookie;
+
+ ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
+ ASSERT(len != 0);
+
+ pg_sz = share_sz;
+ lp_npgs = btop(pg_sz);
+ lp_addr = sptaddr;
+ e_sptaddr = sptaddr + len;
+ an_idx = seg_page(sptseg, sptaddr);
+ ppa_idx = 0;
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ /*CONSTCOND*/
+ while (1) {
+ for (; lp_addr < e_sptaddr;
+ an_idx += lp_npgs, lp_addr += pg_sz,
+ ppa_idx += lp_npgs) {
+
+ anon_array_enter(amp, an_idx, &cookie);
+ ppa_szc = (uint_t)-1;
+ ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
+ lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
+ &ppa_szc, vpage, rw, 0, segvn_anypgsz, kcred);
+ anon_array_exit(&cookie);
+
+ if (ierr != 0) {
+ if (ierr > 0) {
+ err = FC_MAKE_ERR(ierr);
+ goto lpgs_err;
+ }
+ break;
+ }
+ }
+ if (lp_addr == e_sptaddr) {
+ break;
+ }
+ ASSERT(lp_addr < e_sptaddr);
+
+ /*
+ * ierr == -1 means we failed to allocate a large page.
+ * so do a size down operation.
+ *
+ * ierr == -2 means some other process that privately shares
+ * pages with this process has allocated a larger page and we
+ * need to retry with larger pages. So do a size up
+ * operation. This relies on the fact that large pages are
+ * never partially shared i.e. if we share any constituent
+ * page of a large page with another process we must share the
+ * entire large page. Note this cannot happen for SOFTLOCK
+ * case, unless current address (lpaddr) is at the beginning
+ * of the next page size boundary because the other process
+ * couldn't have relocated locked pages.
+ */
+ ASSERT(ierr == -1 || ierr == -2);
+ if (segvn_anypgsz) {
+ ASSERT(ierr == -2 || szc != 0);
+ ASSERT(ierr == -1 || szc < sptseg->s_szc);
+ szc = (ierr == -1) ? szc - 1 : szc + 1;
+ } else {
+ /*
+ * For faults and segvn_anypgsz == 0
+ * we need to be careful not to loop forever
+ * if existing page is found with szc other
+ * than 0 or seg->s_szc. This could be due
+ * to page relocations on behalf of DR or
+ * more likely large page creation. For this
+ * case simply re-size to existing page's szc
+ * if returned by anon_map_getpages().
+ */
+ if (ppa_szc == (uint_t)-1) {
+ szc = (ierr == -1) ? 0 : sptseg->s_szc;
+ } else {
+ ASSERT(ppa_szc <= sptseg->s_szc);
+ ASSERT(ierr == -2 || ppa_szc < szc);
+ ASSERT(ierr == -1 || ppa_szc > szc);
+ szc = ppa_szc;
+ }
+ }
+ pg_sz = page_get_pagesize(szc);
+ lp_npgs = btop(pg_sz);
+ ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ return (0);
+
+lpgs_err:
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ for (j = 0; j < ppa_idx; j++)
+ page_unlock(ppa[j]);
+ return (err);
+}
+
+int
+spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
+ page_t **ppa, ulong_t *lockmap, size_t pos)
+{
+ struct shm_data *shmd = seg->s_data;
+ struct spt_data *sptd = shmd->shm_sptseg->s_data;
+ ulong_t i;
+ int kernel;
+
+ for (i = 0; i < npages; anon_index++, pos++, i++) {
+ if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
+ if (sptd->spt_ppa_lckcnt[anon_index] <
+ (ushort_t)DISM_LOCK_MAX) {
+ if (++sptd->spt_ppa_lckcnt[anon_index] ==
+ (ushort_t)DISM_LOCK_MAX) {
+ cmn_err(CE_WARN,
+ "DISM page lock limit "
+ "reached on DISM offset 0x%lx\n",
+ anon_index << PAGESHIFT);
+ }
+ kernel = (sptd->spt_ppa &&
+ sptd->spt_ppa[anon_index]) ? 1 : 0;
+ if (!page_pp_lock(ppa[i], 0, kernel)) {
+ /* unlock rest of the pages */
+ for (; i < npages; i++)
+ page_unlock(ppa[i]);
+ sptd->spt_ppa_lckcnt[anon_index]--;
+ return (EAGAIN);
+ }
+ shmd->shm_lckpgs++;
+ shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
+ if (lockmap != NULL)
+ BT_SET(lockmap, pos);
+ }
+ }
+ page_unlock(ppa[i]);
+ }
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
+ int attr, int op, ulong_t *lockmap, size_t pos)
+{
+ struct shm_data *shmd = seg->s_data;
+ struct seg *sptseg = shmd->shm_sptseg;
+ struct spt_data *sptd = sptseg->s_data;
+ pgcnt_t npages, a_npages;
+ page_t **ppa;
+ pgcnt_t an_idx, a_an_idx, ppa_idx;
+ caddr_t spt_addr, a_addr; /* spt and aligned address */
+ size_t a_len; /* aligned len */
+ size_t share_sz;
+ ulong_t i;
+ int sts = 0;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
+ return (0);
+ }
+
+ addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ an_idx = seg_page(seg, addr);
+ npages = btopr(len);
+
+ if (an_idx + npages > btopr(shmd->shm_amp->size)) {
+ return (ENOMEM);
+ }
+
+ if (op == MC_LOCK) {
+ /*
+ * Need to align addr and size request if they are not
+ * aligned so we can always allocate large page(s) however
+ * we only lock what was requested in initial request.
+ */
+ share_sz = page_get_pagesize(sptseg->s_szc);
+ a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
+ a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
+ share_sz);
+ a_npages = btop(a_len);
+ a_an_idx = seg_page(seg, a_addr);
+ spt_addr = sptseg->s_base + ptob(a_an_idx);
+ ppa_idx = an_idx - a_an_idx;
+
+ if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
+ KM_NOSLEEP)) == NULL) {
+ return (ENOMEM);
+ }
+
+ /*
+ * Don't cache any new pages for IO and
+ * flush any cached pages.
+ */
+ mutex_enter(&sptd->spt_lock);
+ if (sptd->spt_ppa != NULL)
+ sptd->spt_flags |= DISM_PPA_CHANGED;
+
+ sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
+ if (sts != 0) {
+ mutex_exit(&sptd->spt_lock);
+ kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
+ return (sts);
+ }
+
+ sts = spt_lockpages(seg, an_idx, npages,
+ &ppa[ppa_idx], lockmap, pos);
+ /*
+ * unlock remaining pages for requests which are not
+ * aligned or not in 4 M chunks
+ */
+ for (i = 0; i < ppa_idx; i++)
+ page_unlock(ppa[i]);
+ for (i = ppa_idx + npages; i < a_npages; i++)
+ page_unlock(ppa[i]);
+ if (sptd->spt_ppa != NULL)
+ sptd->spt_flags |= DISM_PPA_CHANGED;
+ mutex_exit(&sptd->spt_lock);
+
+ kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
+
+ } else if (op == MC_UNLOCK) { /* unlock */
+ struct anon_map *amp;
+ struct anon *ap;
+ struct vnode *vp;
+ u_offset_t off;
+ struct page *pp;
+ int kernel;
+ anon_sync_obj_t cookie;
+
+ amp = sptd->spt_amp;
+ mutex_enter(&sptd->spt_lock);
+ if (shmd->shm_lckpgs == 0) {
+ mutex_exit(&sptd->spt_lock);
+ return (0);
+ }
+ /*
+ * Don't cache new IO pages.
+ */
+ if (sptd->spt_ppa != NULL)
+ sptd->spt_flags |= DISM_PPA_CHANGED;
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ for (i = 0; i < npages; i++, an_idx++) {
+ if (shmd->shm_vpage[an_idx] & DISM_PG_LOCKED) {
+ anon_array_enter(amp, an_idx, &cookie);
+ ap = anon_get_ptr(amp->ahp, an_idx);
+ ASSERT(ap);
+ ASSERT(sptd->spt_ppa_lckcnt[an_idx] > 0);
+
+ swap_xlate(ap, &vp, &off);
+ anon_array_exit(&cookie);
+ pp = page_lookup(vp, off, SE_SHARED);
+ ASSERT(pp);
+ /*
+ * the availrmem is decremented only for
+ * pages which are not in seg pcache,
+ * for pages in seg pcache availrmem was
+ * decremented in _dismpagelock() (if
+ * they were not locked here)
+ */
+ kernel = (sptd->spt_ppa &&
+ sptd->spt_ppa[an_idx]) ? 1 : 0;
+ page_pp_unlock(pp, 0, kernel);
+ page_unlock(pp);
+ shmd->shm_vpage[an_idx] &= ~DISM_PG_LOCKED;
+ sptd->spt_ppa_lckcnt[an_idx]--;
+ shmd->shm_lckpgs--;
+ }
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ if (sptd->spt_ppa != NULL)
+ sptd->spt_flags |= DISM_PPA_CHANGED;
+ mutex_exit(&sptd->spt_lock);
+ }
+ return (sts);
+}
+
+/*ARGSUSED*/
+int
+segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+ spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * ISM segment is always rw.
+ */
+ while (--pgno >= 0)
+ *protv++ = sptd->spt_prot;
+ return (0);
+}
+
+/*ARGSUSED*/
+u_offset_t
+segspt_shmgetoffset(struct seg *seg, caddr_t addr)
+{
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /* Offset does not matter in ISM memory */
+
+ return ((u_offset_t)0);
+}
+
+/* ARGSUSED */
+int
+segspt_shmgettype(struct seg *seg, caddr_t addr)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * The shared memory mapping is always MAP_SHARED, SWAP is only
+ * reserved for DISM
+ */
+ return (MAP_SHARED |
+ ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
+}
+
+/*ARGSUSED*/
+int
+segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ *vpp = sptd->spt_vp;
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
+ struct anon_map *amp;
+ pgcnt_t pg_idx;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if (behav == MADV_FREE) {
+ if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
+ return (0);
+
+ amp = sptd->spt_amp;
+ pg_idx = seg_page(seg, addr);
+
+ mutex_enter(&sptd->spt_lock);
+ if (sptd->spt_ppa != NULL)
+ sptd->spt_flags |= DISM_PPA_CHANGED;
+ mutex_exit(&sptd->spt_lock);
+
+ /*
+ * Purge all DISM cached pages
+ */
+ seg_ppurge_seg(segspt_reclaim);
+
+ mutex_enter(&sptd->spt_lock);
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ anon_disclaim(amp, pg_idx, len, ANON_PGLOOKUP_BLK);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ mutex_exit(&sptd->spt_lock);
+ } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
+ behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
+ int already_set;
+ ulong_t anon_index;
+ lgrp_mem_policy_t policy;
+ caddr_t shm_addr;
+ size_t share_size;
+ size_t size;
+ struct seg *sptseg = shmd->shm_sptseg;
+ caddr_t sptseg_addr;
+
+ /*
+ * Align address and length to page size of underlying segment
+ */
+ share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
+ shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
+ size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
+ share_size);
+
+ amp = shmd->shm_amp;
+ anon_index = seg_page(seg, shm_addr);
+
+ /*
+ * And now we may have to adjust size downward if we have
+ * exceeded the realsize of the segment or initial anon
+ * allocations.
+ */
+ sptseg_addr = sptseg->s_base + ptob(anon_index);
+ if ((sptseg_addr + size) >
+ (sptseg->s_base + sptd->spt_realsize))
+ size = (sptseg->s_base + sptd->spt_realsize) -
+ sptseg_addr;
+
+ /*
+ * Set memory allocation policy for this segment
+ */
+ policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
+ already_set = lgrp_shm_policy_set(policy, amp, anon_index,
+ NULL, 0, len);
+
+ /*
+ * If random memory allocation policy set already,
+ * don't bother reapplying it.
+ */
+ if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
+ return (0);
+
+ /*
+ * Mark any existing pages in the given range for
+ * migration, flushing the I/O page cache, and using
+ * underlying segment to calculate anon index and get
+ * anonmap and vnode pointer from
+ */
+ if (shmd->shm_softlockcnt > 0)
+ segspt_purge(seg);
+
+ page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
+ }
+
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+segspt_shmdump(struct seg *seg)
+{
+ /* no-op for ISM segment */
+}
+
+/*ARGSUSED*/
+static faultcode_t
+segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
+{
+ return (ENOTSUP);
+}
+
+/*
+ * get a memory ID for an addr in a given segment
+ */
+static int
+segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+ struct shm_data *shmd = (struct shm_data *)seg->s_data;
+ struct anon *ap;
+ size_t anon_index;
+ struct anon_map *amp = shmd->shm_amp;
+ struct spt_data *sptd = shmd->shm_sptseg->s_data;
+ struct seg *sptseg = shmd->shm_sptseg;
+ anon_sync_obj_t cookie;
+
+ anon_index = seg_page(seg, addr);
+
+ if (addr > (seg->s_base + sptd->spt_realsize)) {
+ return (EFAULT);
+ }
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ anon_array_enter(amp, anon_index, &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_index);
+ if (ap == NULL) {
+ struct page *pp;
+ caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
+
+ pp = anon_zero(sptseg, spt_addr, &ap, kcred);
+ if (pp == NULL) {
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ return (ENOMEM);
+ }
+ (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
+ page_unlock(pp);
+ }
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ memidp->val[0] = (uintptr_t)ap;
+ memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
+ return (0);
+}
+
+/*
+ * Get memory allocation policy info for specified address in given segment
+ */
+static lgrp_mem_policy_info_t *
+segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
+{
+ struct anon_map *amp;
+ ulong_t anon_index;
+ lgrp_mem_policy_info_t *policy_info;
+ struct shm_data *shm_data;
+
+ ASSERT(seg != NULL);
+
+ /*
+ * Get anon_map from segshm
+ *
+ * Assume that no lock needs to be held on anon_map, since
+ * it should be protected by its reference count which must be
+ * nonzero for an existing segment
+ * Need to grab readers lock on policy tree though
+ */
+ shm_data = (struct shm_data *)seg->s_data;
+ if (shm_data == NULL)
+ return (NULL);
+ amp = shm_data->shm_amp;
+ ASSERT(amp->refcnt != 0);
+
+ /*
+ * Get policy info
+ *
+ * Assume starting anon index of 0
+ */
+ anon_index = seg_page(seg, addr);
+ policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
+
+ return (policy_info);
+}
diff --git a/usr/src/uts/common/vm/seg_spt.h b/usr/src/uts/common/vm/seg_spt.h
new file mode 100644
index 0000000000..fb97c77fcf
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_spt.h
@@ -0,0 +1,155 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VM_SEG_SPT_H
+#define _VM_SEG_SPT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _ASM
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/lgrp.h>
+
+/*
+ * Passed data when creating spt segment.
+ */
+struct segspt_crargs {
+ struct seg *seg_spt;
+ struct anon_map *amp;
+ uint_t prot;
+ uint_t flags;
+ uint_t szc;
+};
+
+typedef struct spt_data {
+ struct vnode *spt_vp;
+ struct anon_map *spt_amp;
+ size_t spt_realsize;
+ struct page **spt_ppa;
+ ushort_t *spt_ppa_lckcnt;
+ uint_t spt_prot;
+ kmutex_t spt_lock;
+ size_t spt_pcachecnt; /* # of times in pcache */
+ uint_t spt_flags; /* Dynamic ISM or regular ISM */
+ /*
+ * Initial memory allocation policy
+ * used during pre-allocation done in shmat()
+ */
+ lgrp_mem_policy_info_t spt_policy_info;
+} spt_data_t;
+
+/*
+ * Private data for spt_shm segment.
+ */
+typedef struct shm_data {
+ struct as *shm_sptas;
+ struct anon_map *shm_amp;
+ size_t shm_softlockcnt; /* # outstanding lock operations */
+ struct seg *shm_sptseg; /* pointer to spt segment */
+ char *shm_vpage; /* indicating locked pages */
+ spgcnt_t shm_lckpgs; /* # of locked pages per attached seg */
+ /*
+ * Memory allocation policy after shmat()
+ */
+ lgrp_mem_policy_info_t shm_policy_info;
+} shm_data_t;
+
+#define DISM_PG_LOCKED 0x1 /* DISM page is locked */
+#define DISM_PPA_CHANGED 0x2 /* DISM new lock, need to rebuild ppa */
+
+#define DISM_LOCK_MAX 0xfffe /* max number of locks per DISM page */
+#endif
+
+#ifdef _KERNEL
+
+#ifndef _ASM
+
+/*
+ * Functions used in shm.c to call ISM.
+ */
+int sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
+ uint_t prot, uint_t flags, uint_t szc);
+void sptdestroy(struct as *, struct anon_map *);
+int segspt_shmattach(struct seg *, caddr_t *);
+
+#define isspt(sp) ((sp)->shm_sptinfo ? (sp)->shm_sptinfo->sptas : NULL)
+#define spt_locked(a) ((a) & SHM_SHARE_MMU)
+#define spt_pageable(a) ((a) & SHM_PAGEABLE)
+#define spt_invalid(a) (spt_locked((a)) && spt_pageable((a)))
+
+/*
+ * This can be applied to a segment with seg->s_ops == &segspt_shmops
+ * to determine the real size of the ISM segment.
+ */
+#define spt_realsize(seg) (((struct spt_data *)(((struct shm_data *)\
+ ((seg)->s_data))->shm_sptseg->s_data))->spt_realsize)
+
+/*
+ * This can be applied to a segment with seg->s_ops == &segspt_ops
+ * to determine the flags of the {D}ISM segment.
+ */
+#define spt_flags(seg) (((struct spt_data *)((seg)->s_data))->spt_flags)
+
+/*
+ * For large page support
+ */
+extern int segvn_anypgsz;
+
+#endif
+
+/*
+ * In a 64-bit address space, we'll try to put ISM segments between
+ * PREDISM_BASE and PREDISM_BOUND. The HAT may use these constants to
+ * predict that a VA is contained by an ISM segment, which may optimize
+ * translation. The range must _only_ be treated as advisory; ISM segments
+ * may fall outside of the range, and non-ISM segments may be contained
+ * within the range.
+ * In order to avoid collision between ISM/DISM addresses with e.g.
+ * process heap addresses we will try to put ISM/DISM segments above
+ * PREDISM_1T_BASESHIFT (1T).
+ * The HAT is still expecting that any VA larger than PREDISM_BASESHIFT
+ * may belong to ISM/DISM (so on tlb miss it will probe first for 4M
+ * translation)
+ */
+#define PREDISM_BASESHIFT 33
+#define PREDISM_1T_BASESHIFT 40
+#define PREDISM_BASE ((uintptr_t)1 << PREDISM_BASESHIFT)
+#define PREDISM_1T_BASE ((uintptr_t)1 << PREDISM_1T_BASESHIFT)
+#define PREDISM_BOUND ((uintptr_t)1 << 63)
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_SEG_SPT_H */
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
new file mode 100644
index 0000000000..86e57227f8
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -0,0 +1,7745 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - shared or copy-on-write from a vnode/anonymous memory.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/debug.h>
+#include <sys/cred.h>
+#include <sys/vmsystm.h>
+#include <sys/tuneable.h>
+#include <sys/bitmap.h>
+#include <sys/swap.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/vtrace.h>
+#include <sys/cmn_err.h>
+#include <sys/vm.h>
+#include <sys/dumphdr.h>
+#include <sys/lgrp.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <vm/pvn.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <vm/vpage.h>
+
+/*
+ * Private seg op routines.
+ */
+static int segvn_dup(struct seg *seg, struct seg *newseg);
+static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
+static void segvn_free(struct seg *seg);
+static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
+ caddr_t addr, size_t len, enum fault_type type,
+ enum seg_rw rw);
+static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
+static int segvn_setprot(struct seg *seg, caddr_t addr,
+ size_t len, uint_t prot);
+static int segvn_checkprot(struct seg *seg, caddr_t addr,
+ size_t len, uint_t prot);
+static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
+static size_t segvn_swapout(struct seg *seg);
+static int segvn_sync(struct seg *seg, caddr_t addr, size_t len,
+ int attr, uint_t flags);
+static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len,
+ char *vec);
+static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
+ int attr, int op, ulong_t *lockmap, size_t pos);
+static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
+ uint_t *protv);
+static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr);
+static int segvn_gettype(struct seg *seg, caddr_t addr);
+static int segvn_getvp(struct seg *seg, caddr_t addr,
+ struct vnode **vpp);
+static int segvn_advise(struct seg *seg, caddr_t addr, size_t len,
+ uint_t behav);
+static void segvn_dump(struct seg *seg);
+static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
+ struct page ***ppp, enum lock_type type, enum seg_rw rw);
+static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
+ uint_t szc);
+static int segvn_getmemid(struct seg *seg, caddr_t addr,
+ memid_t *memidp);
+static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t);
+
+struct seg_ops segvn_ops = {
+ segvn_dup,
+ segvn_unmap,
+ segvn_free,
+ segvn_fault,
+ segvn_faulta,
+ segvn_setprot,
+ segvn_checkprot,
+ segvn_kluster,
+ segvn_swapout,
+ segvn_sync,
+ segvn_incore,
+ segvn_lockop,
+ segvn_getprot,
+ segvn_getoffset,
+ segvn_gettype,
+ segvn_getvp,
+ segvn_advise,
+ segvn_dump,
+ segvn_pagelock,
+ segvn_setpagesize,
+ segvn_getmemid,
+ segvn_getpolicy,
+};
+
+/*
+ * Common zfod structures, provided as a shorthand for others to use.
+ */
+static segvn_crargs_t zfod_segvn_crargs =
+ SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
+static segvn_crargs_t kzfod_segvn_crargs =
+ SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
+ PROT_ALL & ~PROT_USER);
+static segvn_crargs_t stack_noexec_crargs =
+ SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
+
+caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */
+caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */
+caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */
+caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
+
+#define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */
+
+size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */
+
+static int segvn_concat(struct seg *, struct seg *, int);
+static int segvn_extend_prev(struct seg *, struct seg *,
+ struct segvn_crargs *, size_t);
+static int segvn_extend_next(struct seg *, struct seg *,
+ struct segvn_crargs *, size_t);
+static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
+static void segvn_pagelist_rele(page_t **);
+static void segvn_setvnode_mpss(vnode_t *);
+static void segvn_relocate_pages(page_t **, page_t *);
+static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
+static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
+ uint_t, page_t **, page_t **, uint_t *, int *);
+static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
+ caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
+static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
+ caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
+static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
+ u_offset_t, struct vpage *, page_t **, uint_t,
+ enum fault_type, enum seg_rw, int);
+static void segvn_vpage(struct seg *);
+
+static void segvn_purge(struct seg *seg);
+static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **,
+ enum seg_rw);
+
+static int sameprot(struct seg *, caddr_t, size_t);
+
+static int segvn_demote_range(struct seg *, caddr_t, size_t, int);
+static int segvn_clrszc(struct seg *);
+static struct seg *segvn_split_seg(struct seg *, caddr_t);
+static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
+ ulong_t, uint_t);
+
+static struct kmem_cache *segvn_cache;
+
+#ifdef VM_STATS
+static struct segvnvmstats_str {
+ ulong_t fill_vp_pages[31];
+ ulong_t fltvnpages[49];
+ ulong_t fullszcpages[10];
+ ulong_t relocatepages[3];
+ ulong_t fltanpages[17];
+ ulong_t pagelock[3];
+ ulong_t demoterange[3];
+} segvnvmstats;
+#endif /* VM_STATS */
+
+#define SDR_RANGE 1 /* demote entire range */
+#define SDR_END 2 /* demote non aligned ends only */
+
+#define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \
+ if ((len) != 0) { \
+ lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \
+ ASSERT(lpgaddr >= (seg)->s_base); \
+ lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \
+ (len)), pgsz); \
+ ASSERT(lpgeaddr > lpgaddr); \
+ ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \
+ } else { \
+ lpgeaddr = lpgaddr = (addr); \
+ } \
+ }
+
+/*ARGSUSED*/
+static int
+segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ struct segvn_data *svd = buf;
+
+ rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+/*ARGSUSED1*/
+static void
+segvn_cache_destructor(void *buf, void *cdrarg)
+{
+ struct segvn_data *svd = buf;
+
+ rw_destroy(&svd->lock);
+ mutex_destroy(&svd->segp_slock);
+}
+
+/*
+ * Patching this variable to non-zero allows the system to run with
+ * stacks marked as "not executable". It's a bit of a kludge, but is
+ * provided as a tweakable for platforms that export those ABIs
+ * (e.g. sparc V8) that have executable stacks enabled by default.
+ * There are also some restrictions for platforms that don't actually
+ * implement 'noexec' protections.
+ *
+ * Once enabled, the system is (therefore) unable to provide a fully
+ * ABI-compliant execution environment, though practically speaking,
+ * most everything works. The exceptions are generally some interpreters
+ * and debuggers that create executable code on the stack and jump
+ * into it (without explicitly mprotecting the address range to include
+ * PROT_EXEC).
+ *
+ * One important class of applications that are disabled are those
+ * that have been transformed into malicious agents using one of the
+ * numerous "buffer overflow" attacks. See 4007890.
+ */
+int noexec_user_stack = 0;
+int noexec_user_stack_log = 1;
+
+int segvn_lpg_disable = 0;
+uint_t segvn_maxpgszc = 0;
+
+ulong_t segvn_fltvnpages_clrszc_err;
+ulong_t segvn_setpgsz_align_err;
+ulong_t segvn_setpgsz_getattr_err;
+ulong_t segvn_setpgsz_eof_err;
+ulong_t segvn_faultvnmpss_align_err1;
+ulong_t segvn_faultvnmpss_align_err2;
+ulong_t segvn_faultvnmpss_align_err3;
+ulong_t segvn_faultvnmpss_align_err4;
+ulong_t segvn_faultvnmpss_align_err5;
+ulong_t segvn_vmpss_pageio_deadlk_err;
+
+/*
+ * Initialize segvn data structures
+ */
+void
+segvn_init(void)
+{
+ uint_t maxszc;
+ uint_t szc;
+ size_t pgsz;
+
+ segvn_cache = kmem_cache_create("segvn_cache",
+ sizeof (struct segvn_data), 0,
+ segvn_cache_constructor, segvn_cache_destructor, NULL,
+ NULL, NULL, 0);
+
+ if (segvn_lpg_disable != 0)
+ return;
+ szc = maxszc = page_num_pagesizes() - 1;
+ if (szc == 0) {
+ segvn_lpg_disable = 1;
+ return;
+ }
+ if (page_get_pagesize(0) != PAGESIZE) {
+ panic("segvn_init: bad szc 0");
+ /*NOTREACHED*/
+ }
+ while (szc != 0) {
+ pgsz = page_get_pagesize(szc);
+ if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
+ panic("segvn_init: bad szc %d", szc);
+ /*NOTREACHED*/
+ }
+ szc--;
+ }
+ if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
+ segvn_maxpgszc = maxszc;
+}
+
+#define SEGVN_PAGEIO ((void *)0x1)
+#define SEGVN_NOPAGEIO ((void *)0x2)
+
+static void
+segvn_setvnode_mpss(vnode_t *vp)
+{
+ int err;
+
+ ASSERT(vp->v_mpssdata == NULL ||
+ vp->v_mpssdata == SEGVN_PAGEIO ||
+ vp->v_mpssdata == SEGVN_NOPAGEIO);
+
+ if (vp->v_mpssdata == NULL) {
+ if (vn_vmpss_usepageio(vp)) {
+ err = VOP_PAGEIO(vp, (page_t *)NULL,
+ (u_offset_t)0, 0, 0, CRED());
+ } else {
+ err = ENOSYS;
+ }
+ /*
+ * set v_mpssdata just once per vnode life
+ * so that it never changes.
+ */
+ mutex_enter(&vp->v_lock);
+ if (vp->v_mpssdata == NULL) {
+ if (err == EINVAL) {
+ vp->v_mpssdata = SEGVN_PAGEIO;
+ } else {
+ vp->v_mpssdata = SEGVN_NOPAGEIO;
+ }
+ }
+ mutex_exit(&vp->v_lock);
+ }
+}
+
+int
+segvn_create(struct seg *seg, void *argsp)
+{
+ struct segvn_crargs *a = (struct segvn_crargs *)argsp;
+ struct segvn_data *svd;
+ size_t swresv = 0;
+ struct cred *cred;
+ struct anon_map *amp;
+ int error = 0;
+ size_t pgsz;
+ lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT;
+
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
+ panic("segvn_create type");
+ /*NOTREACHED*/
+ }
+
+ /*
+ * Check arguments. If a shared anon structure is given then
+ * it is illegal to also specify a vp.
+ */
+ if (a->amp != NULL && a->vp != NULL) {
+ panic("segvn_create anon_map");
+ /*NOTREACHED*/
+ }
+
+ /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
+ if (a->type == MAP_SHARED)
+ a->flags &= ~MAP_NORESERVE;
+
+ if (a->szc != 0) {
+ if (segvn_lpg_disable != 0 || a->amp != NULL ||
+ (a->type == MAP_SHARED && a->vp == NULL) ||
+ (a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
+ a->szc = 0;
+ } else {
+ if (a->szc > segvn_maxpgszc)
+ a->szc = segvn_maxpgszc;
+ pgsz = page_get_pagesize(a->szc);
+ if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
+ !IS_P2ALIGNED(seg->s_size, pgsz)) {
+ a->szc = 0;
+ } else if (a->vp != NULL) {
+ extern struct vnode kvp;
+ if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) {
+ /*
+ * paranoid check.
+ * hat_page_demote() is not supported
+ * on swapfs pages.
+ */
+ a->szc = 0;
+ } else if (map_addr_vacalign_check(seg->s_base,
+ a->offset & PAGEMASK)) {
+ a->szc = 0;
+ }
+ }
+ }
+ }
+
+ /*
+ * If segment may need private pages, reserve them now.
+ */
+ if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
+ (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
+ if (anon_resv(seg->s_size) == 0)
+ return (EAGAIN);
+ swresv = seg->s_size;
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+ seg, swresv, 1);
+ }
+
+ /*
+ * Reserve any mapping structures that may be required.
+ */
+ hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
+
+ if (a->cred) {
+ cred = a->cred;
+ crhold(cred);
+ } else {
+ crhold(cred = CRED());
+ }
+
+ /* Inform the vnode of the new mapping */
+ if (a->vp) {
+ error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
+ seg->s_as, seg->s_base, seg->s_size, a->prot,
+ a->maxprot, a->type, cred);
+ if (error) {
+ if (swresv != 0) {
+ anon_unresv(swresv);
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+ "anon proc:%p %lu %u",
+ seg, swresv, 0);
+ }
+ crfree(cred);
+ hat_unload(seg->s_as->a_hat, seg->s_base,
+ seg->s_size, HAT_UNLOAD_UNMAP);
+ return (error);
+ }
+ }
+
+ /*
+ * If more than one segment in the address space, and
+ * they're adjacent virtually, try to concatenate them.
+ * Don't concatenate if an explicit anon_map structure
+ * was supplied (e.g., SystemV shared memory).
+ */
+ if (a->amp == NULL) {
+ struct seg *pseg, *nseg;
+ struct segvn_data *psvd, *nsvd;
+ lgrp_mem_policy_t ppolicy, npolicy;
+ uint_t lgrp_mem_policy_flags = 0;
+ extern lgrp_mem_policy_t lgrp_mem_default_policy;
+
+ /*
+ * Memory policy flags (lgrp_mem_policy_flags) is valid when
+ * extending stack/heap segments.
+ */
+ if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
+ !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
+ lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
+ } else {
+ /*
+ * Get policy when not extending it from another segment
+ */
+ mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
+ }
+
+ /*
+ * First, try to concatenate the previous and new segments
+ */
+ pseg = AS_SEGPREV(seg->s_as, seg);
+ if (pseg != NULL &&
+ pseg->s_base + pseg->s_size == seg->s_base &&
+ pseg->s_ops == &segvn_ops) {
+ /*
+ * Get memory allocation policy from previous segment.
+ * When extension is specified (e.g. for heap) apply
+ * this policy to the new segment regardless of the
+ * outcome of segment concatenation. Extension occurs
+ * for non-default policy otherwise default policy is
+ * used and is based on extended segment size.
+ */
+ psvd = (struct segvn_data *)pseg->s_data;
+ ppolicy = psvd->policy_info.mem_policy;
+ if (lgrp_mem_policy_flags ==
+ LGRP_MP_FLAG_EXTEND_UP) {
+ if (ppolicy != lgrp_mem_default_policy) {
+ mpolicy = ppolicy;
+ } else {
+ mpolicy = lgrp_mem_policy_default(
+ pseg->s_size + seg->s_size,
+ a->type);
+ }
+ }
+
+ if (mpolicy == ppolicy &&
+ (pseg->s_size + seg->s_size <=
+ segvn_comb_thrshld || psvd->amp == NULL) &&
+ segvn_extend_prev(pseg, seg, a, swresv) == 0) {
+ /*
+ * success! now try to concatenate
+ * with following seg
+ */
+ crfree(cred);
+ nseg = AS_SEGNEXT(pseg->s_as, pseg);
+ if (nseg != NULL &&
+ nseg != pseg &&
+ nseg->s_ops == &segvn_ops &&
+ pseg->s_base + pseg->s_size ==
+ nseg->s_base)
+ (void) segvn_concat(pseg, nseg, 0);
+ ASSERT(pseg->s_szc == 0 ||
+ (a->szc == pseg->s_szc &&
+ IS_P2ALIGNED(pseg->s_base, pgsz) &&
+ IS_P2ALIGNED(pseg->s_size, pgsz)));
+ return (0);
+ }
+ }
+
+ /*
+ * Failed, so try to concatenate with following seg
+ */
+ nseg = AS_SEGNEXT(seg->s_as, seg);
+ if (nseg != NULL &&
+ seg->s_base + seg->s_size == nseg->s_base &&
+ nseg->s_ops == &segvn_ops) {
+ /*
+ * Get memory allocation policy from next segment.
+ * When extension is specified (e.g. for stack) apply
+ * this policy to the new segment regardless of the
+ * outcome of segment concatenation. Extension occurs
+ * for non-default policy otherwise default policy is
+ * used and is based on extended segment size.
+ */
+ nsvd = (struct segvn_data *)nseg->s_data;
+ npolicy = nsvd->policy_info.mem_policy;
+ if (lgrp_mem_policy_flags ==
+ LGRP_MP_FLAG_EXTEND_DOWN) {
+ if (npolicy != lgrp_mem_default_policy) {
+ mpolicy = npolicy;
+ } else {
+ mpolicy = lgrp_mem_policy_default(
+ nseg->s_size + seg->s_size,
+ a->type);
+ }
+ }
+
+ if (mpolicy == npolicy &&
+ segvn_extend_next(seg, nseg, a, swresv) == 0) {
+ crfree(cred);
+ ASSERT(nseg->s_szc == 0 ||
+ (a->szc == nseg->s_szc &&
+ IS_P2ALIGNED(nseg->s_base, pgsz) &&
+ IS_P2ALIGNED(nseg->s_size, pgsz)));
+ return (0);
+ }
+ }
+ }
+
+ if (a->vp != NULL) {
+ VN_HOLD(a->vp);
+ if (a->type == MAP_SHARED)
+ lgrp_shm_policy_init(NULL, a->vp);
+ }
+ svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
+
+ seg->s_ops = &segvn_ops;
+ seg->s_data = (void *)svd;
+ seg->s_szc = a->szc;
+
+ svd->vp = a->vp;
+ /*
+ * Anonymous mappings have no backing file so the offset is meaningless.
+ */
+ svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
+ svd->prot = a->prot;
+ svd->maxprot = a->maxprot;
+ svd->pageprot = 0;
+ svd->type = a->type;
+ svd->vpage = NULL;
+ svd->cred = cred;
+ svd->advice = MADV_NORMAL;
+ svd->pageadvice = 0;
+ svd->flags = (ushort_t)a->flags;
+ svd->softlockcnt = 0;
+ if (a->szc != 0 && a->vp != NULL) {
+ segvn_setvnode_mpss(a->vp);
+ }
+
+ amp = a->amp;
+ if ((svd->amp = amp) == NULL) {
+ svd->anon_index = 0;
+ if (svd->type == MAP_SHARED) {
+ svd->swresv = 0;
+ /*
+ * Shared mappings to a vp need no other setup.
+ * If we have a shared mapping to an anon_map object
+ * which hasn't been allocated yet, allocate the
+ * struct now so that it will be properly shared
+ * by remembering the swap reservation there.
+ */
+ if (a->vp == NULL) {
+ svd->amp = anonmap_alloc(seg->s_size, swresv);
+ svd->amp->a_szc = seg->s_szc;
+ }
+ } else {
+ /*
+ * Private mapping (with or without a vp).
+ * Allocate anon_map when needed.
+ */
+ svd->swresv = swresv;
+ }
+ } else {
+ pgcnt_t anon_num;
+
+ /*
+ * Mapping to an existing anon_map structure without a vp.
+ * For now we will insure that the segment size isn't larger
+ * than the size - offset gives us. Later on we may wish to
+ * have the anon array dynamically allocated itself so that
+ * we don't always have to allocate all the anon pointer slots.
+ * This of course involves adding extra code to check that we
+ * aren't trying to use an anon pointer slot beyond the end
+ * of the currently allocated anon array.
+ */
+ if ((amp->size - a->offset) < seg->s_size) {
+ panic("segvn_create anon_map size");
+ /*NOTREACHED*/
+ }
+
+ anon_num = btopr(a->offset);
+
+ if (a->type == MAP_SHARED) {
+ /*
+ * SHARED mapping to a given anon_map.
+ */
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ amp->refcnt++;
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ svd->anon_index = anon_num;
+ svd->swresv = 0;
+ } else {
+ /*
+ * PRIVATE mapping to a given anon_map.
+ * Make sure that all the needed anon
+ * structures are created (so that we will
+ * share the underlying pages if nothing
+ * is written by this mapping) and then
+ * duplicate the anon array as is done
+ * when a privately mapped segment is dup'ed.
+ */
+ struct anon *ap;
+ caddr_t addr;
+ caddr_t eaddr;
+ ulong_t anon_idx;
+ int hat_flag = HAT_LOAD;
+
+ if (svd->flags & MAP_TEXT) {
+ hat_flag |= HAT_LOAD_TEXT;
+ }
+
+ svd->amp = anonmap_alloc(seg->s_size, 0);
+ svd->amp->a_szc = seg->s_szc;
+ svd->anon_index = 0;
+ svd->swresv = swresv;
+
+ /*
+ * Prevent 2 threads from allocating anon
+ * slots simultaneously.
+ */
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ eaddr = seg->s_base + seg->s_size;
+
+ for (anon_idx = anon_num, addr = seg->s_base;
+ addr < eaddr; addr += PAGESIZE, anon_idx++) {
+ page_t *pp;
+
+ if ((ap = anon_get_ptr(amp->ahp,
+ anon_idx)) != NULL)
+ continue;
+
+ /*
+ * Allocate the anon struct now.
+ * Might as well load up translation
+ * to the page while we're at it...
+ */
+ pp = anon_zero(seg, addr, &ap, cred);
+ if (ap == NULL || pp == NULL) {
+ panic("segvn_create anon_zero");
+ /*NOTREACHED*/
+ }
+
+ /*
+ * Re-acquire the anon_map lock and
+ * initialize the anon array entry.
+ */
+ ASSERT(anon_get_ptr(amp->ahp,
+ anon_idx) == NULL);
+ (void) anon_set_ptr(amp->ahp, anon_idx, ap,
+ ANON_SLEEP);
+
+ ASSERT(seg->s_szc == 0);
+ ASSERT(!IS_VMODSORT(pp->p_vnode));
+
+ hat_memload(seg->s_as->a_hat, addr, pp,
+ svd->prot & ~PROT_WRITE, hat_flag);
+
+ page_unlock(pp);
+ }
+ ASSERT(seg->s_szc == 0);
+ anon_dup(amp->ahp, anon_num, svd->amp->ahp,
+ 0, seg->s_size);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ }
+
+ /*
+ * Set default memory allocation policy for segment
+ *
+ * Always set policy for private memory at least for initialization
+ * even if this is a shared memory segment
+ */
+ (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
+
+ if (svd->type == MAP_SHARED)
+ (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
+ svd->vp, svd->offset, seg->s_size);
+
+ return (0);
+}
+
+/*
+ * Concatenate two existing segments, if possible.
+ * Return 0 on success, -1 if two segments are not compatible
+ * or -2 on memory allocation failure.
+ * If private == 1 then try and concat segments with private pages.
+ */
+static int
+segvn_concat(struct seg *seg1, struct seg *seg2, int private)
+{
+ struct segvn_data *svd1 = seg1->s_data;
+ struct segvn_data *svd2 = seg2->s_data;
+ struct anon_map *amp1 = svd1->amp;
+ struct anon_map *amp2 = svd2->amp;
+ struct vpage *vpage1 = svd1->vpage;
+ struct vpage *vpage2 = svd2->vpage, *nvpage = NULL;
+ size_t size, nvpsize;
+ pgcnt_t npages1, npages2;
+
+ ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as);
+ ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
+ ASSERT(seg1->s_ops == seg2->s_ops);
+
+ /* both segments exist, try to merge them */
+#define incompat(x) (svd1->x != svd2->x)
+ if (incompat(vp) || incompat(maxprot) ||
+ (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) ||
+ (!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
+ incompat(type) || incompat(cred) || incompat(flags) ||
+ seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
+ (svd2->softlockcnt > 0))
+ return (-1);
+#undef incompat
+
+ /*
+ * vp == NULL implies zfod, offset doesn't matter
+ */
+ if (svd1->vp != NULL &&
+ svd1->offset + seg1->s_size != svd2->offset) {
+ return (-1);
+ }
+
+ /*
+ * Fail early if we're not supposed to concatenate
+ * private pages.
+ */
+ if ((private == 0 || svd1->type != MAP_PRIVATE) &&
+ (amp1 != NULL || amp2 != NULL)) {
+ return (-1);
+ }
+
+ /*
+ * If either seg has vpages, create a new merged vpage array.
+ */
+ if (vpage1 != NULL || vpage2 != NULL) {
+ struct vpage *vp;
+
+ npages1 = seg_pages(seg1);
+ npages2 = seg_pages(seg2);
+ nvpsize = vpgtob(npages1 + npages2);
+
+ if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) {
+ return (-2);
+ }
+ if (vpage1 != NULL) {
+ bcopy(vpage1, nvpage, vpgtob(npages1));
+ }
+ if (vpage2 != NULL) {
+ bcopy(vpage2, nvpage + npages1, vpgtob(npages2));
+ }
+ for (vp = nvpage; vp < nvpage + npages1; vp++) {
+ if (svd2->pageprot && !svd1->pageprot) {
+ VPP_SETPROT(vp, svd1->prot);
+ }
+ if (svd2->pageadvice && !svd1->pageadvice) {
+ VPP_SETADVICE(vp, svd1->advice);
+ }
+ }
+ for (vp = nvpage + npages1;
+ vp < nvpage + npages1 + npages2; vp++) {
+ if (svd1->pageprot && !svd2->pageprot) {
+ VPP_SETPROT(vp, svd2->prot);
+ }
+ if (svd1->pageadvice && !svd2->pageadvice) {
+ VPP_SETADVICE(vp, svd2->advice);
+ }
+ }
+ }
+
+ /*
+ * If either segment has private pages, create a new merged anon
+ * array.
+ */
+ if (amp1 != NULL || amp2 != NULL) {
+ struct anon_hdr *nahp;
+ struct anon_map *namp = NULL;
+ size_t asize = seg1->s_size + seg2->s_size;
+
+ if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) {
+ if (nvpage != NULL) {
+ kmem_free(nvpage, nvpsize);
+ }
+ return (-2);
+ }
+ if (amp1 != NULL) {
+ /*
+ * XXX anon rwlock is not really needed because
+ * this is a private segment and we are writers.
+ */
+ ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
+ ASSERT(amp1->refcnt == 1);
+ if (anon_copy_ptr(amp1->ahp, svd1->anon_index,
+ nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) {
+ anon_release(nahp, btop(asize));
+ ANON_LOCK_EXIT(&amp1->a_rwlock);
+ if (nvpage != NULL) {
+ kmem_free(nvpage, nvpsize);
+ }
+ return (-2);
+ }
+ }
+ if (amp2 != NULL) {
+ ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
+ ASSERT(amp2->refcnt == 1);
+ if (anon_copy_ptr(amp2->ahp, svd2->anon_index,
+ nahp, btop(seg1->s_size), btop(seg2->s_size),
+ ANON_NOSLEEP)) {
+ anon_release(nahp, btop(asize));
+ ANON_LOCK_EXIT(&amp2->a_rwlock);
+ if (amp1 != NULL) {
+ ANON_LOCK_EXIT(&amp1->a_rwlock);
+ }
+ if (nvpage != NULL) {
+ kmem_free(nvpage, nvpsize);
+ }
+ return (-2);
+ }
+ }
+ if (amp1 != NULL) {
+ namp = amp1;
+ anon_release(amp1->ahp, btop(amp1->size));
+ }
+ if (amp2 != NULL) {
+ if (namp == NULL) {
+ ASSERT(amp1 == NULL);
+ namp = amp2;
+ anon_release(amp2->ahp, btop(amp2->size));
+ } else {
+ amp2->refcnt--;
+ ANON_LOCK_EXIT(&amp2->a_rwlock);
+ anonmap_free(amp2);
+ }
+ svd2->amp = NULL; /* needed for seg_free */
+ }
+ namp->ahp = nahp;
+ namp->size = asize;
+ svd1->amp = namp;
+ svd1->anon_index = 0;
+ ANON_LOCK_EXIT(&namp->a_rwlock);
+ }
+ /*
+ * Now free the old vpage structures.
+ */
+ if (nvpage != NULL) {
+ if (vpage1 != NULL) {
+ kmem_free(vpage1, vpgtob(npages1));
+ }
+ if (vpage2 != NULL) {
+ svd2->vpage = NULL;
+ kmem_free(vpage2, vpgtob(npages2));
+ }
+ if (svd2->pageprot) {
+ svd1->pageprot = 1;
+ }
+ if (svd2->pageadvice) {
+ svd1->pageadvice = 1;
+ }
+ svd1->vpage = nvpage;
+ }
+
+ /* all looks ok, merge segments */
+ svd1->swresv += svd2->swresv;
+ svd2->swresv = 0; /* so seg_free doesn't release swap space */
+ size = seg2->s_size;
+ seg_free(seg2);
+ seg1->s_size += size;
+ return (0);
+}
+
+/*
+ * Extend the previous segment (seg1) to include the
+ * new segment (seg2 + a), if possible.
+ * Return 0 on success.
+ */
+static int
+segvn_extend_prev(seg1, seg2, a, swresv)
+ struct seg *seg1, *seg2;
+ struct segvn_crargs *a;
+ size_t swresv;
+{
+ struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data;
+ size_t size;
+ struct anon_map *amp1;
+ struct vpage *new_vpage;
+
+ /*
+ * We don't need any segment level locks for "segvn" data
+ * since the address space is "write" locked.
+ */
+ ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
+
+ /* second segment is new, try to extend first */
+ /* XXX - should also check cred */
+ if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
+ (!svd1->pageprot && (svd1->prot != a->prot)) ||
+ svd1->type != a->type || svd1->flags != a->flags ||
+ seg1->s_szc != a->szc)
+ return (-1);
+
+ /* vp == NULL implies zfod, offset doesn't matter */
+ if (svd1->vp != NULL &&
+ svd1->offset + seg1->s_size != (a->offset & PAGEMASK))
+ return (-1);
+
+ amp1 = svd1->amp;
+ if (amp1) {
+ pgcnt_t newpgs;
+
+ /*
+ * Segment has private pages, can data structures
+ * be expanded?
+ *
+ * Acquire the anon_map lock to prevent it from changing,
+ * if it is shared. This ensures that the anon_map
+ * will not change while a thread which has a read/write
+ * lock on an address space references it.
+ * XXX - Don't need the anon_map lock at all if "refcnt"
+ * is 1.
+ *
+ * Can't grow a MAP_SHARED segment with an anonmap because
+ * there may be existing anon slots where we want to extend
+ * the segment and we wouldn't know what to do with them
+ * (e.g., for tmpfs right thing is to just leave them there,
+ * for /dev/zero they should be cleared out).
+ */
+ if (svd1->type == MAP_SHARED)
+ return (-1);
+
+ ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
+ if (amp1->refcnt > 1) {
+ ANON_LOCK_EXIT(&amp1->a_rwlock);
+ return (-1);
+ }
+ newpgs = anon_grow(amp1->ahp, &svd1->anon_index,
+ btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP);
+
+ if (newpgs == 0) {
+ ANON_LOCK_EXIT(&amp1->a_rwlock);
+ return (-1);
+ }
+ amp1->size = ptob(newpgs);
+ ANON_LOCK_EXIT(&amp1->a_rwlock);
+ }
+ if (svd1->vpage != NULL) {
+ new_vpage =
+ kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
+ KM_NOSLEEP);
+ if (new_vpage == NULL)
+ return (-1);
+ bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1)));
+ kmem_free(svd1->vpage, vpgtob(seg_pages(seg1)));
+ svd1->vpage = new_vpage;
+ if (svd1->pageprot) {
+ struct vpage *vp, *evp;
+
+ vp = new_vpage + seg_pages(seg1);
+ evp = vp + seg_pages(seg2);
+ for (; vp < evp; vp++)
+ VPP_SETPROT(vp, a->prot);
+ }
+ }
+ size = seg2->s_size;
+ seg_free(seg2);
+ seg1->s_size += size;
+ svd1->swresv += swresv;
+ return (0);
+}
+
+/*
+ * Extend the next segment (seg2) to include the
+ * new segment (seg1 + a), if possible.
+ * Return 0 on success.
+ */
+static int
+segvn_extend_next(
+ struct seg *seg1,
+ struct seg *seg2,
+ struct segvn_crargs *a,
+ size_t swresv)
+{
+ struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data;
+ size_t size;
+ struct anon_map *amp2;
+ struct vpage *new_vpage;
+
+ /*
+ * We don't need any segment level locks for "segvn" data
+ * since the address space is "write" locked.
+ */
+ ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock));
+
+ /* first segment is new, try to extend second */
+ /* XXX - should also check cred */
+ if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
+ (!svd2->pageprot && (svd2->prot != a->prot)) ||
+ svd2->type != a->type || svd2->flags != a->flags ||
+ seg2->s_szc != a->szc)
+ return (-1);
+ /* vp == NULL implies zfod, offset doesn't matter */
+ if (svd2->vp != NULL &&
+ (a->offset & PAGEMASK) + seg1->s_size != svd2->offset)
+ return (-1);
+
+ amp2 = svd2->amp;
+ if (amp2) {
+ pgcnt_t newpgs;
+
+ /*
+ * Segment has private pages, can data structures
+ * be expanded?
+ *
+ * Acquire the anon_map lock to prevent it from changing,
+ * if it is shared. This ensures that the anon_map
+ * will not change while a thread which has a read/write
+ * lock on an address space references it.
+ *
+ * XXX - Don't need the anon_map lock at all if "refcnt"
+ * is 1.
+ */
+ if (svd2->type == MAP_SHARED)
+ return (-1);
+
+ ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
+ if (amp2->refcnt > 1) {
+ ANON_LOCK_EXIT(&amp2->a_rwlock);
+ return (-1);
+ }
+ newpgs = anon_grow(amp2->ahp, &svd2->anon_index,
+ btop(seg2->s_size), btop(seg1->s_size),
+ ANON_NOSLEEP | ANON_GROWDOWN);
+
+ if (newpgs == 0) {
+ ANON_LOCK_EXIT(&amp2->a_rwlock);
+ return (-1);
+ }
+ amp2->size = ptob(newpgs);
+ ANON_LOCK_EXIT(&amp2->a_rwlock);
+ }
+ if (svd2->vpage != NULL) {
+ new_vpage =
+ kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
+ KM_NOSLEEP);
+ if (new_vpage == NULL) {
+ /* Not merging segments so adjust anon_index back */
+ if (amp2)
+ svd2->anon_index += seg_pages(seg1);
+ return (-1);
+ }
+ bcopy(svd2->vpage, new_vpage + seg_pages(seg1),
+ vpgtob(seg_pages(seg2)));
+ kmem_free(svd2->vpage, vpgtob(seg_pages(seg2)));
+ svd2->vpage = new_vpage;
+ if (svd2->pageprot) {
+ struct vpage *vp, *evp;
+
+ vp = new_vpage;
+ evp = vp + seg_pages(seg1);
+ for (; vp < evp; vp++)
+ VPP_SETPROT(vp, a->prot);
+ }
+ }
+ size = seg1->s_size;
+ seg_free(seg1);
+ seg2->s_size += size;
+ seg2->s_base -= size;
+ svd2->offset -= size;
+ svd2->swresv += swresv;
+ return (0);
+}
+
+static int
+segvn_dup(struct seg *seg, struct seg *newseg)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct segvn_data *newsvd;
+ pgcnt_t npages = seg_pages(seg);
+ int error = 0;
+ uint_t prot;
+ size_t len;
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * If segment has anon reserved, reserve more for the new seg.
+ * For a MAP_NORESERVE segment swresv will be a count of all the
+ * allocated anon slots; thus we reserve for the child as many slots
+ * as the parent has allocated. This semantic prevents the child or
+ * parent from dieing during a copy-on-write fault caused by trying
+ * to write a shared pre-existing anon page.
+ */
+ if ((len = svd->swresv) != 0) {
+ if (anon_resv(svd->swresv) == 0)
+ return (ENOMEM);
+
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+ seg, len, 0);
+ }
+
+ newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
+
+ newseg->s_ops = &segvn_ops;
+ newseg->s_data = (void *)newsvd;
+ newseg->s_szc = seg->s_szc;
+
+ if ((newsvd->vp = svd->vp) != NULL) {
+ VN_HOLD(svd->vp);
+ if (svd->type == MAP_SHARED)
+ lgrp_shm_policy_init(NULL, svd->vp);
+ }
+ newsvd->offset = svd->offset;
+ newsvd->prot = svd->prot;
+ newsvd->maxprot = svd->maxprot;
+ newsvd->pageprot = svd->pageprot;
+ newsvd->type = svd->type;
+ newsvd->cred = svd->cred;
+ crhold(newsvd->cred);
+ newsvd->advice = svd->advice;
+ newsvd->pageadvice = svd->pageadvice;
+ newsvd->swresv = svd->swresv;
+ newsvd->flags = svd->flags;
+ newsvd->softlockcnt = 0;
+ newsvd->policy_info = svd->policy_info;
+ if ((newsvd->amp = svd->amp) == NULL) {
+ /*
+ * Not attaching to a shared anon object.
+ */
+ newsvd->anon_index = 0;
+ } else {
+ struct anon_map *amp;
+
+ amp = svd->amp;
+ if (svd->type == MAP_SHARED) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ amp->refcnt++;
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ newsvd->anon_index = svd->anon_index;
+ } else {
+ int reclaim = 1;
+
+ /*
+ * Allocate and initialize new anon_map structure.
+ */
+ newsvd->amp = anonmap_alloc(newseg->s_size, 0);
+ newsvd->amp->a_szc = newseg->s_szc;
+ newsvd->anon_index = 0;
+
+ /*
+ * We don't have to acquire the anon_map lock
+ * for the new segment (since it belongs to an
+ * address space that is still not associated
+ * with any process), or the segment in the old
+ * address space (since all threads in it
+ * are stopped while duplicating the address space).
+ */
+
+ /*
+ * The goal of the following code is to make sure that
+ * softlocked pages do not end up as copy on write
+ * pages. This would cause problems where one
+ * thread writes to a page that is COW and a different
+ * thread in the same process has softlocked it. The
+ * softlock lock would move away from this process
+ * because the write would cause this process to get
+ * a copy (without the softlock).
+ *
+ * The strategy here is to just break the
+ * sharing on pages that could possibly be
+ * softlocked.
+ */
+retry:
+ if (svd->softlockcnt) {
+ struct anon *ap, *newap;
+ size_t i;
+ uint_t vpprot;
+ page_t *anon_pl[1+1], *pp;
+ caddr_t addr;
+ ulong_t anon_idx = 0;
+
+ /*
+ * The softlock count might be non zero
+ * because some pages are still stuck in the
+ * cache for lazy reclaim. Flush the cache
+ * now. This should drop the count to zero.
+ * [or there is really I/O going on to these
+ * pages]. Note, we have the writers lock so
+ * nothing gets inserted during the flush.
+ */
+ if (reclaim == 1) {
+ segvn_purge(seg);
+ reclaim = 0;
+ goto retry;
+ }
+ i = btopr(seg->s_size);
+ addr = seg->s_base;
+ /*
+ * XXX break cow sharing using PAGESIZE
+ * pages. They will be relocated into larger
+ * pages at fault time.
+ */
+ while (i-- > 0) {
+ if (ap = anon_get_ptr(amp->ahp,
+ anon_idx)) {
+ error = anon_getpage(&ap,
+ &vpprot, anon_pl, PAGESIZE,
+ seg, addr, S_READ,
+ svd->cred);
+ if (error) {
+ newsvd->vpage = NULL;
+ goto out;
+ }
+ /*
+ * prot need not be computed
+ * below 'cause anon_private is
+ * going to ignore it anyway
+ * as child doesn't inherit
+ * pagelock from parent.
+ */
+ prot = svd->pageprot ?
+ VPP_PROT(
+ &svd->vpage[
+ seg_page(seg, addr)])
+ : svd->prot;
+ pp = anon_private(&newap,
+ newseg, addr, prot,
+ anon_pl[0], 0,
+ newsvd->cred);
+ if (pp == NULL) {
+ /* no mem abort */
+ newsvd->vpage = NULL;
+ error = ENOMEM;
+ goto out;
+ }
+ (void) anon_set_ptr(
+ newsvd->amp->ahp, anon_idx,
+ newap, ANON_SLEEP);
+ page_unlock(pp);
+ }
+ addr += PAGESIZE;
+ anon_idx++;
+ }
+ } else { /* common case */
+ if (seg->s_szc != 0) {
+ /*
+ * If at least one of anon slots of a
+ * large page exists then make sure
+ * all anon slots of a large page
+ * exist to avoid partial cow sharing
+ * of a large page in the future.
+ */
+ anon_dup_fill_holes(amp->ahp,
+ svd->anon_index, newsvd->amp->ahp,
+ 0, seg->s_size, seg->s_szc,
+ svd->vp != NULL);
+ } else {
+ anon_dup(amp->ahp, svd->anon_index,
+ newsvd->amp->ahp, 0, seg->s_size);
+ }
+
+ hat_clrattr(seg->s_as->a_hat, seg->s_base,
+ seg->s_size, PROT_WRITE);
+ }
+ }
+ }
+ /*
+ * If necessary, create a vpage structure for the new segment.
+ * Do not copy any page lock indications.
+ */
+ if (svd->vpage != NULL) {
+ uint_t i;
+ struct vpage *ovp = svd->vpage;
+ struct vpage *nvp;
+
+ nvp = newsvd->vpage =
+ kmem_alloc(vpgtob(npages), KM_SLEEP);
+ for (i = 0; i < npages; i++) {
+ *nvp = *ovp++;
+ VPP_CLRPPLOCK(nvp++);
+ }
+ } else
+ newsvd->vpage = NULL;
+
+ /* Inform the vnode of the new mapping */
+ if (newsvd->vp != NULL) {
+ error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset,
+ newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot,
+ newsvd->maxprot, newsvd->type, newsvd->cred);
+ }
+out:
+ return (error);
+}
+
+
+/*
+ * callback function used by segvn_unmap to invoke free_vp_pages() for only
+ * those pages actually processed by the HAT
+ */
+extern int free_pages;
+
+static void
+segvn_hat_unload_callback(hat_callback_t *cb)
+{
+ struct seg *seg = cb->hcb_data;
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ size_t len;
+ u_offset_t off;
+
+ ASSERT(svd->vp != NULL);
+ ASSERT(cb->hcb_end_addr > cb->hcb_start_addr);
+ ASSERT(cb->hcb_start_addr >= seg->s_base);
+
+ len = cb->hcb_end_addr - cb->hcb_start_addr;
+ off = cb->hcb_start_addr - seg->s_base;
+ free_vp_pages(svd->vp, svd->offset + off, len);
+}
+
+
+static int
+segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct segvn_data *nsvd;
+ struct seg *nseg;
+ struct anon_map *amp;
+ pgcnt_t opages; /* old segment size in pages */
+ pgcnt_t npages; /* new segment size in pages */
+ pgcnt_t dpages; /* pages being deleted (unmapped) */
+ hat_callback_t callback; /* used for free_vp_pages() */
+ hat_callback_t *cbp = NULL;
+ caddr_t nbase;
+ size_t nsize;
+ size_t oswresv;
+ int reclaim = 1;
+
+ /*
+ * We don't need any segment level locks for "segvn" data
+ * since the address space is "write" locked.
+ */
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * Fail the unmap if pages are SOFTLOCKed through this mapping.
+ * softlockcnt is protected from change by the as write lock.
+ */
+retry:
+ if (svd->softlockcnt > 0) {
+ /*
+ * since we do have the writers lock nobody can fill
+ * the cache during the purge. The flush either succeeds
+ * or we still have pending I/Os.
+ */
+ if (reclaim == 1) {
+ segvn_purge(seg);
+ reclaim = 0;
+ goto retry;
+ }
+ return (EAGAIN);
+ }
+
+ /*
+ * Check for bad sizes
+ */
+ if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
+ (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) {
+ panic("segvn_unmap");
+ /*NOTREACHED*/
+ }
+
+ if (seg->s_szc != 0) {
+ size_t pgsz = page_get_pagesize(seg->s_szc);
+ int err;
+ if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
+ ASSERT(seg->s_base != addr || seg->s_size != len);
+ VM_STAT_ADD(segvnvmstats.demoterange[0]);
+ err = segvn_demote_range(seg, addr, len, SDR_END);
+ if (err == 0) {
+ return (IE_RETRY);
+ }
+ return (err);
+ }
+ }
+
+ /* Inform the vnode of the unmapping. */
+ if (svd->vp) {
+ int error;
+
+ error = VOP_DELMAP(svd->vp,
+ (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base),
+ seg->s_as, addr, len, svd->prot, svd->maxprot,
+ svd->type, svd->cred);
+
+ if (error == EAGAIN)
+ return (error);
+ }
+ /*
+ * Remove any page locks set through this mapping.
+ */
+ (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0);
+
+ /*
+ * Unload any hardware translations in the range to be taken out.
+ * Use a callback to invoke free_vp_pages() effectively.
+ */
+ if (svd->vp != NULL && free_pages != 0) {
+ callback.hcb_data = seg;
+ callback.hcb_function = segvn_hat_unload_callback;
+ cbp = &callback;
+ }
+ hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp);
+
+ /*
+ * Check for entire segment
+ */
+ if (addr == seg->s_base && len == seg->s_size) {
+ seg_free(seg);
+ return (0);
+ }
+
+ opages = seg_pages(seg);
+ dpages = btop(len);
+ npages = opages - dpages;
+ amp = svd->amp;
+
+ /*
+ * Check for beginning of segment
+ */
+ if (addr == seg->s_base) {
+ if (svd->vpage != NULL) {
+ size_t nbytes;
+ struct vpage *ovpage;
+
+ ovpage = svd->vpage; /* keep pointer to vpage */
+
+ nbytes = vpgtob(npages);
+ svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+ bcopy(&ovpage[dpages], svd->vpage, nbytes);
+
+ /* free up old vpage */
+ kmem_free(ovpage, vpgtob(opages));
+ }
+ if (amp != NULL) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
+ /*
+ * Free up now unused parts of anon_map array.
+ */
+ if (seg->s_szc != 0) {
+ anon_free_pages(amp->ahp,
+ svd->anon_index, len, seg->s_szc);
+ } else {
+ anon_free(amp->ahp, svd->anon_index,
+ len);
+ }
+
+ /*
+ * Unreserve swap space for the unmapped chunk
+ * of this segment in case it's MAP_SHARED
+ */
+ if (svd->type == MAP_SHARED) {
+ anon_unresv(len);
+ amp->swresv -= len;
+ }
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ svd->anon_index += dpages;
+ }
+ if (svd->vp != NULL)
+ svd->offset += len;
+
+ if (svd->swresv) {
+ if (svd->flags & MAP_NORESERVE) {
+ ASSERT(amp);
+ oswresv = svd->swresv;
+
+ svd->swresv = ptob(anon_pages(amp->ahp,
+ svd->anon_index, npages));
+ anon_unresv(oswresv - svd->swresv);
+ } else {
+ anon_unresv(len);
+ svd->swresv -= len;
+ }
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+ seg, len, 0);
+ }
+
+ seg->s_base += len;
+ seg->s_size -= len;
+ return (0);
+ }
+
+ /*
+ * Check for end of segment
+ */
+ if (addr + len == seg->s_base + seg->s_size) {
+ if (svd->vpage != NULL) {
+ size_t nbytes;
+ struct vpage *ovpage;
+
+ ovpage = svd->vpage; /* keep pointer to vpage */
+
+ nbytes = vpgtob(npages);
+ svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+ bcopy(ovpage, svd->vpage, nbytes);
+
+ /* free up old vpage */
+ kmem_free(ovpage, vpgtob(opages));
+
+ }
+ if (amp != NULL) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
+ /*
+ * Free up now unused parts of anon_map array
+ */
+ if (seg->s_szc != 0) {
+ ulong_t an_idx = svd->anon_index +
+ npages;
+ anon_free_pages(amp->ahp, an_idx,
+ len, seg->s_szc);
+ } else {
+ anon_free(amp->ahp,
+ svd->anon_index + npages, len);
+ }
+ /*
+ * Unreserve swap space for the unmapped chunk
+ * of this segment in case it's MAP_SHARED
+ */
+ if (svd->type == MAP_SHARED) {
+ anon_unresv(len);
+ amp->swresv -= len;
+ }
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+
+ if (svd->swresv) {
+ if (svd->flags & MAP_NORESERVE) {
+ ASSERT(amp);
+ oswresv = svd->swresv;
+ svd->swresv = ptob(anon_pages(amp->ahp,
+ svd->anon_index, npages));
+ anon_unresv(oswresv - svd->swresv);
+ } else {
+ anon_unresv(len);
+ svd->swresv -= len;
+ }
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+ "anon proc:%p %lu %u", seg, len, 0);
+ }
+
+ seg->s_size -= len;
+ return (0);
+ }
+
+ /*
+ * The section to go is in the middle of the segment,
+ * have to make it into two segments. nseg is made for
+ * the high end while seg is cut down at the low end.
+ */
+ nbase = addr + len; /* new seg base */
+ nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */
+ seg->s_size = addr - seg->s_base; /* shrink old seg */
+ nseg = seg_alloc(seg->s_as, nbase, nsize);
+ if (nseg == NULL) {
+ panic("segvn_unmap seg_alloc");
+ /*NOTREACHED*/
+ }
+ nseg->s_ops = seg->s_ops;
+ nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
+ nseg->s_data = (void *)nsvd;
+ nseg->s_szc = seg->s_szc;
+ *nsvd = *svd;
+ nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
+ nsvd->swresv = 0;
+ nsvd->softlockcnt = 0;
+
+ if (svd->vp != NULL) {
+ VN_HOLD(nsvd->vp);
+ if (nsvd->type == MAP_SHARED)
+ lgrp_shm_policy_init(NULL, nsvd->vp);
+ }
+ crhold(svd->cred);
+
+ if (svd->vpage == NULL) {
+ nsvd->vpage = NULL;
+ } else {
+ /* need to split vpage into two arrays */
+ size_t nbytes;
+ struct vpage *ovpage;
+
+ ovpage = svd->vpage; /* keep pointer to vpage */
+
+ npages = seg_pages(seg); /* seg has shrunk */
+ nbytes = vpgtob(npages);
+ svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+
+ bcopy(ovpage, svd->vpage, nbytes);
+
+ npages = seg_pages(nseg);
+ nbytes = vpgtob(npages);
+ nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+
+ bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes);
+
+ /* free up old vpage */
+ kmem_free(ovpage, vpgtob(opages));
+ }
+
+ if (amp == NULL) {
+ nsvd->amp = NULL;
+ nsvd->anon_index = 0;
+ } else {
+ /*
+ * Need to create a new anon map for the new segment.
+ * We'll also allocate a new smaller array for the old
+ * smaller segment to save space.
+ */
+ opages = btop((uintptr_t)(addr - seg->s_base));
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
+ /*
+ * Free up now unused parts of anon_map array
+ */
+ if (seg->s_szc != 0) {
+ ulong_t an_idx = svd->anon_index + opages;
+ anon_free_pages(amp->ahp, an_idx, len,
+ seg->s_szc);
+ } else {
+ anon_free(amp->ahp, svd->anon_index + opages,
+ len);
+ }
+
+ /*
+ * Unreserve swap space for the unmapped chunk
+ * of this segment in case it's MAP_SHARED
+ */
+ if (svd->type == MAP_SHARED) {
+ anon_unresv(len);
+ amp->swresv -= len;
+ }
+ }
+
+ nsvd->anon_index = svd->anon_index +
+ btop((uintptr_t)(nseg->s_base - seg->s_base));
+ if (svd->type == MAP_SHARED) {
+ ASSERT(seg->s_szc == 0);
+ amp->refcnt++;
+ nsvd->amp = amp;
+ } else {
+ struct anon_map *namp;
+ struct anon_hdr *nahp;
+
+ ASSERT(svd->type == MAP_PRIVATE);
+ nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
+ namp = anonmap_alloc(nseg->s_size, 0);
+ namp->a_szc = seg->s_szc;
+ (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp,
+ 0, btop(seg->s_size), ANON_SLEEP);
+ (void) anon_copy_ptr(amp->ahp, nsvd->anon_index,
+ namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
+ anon_release(amp->ahp, btop(amp->size));
+ svd->anon_index = 0;
+ nsvd->anon_index = 0;
+ amp->ahp = nahp;
+ amp->size = seg->s_size;
+ nsvd->amp = namp;
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ if (svd->swresv) {
+ if (svd->flags & MAP_NORESERVE) {
+ ASSERT(amp);
+ oswresv = svd->swresv;
+ svd->swresv = ptob(anon_pages(amp->ahp,
+ svd->anon_index, btop(seg->s_size)));
+ nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
+ nsvd->anon_index, btop(nseg->s_size)));
+ ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
+ anon_unresv(oswresv - (svd->swresv + nsvd->swresv));
+ } else {
+ if (seg->s_size + nseg->s_size + len != svd->swresv) {
+ panic("segvn_unmap: "
+ "cannot split swap reservation");
+ /*NOTREACHED*/
+ }
+ anon_unresv(len);
+ svd->swresv = seg->s_size;
+ nsvd->swresv = nseg->s_size;
+ }
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+ seg, len, 0);
+ }
+
+ return (0); /* I'm glad that's all over with! */
+}
+
+static void
+segvn_free(struct seg *seg)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ pgcnt_t npages = seg_pages(seg);
+ struct anon_map *amp;
+ size_t len;
+
+ /*
+ * We don't need any segment level locks for "segvn" data
+ * since the address space is "write" locked.
+ */
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * Be sure to unlock pages. XXX Why do things get free'ed instead
+ * of unmapped? XXX
+ */
+ (void) segvn_lockop(seg, seg->s_base, seg->s_size,
+ 0, MC_UNLOCK, NULL, 0);
+
+ /*
+ * Deallocate the vpage and anon pointers if necessary and possible.
+ */
+ if (svd->vpage != NULL) {
+ kmem_free(svd->vpage, vpgtob(npages));
+ svd->vpage = NULL;
+ }
+ if ((amp = svd->amp) != NULL) {
+ /*
+ * If there are no more references to this anon_map
+ * structure, then deallocate the structure after freeing
+ * up all the anon slot pointers that we can.
+ */
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ if (--amp->refcnt == 0) {
+ if (svd->type == MAP_PRIVATE) {
+ /*
+ * Private - we only need to anon_free
+ * the part that this segment refers to.
+ */
+ if (seg->s_szc != 0) {
+ anon_free_pages(amp->ahp,
+ svd->anon_index, seg->s_size,
+ seg->s_szc);
+ } else {
+ anon_free(amp->ahp, svd->anon_index,
+ seg->s_size);
+ }
+ } else {
+ /*
+ * Shared - anon_free the entire
+ * anon_map's worth of stuff and
+ * release any swap reservation.
+ */
+ ASSERT(seg->s_szc == 0);
+ anon_free(amp->ahp, 0, amp->size);
+ if ((len = amp->swresv) != 0) {
+ anon_unresv(len);
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+ "anon proc:%p %lu %u",
+ seg, len, 0);
+ }
+ }
+ svd->amp = NULL;
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ anonmap_free(amp);
+ } else if (svd->type == MAP_PRIVATE) {
+ /*
+ * We had a private mapping which still has
+ * a held anon_map so just free up all the
+ * anon slot pointers that we were using.
+ */
+ if (seg->s_szc != 0) {
+ anon_free_pages(amp->ahp, svd->anon_index,
+ seg->s_size, seg->s_szc);
+ } else {
+ anon_free(amp->ahp, svd->anon_index,
+ seg->s_size);
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ } else {
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ }
+
+ /*
+ * Release swap reservation.
+ */
+ if ((len = svd->swresv) != 0) {
+ anon_unresv(svd->swresv);
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
+ seg, len, 0);
+ svd->swresv = 0;
+ }
+ /*
+ * Release claim on vnode, credentials, and finally free the
+ * private data.
+ */
+ if (svd->vp != NULL) {
+ if (svd->type == MAP_SHARED)
+ lgrp_shm_policy_fini(NULL, svd->vp);
+ VN_RELE(svd->vp);
+ svd->vp = NULL;
+ }
+ crfree(svd->cred);
+ svd->cred = NULL;
+
+ seg->s_data = NULL;
+ kmem_cache_free(segvn_cache, svd);
+}
+
+/*
+ * Do a F_SOFTUNLOCK call over the range requested. The range must have
+ * already been F_SOFTLOCK'ed.
+ * Caller must always match addr and len of a softunlock with a previous
+ * softlock with exactly the same addr and len.
+ */
+static void
+segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ page_t *pp;
+ caddr_t adr;
+ struct vnode *vp;
+ u_offset_t offset;
+ ulong_t anon_index;
+ struct anon_map *amp;
+ struct anon *ap = NULL;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+
+ if ((amp = svd->amp) != NULL)
+ anon_index = svd->anon_index + seg_page(seg, addr);
+
+ hat_unlock(seg->s_as->a_hat, addr, len);
+ for (adr = addr; adr < addr + len; adr += PAGESIZE) {
+ if (amp != NULL) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ if ((ap = anon_get_ptr(amp->ahp, anon_index++))
+ != NULL) {
+ swap_xlate(ap, &vp, &offset);
+ } else {
+ vp = svd->vp;
+ offset = svd->offset +
+ (uintptr_t)(adr - seg->s_base);
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ } else {
+ vp = svd->vp;
+ offset = svd->offset +
+ (uintptr_t)(adr - seg->s_base);
+ }
+
+ /*
+ * Use page_find() instead of page_lookup() to
+ * find the page since we know that it is locked.
+ */
+ pp = page_find(vp, offset);
+ if (pp == NULL) {
+ panic(
+ "segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
+ (void *)adr, (void *)ap, (void *)vp, offset);
+ /*NOTREACHED*/
+ }
+
+ if (rw == S_WRITE) {
+ hat_setrefmod(pp);
+ if (seg->s_as->a_vbits)
+ hat_setstat(seg->s_as, adr, PAGESIZE,
+ P_REF | P_MOD);
+ } else if (rw != S_OTHER) {
+ hat_setref(pp);
+ if (seg->s_as->a_vbits)
+ hat_setstat(seg->s_as, adr, PAGESIZE, P_REF);
+ }
+ TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
+ "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
+ page_unlock(pp);
+ }
+ mutex_enter(&freemem_lock); /* for availrmem */
+ availrmem += btop(len);
+ segvn_pages_locked -= btop(len);
+ svd->softlockcnt -= btop(len);
+ mutex_exit(&freemem_lock);
+ if (svd->softlockcnt == 0) {
+ /*
+ * All SOFTLOCKS are gone. Wakeup any waiting
+ * unmappers so they can try again to unmap.
+ * Check for waiters first without the mutex
+ * held so we don't always grab the mutex on
+ * softunlocks.
+ */
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ mutex_enter(&seg->s_as->a_contents);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ AS_CLRUNMAPWAIT(seg->s_as);
+ cv_broadcast(&seg->s_as->a_cv);
+ }
+ mutex_exit(&seg->s_as->a_contents);
+ }
+ }
+}
+
+#define PAGE_HANDLED ((page_t *)-1)
+
+/*
+ * Release all the pages in the NULL terminated ppp list
+ * which haven't already been converted to PAGE_HANDLED.
+ */
+static void
+segvn_pagelist_rele(page_t **ppp)
+{
+ for (; *ppp != NULL; ppp++) {
+ if (*ppp != PAGE_HANDLED)
+ page_unlock(*ppp);
+ }
+}
+
+static int stealcow = 1;
+
+/*
+ * Workaround for viking chip bug. See bug id 1220902.
+ * To fix this down in pagefault() would require importing so
+ * much as and segvn code as to be unmaintainable.
+ */
+int enable_mbit_wa = 0;
+
+/*
+ * Handles all the dirty work of getting the right
+ * anonymous pages and loading up the translations.
+ * This routine is called only from segvn_fault()
+ * when looping over the range of addresses requested.
+ *
+ * The basic algorithm here is:
+ * If this is an anon_zero case
+ * Call anon_zero to allocate page
+ * Load up translation
+ * Return
+ * endif
+ * If this is an anon page
+ * Use anon_getpage to get the page
+ * else
+ * Find page in pl[] list passed in
+ * endif
+ * If not a cow
+ * Load up the translation to the page
+ * return
+ * endif
+ * Call anon_private to handle cow
+ * Load up (writable) translation to new page
+ */
+static faultcode_t
+segvn_faultpage(
+ struct hat *hat, /* the hat to use for mapping */
+ struct seg *seg, /* seg_vn of interest */
+ caddr_t addr, /* address in as */
+ u_offset_t off, /* offset in vp */
+ struct vpage *vpage, /* pointer to vpage for vp, off */
+ page_t *pl[], /* object source page pointer */
+ uint_t vpprot, /* access allowed to object pages */
+ enum fault_type type, /* type of fault */
+ enum seg_rw rw, /* type of access at fault */
+ int brkcow) /* we may need to break cow */
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ page_t *pp, **ppp;
+ uint_t pageflags = 0;
+ page_t *anon_pl[1 + 1];
+ page_t *opp = NULL; /* original page */
+ uint_t prot;
+ int err;
+ int cow;
+ int claim;
+ int steal = 0;
+ ulong_t anon_index;
+ struct anon *ap, *oldap;
+ struct anon_map *amp;
+ int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
+ int anon_lock = 0;
+ anon_sync_obj_t cookie;
+
+ if (svd->flags & MAP_TEXT) {
+ hat_flag |= HAT_LOAD_TEXT;
+ }
+
+ ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
+ ASSERT(seg->s_szc == 0);
+
+ /*
+ * Initialize protection value for this page.
+ * If we have per page protection values check it now.
+ */
+ if (svd->pageprot) {
+ uint_t protchk;
+
+ switch (rw) {
+ case S_READ:
+ protchk = PROT_READ;
+ break;
+ case S_WRITE:
+ protchk = PROT_WRITE;
+ break;
+ case S_EXEC:
+ protchk = PROT_EXEC;
+ break;
+ case S_OTHER:
+ default:
+ protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+ break;
+ }
+
+ prot = VPP_PROT(vpage);
+ if ((prot & protchk) == 0)
+ return (FC_PROT); /* illegal access type */
+ } else {
+ prot = svd->prot;
+ }
+
+ if (type == F_SOFTLOCK) {
+ mutex_enter(&freemem_lock);
+ if (availrmem <= tune.t_minarmem) {
+ mutex_exit(&freemem_lock);
+ return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */
+ } else {
+ svd->softlockcnt++;
+ availrmem--;
+ segvn_pages_locked++;
+ }
+ mutex_exit(&freemem_lock);
+ }
+
+ /*
+ * Always acquire the anon array lock to prevent 2 threads from
+ * allocating separate anon slots for the same "addr".
+ */
+
+ if ((amp = svd->amp) != NULL) {
+ ASSERT(RW_READ_HELD(&amp->a_rwlock));
+ anon_index = svd->anon_index + seg_page(seg, addr);
+ anon_array_enter(amp, anon_index, &cookie);
+ anon_lock = 1;
+ }
+
+ if (svd->vp == NULL && amp != NULL) {
+ if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) {
+ /*
+ * Allocate a (normally) writable anonymous page of
+ * zeroes. If no advance reservations, reserve now.
+ */
+ if (svd->flags & MAP_NORESERVE) {
+ if (anon_resv(ptob(1))) {
+ svd->swresv += ptob(1);
+ } else {
+ err = ENOMEM;
+ goto out;
+ }
+ }
+ if ((pp = anon_zero(seg, addr, &ap,
+ svd->cred)) == NULL) {
+ err = ENOMEM;
+ goto out; /* out of swap space */
+ }
+ /*
+ * Re-acquire the anon_map lock and
+ * initialize the anon array entry.
+ */
+ (void) anon_set_ptr(amp->ahp, anon_index, ap,
+ ANON_SLEEP);
+ if (enable_mbit_wa) {
+ if (rw == S_WRITE)
+ hat_setmod(pp);
+ else if (!hat_ismod(pp))
+ prot &= ~PROT_WRITE;
+ }
+ /*
+ * If AS_PAGLCK is set in a_flags (via memcntl(2)
+ * with MC_LOCKAS, MCL_FUTURE) and this is a
+ * MAP_NORESERVE segment, we may need to
+ * permanently lock the page as it is being faulted
+ * for the first time. The following text applies
+ * only to MAP_NORESERVE segments:
+ *
+ * As per memcntl(2), if this segment was created
+ * after MCL_FUTURE was applied (a "future"
+ * segment), its pages must be locked. If this
+ * segment existed at MCL_FUTURE application (a
+ * "past" segment), the interface is unclear.
+ *
+ * We decide to lock only if vpage is present:
+ *
+ * - "future" segments will have a vpage array (see
+ * as_map), and so will be locked as required
+ *
+ * - "past" segments may not have a vpage array,
+ * depending on whether events (such as
+ * mprotect) have occurred. Locking if vpage
+ * exists will preserve legacy behavior. Not
+ * locking if vpage is absent, will not break
+ * the interface or legacy behavior. Note that
+ * allocating vpage here if it's absent requires
+ * upgrading the segvn reader lock, the cost of
+ * which does not seem worthwhile.
+ */
+ if (AS_ISPGLCK(seg->s_as) && vpage != NULL &&
+ (svd->flags & MAP_NORESERVE)) {
+ claim = VPP_PROT(vpage) & PROT_WRITE;
+ ASSERT(svd->type == MAP_PRIVATE);
+ if (page_pp_lock(pp, claim, 0))
+ VPP_SETPPLOCK(vpage);
+ }
+
+
+ /*
+ * Handle pages that have been marked for migration
+ */
+ if (lgrp_optimizations())
+ page_migrate(seg, addr, &pp, 1);
+ hat_memload(hat, addr, pp, prot, hat_flag);
+
+ if (!(hat_flag & HAT_LOAD_LOCK))
+ page_unlock(pp);
+
+ anon_array_exit(&cookie);
+ return (0);
+ }
+ }
+
+ /*
+ * Obtain the page structure via anon_getpage() if it is
+ * a private copy of an object (the result of a previous
+ * copy-on-write).
+ */
+ if (amp != NULL) {
+ if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) {
+ err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE,
+ seg, addr, rw, svd->cred);
+ if (err)
+ goto out;
+
+ if (svd->type == MAP_SHARED) {
+ /*
+ * If this is a shared mapping to an
+ * anon_map, then ignore the write
+ * permissions returned by anon_getpage().
+ * They apply to the private mappings
+ * of this anon_map.
+ */
+ vpprot |= PROT_WRITE;
+ }
+ opp = anon_pl[0];
+ }
+ }
+
+ /*
+ * Search the pl[] list passed in if it is from the
+ * original object (i.e., not a private copy).
+ */
+ if (opp == NULL) {
+ /*
+ * Find original page. We must be bringing it in
+ * from the list in pl[].
+ */
+ for (ppp = pl; (opp = *ppp) != NULL; ppp++) {
+ if (opp == PAGE_HANDLED)
+ continue;
+ ASSERT(opp->p_vnode == svd->vp); /* XXX */
+ if (opp->p_offset == off)
+ break;
+ }
+ if (opp == NULL) {
+ panic("segvn_faultpage not found");
+ /*NOTREACHED*/
+ }
+ *ppp = PAGE_HANDLED;
+
+ }
+
+ ASSERT(PAGE_LOCKED(opp));
+
+ TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
+ "segvn_fault:pp %p vp %p offset %llx",
+ opp, NULL, 0);
+
+ /*
+ * The fault is treated as a copy-on-write fault if a
+ * write occurs on a private segment and the object
+ * page (i.e., mapping) is write protected. We assume
+ * that fatal protection checks have already been made.
+ */
+
+ cow = brkcow && ((vpprot & PROT_WRITE) == 0);
+
+ /*
+ * If not a copy-on-write case load the translation
+ * and return.
+ */
+ if (cow == 0) {
+ if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
+ if (rw == S_WRITE)
+ hat_setmod(opp);
+ else if (rw != S_OTHER && !hat_ismod(opp))
+ prot &= ~PROT_WRITE;
+ }
+
+ /*
+ * Handle pages that have been marked for migration
+ */
+ if (lgrp_optimizations())
+ page_migrate(seg, addr, &opp, 1);
+
+ hat_memload(hat, addr, opp, prot & vpprot, hat_flag);
+
+ if (!(hat_flag & HAT_LOAD_LOCK))
+ page_unlock(opp);
+
+ if (anon_lock) {
+ anon_array_exit(&cookie);
+ }
+ return (0);
+ }
+
+ hat_setref(opp);
+
+ ASSERT(amp != NULL && anon_lock);
+
+ /*
+ * Steal the page only if it isn't a private page
+ * since stealing a private page is not worth the effort.
+ */
+ if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL)
+ steal = 1;
+
+ /*
+ * Steal the original page if the following conditions are true:
+ *
+ * We are low on memory, the page is not private, page is not
+ * shared, not modified, not `locked' or if we have it `locked'
+ * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
+ * that the page is not shared) and if it doesn't have any
+ * translations. page_struct_lock isn't needed to look at p_cowcnt
+ * and p_lckcnt because we first get exclusive lock on page.
+ */
+ (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
+
+ if (stealcow && freemem < minfree && steal &&
+ page_tryupgrade(opp) && !hat_ismod(opp) &&
+ ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) ||
+ (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 &&
+ vpage != NULL && VPP_ISPPLOCK(vpage)))) {
+ /*
+ * Check if this page has other translations
+ * after unloading our translation.
+ */
+ if (hat_page_is_mapped(opp)) {
+ hat_unload(seg->s_as->a_hat, addr, PAGESIZE,
+ HAT_UNLOAD);
+ }
+
+ /*
+ * hat_unload() might sync back someone else's recent
+ * modification, so check again.
+ */
+ if (!hat_ismod(opp) && !hat_page_is_mapped(opp))
+ pageflags |= STEAL_PAGE;
+ }
+
+ /*
+ * If we have a vpage pointer, see if it indicates that we have
+ * ``locked'' the page we map -- if so, tell anon_private to
+ * transfer the locking resource to the new page.
+ *
+ * See Statement at the beginning of segvn_lockop regarding
+ * the way lockcnts/cowcnts are handled during COW.
+ *
+ */
+ if (vpage != NULL && VPP_ISPPLOCK(vpage))
+ pageflags |= LOCK_PAGE;
+
+ /*
+ * Allocate a private page and perform the copy.
+ * For MAP_NORESERVE reserve swap space now, unless this
+ * is a cow fault on an existing anon page in which case
+ * MAP_NORESERVE will have made advance reservations.
+ */
+ if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) {
+ if (anon_resv(ptob(1))) {
+ svd->swresv += ptob(1);
+ } else {
+ page_unlock(opp);
+ err = ENOMEM;
+ goto out;
+ }
+ }
+ oldap = ap;
+ pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred);
+ if (pp == NULL) {
+ err = ENOMEM; /* out of swap space */
+ goto out;
+ }
+
+ /*
+ * If we copied away from an anonymous page, then
+ * we are one step closer to freeing up an anon slot.
+ *
+ * NOTE: The original anon slot must be released while
+ * holding the "anon_map" lock. This is necessary to prevent
+ * other threads from obtaining a pointer to the anon slot
+ * which may be freed if its "refcnt" is 1.
+ */
+ if (oldap != NULL)
+ anon_decref(oldap);
+
+ (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
+
+ ASSERT(!IS_VMODSORT(pp->p_vnode));
+ if (enable_mbit_wa) {
+ if (rw == S_WRITE)
+ hat_setmod(pp);
+ else if (!hat_ismod(pp))
+ prot &= ~PROT_WRITE;
+ }
+
+
+ /*
+ * Handle pages that have been marked for migration
+ */
+ if (lgrp_optimizations())
+ page_migrate(seg, addr, &pp, 1);
+ hat_memload(hat, addr, pp, prot, hat_flag);
+
+ if (!(hat_flag & HAT_LOAD_LOCK))
+ page_unlock(pp);
+
+ ASSERT(anon_lock);
+ anon_array_exit(&cookie);
+ return (0);
+out:
+ if (anon_lock)
+ anon_array_exit(&cookie);
+
+ if (type == F_SOFTLOCK) {
+ mutex_enter(&freemem_lock);
+ availrmem++;
+ segvn_pages_locked--;
+ svd->softlockcnt--;
+ mutex_exit(&freemem_lock);
+ }
+ return (FC_MAKE_ERR(err));
+}
+
+/*
+ * relocate a bunch of smaller targ pages into one large repl page. all targ
+ * pages must be complete pages smaller than replacement pages.
+ * it's assumed that no page's szc can change since they are all PAGESIZE or
+ * complete large pages locked SHARED.
+ */
+static void
+segvn_relocate_pages(page_t **targ, page_t *replacement)
+{
+ page_t *pp;
+ pgcnt_t repl_npgs, curnpgs;
+ pgcnt_t i;
+ uint_t repl_szc = replacement->p_szc;
+ page_t *first_repl = replacement;
+ page_t *repl;
+ spgcnt_t npgs;
+
+ VM_STAT_ADD(segvnvmstats.relocatepages[0]);
+
+ ASSERT(repl_szc != 0);
+ npgs = repl_npgs = page_get_pagecnt(repl_szc);
+
+ i = 0;
+ while (repl_npgs) {
+ spgcnt_t nreloc;
+ int err;
+ ASSERT(replacement != NULL);
+ pp = targ[i];
+ ASSERT(pp->p_szc < repl_szc);
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(!PP_ISFREE(pp));
+ curnpgs = page_get_pagecnt(pp->p_szc);
+ if (curnpgs == 1) {
+ VM_STAT_ADD(segvnvmstats.relocatepages[1]);
+ repl = replacement;
+ page_sub(&replacement, repl);
+ ASSERT(PAGE_EXCL(repl));
+ ASSERT(!PP_ISFREE(repl));
+ ASSERT(repl->p_szc == repl_szc);
+ } else {
+ page_t *repl_savepp;
+ int j;
+ VM_STAT_ADD(segvnvmstats.relocatepages[2]);
+ repl_savepp = replacement;
+ for (j = 0; j < curnpgs; j++) {
+ repl = replacement;
+ page_sub(&replacement, repl);
+ ASSERT(PAGE_EXCL(repl));
+ ASSERT(!PP_ISFREE(repl));
+ ASSERT(repl->p_szc == repl_szc);
+ ASSERT(page_pptonum(targ[i + j]) ==
+ page_pptonum(targ[i]) + j);
+ }
+ repl = repl_savepp;
+ ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs));
+ }
+ err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL);
+ if (err || nreloc != curnpgs) {
+ panic("segvn_relocate_pages: "
+ "page_relocate failed err=%d curnpgs=%ld "
+ "nreloc=%ld", err, curnpgs, nreloc);
+ }
+ ASSERT(curnpgs <= repl_npgs);
+ repl_npgs -= curnpgs;
+ i += curnpgs;
+ }
+ ASSERT(replacement == NULL);
+
+ repl = first_repl;
+ repl_npgs = npgs;
+ for (i = 0; i < repl_npgs; i++) {
+ ASSERT(PAGE_EXCL(repl));
+ ASSERT(!PP_ISFREE(repl));
+ targ[i] = repl;
+ page_downgrade(targ[i]);
+ repl = page_next(repl);
+ }
+}
+
+/*
+ * Check if all pages in ppa array are complete smaller than szc pages and
+ * their roots will still be aligned relative to their current size if the
+ * entire ppa array is relocated into one szc page. If these conditions are
+ * not met return 0.
+ *
+ * If all pages are properly aligned attempt to upgrade their locks
+ * to exclusive mode. If it fails set *upgrdfail to 1 and return 0.
+ * upgrdfail was set to 0 by caller.
+ *
+ * Return 1 if all pages are aligned and locked exclusively.
+ *
+ * If all pages in ppa array happen to be physically contiguous to make one
+ * szc page and all exclusive locks are successfully obtained promote the page
+ * size to szc and set *pszc to szc. Return 1 with pages locked shared.
+ */
+static int
+segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc)
+{
+ page_t *pp;
+ pfn_t pfn;
+ pgcnt_t totnpgs = page_get_pagecnt(szc);
+ pfn_t first_pfn;
+ int contig = 1;
+ pgcnt_t i;
+ pgcnt_t j;
+ uint_t curszc;
+ pgcnt_t curnpgs;
+ int root = 0;
+
+ ASSERT(szc > 0);
+
+ VM_STAT_ADD(segvnvmstats.fullszcpages[0]);
+
+ for (i = 0; i < totnpgs; i++) {
+ pp = ppa[i];
+ ASSERT(PAGE_SHARED(pp));
+ ASSERT(!PP_ISFREE(pp));
+ pfn = page_pptonum(pp);
+ if (i == 0) {
+ if (!IS_P2ALIGNED(pfn, totnpgs)) {
+ contig = 0;
+ } else {
+ first_pfn = pfn;
+ }
+ } else if (contig && pfn != first_pfn + i) {
+ contig = 0;
+ }
+ if (pp->p_szc == 0) {
+ if (root) {
+ VM_STAT_ADD(segvnvmstats.fullszcpages[1]);
+ return (0);
+ }
+ } else if (!root) {
+ if ((curszc = pp->p_szc) >= szc) {
+ VM_STAT_ADD(segvnvmstats.fullszcpages[2]);
+ return (0);
+ }
+ if (curszc == 0) {
+ /*
+ * p_szc changed means we don't have all pages
+ * locked. return failure.
+ */
+ VM_STAT_ADD(segvnvmstats.fullszcpages[3]);
+ return (0);
+ }
+ curnpgs = page_get_pagecnt(curszc);
+ if (!IS_P2ALIGNED(pfn, curnpgs) ||
+ !IS_P2ALIGNED(i, curnpgs)) {
+ VM_STAT_ADD(segvnvmstats.fullszcpages[4]);
+ return (0);
+ }
+ root = 1;
+ } else {
+ ASSERT(i > 0);
+ VM_STAT_ADD(segvnvmstats.fullszcpages[5]);
+ if (pp->p_szc != curszc) {
+ VM_STAT_ADD(segvnvmstats.fullszcpages[6]);
+ return (0);
+ }
+ if (pfn - 1 != page_pptonum(ppa[i - 1])) {
+ panic("segvn_full_szcpages: "
+ "large page not physically contiguous");
+ }
+ if (P2PHASE(pfn, curnpgs) == curnpgs - 1) {
+ root = 0;
+ }
+ }
+ }
+
+ for (i = 0; i < totnpgs; i++) {
+ ASSERT(ppa[i]->p_szc < szc);
+ if (!page_tryupgrade(ppa[i])) {
+ for (j = 0; j < i; j++) {
+ page_downgrade(ppa[j]);
+ }
+ *pszc = ppa[i]->p_szc;
+ *upgrdfail = 1;
+ VM_STAT_ADD(segvnvmstats.fullszcpages[7]);
+ return (0);
+ }
+ }
+
+ /*
+ * When a page is put a free cachelist its szc is set to 0. if file
+ * system reclaimed pages from cachelist targ pages will be physically
+ * contiguous with 0 p_szc. in this case just upgrade szc of targ
+ * pages without any relocations.
+ * To avoid any hat issues with previous small mappings
+ * hat_pageunload() the target pages first.
+ */
+ if (contig) {
+ VM_STAT_ADD(segvnvmstats.fullszcpages[8]);
+ for (i = 0; i < totnpgs; i++) {
+ (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD);
+ }
+ for (i = 0; i < totnpgs; i++) {
+ ppa[i]->p_szc = szc;
+ }
+ for (i = 0; i < totnpgs; i++) {
+ ASSERT(PAGE_EXCL(ppa[i]));
+ page_downgrade(ppa[i]);
+ }
+ if (pszc != NULL) {
+ *pszc = szc;
+ }
+ }
+ VM_STAT_ADD(segvnvmstats.fullszcpages[9]);
+ return (1);
+}
+
+/*
+ * Create physically contiguous pages for [vp, off] - [vp, off +
+ * page_size(szc)) range and for private segment return them in ppa array.
+ * Pages are created either via IO or relocations.
+ *
+ * Return 1 on sucess and 0 on failure.
+ *
+ * If physically contiguos pages already exist for this range return 1 without
+ * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa
+ * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE().
+ */
+
+static int
+segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off,
+ uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc,
+ int *downsize)
+
+{
+ page_t *pplist = *ppplist;
+ size_t pgsz = page_get_pagesize(szc);
+ pgcnt_t pages = btop(pgsz);
+ ulong_t start_off = off;
+ u_offset_t eoff = off + pgsz;
+ spgcnt_t nreloc;
+ u_offset_t io_off = off;
+ size_t io_len;
+ page_t *io_pplist = NULL;
+ page_t *done_pplist = NULL;
+ pgcnt_t pgidx = 0;
+ page_t *pp;
+ page_t *newpp;
+ page_t *targpp;
+ int io_err = 0;
+ int i;
+ pfn_t pfn;
+ ulong_t ppages;
+ page_t *targ_pplist = NULL;
+ page_t *repl_pplist = NULL;
+ page_t *tmp_pplist;
+ int nios = 0;
+ uint_t pszc;
+ struct vattr va;
+
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]);
+
+ ASSERT(szc != 0);
+ ASSERT(pplist->p_szc == szc);
+
+ /*
+ * downsize will be set to 1 only if we fail to lock pages. this will
+ * allow subsequent faults to try to relocate the page again. If we
+ * fail due to misalignment don't downsize and let the caller map the
+ * whole region with small mappings to avoid more faults into the area
+ * where we can't get large pages anyway.
+ */
+ *downsize = 0;
+
+ while (off < eoff) {
+ newpp = pplist;
+ ASSERT(newpp != NULL);
+ ASSERT(PAGE_EXCL(newpp));
+ ASSERT(!PP_ISFREE(newpp));
+ /*
+ * we pass NULL for nrelocp to page_lookup_create()
+ * so that it doesn't relocate. We relocate here
+ * later only after we make sure we can lock all
+ * pages in the range we handle and they are all
+ * aligned.
+ */
+ pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0);
+ ASSERT(pp != NULL);
+ ASSERT(!PP_ISFREE(pp));
+ ASSERT(pp->p_vnode == vp);
+ ASSERT(pp->p_offset == off);
+ if (pp == newpp) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]);
+ page_sub(&pplist, pp);
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(page_iolock_assert(pp));
+ page_list_concat(&io_pplist, &pp);
+ off += PAGESIZE;
+ continue;
+ }
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]);
+ pfn = page_pptonum(pp);
+ pszc = pp->p_szc;
+ if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL &&
+ IS_P2ALIGNED(pfn, pages)) {
+ ASSERT(repl_pplist == NULL);
+ ASSERT(done_pplist == NULL);
+ ASSERT(pplist == *ppplist);
+ page_unlock(pp);
+ page_free_replacement_page(pplist);
+ page_create_putback(pages);
+ *ppplist = NULL;
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]);
+ return (1);
+ }
+ if (pszc >= szc) {
+ page_unlock(pp);
+ segvn_faultvnmpss_align_err1++;
+ goto out;
+ }
+ ppages = page_get_pagecnt(pszc);
+ if (!IS_P2ALIGNED(pfn, ppages)) {
+ ASSERT(pszc > 0);
+ /*
+ * sizing down to pszc won't help.
+ */
+ page_unlock(pp);
+ segvn_faultvnmpss_align_err2++;
+ goto out;
+ }
+ pfn = page_pptonum(newpp);
+ if (!IS_P2ALIGNED(pfn, ppages)) {
+ ASSERT(pszc > 0);
+ /*
+ * sizing down to pszc won't help.
+ */
+ page_unlock(pp);
+ segvn_faultvnmpss_align_err3++;
+ goto out;
+ }
+ if (!PAGE_EXCL(pp)) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]);
+ page_unlock(pp);
+ *downsize = 1;
+ *ret_pszc = pp->p_szc;
+ goto out;
+ }
+ targpp = pp;
+ if (io_pplist != NULL) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]);
+ io_len = off - io_off;
+ /*
+ * Some file systems like NFS don't check EOF
+ * conditions in VOP_PAGEIO(). Check it here
+ * now that pages are locked SE_EXCL. Any file
+ * truncation will wait until the pages are
+ * unlocked so no need to worry that file will
+ * be truncated after we check its size here.
+ * XXX fix NFS to remove this check.
+ */
+ va.va_mask = AT_SIZE;
+ if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]);
+ page_unlock(targpp);
+ goto out;
+ }
+ if (btopr(va.va_size) < btopr(io_off + io_len)) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]);
+ *downsize = 1;
+ *ret_pszc = 0;
+ page_unlock(targpp);
+ goto out;
+ }
+ io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
+ B_READ, svd->cred);
+ if (io_err) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]);
+ page_unlock(targpp);
+ if (io_err == EDEADLK) {
+ segvn_vmpss_pageio_deadlk_err++;
+ }
+ goto out;
+ }
+ nios++;
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]);
+ while (io_pplist != NULL) {
+ pp = io_pplist;
+ page_sub(&io_pplist, pp);
+ ASSERT(page_iolock_assert(pp));
+ page_io_unlock(pp);
+ pgidx = (pp->p_offset - start_off) >>
+ PAGESHIFT;
+ ASSERT(pgidx < pages);
+ ppa[pgidx] = pp;
+ page_list_concat(&done_pplist, &pp);
+ }
+ }
+ pp = targpp;
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_szc <= pszc);
+ if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]);
+ page_unlock(pp);
+ *downsize = 1;
+ *ret_pszc = pp->p_szc;
+ goto out;
+ }
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]);
+ /*
+ * page szc chould have changed before the entire group was
+ * locked. reread page szc.
+ */
+ pszc = pp->p_szc;
+ ppages = page_get_pagecnt(pszc);
+
+ /* link just the roots */
+ page_list_concat(&targ_pplist, &pp);
+ page_sub(&pplist, newpp);
+ page_list_concat(&repl_pplist, &newpp);
+ off += PAGESIZE;
+ while (--ppages != 0) {
+ newpp = pplist;
+ page_sub(&pplist, newpp);
+ off += PAGESIZE;
+ }
+ io_off = off;
+ }
+ if (io_pplist != NULL) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]);
+ io_len = eoff - io_off;
+ va.va_mask = AT_SIZE;
+ if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]);
+ goto out;
+ }
+ if (btopr(va.va_size) < btopr(io_off + io_len)) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]);
+ *downsize = 1;
+ *ret_pszc = 0;
+ goto out;
+ }
+ io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
+ B_READ, svd->cred);
+ if (io_err) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]);
+ if (io_err == EDEADLK) {
+ segvn_vmpss_pageio_deadlk_err++;
+ }
+ goto out;
+ }
+ nios++;
+ while (io_pplist != NULL) {
+ pp = io_pplist;
+ page_sub(&io_pplist, pp);
+ ASSERT(page_iolock_assert(pp));
+ page_io_unlock(pp);
+ pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
+ ASSERT(pgidx < pages);
+ ppa[pgidx] = pp;
+ }
+ }
+ /*
+ * we're now bound to succeed or panic.
+ * remove pages from done_pplist. it's not needed anymore.
+ */
+ while (done_pplist != NULL) {
+ pp = done_pplist;
+ page_sub(&done_pplist, pp);
+ }
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]);
+ ASSERT(pplist == NULL);
+ *ppplist = NULL;
+ while (targ_pplist != NULL) {
+ int ret;
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]);
+ ASSERT(repl_pplist);
+ pp = targ_pplist;
+ page_sub(&targ_pplist, pp);
+ pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
+ newpp = repl_pplist;
+ page_sub(&repl_pplist, newpp);
+#ifdef DEBUG
+ pfn = page_pptonum(pp);
+ pszc = pp->p_szc;
+ ppages = page_get_pagecnt(pszc);
+ ASSERT(IS_P2ALIGNED(pfn, ppages));
+ pfn = page_pptonum(newpp);
+ ASSERT(IS_P2ALIGNED(pfn, ppages));
+ ASSERT(P2PHASE(pfn, pages) == pgidx);
+#endif
+ nreloc = 0;
+ ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL);
+ if (ret != 0 || nreloc == 0) {
+ panic("segvn_fill_vp_pages: "
+ "page_relocate failed");
+ }
+ pp = newpp;
+ while (nreloc-- != 0) {
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_vnode == vp);
+ ASSERT(pgidx ==
+ ((pp->p_offset - start_off) >> PAGESHIFT));
+ ppa[pgidx++] = pp;
+ pp = page_next(pp);
+ }
+ }
+
+ if (svd->type == MAP_PRIVATE) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]);
+ for (i = 0; i < pages; i++) {
+ ASSERT(ppa[i] != NULL);
+ ASSERT(PAGE_EXCL(ppa[i]));
+ ASSERT(ppa[i]->p_vnode == vp);
+ ASSERT(ppa[i]->p_offset ==
+ start_off + (i << PAGESHIFT));
+ page_downgrade(ppa[i]);
+ }
+ ppa[pages] = NULL;
+ } else {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]);
+ /*
+ * the caller will still call VOP_GETPAGE() for shared segments
+ * to check FS write permissions. For private segments we map
+ * file read only anyway. so no VOP_GETPAGE is needed.
+ */
+ for (i = 0; i < pages; i++) {
+ ASSERT(ppa[i] != NULL);
+ ASSERT(PAGE_EXCL(ppa[i]));
+ ASSERT(ppa[i]->p_vnode == vp);
+ ASSERT(ppa[i]->p_offset ==
+ start_off + (i << PAGESHIFT));
+ page_unlock(ppa[i]);
+ }
+ ppa[0] = NULL;
+ }
+
+ return (1);
+out:
+ /*
+ * Do the cleanup. Unlock target pages we didn't relocate. They are
+ * linked on targ_pplist by root pages. reassemble unused replacement
+ * and io pages back to pplist.
+ */
+ if (io_pplist != NULL) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]);
+ pp = io_pplist;
+ do {
+ ASSERT(pp->p_vnode == vp);
+ ASSERT(pp->p_offset == io_off);
+ ASSERT(page_iolock_assert(pp));
+ page_io_unlock(pp);
+ page_hashout(pp, NULL);
+ io_off += PAGESIZE;
+ } while ((pp = pp->p_next) != io_pplist);
+ page_list_concat(&io_pplist, &pplist);
+ pplist = io_pplist;
+ }
+ tmp_pplist = NULL;
+ while (targ_pplist != NULL) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]);
+ pp = targ_pplist;
+ ASSERT(PAGE_EXCL(pp));
+ page_sub(&targ_pplist, pp);
+
+ pszc = pp->p_szc;
+ ppages = page_get_pagecnt(pszc);
+ ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
+
+ if (pszc != 0) {
+ group_page_unlock(pp);
+ }
+ page_unlock(pp);
+
+ pp = repl_pplist;
+ ASSERT(pp != NULL);
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_szc == szc);
+ page_sub(&repl_pplist, pp);
+
+ ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
+
+ /* relink replacement page */
+ page_list_concat(&tmp_pplist, &pp);
+ while (--ppages != 0) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]);
+ pp = page_next(pp);
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_szc == szc);
+ page_list_concat(&tmp_pplist, &pp);
+ }
+ }
+ if (tmp_pplist != NULL) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]);
+ page_list_concat(&tmp_pplist, &pplist);
+ pplist = tmp_pplist;
+ }
+ /*
+ * at this point all pages are either on done_pplist or
+ * pplist. They can't be all on done_pplist otherwise
+ * we'd've been done.
+ */
+ ASSERT(pplist != NULL);
+ if (nios != 0) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]);
+ pp = pplist;
+ do {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]);
+ ASSERT(pp->p_szc == szc);
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_vnode != vp);
+ pp->p_szc = 0;
+ } while ((pp = pp->p_next) != pplist);
+
+ pp = done_pplist;
+ do {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]);
+ ASSERT(pp->p_szc == szc);
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_vnode == vp);
+ pp->p_szc = 0;
+ } while ((pp = pp->p_next) != done_pplist);
+
+ while (pplist != NULL) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]);
+ pp = pplist;
+ page_sub(&pplist, pp);
+ page_free(pp, 0);
+ }
+
+ while (done_pplist != NULL) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]);
+ pp = done_pplist;
+ page_sub(&done_pplist, pp);
+ page_unlock(pp);
+ }
+ *ppplist = NULL;
+ return (0);
+ }
+ ASSERT(pplist == *ppplist);
+ if (io_err) {
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]);
+ /*
+ * don't downsize on io error.
+ * see if vop_getpage succeeds.
+ * pplist may still be used in this case
+ * for relocations.
+ */
+ return (0);
+ }
+ VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]);
+ page_free_replacement_page(pplist);
+ page_create_putback(pages);
+ *ppplist = NULL;
+ return (0);
+}
+
+int segvn_anypgsz = 0;
+
+#define SEGVN_RESTORE_SOFTLOCK(type, pages) \
+ if ((type) == F_SOFTLOCK) { \
+ mutex_enter(&freemem_lock); \
+ availrmem += (pages); \
+ segvn_pages_locked -= (pages); \
+ svd->softlockcnt -= (pages); \
+ mutex_exit(&freemem_lock); \
+ }
+
+#define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \
+ if (IS_VMODSORT((ppa)[0]->p_vnode)) { \
+ if ((rw) == S_WRITE) { \
+ for (i = 0; i < (pages); i++) { \
+ ASSERT((ppa)[i]->p_vnode == \
+ (ppa)[0]->p_vnode); \
+ hat_setmod((ppa)[i]); \
+ } \
+ } else if ((rw) != S_OTHER && \
+ ((prot) & (vpprot) & PROT_WRITE)) { \
+ for (i = 0; i < (pages); i++) { \
+ ASSERT((ppa)[i]->p_vnode == \
+ (ppa)[0]->p_vnode); \
+ if (!hat_ismod((ppa)[i])) { \
+ prot &= ~PROT_WRITE; \
+ break; \
+ } \
+ } \
+ } \
+ }
+
+#ifdef VM_STATS
+
+#define SEGVN_VMSTAT_FLTVNPAGES(idx) \
+ VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]);
+
+#else /* VM_STATS */
+
+#define SEGVN_VMSTAT_FLTVNPAGES(idx)
+
+#endif
+
+static faultcode_t
+segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
+ caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
+ caddr_t eaddr, int brkcow)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct anon_map *amp = svd->amp;
+ uchar_t segtype = svd->type;
+ uint_t szc = seg->s_szc;
+ size_t pgsz = page_get_pagesize(szc);
+ size_t maxpgsz = pgsz;
+ pgcnt_t pages = btop(pgsz);
+ pgcnt_t maxpages = pages;
+ size_t ppasize = (pages + 1) * sizeof (page_t *);
+ caddr_t a = lpgaddr;
+ caddr_t maxlpgeaddr = lpgeaddr;
+ u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base);
+ ulong_t aindx = svd->anon_index + seg_page(seg, a);
+ struct vpage *vpage = (svd->vpage != NULL) ?
+ &svd->vpage[seg_page(seg, a)] : NULL;
+ vnode_t *vp = svd->vp;
+ page_t **ppa;
+ uint_t pszc;
+ size_t ppgsz;
+ pgcnt_t ppages;
+ faultcode_t err = 0;
+ int ierr;
+ int vop_size_err = 0;
+ uint_t protchk, prot, vpprot;
+ ulong_t i;
+ int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
+ anon_sync_obj_t an_cookie;
+ enum seg_rw arw;
+ int alloc_failed = 0;
+ int adjszc_chk;
+ struct vattr va;
+ int xhat = 0;
+ page_t *pplist;
+ pfn_t pfn;
+ int physcontig;
+ int upgrdfail;
+ int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */
+
+ ASSERT(szc != 0);
+ ASSERT(vp != NULL);
+ ASSERT(brkcow == 0 || amp != NULL);
+ ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
+ ASSERT(!(svd->flags & MAP_NORESERVE));
+ ASSERT(type != F_SOFTUNLOCK);
+ ASSERT(IS_P2ALIGNED(a, maxpgsz));
+ ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages));
+ ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+ ASSERT(seg->s_szc < NBBY * sizeof (int));
+
+ VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]);
+ VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]);
+
+ if (svd->flags & MAP_TEXT) {
+ hat_flag |= HAT_LOAD_TEXT;
+ }
+
+ if (svd->pageprot) {
+ switch (rw) {
+ case S_READ:
+ protchk = PROT_READ;
+ break;
+ case S_WRITE:
+ protchk = PROT_WRITE;
+ break;
+ case S_EXEC:
+ protchk = PROT_EXEC;
+ break;
+ case S_OTHER:
+ default:
+ protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+ break;
+ }
+ } else {
+ prot = svd->prot;
+ /* caller has already done segment level protection check. */
+ }
+
+ if (seg->s_as->a_hat != hat) {
+ xhat = 1;
+ }
+
+ if (rw == S_WRITE && segtype == MAP_PRIVATE) {
+ SEGVN_VMSTAT_FLTVNPAGES(2);
+ arw = S_READ;
+ } else {
+ arw = rw;
+ }
+
+ ppa = kmem_alloc(ppasize, KM_SLEEP);
+
+ VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]);
+
+ for (;;) {
+ adjszc_chk = 0;
+ for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) {
+ if (adjszc_chk) {
+ while (szc < seg->s_szc) {
+ uintptr_t e;
+ uint_t tszc;
+ tszc = segvn_anypgsz_vnode ? szc + 1 :
+ seg->s_szc;
+ ppgsz = page_get_pagesize(tszc);
+ if (!IS_P2ALIGNED(a, ppgsz) ||
+ ((alloc_failed >> tszc) &
+ 0x1)) {
+ break;
+ }
+ SEGVN_VMSTAT_FLTVNPAGES(4);
+ szc = tszc;
+ pgsz = ppgsz;
+ pages = btop(pgsz);
+ e = P2ROUNDUP((uintptr_t)eaddr, pgsz);
+ lpgeaddr = (caddr_t)e;
+ }
+ }
+
+ again:
+ if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) {
+ ASSERT(IS_P2ALIGNED(aindx, maxpages));
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ anon_array_enter(amp, aindx, &an_cookie);
+ if (anon_get_ptr(amp->ahp, aindx) != NULL) {
+ SEGVN_VMSTAT_FLTVNPAGES(5);
+ if (anon_pages(amp->ahp, aindx,
+ maxpages) != maxpages) {
+ panic("segvn_fault_vnodepages:"
+ " empty anon slots\n");
+ }
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ err = segvn_fault_anonpages(hat, seg,
+ a, a + maxpgsz, type, rw,
+ MAX(a, addr),
+ MIN(a + maxpgsz, eaddr), brkcow);
+ if (err != 0) {
+ SEGVN_VMSTAT_FLTVNPAGES(6);
+ goto out;
+ }
+ if (szc < seg->s_szc) {
+ szc = seg->s_szc;
+ pgsz = maxpgsz;
+ pages = maxpages;
+ lpgeaddr = maxlpgeaddr;
+ }
+ goto next;
+ } else if (anon_pages(amp->ahp, aindx,
+ maxpages)) {
+ panic("segvn_fault_vnodepages:"
+ " non empty anon slots\n");
+ } else {
+ SEGVN_VMSTAT_FLTVNPAGES(7);
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ }
+ ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz));
+
+ if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
+ ASSERT(vpage != NULL);
+ prot = VPP_PROT(vpage);
+ ASSERT(sameprot(seg, a, maxpgsz));
+ if ((prot & protchk) == 0) {
+ SEGVN_VMSTAT_FLTVNPAGES(8);
+ err = FC_PROT;
+ goto out;
+ }
+ }
+ if (type == F_SOFTLOCK) {
+ mutex_enter(&freemem_lock);
+ if (availrmem < tune.t_minarmem + pages) {
+ mutex_exit(&freemem_lock);
+ err = FC_MAKE_ERR(ENOMEM);
+ goto out;
+ } else {
+ availrmem -= pages;
+ segvn_pages_locked += pages;
+ svd->softlockcnt += pages;
+ }
+ mutex_exit(&freemem_lock);
+ }
+
+ pplist = NULL;
+ physcontig = 0;
+ ppa[0] = NULL;
+ if (!brkcow && szc &&
+ !page_exists_physcontig(vp, off, szc,
+ segtype == MAP_PRIVATE ? ppa : NULL)) {
+ SEGVN_VMSTAT_FLTVNPAGES(9);
+ if (page_alloc_pages(seg, a, &pplist, NULL,
+ szc, 0)) {
+ SEGVN_RESTORE_SOFTLOCK(type, pages);
+ SEGVN_VMSTAT_FLTVNPAGES(10);
+ pszc = 0;
+ ierr = -1;
+ alloc_failed |= (1 << szc);
+ break;
+ }
+ if (vp->v_mpssdata == SEGVN_PAGEIO) {
+ int downsize;
+ SEGVN_VMSTAT_FLTVNPAGES(11);
+ physcontig = segvn_fill_vp_pages(svd,
+ vp, off, szc, ppa, &pplist,
+ &pszc, &downsize);
+ ASSERT(!physcontig || pplist == NULL);
+ if (!physcontig && downsize) {
+ SEGVN_RESTORE_SOFTLOCK(type,
+ pages);
+ ASSERT(pplist == NULL);
+ SEGVN_VMSTAT_FLTVNPAGES(12);
+ ierr = -1;
+ break;
+ }
+ ASSERT(!physcontig ||
+ segtype == MAP_PRIVATE ||
+ ppa[0] == NULL);
+ if (physcontig && ppa[0] == NULL) {
+ physcontig = 0;
+ }
+ }
+ } else if (!brkcow && szc && ppa[0] != NULL) {
+ SEGVN_VMSTAT_FLTVNPAGES(13);
+ ASSERT(segtype == MAP_PRIVATE);
+ physcontig = 1;
+ }
+
+ if (!physcontig) {
+ SEGVN_VMSTAT_FLTVNPAGES(14);
+ ppa[0] = NULL;
+ ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz,
+ &vpprot, ppa, pgsz, seg, a, arw,
+ svd->cred);
+ if (segtype == MAP_PRIVATE) {
+ SEGVN_VMSTAT_FLTVNPAGES(15);
+ vpprot &= ~PROT_WRITE;
+ }
+ } else {
+ ASSERT(segtype == MAP_PRIVATE);
+ SEGVN_VMSTAT_FLTVNPAGES(16);
+ vpprot = PROT_ALL & ~PROT_WRITE;
+ ierr = 0;
+ }
+
+ if (ierr != 0) {
+ SEGVN_VMSTAT_FLTVNPAGES(17);
+ if (pplist != NULL) {
+ SEGVN_VMSTAT_FLTVNPAGES(18);
+ page_free_replacement_page(pplist);
+ page_create_putback(pages);
+ }
+ SEGVN_RESTORE_SOFTLOCK(type, pages);
+ if (a + pgsz <= eaddr) {
+ SEGVN_VMSTAT_FLTVNPAGES(19);
+ err = FC_MAKE_ERR(ierr);
+ goto out;
+ }
+ va.va_mask = AT_SIZE;
+ if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) {
+ SEGVN_VMSTAT_FLTVNPAGES(20);
+ err = FC_MAKE_ERR(EIO);
+ goto out;
+ }
+ if (btopr(va.va_size) >= btopr(off + pgsz)) {
+ SEGVN_VMSTAT_FLTVNPAGES(21);
+ err = FC_MAKE_ERR(EIO);
+ goto out;
+ }
+ if (btopr(va.va_size) <
+ btopr(off + (eaddr - a))) {
+ SEGVN_VMSTAT_FLTVNPAGES(22);
+ err = FC_MAKE_ERR(EIO);
+ goto out;
+ }
+ if (brkcow || type == F_SOFTLOCK) {
+ /* can't reduce map area */
+ SEGVN_VMSTAT_FLTVNPAGES(23);
+ vop_size_err = 1;
+ goto out;
+ }
+ SEGVN_VMSTAT_FLTVNPAGES(24);
+ ASSERT(szc != 0);
+ pszc = 0;
+ ierr = -1;
+ break;
+ }
+
+ if (amp != NULL) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ anon_array_enter(amp, aindx, &an_cookie);
+ }
+ if (amp != NULL &&
+ anon_get_ptr(amp->ahp, aindx) != NULL) {
+ ulong_t taindx = P2ALIGN(aindx, maxpages);
+
+ SEGVN_VMSTAT_FLTVNPAGES(25);
+ if (anon_pages(amp->ahp, taindx, maxpages) !=
+ maxpages) {
+ panic("segvn_fault_vnodepages:"
+ " empty anon slots\n");
+ }
+ for (i = 0; i < pages; i++) {
+ page_unlock(ppa[i]);
+ }
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ if (pplist != NULL) {
+ page_free_replacement_page(pplist);
+ page_create_putback(pages);
+ }
+ SEGVN_RESTORE_SOFTLOCK(type, pages);
+ if (szc < seg->s_szc) {
+ SEGVN_VMSTAT_FLTVNPAGES(26);
+ /*
+ * For private segments SOFTLOCK
+ * either always breaks cow (any rw
+ * type except S_READ_NOCOW) or
+ * address space is locked as writer
+ * (S_READ_NOCOW case) and anon slots
+ * can't show up on second check.
+ * Therefore if we are here for
+ * SOFTLOCK case it must be a cow
+ * break but cow break never reduces
+ * szc. Thus the assert below.
+ */
+ ASSERT(!brkcow && type != F_SOFTLOCK);
+ pszc = seg->s_szc;
+ ierr = -2;
+ break;
+ }
+ ASSERT(IS_P2ALIGNED(a, maxpgsz));
+ goto again;
+ }
+#ifdef DEBUG
+ if (amp != NULL) {
+ ulong_t taindx = P2ALIGN(aindx, maxpages);
+ ASSERT(!anon_pages(amp->ahp, taindx, maxpages));
+ }
+#endif /* DEBUG */
+
+ if (brkcow) {
+ ASSERT(amp != NULL);
+ ASSERT(pplist == NULL);
+ ASSERT(szc == seg->s_szc);
+ ASSERT(IS_P2ALIGNED(a, maxpgsz));
+ ASSERT(IS_P2ALIGNED(aindx, maxpages));
+ SEGVN_VMSTAT_FLTVNPAGES(27);
+ ierr = anon_map_privatepages(amp, aindx, szc,
+ seg, a, prot, ppa, vpage, segvn_anypgsz,
+ svd->cred);
+ if (ierr != 0) {
+ SEGVN_VMSTAT_FLTVNPAGES(28);
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ SEGVN_RESTORE_SOFTLOCK(type, pages);
+ err = FC_MAKE_ERR(ierr);
+ goto out;
+ }
+
+ ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
+ /*
+ * p_szc can't be changed for locked
+ * swapfs pages.
+ */
+ hat_memload_array(hat, a, pgsz, ppa, prot,
+ hat_flag);
+
+ if (!(hat_flag & HAT_LOAD_LOCK)) {
+ SEGVN_VMSTAT_FLTVNPAGES(29);
+ for (i = 0; i < pages; i++) {
+ page_unlock(ppa[i]);
+ }
+ }
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ goto next;
+ }
+
+ pfn = page_pptonum(ppa[0]);
+ /*
+ * hat_page_demote() needs an EXCl lock on one of
+ * constituent page_t's and it decreases root's p_szc
+ * last. This means if root's p_szc is equal szc and
+ * all its constituent pages are locked
+ * hat_page_demote() that could have changed p_szc to
+ * szc is already done and no new have page_demote()
+ * can start for this large page.
+ */
+
+ /*
+ * we need to make sure same mapping size is used for
+ * the same address range if there's a possibility the
+ * adddress is already mapped because hat layer panics
+ * when translation is loaded for the range already
+ * mapped with a different page size. We achieve it
+ * by always using largest page size possible subject
+ * to the constraints of page size, segment page size
+ * and page alignment. Since mappings are invalidated
+ * when those constraints change and make it
+ * impossible to use previously used mapping size no
+ * mapping size conflicts should happen.
+ */
+
+ chkszc:
+ if ((pszc = ppa[0]->p_szc) == szc &&
+ IS_P2ALIGNED(pfn, pages)) {
+
+ SEGVN_VMSTAT_FLTVNPAGES(30);
+#ifdef DEBUG
+ for (i = 0; i < pages; i++) {
+ ASSERT(PAGE_LOCKED(ppa[i]));
+ ASSERT(!PP_ISFREE(ppa[i]));
+ ASSERT(page_pptonum(ppa[i]) ==
+ pfn + i);
+ ASSERT(ppa[i]->p_szc == szc);
+ ASSERT(ppa[i]->p_vnode == vp);
+ ASSERT(ppa[i]->p_offset ==
+ off + (i << PAGESHIFT));
+ }
+#endif
+ /*
+ * All pages are of szc we need and they are
+ * all locked so they can't change szc. load
+ * translations.
+ *
+ * if page got promoted since last check
+ * we don't need pplist.
+ */
+ if (pplist != NULL) {
+ page_free_replacement_page(pplist);
+ page_create_putback(pages);
+ }
+ if (PP_ISMIGRATE(ppa[0])) {
+ page_migrate(seg, a, ppa, pages);
+ }
+ SEGVN_UPDATE_MODBITS(ppa, pages, rw,
+ prot, vpprot);
+ if (!xhat) {
+ hat_memload_array(hat, a, pgsz, ppa,
+ prot & vpprot, hat_flag);
+ } else {
+ /*
+ * avoid large xhat mappings to FS
+ * pages so that hat_page_demote()
+ * doesn't need to check for xhat
+ * large mappings.
+ */
+ for (i = 0; i < pages; i++) {
+ hat_memload(hat,
+ a + (i << PAGESHIFT),
+ ppa[i], prot & vpprot,
+ hat_flag);
+ }
+ }
+
+ if (!(hat_flag & HAT_LOAD_LOCK)) {
+ for (i = 0; i < pages; i++) {
+ page_unlock(ppa[i]);
+ }
+ }
+ if (amp != NULL) {
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ goto next;
+ }
+
+ /*
+ * See if upsize is possible.
+ */
+ if (pszc > szc && szc < seg->s_szc &&
+ (segvn_anypgsz_vnode || pszc >= seg->s_szc)) {
+ pgcnt_t aphase;
+ uint_t pszc1 = MIN(pszc, seg->s_szc);
+ ppgsz = page_get_pagesize(pszc1);
+ ppages = btop(ppgsz);
+ aphase = btop(P2PHASE((uintptr_t)a, ppgsz));
+
+ SEGVN_VMSTAT_FLTVNPAGES(31);
+ if (aphase != P2PHASE(pfn, ppages)) {
+ segvn_faultvnmpss_align_err4++;
+ } else if (type == F_SOFTLOCK &&
+ a != lpgaddr &&
+ !IS_P2ALIGNED(pfn,
+ page_get_pagecnt(ppa[0]->p_szc))) {
+ /*
+ * if we locked previous offsets for
+ * smaller szc page larger page can't
+ * be here since one needs excl locks
+ * to promote page size.
+ */
+ panic("segvn_fault_vnodepages: "
+ "unexpected larger than szc page"
+ " found after SOFTLOCK");
+ } else {
+ SEGVN_VMSTAT_FLTVNPAGES(32);
+ if (pplist != NULL) {
+ page_t *pl = pplist;
+ page_free_replacement_page(pl);
+ page_create_putback(pages);
+ }
+ for (i = 0; i < pages; i++) {
+ page_unlock(ppa[i]);
+ }
+ if (amp != NULL) {
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ SEGVN_RESTORE_SOFTLOCK(type, pages);
+ pszc = pszc1;
+ ierr = -2;
+ break;
+ }
+ }
+
+ /*
+ * check if we should use smallest mapping size.
+ */
+ upgrdfail = 0;
+ if (szc == 0 || xhat ||
+ (pszc >= szc &&
+ !IS_P2ALIGNED(pfn, pages)) ||
+ (pszc < szc &&
+ !segvn_full_szcpages(ppa, szc, &upgrdfail,
+ &pszc))) {
+
+ if (upgrdfail) {
+ /*
+ * segvn_full_szcpages failed to lock
+ * all pages EXCL. Size down.
+ */
+ ASSERT(pszc < szc);
+
+ SEGVN_VMSTAT_FLTVNPAGES(33);
+
+ if (pplist != NULL) {
+ page_t *pl = pplist;
+ page_free_replacement_page(pl);
+ page_create_putback(pages);
+ }
+
+ for (i = 0; i < pages; i++) {
+ page_unlock(ppa[i]);
+ }
+ if (amp != NULL) {
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ SEGVN_RESTORE_SOFTLOCK(type, pages);
+ ierr = -1;
+ break;
+ }
+ if (szc != 0 && !xhat) {
+ segvn_faultvnmpss_align_err5++;
+ }
+ SEGVN_VMSTAT_FLTVNPAGES(34);
+ if (pplist != NULL) {
+ page_free_replacement_page(pplist);
+ page_create_putback(pages);
+ }
+ SEGVN_UPDATE_MODBITS(ppa, pages, rw,
+ prot, vpprot);
+ for (i = 0; i < pages; i++) {
+ hat_memload(hat, a + (i << PAGESHIFT),
+ ppa[i], prot & vpprot, hat_flag);
+ }
+ if (!(hat_flag & HAT_LOAD_LOCK)) {
+ for (i = 0; i < pages; i++) {
+ page_unlock(ppa[i]);
+ }
+ }
+ if (amp != NULL) {
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ goto next;
+ }
+
+ if (pszc == szc) {
+ /*
+ * segvn_full_szcpages() upgraded pages szc.
+ */
+ ASSERT(pszc == ppa[0]->p_szc);
+ ASSERT(IS_P2ALIGNED(pfn, pages));
+ goto chkszc;
+ }
+
+ if (pszc > szc) {
+ kmutex_t *szcmtx;
+ SEGVN_VMSTAT_FLTVNPAGES(35);
+ /*
+ * p_szc of ppa[0] can change since we haven't
+ * locked all constituent pages. Call
+ * page_lock_szc() to prevent szc changes.
+ * This should be a rare case that happens when
+ * multiple segments use a different page size
+ * to map the same file offsets.
+ */
+ szcmtx = page_szc_lock(ppa[0]);
+ pszc = ppa[0]->p_szc;
+ ASSERT(szcmtx != NULL || pszc == 0);
+ ASSERT(ppa[0]->p_szc <= pszc);
+ if (pszc <= szc) {
+ SEGVN_VMSTAT_FLTVNPAGES(36);
+ if (szcmtx != NULL) {
+ mutex_exit(szcmtx);
+ }
+ goto chkszc;
+ }
+ if (pplist != NULL) {
+ /*
+ * page got promoted since last check.
+ * we don't need preaalocated large
+ * page.
+ */
+ SEGVN_VMSTAT_FLTVNPAGES(37);
+ page_free_replacement_page(pplist);
+ page_create_putback(pages);
+ }
+ SEGVN_UPDATE_MODBITS(ppa, pages, rw,
+ prot, vpprot);
+ hat_memload_array(hat, a, pgsz, ppa,
+ prot & vpprot, hat_flag);
+ mutex_exit(szcmtx);
+ if (!(hat_flag & HAT_LOAD_LOCK)) {
+ for (i = 0; i < pages; i++) {
+ page_unlock(ppa[i]);
+ }
+ }
+ if (amp != NULL) {
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ goto next;
+ }
+
+ /*
+ * if page got demoted since last check
+ * we could have not allocated larger page.
+ * allocate now.
+ */
+ if (pplist == NULL &&
+ page_alloc_pages(seg, a, &pplist, NULL, szc, 0)) {
+ SEGVN_VMSTAT_FLTVNPAGES(38);
+ for (i = 0; i < pages; i++) {
+ page_unlock(ppa[i]);
+ }
+ if (amp != NULL) {
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ SEGVN_RESTORE_SOFTLOCK(type, pages);
+ ierr = -1;
+ alloc_failed |= (1 << szc);
+ break;
+ }
+
+ SEGVN_VMSTAT_FLTVNPAGES(39);
+
+ segvn_relocate_pages(ppa, pplist);
+
+ SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot);
+ hat_memload_array(hat, a, pgsz, ppa, prot & vpprot,
+ hat_flag);
+ if (!(hat_flag & HAT_LOAD_LOCK)) {
+ for (i = 0; i < pages; i++) {
+ ASSERT(PAGE_SHARED(ppa[i]));
+ page_unlock(ppa[i]);
+ }
+ }
+ if (amp != NULL) {
+ anon_array_exit(&an_cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+
+ next:
+ if (vpage != NULL) {
+ vpage += pages;
+ }
+ adjszc_chk = 1;
+ }
+ if (a == lpgeaddr)
+ break;
+ ASSERT(a < lpgeaddr);
+ /*
+ * ierr == -1 means we failed to map with a large page.
+ * (either due to allocation/relocation failures or
+ * misalignment with other mappings to this file.
+ *
+ * ierr == -2 means some other thread allocated a large page
+ * after we gave up tp map with a large page. retry with
+ * larger mapping.
+ */
+ ASSERT(ierr == -1 || ierr == -2);
+ ASSERT(ierr == -2 || szc != 0);
+ ASSERT(ierr == -1 || szc < seg->s_szc);
+ if (ierr == -2) {
+ SEGVN_VMSTAT_FLTVNPAGES(40);
+ ASSERT(pszc > szc && pszc <= seg->s_szc);
+ szc = pszc;
+ } else if (segvn_anypgsz_vnode) {
+ SEGVN_VMSTAT_FLTVNPAGES(41);
+ szc--;
+ } else {
+ SEGVN_VMSTAT_FLTVNPAGES(42);
+ ASSERT(pszc < szc);
+ /*
+ * other process created pszc large page.
+ * but we still have to drop to 0 szc.
+ */
+ szc = 0;
+ }
+
+ pgsz = page_get_pagesize(szc);
+ pages = btop(pgsz);
+ ASSERT(type != F_SOFTLOCK || ierr == -1 ||
+ (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz)));
+ if (type == F_SOFTLOCK) {
+ /*
+ * For softlocks we cannot reduce the fault area
+ * (calculated based on the largest page size for this
+ * segment) for size down and a is already next
+ * page size aligned as assertted above for size
+ * ups. Therefore just continue in case of softlock.
+ */
+ SEGVN_VMSTAT_FLTVNPAGES(43);
+ continue; /* keep lint happy */
+ } else if (ierr == -2) {
+
+ /*
+ * Size up case. Note lpgaddr may only be needed for
+ * softlock case so we don't adjust it here.
+ */
+ a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
+ ASSERT(a >= lpgaddr);
+ lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
+ off = svd->offset + (uintptr_t)(a - seg->s_base);
+ aindx = svd->anon_index + seg_page(seg, a);
+ vpage = (svd->vpage != NULL) ?
+ &svd->vpage[seg_page(seg, a)] : NULL;
+ } else {
+ /*
+ * Size down case. Note lpgaddr may only be needed for
+ * softlock case so we don't adjust it here.
+ */
+ ASSERT(IS_P2ALIGNED(a, pgsz));
+ ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
+ lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
+ ASSERT(a < lpgeaddr);
+ if (a < addr) {
+ SEGVN_VMSTAT_FLTVNPAGES(44);
+ /*
+ * The beginning of the large page region can
+ * be pulled to the right to make a smaller
+ * region. We haven't yet faulted a single
+ * page.
+ */
+ a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
+ ASSERT(a >= lpgaddr);
+ off = svd->offset +
+ (uintptr_t)(a - seg->s_base);
+ aindx = svd->anon_index + seg_page(seg, a);
+ vpage = (svd->vpage != NULL) ?
+ &svd->vpage[seg_page(seg, a)] : NULL;
+ }
+ }
+ }
+out:
+ kmem_free(ppa, ppasize);
+ if (!err && !vop_size_err) {
+ SEGVN_VMSTAT_FLTVNPAGES(45);
+ return (0);
+ }
+ if (type == F_SOFTLOCK && a > lpgaddr) {
+ SEGVN_VMSTAT_FLTVNPAGES(46);
+ segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
+ }
+ if (!vop_size_err) {
+ SEGVN_VMSTAT_FLTVNPAGES(47);
+ return (err);
+ }
+ ASSERT(brkcow || type == F_SOFTLOCK);
+ /*
+ * Large page end is mapped beyond the end of file and it's a cow
+ * fault or softlock so we can't reduce the map area. For now just
+ * demote the segment. This should really only happen if the end of
+ * the file changed after the mapping was established since when large
+ * page segments are created we make sure they don't extend beyond the
+ * end of the file.
+ */
+ SEGVN_VMSTAT_FLTVNPAGES(48);
+
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+ err = 0;
+ if (seg->s_szc != 0) {
+ err = segvn_clrszc(seg);
+ if (err != 0) {
+ segvn_fltvnpages_clrszc_err++;
+ }
+ }
+ ASSERT(err || seg->s_szc == 0);
+ SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock);
+ /* segvn_fault will do its job as if szc had been zero to begin with */
+ return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err));
+}
+
+/*
+ * This routine will attempt to fault in one large page.
+ * it will use smaller pages if that fails.
+ * It should only be called for pure anonymous segments.
+ */
+static faultcode_t
+segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
+ caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
+ caddr_t eaddr, int brkcow)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct anon_map *amp = svd->amp;
+ uchar_t segtype = svd->type;
+ uint_t szc = seg->s_szc;
+ size_t pgsz = page_get_pagesize(szc);
+ size_t maxpgsz = pgsz;
+ pgcnt_t pages = btop(pgsz);
+ size_t ppasize = pages * sizeof (page_t *);
+ caddr_t a = lpgaddr;
+ ulong_t aindx = svd->anon_index + seg_page(seg, a);
+ struct vpage *vpage = (svd->vpage != NULL) ?
+ &svd->vpage[seg_page(seg, a)] : NULL;
+ page_t **ppa;
+ uint_t ppa_szc;
+ faultcode_t err;
+ int ierr;
+ uint_t protchk, prot, vpprot;
+ int i;
+ int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
+ anon_sync_obj_t cookie;
+
+ ASSERT(szc != 0);
+ ASSERT(amp != NULL);
+ ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
+ ASSERT(!(svd->flags & MAP_NORESERVE));
+ ASSERT(type != F_SOFTUNLOCK);
+ ASSERT(segtype == MAP_PRIVATE);
+ ASSERT(IS_P2ALIGNED(a, maxpgsz));
+
+ ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+
+ VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]);
+ VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]);
+
+ if (svd->flags & MAP_TEXT) {
+ hat_flag |= HAT_LOAD_TEXT;
+ }
+
+ if (svd->pageprot) {
+ switch (rw) {
+ case S_READ:
+ protchk = PROT_READ;
+ break;
+ case S_WRITE:
+ protchk = PROT_WRITE;
+ break;
+ case S_EXEC:
+ protchk = PROT_EXEC;
+ break;
+ case S_OTHER:
+ default:
+ protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+ break;
+ }
+ VM_STAT_ADD(segvnvmstats.fltanpages[2]);
+ } else {
+ prot = svd->prot;
+ /* caller has already done segment level protection check. */
+ }
+
+ ppa = kmem_alloc(ppasize, KM_SLEEP);
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ for (;;) {
+ for (; a < lpgeaddr; a += pgsz, aindx += pages) {
+ if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
+ VM_STAT_ADD(segvnvmstats.fltanpages[3]);
+ ASSERT(vpage != NULL);
+ prot = VPP_PROT(vpage);
+ ASSERT(sameprot(seg, a, maxpgsz));
+ if ((prot & protchk) == 0) {
+ err = FC_PROT;
+ goto error;
+ }
+ }
+ if (type == F_SOFTLOCK) {
+ mutex_enter(&freemem_lock);
+ if (availrmem < tune.t_minarmem + pages) {
+ mutex_exit(&freemem_lock);
+ err = FC_MAKE_ERR(ENOMEM);
+ goto error;
+ } else {
+ availrmem -= pages;
+ segvn_pages_locked += pages;
+ svd->softlockcnt += pages;
+ }
+ mutex_exit(&freemem_lock);
+ }
+ anon_array_enter(amp, aindx, &cookie);
+ ppa_szc = (uint_t)-1;
+ ierr = anon_map_getpages(amp, aindx, szc, seg, a,
+ prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow,
+ segvn_anypgsz, svd->cred);
+ if (ierr != 0) {
+ anon_array_exit(&cookie);
+ VM_STAT_ADD(segvnvmstats.fltanpages[4]);
+ if (type == F_SOFTLOCK) {
+ VM_STAT_ADD(segvnvmstats.fltanpages[5]);
+ mutex_enter(&freemem_lock);
+ availrmem += pages;
+ segvn_pages_locked -= pages;
+ svd->softlockcnt -= pages;
+ mutex_exit(&freemem_lock);
+ }
+ if (ierr > 0) {
+ VM_STAT_ADD(segvnvmstats.fltanpages[6]);
+ err = FC_MAKE_ERR(ierr);
+ goto error;
+ }
+ break;
+ }
+
+ ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
+
+ /*
+ * Handle pages that have been marked for migration
+ */
+ if (lgrp_optimizations())
+ page_migrate(seg, a, ppa, pages);
+
+ hat_memload_array(hat, a, pgsz, ppa,
+ prot & vpprot, hat_flag);
+
+ if (hat_flag & HAT_LOAD_LOCK) {
+ VM_STAT_ADD(segvnvmstats.fltanpages[7]);
+ } else {
+ VM_STAT_ADD(segvnvmstats.fltanpages[8]);
+ for (i = 0; i < pages; i++)
+ page_unlock(ppa[i]);
+ }
+ if (vpage != NULL)
+ vpage += pages;
+
+ anon_array_exit(&cookie);
+ }
+ if (a == lpgeaddr)
+ break;
+ ASSERT(a < lpgeaddr);
+ /*
+ * ierr == -1 means we failed to allocate a large page.
+ * so do a size down operation.
+ *
+ * ierr == -2 means some other process that privately shares
+ * pages with this process has allocated a larger page and we
+ * need to retry with larger pages. So do a size up
+ * operation. This relies on the fact that large pages are
+ * never partially shared i.e. if we share any constituent
+ * page of a large page with another process we must share the
+ * entire large page. Note this cannot happen for SOFTLOCK
+ * case, unless current address (a) is at the beginning of the
+ * next page size boundary because the other process couldn't
+ * have relocated locked pages.
+ */
+ ASSERT(ierr == -1 || ierr == -2);
+ if (segvn_anypgsz) {
+ ASSERT(ierr == -2 || szc != 0);
+ ASSERT(ierr == -1 || szc < seg->s_szc);
+ szc = (ierr == -1) ? szc - 1 : szc + 1;
+ } else {
+ /*
+ * For non COW faults and segvn_anypgsz == 0
+ * we need to be careful not to loop forever
+ * if existing page is found with szc other
+ * than 0 or seg->s_szc. This could be due
+ * to page relocations on behalf of DR or
+ * more likely large page creation. For this
+ * case simply re-size to existing page's szc
+ * if returned by anon_map_getpages().
+ */
+ if (ppa_szc == (uint_t)-1) {
+ szc = (ierr == -1) ? 0 : seg->s_szc;
+ } else {
+ ASSERT(ppa_szc <= seg->s_szc);
+ ASSERT(ierr == -2 || ppa_szc < szc);
+ ASSERT(ierr == -1 || ppa_szc > szc);
+ szc = ppa_szc;
+ }
+ }
+
+ pgsz = page_get_pagesize(szc);
+ pages = btop(pgsz);
+ ASSERT(type != F_SOFTLOCK || ierr == -1 ||
+ (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz)));
+ if (type == F_SOFTLOCK) {
+ /*
+ * For softlocks we cannot reduce the fault area
+ * (calculated based on the largest page size for this
+ * segment) for size down and a is already next
+ * page size aligned as assertted above for size
+ * ups. Therefore just continue in case of softlock.
+ */
+ VM_STAT_ADD(segvnvmstats.fltanpages[9]);
+ continue; /* keep lint happy */
+ } else if (ierr == -2) {
+
+ /*
+ * Size up case. Note lpgaddr may only be needed for
+ * softlock case so we don't adjust it here.
+ */
+ VM_STAT_ADD(segvnvmstats.fltanpages[10]);
+ a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
+ ASSERT(a >= lpgaddr);
+ lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
+ aindx = svd->anon_index + seg_page(seg, a);
+ vpage = (svd->vpage != NULL) ?
+ &svd->vpage[seg_page(seg, a)] : NULL;
+ } else {
+ /*
+ * Size down case. Note lpgaddr may only be needed for
+ * softlock case so we don't adjust it here.
+ */
+ VM_STAT_ADD(segvnvmstats.fltanpages[11]);
+ ASSERT(IS_P2ALIGNED(a, pgsz));
+ ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
+ lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
+ ASSERT(a < lpgeaddr);
+ if (a < addr) {
+ /*
+ * The beginning of the large page region can
+ * be pulled to the right to make a smaller
+ * region. We haven't yet faulted a single
+ * page.
+ */
+ VM_STAT_ADD(segvnvmstats.fltanpages[12]);
+ a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
+ ASSERT(a >= lpgaddr);
+ aindx = svd->anon_index + seg_page(seg, a);
+ vpage = (svd->vpage != NULL) ?
+ &svd->vpage[seg_page(seg, a)] : NULL;
+ }
+ }
+ }
+ VM_STAT_ADD(segvnvmstats.fltanpages[13]);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ kmem_free(ppa, ppasize);
+ return (0);
+error:
+ VM_STAT_ADD(segvnvmstats.fltanpages[14]);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ kmem_free(ppa, ppasize);
+ if (type == F_SOFTLOCK && a > lpgaddr) {
+ VM_STAT_ADD(segvnvmstats.fltanpages[15]);
+ segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
+ }
+ return (err);
+}
+
+int fltadvice = 1; /* set to free behind pages for sequential access */
+
+/*
+ * This routine is called via a machine specific fault handling routine.
+ * It is also called by software routines wishing to lock or unlock
+ * a range of addresses.
+ *
+ * Here is the basic algorithm:
+ * If unlocking
+ * Call segvn_softunlock
+ * Return
+ * endif
+ * Checking and set up work
+ * If we will need some non-anonymous pages
+ * Call VOP_GETPAGE over the range of non-anonymous pages
+ * endif
+ * Loop over all addresses requested
+ * Call segvn_faultpage passing in page list
+ * to load up translations and handle anonymous pages
+ * endloop
+ * Load up translation to any additional pages in page list not
+ * already handled that fit into this segment
+ */
+static faultcode_t
+segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
+ enum fault_type type, enum seg_rw rw)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ page_t **plp, **ppp, *pp;
+ u_offset_t off;
+ caddr_t a;
+ struct vpage *vpage;
+ uint_t vpprot, prot;
+ int err;
+ page_t *pl[PVN_GETPAGE_NUM + 1];
+ size_t plsz, pl_alloc_sz;
+ size_t page;
+ ulong_t anon_index;
+ struct anon_map *amp;
+ int dogetpage = 0;
+ caddr_t lpgaddr, lpgeaddr;
+ size_t pgsz;
+ anon_sync_obj_t cookie;
+ int brkcow = BREAK_COW_SHARE(rw, type, svd->type);
+
+ /*
+ * S_READ_NOCOW is like read
+ * except caller advises no need
+ * to copy-on-write for softlock
+ * because it holds address space
+ * locked as writer and thus prevents
+ * any copy on writes of a softlocked
+ * page by another thread.
+ * S_READ_NOCOW vs S_READ distinction was
+ * only needed for BREAK_COW_SHARE(). After
+ * that we treat S_READ_NOW as just S_READ.
+ */
+ if (rw == S_READ_NOCOW) {
+ rw = S_READ;
+ ASSERT(type == F_SOFTLOCK &&
+ AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+ }
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * First handle the easy stuff
+ */
+ if (type == F_SOFTUNLOCK) {
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ pgsz = (seg->s_szc == 0) ? PAGESIZE :
+ page_get_pagesize(seg->s_szc);
+ VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]);
+ CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
+ segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+
+top:
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+ /*
+ * If we have the same protections for the entire segment,
+ * insure that the access being attempted is legitimate.
+ */
+
+ if (svd->pageprot == 0) {
+ uint_t protchk;
+
+ switch (rw) {
+ case S_READ:
+ protchk = PROT_READ;
+ break;
+ case S_WRITE:
+ protchk = PROT_WRITE;
+ break;
+ case S_EXEC:
+ protchk = PROT_EXEC;
+ break;
+ case S_OTHER:
+ default:
+ protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
+ break;
+ }
+
+ if ((svd->prot & protchk) == 0) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (FC_PROT); /* illegal access type */
+ }
+ }
+
+ /*
+ * Check to see if we need to allocate an anon_map structure.
+ */
+ if (svd->amp == NULL && (svd->vp == NULL || brkcow)) {
+ /*
+ * Drop the "read" lock on the segment and acquire
+ * the "write" version since we have to allocate the
+ * anon_map.
+ */
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+
+ if (svd->amp == NULL) {
+ svd->amp = anonmap_alloc(seg->s_size, 0);
+ svd->amp->a_szc = seg->s_szc;
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+
+ /*
+ * Start all over again since segment protections
+ * may have changed after we dropped the "read" lock.
+ */
+ goto top;
+ }
+
+ amp = svd->amp;
+
+ /*
+ * MADV_SEQUENTIAL work is ignored for large page segments.
+ */
+ if (seg->s_szc != 0) {
+ pgsz = page_get_pagesize(seg->s_szc);
+ ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+ /*
+ * We may need to do relocations so purge seg_pcache to allow
+ * pages to be locked exclusively.
+ */
+ if (svd->softlockcnt != 0)
+ segvn_purge(seg);
+ CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
+ if (svd->vp == NULL) {
+ ASSERT(svd->type == MAP_PRIVATE);
+ err = segvn_fault_anonpages(hat, seg, lpgaddr,
+ lpgeaddr, type, rw, addr, addr + len, brkcow);
+ } else {
+ err = segvn_fault_vnodepages(hat, seg, lpgaddr,
+ lpgeaddr, type, rw, addr, addr + len, brkcow);
+ if (err == IE_RETRY) {
+ ASSERT(seg->s_szc == 0);
+ ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
+ goto cont;
+ }
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (err);
+ }
+
+cont:
+ page = seg_page(seg, addr);
+ if (amp != NULL) {
+ anon_index = svd->anon_index + page;
+
+ if ((type == F_PROT) && (rw == S_READ) &&
+ svd->type == MAP_PRIVATE && svd->pageprot == 0) {
+ size_t index = anon_index;
+ struct anon *ap;
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ /*
+ * The fast path could apply to S_WRITE also, except
+ * that the protection fault could be caused by lazy
+ * tlb flush when ro->rw. In this case, the pte is
+ * RW already. But RO in the other cpu's tlb causes
+ * the fault. Since hat_chgprot won't do anything if
+ * pte doesn't change, we may end up faulting
+ * indefinitely until the RO tlb entry gets replaced.
+ */
+ for (a = addr; a < addr + len; a += PAGESIZE, index++) {
+ anon_array_enter(amp, index, &cookie);
+ ap = anon_get_ptr(amp->ahp, index);
+ anon_array_exit(&cookie);
+ if ((ap == NULL) || (ap->an_refcnt != 1)) {
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ goto slow;
+ }
+ }
+ hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+ }
+slow:
+
+ if (svd->vpage == NULL)
+ vpage = NULL;
+ else
+ vpage = &svd->vpage[page];
+
+ off = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+ /*
+ * If MADV_SEQUENTIAL has been set for the particular page we
+ * are faulting on, free behind all pages in the segment and put
+ * them on the free list.
+ */
+ if ((page != 0) && fltadvice) { /* not if first page in segment */
+ struct vpage *vpp;
+ ulong_t fanon_index;
+ size_t fpage;
+ u_offset_t pgoff, fpgoff;
+ struct vnode *fvp;
+ struct anon *fap = NULL;
+
+ if (svd->advice == MADV_SEQUENTIAL ||
+ (svd->pageadvice &&
+ VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) {
+ pgoff = off - PAGESIZE;
+ fpage = page - 1;
+ if (vpage != NULL)
+ vpp = &svd->vpage[fpage];
+ if (amp != NULL)
+ fanon_index = svd->anon_index + fpage;
+
+ while (pgoff > svd->offset) {
+ if (svd->advice != MADV_SEQUENTIAL &&
+ (!svd->pageadvice || (vpage &&
+ VPP_ADVICE(vpp) != MADV_SEQUENTIAL)))
+ break;
+
+ /*
+ * If this is an anon page, we must find the
+ * correct <vp, offset> for it
+ */
+ fap = NULL;
+ if (amp != NULL) {
+ ANON_LOCK_ENTER(&amp->a_rwlock,
+ RW_READER);
+ anon_array_enter(amp, fanon_index,
+ &cookie);
+ fap = anon_get_ptr(amp->ahp,
+ fanon_index);
+ if (fap != NULL) {
+ swap_xlate(fap, &fvp, &fpgoff);
+ } else {
+ fpgoff = pgoff;
+ fvp = svd->vp;
+ }
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ } else {
+ fpgoff = pgoff;
+ fvp = svd->vp;
+ }
+ if (fvp == NULL)
+ break; /* XXX */
+ /*
+ * Skip pages that are free or have an
+ * "exclusive" lock.
+ */
+ pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED);
+ if (pp == NULL)
+ break;
+ /*
+ * We don't need the page_struct_lock to test
+ * as this is only advisory; even if we
+ * acquire it someone might race in and lock
+ * the page after we unlock and before the
+ * PUTPAGE, then VOP_PUTPAGE will do nothing.
+ */
+ if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
+ /*
+ * Hold the vnode before releasing
+ * the page lock to prevent it from
+ * being freed and re-used by some
+ * other thread.
+ */
+ VN_HOLD(fvp);
+ page_unlock(pp);
+ /*
+ * We should build a page list
+ * to kluster putpages XXX
+ */
+ (void) VOP_PUTPAGE(fvp,
+ (offset_t)fpgoff, PAGESIZE,
+ (B_DONTNEED|B_FREE|B_ASYNC),
+ svd->cred);
+ VN_RELE(fvp);
+ } else {
+ /*
+ * XXX - Should the loop terminate if
+ * the page is `locked'?
+ */
+ page_unlock(pp);
+ }
+ --vpp;
+ --fanon_index;
+ pgoff -= PAGESIZE;
+ }
+ }
+ }
+
+ plp = pl;
+ *plp = NULL;
+ pl_alloc_sz = 0;
+
+ /*
+ * See if we need to call VOP_GETPAGE for
+ * *any* of the range being faulted on.
+ * We can skip all of this work if there
+ * was no original vnode.
+ */
+ if (svd->vp != NULL) {
+ u_offset_t vp_off;
+ size_t vp_len;
+ struct anon *ap;
+ vnode_t *vp;
+
+ vp_off = off;
+ vp_len = len;
+
+ if (amp == NULL)
+ dogetpage = 1;
+ else {
+ /*
+ * Only acquire reader lock to prevent amp->ahp
+ * from being changed. It's ok to miss pages,
+ * hence we don't do anon_array_enter
+ */
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ ap = anon_get_ptr(amp->ahp, anon_index);
+
+ if (len <= PAGESIZE)
+ /* inline non_anon() */
+ dogetpage = (ap == NULL);
+ else
+ dogetpage = non_anon(amp->ahp, anon_index,
+ &vp_off, &vp_len);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+
+ if (dogetpage) {
+ enum seg_rw arw;
+ struct as *as = seg->s_as;
+
+ if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) {
+ /*
+ * Page list won't fit in local array,
+ * allocate one of the needed size.
+ */
+ pl_alloc_sz =
+ (btop(len) + 1) * sizeof (page_t *);
+ plp = kmem_alloc(pl_alloc_sz, KM_SLEEP);
+ plp[0] = NULL;
+ plsz = len;
+ } else if (rw == S_WRITE && svd->type == MAP_PRIVATE ||
+ rw == S_OTHER ||
+ (((size_t)(addr + PAGESIZE) <
+ (size_t)(seg->s_base + seg->s_size)) &&
+ hat_probe(as->a_hat, addr + PAGESIZE))) {
+ /*
+ * Ask VOP_GETPAGE to return the exact number
+ * of pages if
+ * (a) this is a COW fault, or
+ * (b) this is a software fault, or
+ * (c) next page is already mapped.
+ */
+ plsz = len;
+ } else {
+ /*
+ * Ask VOP_GETPAGE to return adjacent pages
+ * within the segment.
+ */
+ plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t)
+ ((seg->s_base + seg->s_size) - addr));
+ ASSERT((addr + plsz) <=
+ (seg->s_base + seg->s_size));
+ }
+
+ /*
+ * Need to get some non-anonymous pages.
+ * We need to make only one call to GETPAGE to do
+ * this to prevent certain deadlocking conditions
+ * when we are doing locking. In this case
+ * non_anon() should have picked up the smallest
+ * range which includes all the non-anonymous
+ * pages in the requested range. We have to
+ * be careful regarding which rw flag to pass in
+ * because on a private mapping, the underlying
+ * object is never allowed to be written.
+ */
+ if (rw == S_WRITE && svd->type == MAP_PRIVATE) {
+ arw = S_READ;
+ } else {
+ arw = rw;
+ }
+ vp = svd->vp;
+ TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
+ "segvn_getpage:seg %p addr %p vp %p",
+ seg, addr, vp);
+ err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len,
+ &vpprot, plp, plsz, seg, addr + (vp_off - off), arw,
+ svd->cred);
+ if (err) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ segvn_pagelist_rele(plp);
+ if (pl_alloc_sz)
+ kmem_free(plp, pl_alloc_sz);
+ return (FC_MAKE_ERR(err));
+ }
+ if (svd->type == MAP_PRIVATE)
+ vpprot &= ~PROT_WRITE;
+ }
+ }
+
+ /*
+ * N.B. at this time the plp array has all the needed non-anon
+ * pages in addition to (possibly) having some adjacent pages.
+ */
+
+ /*
+ * Always acquire the anon_array_lock to prevent
+ * 2 threads from allocating separate anon slots for
+ * the same "addr".
+ *
+ * If this is a copy-on-write fault and we don't already
+ * have the anon_array_lock, acquire it to prevent the
+ * fault routine from handling multiple copy-on-write faults
+ * on the same "addr" in the same address space.
+ *
+ * Only one thread should deal with the fault since after
+ * it is handled, the other threads can acquire a translation
+ * to the newly created private page. This prevents two or
+ * more threads from creating different private pages for the
+ * same fault.
+ *
+ * We grab "serialization" lock here if this is a MAP_PRIVATE segment
+ * to prevent deadlock between this thread and another thread
+ * which has soft-locked this page and wants to acquire serial_lock.
+ * ( bug 4026339 )
+ *
+ * The fix for bug 4026339 becomes unnecessary when using the
+ * locking scheme with per amp rwlock and a global set of hash
+ * lock, anon_array_lock. If we steal a vnode page when low
+ * on memory and upgrad the page lock through page_rename,
+ * then the page is PAGE_HANDLED, nothing needs to be done
+ * for this page after returning from segvn_faultpage.
+ *
+ * But really, the page lock should be downgraded after
+ * the stolen page is page_rename'd.
+ */
+
+ if (amp != NULL)
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+
+ /*
+ * Ok, now loop over the address range and handle faults
+ */
+ for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) {
+ err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot,
+ type, rw, brkcow);
+ if (err) {
+ if (amp != NULL)
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ if (type == F_SOFTLOCK && a > addr)
+ segvn_softunlock(seg, addr, (a - addr),
+ S_OTHER);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ segvn_pagelist_rele(plp);
+ if (pl_alloc_sz)
+ kmem_free(plp, pl_alloc_sz);
+ return (err);
+ }
+ if (vpage) {
+ vpage++;
+ } else if (svd->vpage) {
+ page = seg_page(seg, addr);
+ vpage = &svd->vpage[++page];
+ }
+ }
+
+ /* Didn't get pages from the underlying fs so we're done */
+ if (!dogetpage)
+ goto done;
+
+ /*
+ * Now handle any other pages in the list returned.
+ * If the page can be used, load up the translations now.
+ * Note that the for loop will only be entered if "plp"
+ * is pointing to a non-NULL page pointer which means that
+ * VOP_GETPAGE() was called and vpprot has been initialized.
+ */
+ if (svd->pageprot == 0)
+ prot = svd->prot & vpprot;
+
+
+ /*
+ * Large Files: diff should be unsigned value because we started
+ * supporting > 2GB segment sizes from 2.5.1 and when a
+ * large file of size > 2GB gets mapped to address space
+ * the diff value can be > 2GB.
+ */
+
+ for (ppp = plp; (pp = *ppp) != NULL; ppp++) {
+ size_t diff;
+ struct anon *ap;
+ int anon_index;
+ anon_sync_obj_t cookie;
+ int hat_flag = HAT_LOAD_ADV;
+
+ if (svd->flags & MAP_TEXT) {
+ hat_flag |= HAT_LOAD_TEXT;
+ }
+
+ if (pp == PAGE_HANDLED)
+ continue;
+
+ if (pp->p_offset >= svd->offset &&
+ (pp->p_offset < svd->offset + seg->s_size)) {
+
+ diff = pp->p_offset - svd->offset;
+
+ /*
+ * Large Files: Following is the assertion
+ * validating the above cast.
+ */
+ ASSERT(svd->vp == pp->p_vnode);
+
+ page = btop(diff);
+ if (svd->pageprot)
+ prot = VPP_PROT(&svd->vpage[page]) & vpprot;
+
+ /*
+ * Prevent other threads in the address space from
+ * creating private pages (i.e., allocating anon slots)
+ * while we are in the process of loading translations
+ * to additional pages returned by the underlying
+ * object.
+ */
+ if (amp != NULL) {
+ anon_index = svd->anon_index + page;
+ anon_array_enter(amp, anon_index, &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_index);
+ }
+ if ((amp == NULL) || (ap == NULL)) {
+ if (IS_VMODSORT(pp->p_vnode) ||
+ enable_mbit_wa) {
+ if (rw == S_WRITE)
+ hat_setmod(pp);
+ else if (rw != S_OTHER &&
+ !hat_ismod(pp))
+ prot &= ~PROT_WRITE;
+ }
+ /*
+ * Skip mapping read ahead pages marked
+ * for migration, so they will get migrated
+ * properly on fault
+ */
+ if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) {
+ hat_memload(hat, seg->s_base + diff,
+ pp, prot, hat_flag);
+ }
+ }
+ if (amp != NULL)
+ anon_array_exit(&cookie);
+ }
+ page_unlock(pp);
+ }
+done:
+ if (amp != NULL)
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ if (pl_alloc_sz)
+ kmem_free(plp, pl_alloc_sz);
+ return (0);
+}
+
+/*
+ * This routine is used to start I/O on pages asynchronously. XXX it will
+ * only create PAGESIZE pages. At fault time they will be relocated into
+ * larger pages.
+ */
+static faultcode_t
+segvn_faulta(struct seg *seg, caddr_t addr)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ int err;
+ struct anon_map *amp;
+ vnode_t *vp;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ if ((amp = svd->amp) != NULL) {
+ struct anon *ap;
+
+ /*
+ * Reader lock to prevent amp->ahp from being changed.
+ * This is advisory, it's ok to miss a page, so
+ * we don't do anon_array_enter lock.
+ */
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ if ((ap = anon_get_ptr(amp->ahp,
+ svd->anon_index + seg_page(seg, addr))) != NULL) {
+
+ err = anon_getpage(&ap, NULL, NULL,
+ 0, seg, addr, S_READ, svd->cred);
+
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ if (err)
+ return (FC_MAKE_ERR(err));
+ return (0);
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+
+ if (svd->vp == NULL) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0); /* zfod page - do nothing now */
+ }
+
+ vp = svd->vp;
+ TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
+ "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp);
+ err = VOP_GETPAGE(vp,
+ (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)),
+ PAGESIZE, NULL, NULL, 0, seg, addr,
+ S_OTHER, svd->cred);
+
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ if (err)
+ return (FC_MAKE_ERR(err));
+ return (0);
+}
+
+static int
+segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct vpage *svp, *evp;
+ struct vnode *vp;
+ size_t pgsz;
+ pgcnt_t pgcnt;
+ anon_sync_obj_t cookie;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if ((svd->maxprot & prot) != prot)
+ return (EACCES); /* violated maxprot */
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+
+ /* return if prot is the same */
+ if (!svd->pageprot && svd->prot == prot) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+
+ /*
+ * Since we change protections we first have to flush the cache.
+ * This makes sure all the pagelock calls have to recheck
+ * protections.
+ */
+ if (svd->softlockcnt > 0) {
+ /*
+ * Since we do have the segvn writers lock nobody can fill
+ * the cache with entries belonging to this seg during
+ * the purge. The flush either succeeds or we still have
+ * pending I/Os.
+ */
+ segvn_purge(seg);
+ if (svd->softlockcnt > 0) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EAGAIN);
+ }
+ }
+
+ if (seg->s_szc != 0) {
+ int err;
+ pgsz = page_get_pagesize(seg->s_szc);
+ pgcnt = pgsz >> PAGESHIFT;
+ ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+ if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ ASSERT(seg->s_base != addr || seg->s_size != len);
+ /*
+ * If we are holding the as lock as a reader then
+ * we need to return IE_RETRY and let the as
+ * layer drop and re-aquire the lock as a writer.
+ */
+ if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock))
+ return (IE_RETRY);
+ VM_STAT_ADD(segvnvmstats.demoterange[1]);
+ err = segvn_demote_range(seg, addr, len, SDR_END);
+ if (err == 0)
+ return (IE_RETRY);
+ if (err == ENOMEM)
+ return (IE_NOMEM);
+ return (err);
+ }
+ }
+
+
+ /*
+ * If it's a private mapping and we're making it writable
+ * and no swap space has been reserved, have to reserve
+ * it all now. If it's a private mapping to a file (i.e., vp != NULL)
+ * and we're removing write permission on the entire segment and
+ * we haven't modified any pages, we can release the swap space.
+ */
+ if (svd->type == MAP_PRIVATE) {
+ if (prot & PROT_WRITE) {
+ size_t sz;
+ if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) {
+ if (anon_resv(seg->s_size) == 0) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (IE_NOMEM);
+ }
+ sz = svd->swresv = seg->s_size;
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+ "anon proc:%p %lu %u",
+ seg, sz, 1);
+ }
+ } else {
+ /*
+ * Swap space is released only if this segment
+ * does not map anonymous memory, since read faults
+ * on such segments still need an anon slot to read
+ * in the data.
+ */
+ if (svd->swresv != 0 && svd->vp != NULL &&
+ svd->amp == NULL && addr == seg->s_base &&
+ len == seg->s_size && svd->pageprot == 0) {
+ anon_unresv(svd->swresv);
+ svd->swresv = 0;
+ TRACE_3(TR_FAC_VM, TR_ANON_PROC,
+ "anon proc:%p %lu %u",
+ seg, 0, 0);
+ }
+ }
+ }
+
+ if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) {
+ if (svd->prot == prot) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0); /* all done */
+ }
+ svd->prot = (uchar_t)prot;
+ } else {
+ struct anon *ap = NULL;
+ page_t *pp;
+ u_offset_t offset, off;
+ struct anon_map *amp;
+ ulong_t anon_idx = 0;
+
+ /*
+ * A vpage structure exists or else the change does not
+ * involve the entire segment. Establish a vpage structure
+ * if none is there. Then, for each page in the range,
+ * adjust its individual permissions. Note that write-
+ * enabling a MAP_PRIVATE page can affect the claims for
+ * locked down memory. Overcommitting memory terminates
+ * the operation.
+ */
+ segvn_vpage(seg);
+ if ((amp = svd->amp) != NULL) {
+ anon_idx = svd->anon_index + seg_page(seg, addr);
+ ASSERT(seg->s_szc == 0 ||
+ IS_P2ALIGNED(anon_idx, pgcnt));
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ }
+
+ offset = svd->offset + (uintptr_t)(addr - seg->s_base);
+ evp = &svd->vpage[seg_page(seg, addr + len)];
+
+ /*
+ * See Statement at the beginning of segvn_lockop regarding
+ * the way cowcnts and lckcnts are handled.
+ */
+ for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
+
+ ASSERT(seg->s_szc == 0 ||
+ (svd->vp != NULL || svd->type == MAP_PRIVATE));
+
+ if (seg->s_szc != 0 && svd->type == MAP_PRIVATE) {
+ if (amp != NULL) {
+ anon_array_enter(amp, anon_idx,
+ &cookie);
+ }
+ if (IS_P2ALIGNED(anon_idx, pgcnt) &&
+ !segvn_claim_pages(seg, svp, offset,
+ anon_idx, prot)) {
+ if (amp != NULL) {
+ anon_array_exit(&cookie);
+ }
+ break;
+ }
+ if (amp != NULL) {
+ anon_array_exit(&cookie);
+ }
+ anon_idx++;
+ } else {
+ if (amp != NULL) {
+ anon_array_enter(amp, anon_idx,
+ &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_idx++);
+ }
+
+ if (VPP_ISPPLOCK(svp) &&
+ (VPP_PROT(svp) != prot) &&
+ (svd->type == MAP_PRIVATE)) {
+
+ if (amp == NULL || ap == NULL) {
+ vp = svd->vp;
+ off = offset;
+ } else
+ swap_xlate(ap, &vp, &off);
+ if (amp != NULL)
+ anon_array_exit(&cookie);
+
+ if ((pp = page_lookup(vp, off,
+ SE_SHARED)) == NULL) {
+ panic("segvn_setprot: no page");
+ /*NOTREACHED*/
+ }
+ ASSERT(seg->s_szc == 0);
+ if ((VPP_PROT(svp) ^ prot) &
+ PROT_WRITE) {
+ if (prot & PROT_WRITE) {
+ if (!page_addclaim(pp)) {
+ page_unlock(pp);
+ break;
+ }
+ } else {
+ if (!page_subclaim(pp)) {
+ page_unlock(pp);
+ break;
+ }
+ }
+ }
+ page_unlock(pp);
+ } else if (amp != NULL)
+ anon_array_exit(&cookie);
+ }
+ VPP_SETPROT(svp, prot);
+ offset += PAGESIZE;
+ }
+ if (amp != NULL)
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+
+ /*
+ * Did we terminate prematurely? If so, simply unload
+ * the translations to the things we've updated so far.
+ */
+ if (svp != evp) {
+ len = (svp - &svd->vpage[seg_page(seg, addr)]) *
+ PAGESIZE;
+ ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz));
+ if (len != 0)
+ hat_unload(seg->s_as->a_hat, addr,
+ len, HAT_UNLOAD);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (IE_NOMEM);
+ }
+ }
+
+ if ((prot & PROT_WRITE) != 0 || (prot & ~PROT_USER) == PROT_NONE) {
+ /*
+ * Either private or shared data with write access (in
+ * which case we need to throw out all former translations
+ * so that we get the right translations set up on fault
+ * and we don't allow write access to any copy-on-write pages
+ * that might be around or to prevent write access to pages
+ * representing holes in a file), or we don't have permission
+ * to access the memory at all (in which case we have to
+ * unload any current translations that might exist).
+ */
+ hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
+ } else {
+ /*
+ * A shared mapping or a private mapping in which write
+ * protection is going to be denied - just change all the
+ * protections over the range of addresses in question.
+ * segvn does not support any other attributes other
+ * than prot so we can use hat_chgattr.
+ */
+ hat_chgattr(seg->s_as->a_hat, addr, len, prot);
+ }
+
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+
+ return (0);
+}
+
+/*
+ * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize,
+ * to determine if the seg is capable of mapping the requested szc.
+ */
+static int
+segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct segvn_data *nsvd;
+ struct anon_map *amp = svd->amp;
+ struct seg *nseg;
+ caddr_t eaddr = addr + len, a;
+ size_t pgsz = page_get_pagesize(szc);
+ int err;
+ u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base);
+ extern struct vnode kvp;
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
+
+ if (seg->s_szc == szc || segvn_lpg_disable != 0) {
+ return (0);
+ }
+
+ /*
+ * addr should always be pgsz aligned but eaddr may be misaligned if
+ * it's at the end of the segment.
+ *
+ * XXX we should assert this condition since as_setpagesize() logic
+ * guarantees it.
+ */
+ if (!IS_P2ALIGNED(addr, pgsz) ||
+ (!IS_P2ALIGNED(eaddr, pgsz) &&
+ eaddr != seg->s_base + seg->s_size)) {
+
+ segvn_setpgsz_align_err++;
+ return (EINVAL);
+ }
+
+ if ((svd->vp == NULL && svd->type == MAP_SHARED) ||
+ (svd->flags & MAP_NORESERVE) || seg->s_as == &kas ||
+ szc > segvn_maxpgszc) {
+ return (EINVAL);
+ }
+
+ /* paranoid check */
+ if (svd->vp != NULL &&
+ (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) {
+ return (EINVAL);
+ }
+
+ if (seg->s_szc == 0 && svd->vp != NULL &&
+ map_addr_vacalign_check(addr, off)) {
+ return (EINVAL);
+ }
+
+ /*
+ * Check that protections are the same within new page
+ * size boundaries.
+ */
+ if (svd->pageprot) {
+ for (a = addr; a < eaddr; a += pgsz) {
+ if ((a + pgsz) > eaddr) {
+ if (!sameprot(seg, a, eaddr - a)) {
+ return (EINVAL);
+ }
+ } else {
+ if (!sameprot(seg, a, pgsz)) {
+ return (EINVAL);
+ }
+ }
+ }
+ }
+
+ /*
+ * Since we are changing page size we first have to flush
+ * the cache. This makes sure all the pagelock calls have
+ * to recheck protections.
+ */
+ if (svd->softlockcnt > 0) {
+ /*
+ * Since we do have the segvn writers lock nobody can fill
+ * the cache with entries belonging to this seg during
+ * the purge. The flush either succeeds or we still have
+ * pending I/Os.
+ */
+ segvn_purge(seg);
+ if (svd->softlockcnt > 0) {
+ return (EAGAIN);
+ }
+ }
+
+ /*
+ * Operation for sub range of existing segment.
+ */
+ if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) {
+ if (szc < seg->s_szc) {
+ VM_STAT_ADD(segvnvmstats.demoterange[2]);
+ err = segvn_demote_range(seg, addr, len, SDR_RANGE);
+ if (err == 0) {
+ return (IE_RETRY);
+ }
+ if (err == ENOMEM) {
+ return (IE_NOMEM);
+ }
+ return (err);
+ }
+ if (addr != seg->s_base) {
+ nseg = segvn_split_seg(seg, addr);
+ if (eaddr != (nseg->s_base + nseg->s_size)) {
+ /* eaddr is szc aligned */
+ (void) segvn_split_seg(nseg, eaddr);
+ }
+ return (IE_RETRY);
+ }
+ if (eaddr != (seg->s_base + seg->s_size)) {
+ /* eaddr is szc aligned */
+ (void) segvn_split_seg(seg, eaddr);
+ }
+ return (IE_RETRY);
+ }
+
+ /*
+ * Break any low level sharing and reset seg->s_szc to 0.
+ */
+ if ((err = segvn_clrszc(seg)) != 0) {
+ if (err == ENOMEM) {
+ err = IE_NOMEM;
+ }
+ return (err);
+ }
+ ASSERT(seg->s_szc == 0);
+
+ /*
+ * If the end of the current segment is not pgsz aligned
+ * then attempt to concatenate with the next segment.
+ */
+ if (!IS_P2ALIGNED(eaddr, pgsz)) {
+ nseg = AS_SEGNEXT(seg->s_as, seg);
+ if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) {
+ return (ENOMEM);
+ }
+ if (nseg->s_ops != &segvn_ops) {
+ return (EINVAL);
+ }
+ nsvd = (struct segvn_data *)nseg->s_data;
+ if (nsvd->softlockcnt > 0) {
+ segvn_purge(nseg);
+ if (nsvd->softlockcnt > 0) {
+ return (EAGAIN);
+ }
+ }
+ err = segvn_clrszc(nseg);
+ if (err == ENOMEM) {
+ err = IE_NOMEM;
+ }
+ if (err != 0) {
+ return (err);
+ }
+ err = segvn_concat(seg, nseg, 1);
+ if (err == -1) {
+ return (EINVAL);
+ }
+ if (err == -2) {
+ return (IE_NOMEM);
+ }
+ return (IE_RETRY);
+ }
+
+ /*
+ * May need to re-align anon array to
+ * new szc.
+ */
+ if (amp != NULL) {
+ pgcnt_t pgcnt = pgsz >> PAGESHIFT;
+ if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) {
+ struct anon_hdr *nahp;
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ ASSERT(amp->refcnt == 1);
+ nahp = anon_create(btop(amp->size), ANON_NOSLEEP);
+ if (nahp == NULL) {
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ return (IE_NOMEM);
+ }
+ if (anon_copy_ptr(amp->ahp, svd->anon_index,
+ nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) {
+ anon_release(nahp, btop(amp->size));
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ return (IE_NOMEM);
+ }
+ anon_release(amp->ahp, btop(amp->size));
+ amp->ahp = nahp;
+ svd->anon_index = 0;
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ }
+ if (svd->vp != NULL && szc != 0) {
+ struct vattr va;
+ u_offset_t eoffpage = svd->offset;
+ va.va_mask = AT_SIZE;
+ eoffpage += seg->s_size;
+ eoffpage = btopr(eoffpage);
+ if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) {
+ segvn_setpgsz_getattr_err++;
+ return (EINVAL);
+ }
+ if (btopr(va.va_size) < eoffpage) {
+ segvn_setpgsz_eof_err++;
+ return (EINVAL);
+ }
+ if (amp != NULL) {
+ /*
+ * anon_fill_cow_holes() may call VOP_GETPAGE().
+ * don't take anon map lock here to avoid holding it
+ * across VOP_GETPAGE() calls that may call back into
+ * segvn for klsutering checks. We don't really need
+ * anon map lock here since it's a private segment and
+ * we hold as level lock as writers.
+ */
+ if ((err = anon_fill_cow_holes(seg, seg->s_base,
+ amp->ahp, svd->anon_index, svd->vp, svd->offset,
+ seg->s_size, szc, svd->prot, svd->vpage,
+ svd->cred)) != 0) {
+ return (EINVAL);
+ }
+ }
+ segvn_setvnode_mpss(svd->vp);
+ }
+
+ if (amp != NULL) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ amp->a_szc = szc;
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+
+ seg->s_szc = szc;
+
+ return (0);
+}
+
+static int
+segvn_clrszc(struct seg *seg)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct anon_map *amp = svd->amp;
+ size_t pgsz;
+ pgcnt_t pages;
+ int err = 0;
+ caddr_t a = seg->s_base;
+ caddr_t ea = a + seg->s_size;
+ ulong_t an_idx = svd->anon_index;
+ vnode_t *vp = svd->vp;
+ struct vpage *vpage = svd->vpage;
+ page_t *anon_pl[1 + 1], *pp;
+ struct anon *ap, *oldap;
+ uint_t prot = svd->prot, vpprot;
+
+ ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
+ SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
+ ASSERT(svd->type == MAP_PRIVATE ||
+ (vp != NULL && svd->amp == NULL));
+
+ if (vp == NULL && amp == NULL) {
+ seg->s_szc = 0;
+ return (0);
+ }
+
+ /*
+ * do HAT_UNLOAD_UNMAP since we are changing the pagesize.
+ * unload argument is 0 when we are freeing the segment
+ * and unload was already done.
+ */
+ hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
+ HAT_UNLOAD_UNMAP);
+
+ if (amp == NULL) {
+ seg->s_szc = 0;
+ return (0);
+ }
+
+ pgsz = page_get_pagesize(seg->s_szc);
+ pages = btop(pgsz);
+
+ /*
+ * XXX anon rwlock is not really needed because this is a
+ * private segment and we are writers.
+ */
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+
+ for (; a < ea; a += pgsz, an_idx += pages) {
+ if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) {
+ if (svd->pageprot != 0) {
+ ASSERT(vpage != NULL);
+ prot = VPP_PROT(vpage);
+ ASSERT(sameprot(seg, a, pgsz));
+ }
+ if (seg->s_szc != 0) {
+ ASSERT(vp == NULL || anon_pages(amp->ahp,
+ an_idx, pages) == pages);
+ if ((err = anon_map_demotepages(amp, an_idx,
+ seg, a, prot, vpage, svd->cred)) != 0) {
+ goto out;
+ }
+ } else {
+ if (oldap->an_refcnt == 1) {
+ continue;
+ }
+ if ((err = anon_getpage(&oldap, &vpprot,
+ anon_pl, PAGESIZE, seg, a, S_READ,
+ svd->cred))) {
+ goto out;
+ }
+ if ((pp = anon_private(&ap, seg, a, prot,
+ anon_pl[0], 0, svd->cred)) == NULL) {
+ err = ENOMEM;
+ goto out;
+ }
+ anon_decref(oldap);
+ (void) anon_set_ptr(amp->ahp, an_idx, ap,
+ ANON_SLEEP);
+ page_unlock(pp);
+ }
+ }
+ vpage = (vpage == NULL) ? NULL : vpage + pages;
+ }
+
+ amp->a_szc = 0;
+ seg->s_szc = 0;
+out:
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ return (err);
+}
+
+static int
+segvn_claim_pages(
+ struct seg *seg,
+ struct vpage *svp,
+ u_offset_t off,
+ ulong_t anon_idx,
+ uint_t prot)
+{
+ pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc);
+ size_t ppasize = (pgcnt + 1) * sizeof (page_t *);
+ page_t **ppa;
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct anon_map *amp = svd->amp;
+ struct vpage *evp = svp + pgcnt;
+ caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT)
+ + seg->s_base;
+ struct anon *ap;
+ struct vnode *vp = svd->vp;
+ page_t *pp;
+ pgcnt_t pg_idx, i;
+ int err = 0;
+ anoff_t aoff;
+ int anon = (amp != NULL) ? 1 : 0;
+
+ ASSERT(svd->type == MAP_PRIVATE);
+ ASSERT(svd->vpage != NULL);
+ ASSERT(seg->s_szc != 0);
+ ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+ ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt));
+ ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT));
+
+ if (VPP_PROT(svp) == prot)
+ return (1);
+ if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE))
+ return (1);
+
+ ppa = kmem_alloc(ppasize, KM_SLEEP);
+ if (anon && vp != NULL) {
+ if (anon_get_ptr(amp->ahp, anon_idx) == NULL) {
+ anon = 0;
+ ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt));
+ }
+ ASSERT(!anon ||
+ anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt);
+ }
+
+ for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) {
+ if (!VPP_ISPPLOCK(svp))
+ continue;
+ if (anon) {
+ ap = anon_get_ptr(amp->ahp, anon_idx);
+ if (ap == NULL) {
+ panic("segvn_claim_pages: no anon slot");
+ }
+ swap_xlate(ap, &vp, &aoff);
+ off = (u_offset_t)aoff;
+ }
+ ASSERT(vp != NULL);
+ if ((pp = page_lookup(vp,
+ (u_offset_t)off, SE_SHARED)) == NULL) {
+ panic("segvn_claim_pages: no page");
+ }
+ ppa[pg_idx++] = pp;
+ off += PAGESIZE;
+ }
+
+ if (ppa[0] == NULL) {
+ kmem_free(ppa, ppasize);
+ return (1);
+ }
+
+ ASSERT(pg_idx <= pgcnt);
+ ppa[pg_idx] = NULL;
+
+ if (prot & PROT_WRITE)
+ err = page_addclaim_pages(ppa);
+ else
+ err = page_subclaim_pages(ppa);
+
+ for (i = 0; i < pg_idx; i++) {
+ ASSERT(ppa[i] != NULL);
+ page_unlock(ppa[i]);
+ }
+
+ kmem_free(ppa, ppasize);
+ return (err);
+}
+
+/*
+ * Returns right (upper address) segment if split occured.
+ * If the address is equal to the beginning or end of its segment it returns
+ * the current segment.
+ */
+static struct seg *
+segvn_split_seg(struct seg *seg, caddr_t addr)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct seg *nseg;
+ size_t nsize;
+ struct segvn_data *nsvd;
+
+ ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(svd->type == MAP_PRIVATE || svd->amp == NULL);
+ ASSERT(addr >= seg->s_base);
+ ASSERT(addr <= seg->s_base + seg->s_size);
+
+ if (addr == seg->s_base || addr == seg->s_base + seg->s_size)
+ return (seg);
+
+ nsize = seg->s_base + seg->s_size - addr;
+ seg->s_size = addr - seg->s_base;
+ nseg = seg_alloc(seg->s_as, addr, nsize);
+ ASSERT(nseg != NULL);
+ nseg->s_ops = seg->s_ops;
+ nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
+ nseg->s_data = (void *)nsvd;
+ nseg->s_szc = seg->s_szc;
+ *nsvd = *svd;
+ rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL);
+
+ if (nsvd->vp != NULL) {
+ VN_HOLD(nsvd->vp);
+ nsvd->offset = svd->offset +
+ (uintptr_t)(nseg->s_base - seg->s_base);
+ if (nsvd->type == MAP_SHARED)
+ lgrp_shm_policy_init(NULL, nsvd->vp);
+ } else {
+ /*
+ * The offset for an anonymous segment has no signifigance in
+ * terms of an offset into a file. If we were to use the above
+ * calculation instead, the structures read out of
+ * /proc/<pid>/xmap would be more difficult to decipher since
+ * it would be unclear whether two seemingly contiguous
+ * prxmap_t structures represented different segments or a
+ * single segment that had been split up into multiple prxmap_t
+ * structures (e.g. if some part of the segment had not yet
+ * been faulted in).
+ */
+ nsvd->offset = 0;
+ }
+
+ ASSERT(svd->softlockcnt == 0);
+ crhold(svd->cred);
+
+ if (svd->vpage != NULL) {
+ size_t bytes = vpgtob(seg_pages(seg));
+ size_t nbytes = vpgtob(seg_pages(nseg));
+ struct vpage *ovpage = svd->vpage;
+
+ svd->vpage = kmem_alloc(bytes, KM_SLEEP);
+ bcopy(ovpage, svd->vpage, bytes);
+ nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
+ bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes);
+ kmem_free(ovpage, bytes + nbytes);
+ }
+ if (svd->amp != NULL) {
+ struct anon_map *oamp = svd->amp, *namp;
+ struct anon_hdr *nahp;
+
+ ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER);
+ ASSERT(oamp->refcnt == 1);
+ nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
+ (void) anon_copy_ptr(oamp->ahp, svd->anon_index,
+ nahp, 0, btop(seg->s_size), ANON_SLEEP);
+
+ namp = anonmap_alloc(nseg->s_size, 0);
+ namp->a_szc = nseg->s_szc;
+ (void) anon_copy_ptr(oamp->ahp,
+ svd->anon_index + btop(seg->s_size),
+ namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
+ anon_release(oamp->ahp, btop(oamp->size));
+ oamp->ahp = nahp;
+ oamp->size = seg->s_size;
+ svd->anon_index = 0;
+ nsvd->amp = namp;
+ nsvd->anon_index = 0;
+ ANON_LOCK_EXIT(&oamp->a_rwlock);
+ }
+
+ /*
+ * Split amount of swap reserve
+ */
+ if (svd->swresv) {
+ /*
+ * For MAP_NORESERVE, only allocate swap reserve for pages
+ * being used. Other segments get enough to cover whole
+ * segment.
+ */
+ if (svd->flags & MAP_NORESERVE) {
+ size_t oswresv;
+
+ ASSERT(svd->amp);
+ oswresv = svd->swresv;
+ svd->swresv = ptob(anon_pages(svd->amp->ahp,
+ svd->anon_index, btop(seg->s_size)));
+ nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
+ nsvd->anon_index, btop(nseg->s_size)));
+ ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
+ } else {
+ ASSERT(svd->swresv == seg->s_size + nseg->s_size);
+ svd->swresv = seg->s_size;
+ nsvd->swresv = nseg->s_size;
+ }
+ }
+
+ return (nseg);
+}
+
+
+/*
+ * called on memory operations (unmap, setprot, setpagesize) for a subset
+ * of a large page segment to either demote the memory range (SDR_RANGE)
+ * or the ends (SDR_END) by addr/len.
+ *
+ * returns 0 on success. returns errno, including ENOMEM, on failure.
+ */
+static int
+segvn_demote_range(struct seg *seg, caddr_t addr, size_t len, int flag)
+{
+ caddr_t eaddr = addr + len;
+ caddr_t lpgaddr, lpgeaddr;
+ struct seg *nseg;
+ struct seg *badseg1 = NULL;
+ struct seg *badseg2 = NULL;
+ size_t pgsz;
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ int err;
+
+ ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(seg->s_szc != 0);
+ pgsz = page_get_pagesize(seg->s_szc);
+ ASSERT(seg->s_base != addr || seg->s_size != len);
+ ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
+ ASSERT(svd->softlockcnt == 0);
+ ASSERT(svd->type == MAP_PRIVATE ||
+ (svd->vp != NULL && svd->amp == NULL));
+
+ CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
+ ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr);
+ if (flag == SDR_RANGE) {
+ /* demote entire range */
+ badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
+ (void) segvn_split_seg(nseg, lpgeaddr);
+ ASSERT(badseg1->s_base == lpgaddr);
+ ASSERT(badseg1->s_size == lpgeaddr - lpgaddr);
+ } else if (addr != lpgaddr) {
+ ASSERT(flag == SDR_END);
+ badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
+ if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz &&
+ eaddr < lpgaddr + 2 * pgsz) {
+ (void) segvn_split_seg(nseg, lpgeaddr);
+ ASSERT(badseg1->s_base == lpgaddr);
+ ASSERT(badseg1->s_size == 2 * pgsz);
+ } else {
+ nseg = segvn_split_seg(nseg, lpgaddr + pgsz);
+ ASSERT(badseg1->s_base == lpgaddr);
+ ASSERT(badseg1->s_size == pgsz);
+ if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) {
+ ASSERT(lpgeaddr - lpgaddr > 2 * pgsz);
+ nseg = segvn_split_seg(nseg, lpgeaddr - pgsz);
+ badseg2 = nseg;
+ (void) segvn_split_seg(nseg, lpgeaddr);
+ ASSERT(badseg2->s_base == lpgeaddr - pgsz);
+ ASSERT(badseg2->s_size == pgsz);
+ }
+ }
+ } else {
+ ASSERT(flag == SDR_END);
+ ASSERT(eaddr < lpgeaddr);
+ badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz);
+ (void) segvn_split_seg(nseg, lpgeaddr);
+ ASSERT(badseg1->s_base == lpgeaddr - pgsz);
+ ASSERT(badseg1->s_size == pgsz);
+ }
+
+ ASSERT(badseg1 != NULL);
+ ASSERT(badseg1->s_szc != 0);
+ ASSERT(page_get_pagesize(badseg1->s_szc) == pgsz);
+ ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz ||
+ badseg1->s_size == 2 * pgsz);
+ if (err = segvn_clrszc(badseg1)) {
+ return (err);
+ }
+ ASSERT(badseg1->s_szc == 0);
+
+ if (badseg2 == NULL)
+ return (0);
+ ASSERT(badseg2->s_szc != 0);
+ ASSERT(page_get_pagesize(badseg2->s_szc) == pgsz);
+ ASSERT(badseg2->s_size == pgsz);
+ ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size));
+ if (err = segvn_clrszc(badseg2)) {
+ return (err);
+ }
+ ASSERT(badseg2->s_szc == 0);
+ return (0);
+}
+
+static int
+segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct vpage *vp, *evp;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ /*
+ * If segment protection can be used, simply check against them.
+ */
+ if (svd->pageprot == 0) {
+ int err;
+
+ err = ((svd->prot & prot) != prot) ? EACCES : 0;
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (err);
+ }
+
+ /*
+ * Have to check down to the vpage level.
+ */
+ evp = &svd->vpage[seg_page(seg, addr + len)];
+ for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) {
+ if ((VPP_PROT(vp) & prot) != prot) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EACCES);
+ }
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+}
+
+static int
+segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ if (pgno != 0) {
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ if (svd->pageprot == 0) {
+ do
+ protv[--pgno] = svd->prot;
+ while (pgno != 0);
+ } else {
+ size_t pgoff = seg_page(seg, addr);
+
+ do {
+ pgno--;
+ protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]);
+ } while (pgno != 0);
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ }
+ return (0);
+}
+
+static u_offset_t
+segvn_getoffset(struct seg *seg, caddr_t addr)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ return (svd->offset + (uintptr_t)(addr - seg->s_base));
+}
+
+/*ARGSUSED*/
+static int
+segvn_gettype(struct seg *seg, caddr_t addr)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ return (svd->type | (svd->flags & MAP_NORESERVE));
+}
+
+/*ARGSUSED*/
+static int
+segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ *vpp = svd->vp;
+ return (0);
+}
+
+/*
+ * Check to see if it makes sense to do kluster/read ahead to
+ * addr + delta relative to the mapping at addr. We assume here
+ * that delta is a signed PAGESIZE'd multiple (which can be negative).
+ *
+ * For segvn, we currently "approve" of the action if we are
+ * still in the segment and it maps from the same vp/off,
+ * or if the advice stored in segvn_data or vpages allows it.
+ * Currently, klustering is not allowed only if MADV_RANDOM is set.
+ */
+static int
+segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct anon *oap, *ap;
+ ssize_t pd;
+ size_t page;
+ struct vnode *vp1, *vp2;
+ u_offset_t off1, off2;
+ struct anon_map *amp;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
+ SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
+
+ if (addr + delta < seg->s_base ||
+ addr + delta >= (seg->s_base + seg->s_size))
+ return (-1); /* exceeded segment bounds */
+
+ pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */
+ page = seg_page(seg, addr);
+
+ /*
+ * Check to see if either of the pages addr or addr + delta
+ * have advice set that prevents klustering (if MADV_RANDOM advice
+ * is set for entire segment, or MADV_SEQUENTIAL is set and delta
+ * is negative).
+ */
+ if (svd->advice == MADV_RANDOM ||
+ svd->advice == MADV_SEQUENTIAL && delta < 0)
+ return (-1);
+ else if (svd->pageadvice && svd->vpage) {
+ struct vpage *bvpp, *evpp;
+
+ bvpp = &svd->vpage[page];
+ evpp = &svd->vpage[page + pd];
+ if (VPP_ADVICE(bvpp) == MADV_RANDOM ||
+ VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0)
+ return (-1);
+ if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) &&
+ VPP_ADVICE(evpp) == MADV_RANDOM)
+ return (-1);
+ }
+
+ if (svd->type == MAP_SHARED)
+ return (0); /* shared mapping - all ok */
+
+ if ((amp = svd->amp) == NULL)
+ return (0); /* off original vnode */
+
+ page += svd->anon_index;
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+
+ oap = anon_get_ptr(amp->ahp, page);
+ ap = anon_get_ptr(amp->ahp, page + pd);
+
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+
+ if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) {
+ return (-1); /* one with and one without an anon */
+ }
+
+ if (oap == NULL) { /* implies that ap == NULL */
+ return (0); /* off original vnode */
+ }
+
+ /*
+ * Now we know we have two anon pointers - check to
+ * see if they happen to be properly allocated.
+ */
+
+ /*
+ * XXX We cheat here and don't lock the anon slots. We can't because
+ * we may have been called from the anon layer which might already
+ * have locked them. We are holding a refcnt on the slots so they
+ * can't disappear. The worst that will happen is we'll get the wrong
+ * names (vp, off) for the slots and make a poor klustering decision.
+ */
+ swap_xlate(ap, &vp1, &off1);
+ swap_xlate(oap, &vp2, &off2);
+
+
+ if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta)
+ return (-1);
+ return (0);
+}
+
+/*
+ * Swap the pages of seg out to secondary storage, returning the
+ * number of bytes of storage freed.
+ *
+ * The basic idea is first to unload all translations and then to call
+ * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the
+ * swap device. Pages to which other segments have mappings will remain
+ * mapped and won't be swapped. Our caller (as_swapout) has already
+ * performed the unloading step.
+ *
+ * The value returned is intended to correlate well with the process's
+ * memory requirements. However, there are some caveats:
+ * 1) When given a shared segment as argument, this routine will
+ * only succeed in swapping out pages for the last sharer of the
+ * segment. (Previous callers will only have decremented mapping
+ * reference counts.)
+ * 2) We assume that the hat layer maintains a large enough translation
+ * cache to capture process reference patterns.
+ */
+static size_t
+segvn_swapout(struct seg *seg)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct anon_map *amp;
+ pgcnt_t pgcnt = 0;
+ pgcnt_t npages;
+ pgcnt_t page;
+ ulong_t anon_index;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ /*
+ * Find pages unmapped by our caller and force them
+ * out to the virtual swap device.
+ */
+ if ((amp = svd->amp) != NULL)
+ anon_index = svd->anon_index;
+ npages = seg->s_size >> PAGESHIFT;
+ for (page = 0; page < npages; page++) {
+ page_t *pp;
+ struct anon *ap;
+ struct vnode *vp;
+ u_offset_t off;
+ anon_sync_obj_t cookie;
+
+ /*
+ * Obtain <vp, off> pair for the page, then look it up.
+ *
+ * Note that this code is willing to consider regular
+ * pages as well as anon pages. Is this appropriate here?
+ */
+ ap = NULL;
+ if (amp != NULL) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ anon_array_enter(amp, anon_index + page, &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_index + page);
+ if (ap != NULL) {
+ swap_xlate(ap, &vp, &off);
+ } else {
+ vp = svd->vp;
+ off = svd->offset + ptob(page);
+ }
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ } else {
+ vp = svd->vp;
+ off = svd->offset + ptob(page);
+ }
+ if (vp == NULL) { /* untouched zfod page */
+ ASSERT(ap == NULL);
+ continue;
+ }
+
+ pp = page_lookup_nowait(vp, off, SE_SHARED);
+ if (pp == NULL)
+ continue;
+
+
+ /*
+ * Examine the page to see whether it can be tossed out,
+ * keeping track of how many we've found.
+ */
+ if (!page_tryupgrade(pp)) {
+ /*
+ * If the page has an i/o lock and no mappings,
+ * it's very likely that the page is being
+ * written out as a result of klustering.
+ * Assume this is so and take credit for it here.
+ */
+ if (!page_io_trylock(pp)) {
+ if (!hat_page_is_mapped(pp))
+ pgcnt++;
+ } else {
+ page_io_unlock(pp);
+ }
+ page_unlock(pp);
+ continue;
+ }
+ ASSERT(!page_iolock_assert(pp));
+
+
+ /*
+ * Skip if page is locked or has mappings.
+ * We don't need the page_struct_lock to look at lckcnt
+ * and cowcnt because the page is exclusive locked.
+ */
+ if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
+ hat_page_is_mapped(pp)) {
+ page_unlock(pp);
+ continue;
+ }
+
+ /*
+ * dispose skips large pages so try to demote first.
+ */
+ if (pp->p_szc != 0 && !page_try_demote_pages(pp)) {
+ page_unlock(pp);
+ /*
+ * XXX should skip the remaining page_t's of this
+ * large page.
+ */
+ continue;
+ }
+
+ ASSERT(pp->p_szc == 0);
+
+ /*
+ * No longer mapped -- we can toss it out. How
+ * we do so depends on whether or not it's dirty.
+ */
+ if (hat_ismod(pp) && pp->p_vnode) {
+ /*
+ * We must clean the page before it can be
+ * freed. Setting B_FREE will cause pvn_done
+ * to free the page when the i/o completes.
+ * XXX: This also causes it to be accounted
+ * as a pageout instead of a swap: need
+ * B_SWAPOUT bit to use instead of B_FREE.
+ *
+ * Hold the vnode before releasing the page lock
+ * to prevent it from being freed and re-used by
+ * some other thread.
+ */
+ VN_HOLD(vp);
+ page_unlock(pp);
+
+ /*
+ * Queue all i/o requests for the pageout thread
+ * to avoid saturating the pageout devices.
+ */
+ if (!queue_io_request(vp, off))
+ VN_RELE(vp);
+ } else {
+ /*
+ * The page was clean, free it.
+ *
+ * XXX: Can we ever encounter modified pages
+ * with no associated vnode here?
+ */
+ ASSERT(pp->p_vnode != NULL);
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, B_FREE, 0, kcred);
+ }
+
+ /*
+ * Credit now even if i/o is in progress.
+ */
+ pgcnt++;
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+
+ /*
+ * Wakeup pageout to initiate i/o on all queued requests.
+ */
+ cv_signal_pageout();
+ return (ptob(pgcnt));
+}
+
+/*
+ * Synchronize primary storage cache with real object in virtual memory.
+ *
+ * XXX - Anonymous pages should not be sync'ed out at all.
+ */
+static int
+segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct vpage *vpp;
+ page_t *pp;
+ u_offset_t offset;
+ struct vnode *vp;
+ u_offset_t off;
+ caddr_t eaddr;
+ int bflags;
+ int err = 0;
+ int segtype;
+ int pageprot;
+ int prot;
+ ulong_t anon_index;
+ struct anon_map *amp;
+ struct anon *ap;
+ anon_sync_obj_t cookie;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+ if (svd->softlockcnt > 0) {
+ /*
+ * flush all pages from seg cache
+ * otherwise we may deadlock in swap_putpage
+ * for B_INVAL page (4175402).
+ *
+ * Even if we grab segvn WRITER's lock or segp_slock
+ * here, there might be another thread which could've
+ * successfully performed lookup/insert just before
+ * we acquired the lock here. So, grabbing either
+ * lock here is of not much use. Until we devise
+ * a strategy at upper layers to solve the
+ * synchronization issues completely, we expect
+ * applications to handle this appropriately.
+ */
+ segvn_purge(seg);
+ if (svd->softlockcnt > 0) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EAGAIN);
+ }
+ }
+
+ vpp = svd->vpage;
+ offset = svd->offset + (uintptr_t)(addr - seg->s_base);
+ bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
+ ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+
+ if (attr) {
+ pageprot = attr & ~(SHARED|PRIVATE);
+ segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE;
+
+ /*
+ * We are done if the segment types don't match
+ * or if we have segment level protections and
+ * they don't match.
+ */
+ if (svd->type != segtype) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+ if (vpp == NULL) {
+ if (svd->prot != pageprot) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+ prot = svd->prot;
+ } else
+ vpp = &svd->vpage[seg_page(seg, addr)];
+
+ } else if (svd->vp && svd->amp == NULL &&
+ (flags & MS_INVALIDATE) == 0) {
+
+ /*
+ * No attributes, no anonymous pages and MS_INVALIDATE flag
+ * is not on, just use one big request.
+ */
+ err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
+ bflags, svd->cred);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (err);
+ }
+
+ if ((amp = svd->amp) != NULL)
+ anon_index = svd->anon_index + seg_page(seg, addr);
+
+ for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) {
+ ap = NULL;
+ if (amp != NULL) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ anon_array_enter(amp, anon_index, &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_index++);
+ if (ap != NULL) {
+ swap_xlate(ap, &vp, &off);
+ } else {
+ vp = svd->vp;
+ off = offset;
+ }
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ } else {
+ vp = svd->vp;
+ off = offset;
+ }
+ offset += PAGESIZE;
+
+ if (vp == NULL) /* untouched zfod page */
+ continue;
+
+ if (attr) {
+ if (vpp) {
+ prot = VPP_PROT(vpp);
+ vpp++;
+ }
+ if (prot != pageprot) {
+ continue;
+ }
+ }
+
+ /*
+ * See if any of these pages are locked -- if so, then we
+ * will have to truncate an invalidate request at the first
+ * locked one. We don't need the page_struct_lock to test
+ * as this is only advisory; even if we acquire it someone
+ * might race in and lock the page after we unlock and before
+ * we do the PUTPAGE, then PUTPAGE simply does nothing.
+ */
+ if (flags & MS_INVALIDATE) {
+ if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
+ if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+ page_unlock(pp);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EBUSY);
+ }
+ if (ap != NULL && pp->p_szc != 0 &&
+ page_tryupgrade(pp)) {
+ if (pp->p_lckcnt == 0 &&
+ pp->p_cowcnt == 0) {
+ /*
+ * swapfs VN_DISPOSE() won't
+ * invalidate large pages.
+ * Attempt to demote.
+ * XXX can't help it if it
+ * fails. But for swapfs
+ * pages it is no big deal.
+ */
+ (void) page_try_demote_pages(
+ pp);
+ }
+ }
+ page_unlock(pp);
+ }
+ } else if (svd->type == MAP_SHARED && amp != NULL) {
+ /*
+ * Avoid writting out to disk ISM's large pages
+ * because segspt_free_pages() relies on NULL an_pvp
+ * of anon slots of such pages.
+ */
+
+ ASSERT(svd->vp == NULL);
+ /*
+ * swapfs uses page_lookup_nowait if not freeing or
+ * invalidating and skips a page if
+ * page_lookup_nowait returns NULL.
+ */
+ pp = page_lookup_nowait(vp, off, SE_SHARED);
+ if (pp == NULL) {
+ continue;
+ }
+ if (pp->p_szc != 0) {
+ page_unlock(pp);
+ continue;
+ }
+
+ /*
+ * Note ISM pages are created large so (vp, off)'s
+ * page cannot suddenly become large after we unlock
+ * pp.
+ */
+ page_unlock(pp);
+ }
+ /*
+ * XXX - Should ultimately try to kluster
+ * calls to VOP_PUTPAGE() for performance.
+ */
+ VN_HOLD(vp);
+ err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
+ bflags, svd->cred);
+ VN_RELE(vp);
+ if (err)
+ break;
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (err);
+}
+
+/*
+ * Determine if we have data corresponding to pages in the
+ * primary storage virtual memory cache (i.e., "in core").
+ */
+static size_t
+segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct vnode *vp, *avp;
+ u_offset_t offset, aoffset;
+ size_t p, ep;
+ int ret;
+ struct vpage *vpp;
+ page_t *pp;
+ uint_t start;
+ struct anon_map *amp; /* XXX - for locknest */
+ struct anon *ap;
+ uint_t attr;
+ anon_sync_obj_t cookie;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ if (svd->amp == NULL && svd->vp == NULL) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ bzero(vec, btopr(len));
+ return (len); /* no anonymous pages created yet */
+ }
+
+ p = seg_page(seg, addr);
+ ep = seg_page(seg, addr + len);
+ start = svd->vp ? SEG_PAGE_VNODEBACKED : 0;
+
+ amp = svd->amp;
+ for (; p < ep; p++, addr += PAGESIZE) {
+ vpp = (svd->vpage) ? &svd->vpage[p]: NULL;
+ ret = start;
+ ap = NULL;
+ avp = NULL;
+ /* Grab the vnode/offset for the anon slot */
+ if (amp != NULL) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ anon_array_enter(amp, svd->anon_index + p, &cookie);
+ ap = anon_get_ptr(amp->ahp, svd->anon_index + p);
+ if (ap != NULL) {
+ swap_xlate(ap, &avp, &aoffset);
+ }
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+ if ((avp != NULL) && page_exists(avp, aoffset)) {
+ /* A page exists for the anon slot */
+ ret |= SEG_PAGE_INCORE;
+
+ /*
+ * If page is mapped and writable
+ */
+ attr = (uint_t)0;
+ if ((hat_getattr(seg->s_as->a_hat, addr,
+ &attr) != -1) && (attr & PROT_WRITE)) {
+ ret |= SEG_PAGE_ANON;
+ }
+ /*
+ * Don't get page_struct lock for lckcnt and cowcnt,
+ * since this is purely advisory.
+ */
+ if ((pp = page_lookup_nowait(avp, aoffset,
+ SE_SHARED)) != NULL) {
+ if (pp->p_lckcnt)
+ ret |= SEG_PAGE_SOFTLOCK;
+ if (pp->p_cowcnt)
+ ret |= SEG_PAGE_HASCOW;
+ page_unlock(pp);
+ }
+ }
+
+ /* Gather vnode statistics */
+ vp = svd->vp;
+ offset = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+ if (vp != NULL) {
+ /*
+ * Try to obtain a "shared" lock on the page
+ * without blocking. If this fails, determine
+ * if the page is in memory.
+ */
+ pp = page_lookup_nowait(vp, offset, SE_SHARED);
+ if ((pp == NULL) && (page_exists(vp, offset))) {
+ /* Page is incore, and is named */
+ ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
+ }
+ /*
+ * Don't get page_struct lock for lckcnt and cowcnt,
+ * since this is purely advisory.
+ */
+ if (pp != NULL) {
+ ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
+ if (pp->p_lckcnt)
+ ret |= SEG_PAGE_SOFTLOCK;
+ if (pp->p_cowcnt)
+ ret |= SEG_PAGE_HASCOW;
+ page_unlock(pp);
+ }
+ }
+
+ /* Gather virtual page information */
+ if (vpp) {
+ if (VPP_ISPPLOCK(vpp))
+ ret |= SEG_PAGE_LOCKED;
+ vpp++;
+ }
+
+ *vec++ = (char)ret;
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (len);
+}
+
+/*
+ * Statement for p_cowcnts/p_lckcnts.
+ *
+ * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region
+ * irrespective of the following factors or anything else:
+ *
+ * (1) anon slots are populated or not
+ * (2) cow is broken or not
+ * (3) refcnt on ap is 1 or greater than 1
+ *
+ * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock
+ * and munlock.
+ *
+ *
+ * Handling p_cowcnts/p_lckcnts during copy-on-write fault:
+ *
+ * if vpage has PROT_WRITE
+ * transfer cowcnt on the oldpage -> cowcnt on the newpage
+ * else
+ * transfer lckcnt on the oldpage -> lckcnt on the newpage
+ *
+ * During copy-on-write, decrement p_cowcnt on the oldpage and increment
+ * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE.
+ *
+ * We may also break COW if softlocking on read access in the physio case.
+ * In this case, vpage may not have PROT_WRITE. So, we need to decrement
+ * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the
+ * vpage doesn't have PROT_WRITE.
+ *
+ *
+ * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region:
+ *
+ * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and
+ * increment p_lckcnt by calling page_subclaim() which takes care of
+ * availrmem accounting and p_lckcnt overflow.
+ *
+ * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and
+ * increment p_cowcnt by calling page_addclaim() which takes care of
+ * availrmem availability and p_cowcnt overflow.
+ */
+
+/*
+ * Lock down (or unlock) pages mapped by this segment.
+ *
+ * XXX only creates PAGESIZE pages if anon slots are not initialized.
+ * At fault time they will be relocated into larger pages.
+ */
+static int
+segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
+ int attr, int op, ulong_t *lockmap, size_t pos)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct vpage *vpp;
+ struct vpage *evp;
+ page_t *pp;
+ u_offset_t offset;
+ u_offset_t off;
+ int segtype;
+ int pageprot;
+ int claim;
+ struct vnode *vp;
+ ulong_t anon_index;
+ struct anon_map *amp;
+ struct anon *ap;
+ struct vattr va;
+ anon_sync_obj_t cookie;
+
+ /*
+ * Hold write lock on address space because may split or concatenate
+ * segments
+ */
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+ if (attr) {
+ pageprot = attr & ~(SHARED|PRIVATE);
+ segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE;
+
+ /*
+ * We are done if the segment types don't match
+ * or if we have segment level protections and
+ * they don't match.
+ */
+ if (svd->type != segtype) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+ if (svd->pageprot == 0 && svd->prot != pageprot) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+ }
+
+ /*
+ * If we're locking, then we must create a vpage structure if
+ * none exists. If we're unlocking, then check to see if there
+ * is a vpage -- if not, then we could not have locked anything.
+ */
+
+ if ((vpp = svd->vpage) == NULL) {
+ if (op == MC_LOCK)
+ segvn_vpage(seg);
+ else {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+ }
+
+ /*
+ * The anonymous data vector (i.e., previously
+ * unreferenced mapping to swap space) can be allocated
+ * by lazily testing for its existence.
+ */
+ if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) {
+ svd->amp = anonmap_alloc(seg->s_size, 0);
+ svd->amp->a_szc = seg->s_szc;
+ }
+
+ if ((amp = svd->amp) != NULL) {
+ anon_index = svd->anon_index + seg_page(seg, addr);
+ }
+
+ offset = svd->offset + (uintptr_t)(addr - seg->s_base);
+ evp = &svd->vpage[seg_page(seg, addr + len)];
+
+ /*
+ * Loop over all pages in the range. Process if we're locking and
+ * page has not already been locked in this mapping; or if we're
+ * unlocking and the page has been locked.
+ */
+ for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp;
+ vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) {
+ if ((attr == 0 || VPP_PROT(vpp) == pageprot) &&
+ ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) ||
+ (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) {
+
+ if (amp != NULL)
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ /*
+ * If this isn't a MAP_NORESERVE segment and
+ * we're locking, allocate anon slots if they
+ * don't exist. The page is brought in later on.
+ */
+ if (op == MC_LOCK && svd->vp == NULL &&
+ ((svd->flags & MAP_NORESERVE) == 0) &&
+ amp != NULL &&
+ ((ap = anon_get_ptr(amp->ahp, anon_index))
+ == NULL)) {
+ anon_array_enter(amp, anon_index, &cookie);
+
+ if ((ap = anon_get_ptr(amp->ahp,
+ anon_index)) == NULL) {
+ pp = anon_zero(seg, addr, &ap,
+ svd->cred);
+ if (pp == NULL) {
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ SEGVN_LOCK_EXIT(seg->s_as,
+ &svd->lock);
+ return (ENOMEM);
+ }
+ ASSERT(anon_get_ptr(amp->ahp,
+ anon_index) == NULL);
+ (void) anon_set_ptr(amp->ahp,
+ anon_index, ap, ANON_SLEEP);
+ page_unlock(pp);
+ }
+ anon_array_exit(&cookie);
+ }
+
+ /*
+ * Get name for page, accounting for
+ * existence of private copy.
+ */
+ ap = NULL;
+ if (amp != NULL) {
+ anon_array_enter(amp, anon_index, &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_index);
+ if (ap != NULL) {
+ swap_xlate(ap, &vp, &off);
+ } else {
+ if (svd->vp == NULL &&
+ (svd->flags & MAP_NORESERVE)) {
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ continue;
+ }
+ vp = svd->vp;
+ off = offset;
+ }
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ } else {
+ vp = svd->vp;
+ off = offset;
+ }
+
+ /*
+ * Get page frame. It's ok if the page is
+ * not available when we're unlocking, as this
+ * may simply mean that a page we locked got
+ * truncated out of existence after we locked it.
+ *
+ * Invoke VOP_GETPAGE() to obtain the page struct
+ * since we may need to read it from disk if its
+ * been paged out.
+ */
+ if (op != MC_LOCK)
+ pp = page_lookup(vp, off, SE_SHARED);
+ else {
+ page_t *pl[1 + 1];
+ int error;
+
+ ASSERT(vp != NULL);
+
+ error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE,
+ (uint_t *)NULL, pl, PAGESIZE, seg, addr,
+ S_OTHER, svd->cred);
+
+ /*
+ * If the error is EDEADLK then we must bounce
+ * up and drop all vm subsystem locks and then
+ * retry the operation later
+ * This behavior is a temporary measure because
+ * ufs/sds logging is badly designed and will
+ * deadlock if we don't allow this bounce to
+ * happen. The real solution is to re-design
+ * the logging code to work properly. See bug
+ * 4125102 for details of the problem.
+ */
+ if (error == EDEADLK) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (error);
+ }
+ /*
+ * Quit if we fail to fault in the page. Treat
+ * the failure as an error, unless the addr
+ * is mapped beyond the end of a file.
+ */
+ if (error && svd->vp) {
+ va.va_mask = AT_SIZE;
+ if (VOP_GETATTR(svd->vp, &va, 0,
+ svd->cred) != 0) {
+ SEGVN_LOCK_EXIT(seg->s_as,
+ &svd->lock);
+ return (EIO);
+ }
+ if (btopr(va.va_size) >=
+ btopr(off + 1)) {
+ SEGVN_LOCK_EXIT(seg->s_as,
+ &svd->lock);
+ return (EIO);
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ } else if (error) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EIO);
+ }
+ pp = pl[0];
+ ASSERT(pp != NULL);
+ }
+
+ /*
+ * See Statement at the beginning of this routine.
+ *
+ * claim is always set if MAP_PRIVATE and PROT_WRITE
+ * irrespective of following factors:
+ *
+ * (1) anon slots are populated or not
+ * (2) cow is broken or not
+ * (3) refcnt on ap is 1 or greater than 1
+ *
+ * See 4140683 for details
+ */
+ claim = ((VPP_PROT(vpp) & PROT_WRITE) &&
+ (svd->type == MAP_PRIVATE));
+
+ /*
+ * Perform page-level operation appropriate to
+ * operation. If locking, undo the SOFTLOCK
+ * performed to bring the page into memory
+ * after setting the lock. If unlocking,
+ * and no page was found, account for the claim
+ * separately.
+ */
+ if (op == MC_LOCK) {
+ int ret = 1; /* Assume success */
+
+ /*
+ * Make sure another thread didn't lock
+ * the page after we released the segment
+ * lock.
+ */
+ if ((attr == 0 || VPP_PROT(vpp) == pageprot) &&
+ !VPP_ISPPLOCK(vpp)) {
+ ret = page_pp_lock(pp, claim, 0);
+ if (ret != 0) {
+ VPP_SETPPLOCK(vpp);
+ if (lockmap != (ulong_t *)NULL)
+ BT_SET(lockmap, pos);
+ }
+ }
+ page_unlock(pp);
+ if (ret == 0) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EAGAIN);
+ }
+ } else {
+ if (pp != NULL) {
+ if ((attr == 0 ||
+ VPP_PROT(vpp) == pageprot) &&
+ VPP_ISPPLOCK(vpp))
+ page_pp_unlock(pp, claim, 0);
+ page_unlock(pp);
+ }
+ VPP_CLRPPLOCK(vpp);
+ }
+ }
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+}
+
+/*
+ * Set advice from user for specified pages
+ * There are 5 types of advice:
+ * MADV_NORMAL - Normal (default) behavior (whatever that is)
+ * MADV_RANDOM - Random page references
+ * do not allow readahead or 'klustering'
+ * MADV_SEQUENTIAL - Sequential page references
+ * Pages previous to the one currently being
+ * accessed (determined by fault) are 'not needed'
+ * and are freed immediately
+ * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl)
+ * MADV_DONTNEED - Pages are not needed (synced out in mctl)
+ * MADV_FREE - Contents can be discarded
+ * MADV_ACCESS_DEFAULT- Default access
+ * MADV_ACCESS_LWP - Next LWP will access heavily
+ * MADV_ACCESS_MANY- Many LWPs or processes will access heavily
+ */
+static int
+segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ size_t page;
+ int err = 0;
+ int already_set;
+ struct anon_map *amp;
+ ulong_t anon_index;
+ struct seg *next;
+ lgrp_mem_policy_t policy;
+ struct seg *prev;
+ struct vnode *vp;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * In case of MADV_FREE, we won't be modifying any segment private
+ * data structures; so, we only need to grab READER's lock
+ */
+ if (behav != MADV_FREE)
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+ else
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+ /*
+ * Large pages are assumed to be only turned on when accesses to the
+ * segment's address range have spatial and temporal locality. That
+ * justifies ignoring MADV_SEQUENTIAL for large page segments.
+ * Also, ignore advice affecting lgroup memory allocation
+ * if don't need to do lgroup optimizations on this system
+ */
+
+ if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) ||
+ (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT ||
+ behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+
+ if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT ||
+ behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) {
+ /*
+ * Since we are going to unload hat mappings
+ * we first have to flush the cache. Otherwise
+ * this might lead to system panic if another
+ * thread is doing physio on the range whose
+ * mappings are unloaded by madvise(3C).
+ */
+ if (svd->softlockcnt > 0) {
+ /*
+ * Since we do have the segvn writers lock
+ * nobody can fill the cache with entries
+ * belonging to this seg during the purge.
+ * The flush either succeeds or we still
+ * have pending I/Os. In the later case,
+ * madvise(3C) fails.
+ */
+ segvn_purge(seg);
+ if (svd->softlockcnt > 0) {
+ /*
+ * Since madvise(3C) is advisory and
+ * it's not part of UNIX98, madvise(3C)
+ * failure here doesn't cause any hardship.
+ * Note that we don't block in "as" layer.
+ */
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EAGAIN);
+ }
+ }
+ }
+
+ amp = svd->amp;
+ vp = svd->vp;
+ if (behav == MADV_FREE) {
+ /*
+ * MADV_FREE is not supported for segments with
+ * underlying object; if anonmap is NULL, anon slots
+ * are not yet populated and there is nothing for
+ * us to do. As MADV_FREE is advisory, we don't
+ * return error in either case.
+ */
+ if (vp || amp == NULL) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+
+ page = seg_page(seg, addr);
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ anon_disclaim(amp, svd->anon_index + page, len, 0);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+
+ /*
+ * If advice is to be applied to entire segment,
+ * use advice field in seg_data structure
+ * otherwise use appropriate vpage entry.
+ */
+ if ((addr == seg->s_base) && (len == seg->s_size)) {
+ switch (behav) {
+ case MADV_ACCESS_LWP:
+ case MADV_ACCESS_MANY:
+ case MADV_ACCESS_DEFAULT:
+ /*
+ * Set memory allocation policy for this segment
+ */
+ policy = lgrp_madv_to_policy(behav, len, svd->type);
+ if (svd->type == MAP_SHARED)
+ already_set = lgrp_shm_policy_set(policy, amp,
+ svd->anon_index, vp, svd->offset, len);
+ else {
+ /*
+ * For private memory, need writers lock on
+ * address space because the segment may be
+ * split or concatenated when changing policy
+ */
+ if (AS_READ_HELD(seg->s_as,
+ &seg->s_as->a_lock)) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (IE_RETRY);
+ }
+
+ already_set = lgrp_privm_policy_set(policy,
+ &svd->policy_info, len);
+ }
+
+ /*
+ * If policy set already and it shouldn't be reapplied,
+ * don't do anything.
+ */
+ if (already_set &&
+ !LGRP_MEM_POLICY_REAPPLICABLE(policy))
+ break;
+
+ /*
+ * Mark any existing pages in given range for
+ * migration
+ */
+ page_mark_migrate(seg, addr, len, amp, svd->anon_index,
+ vp, svd->offset, 1);
+
+ /*
+ * If same policy set already or this is a shared
+ * memory segment, don't need to try to concatenate
+ * segment with adjacent ones.
+ */
+ if (already_set || svd->type == MAP_SHARED)
+ break;
+
+ /*
+ * Try to concatenate this segment with previous
+ * one and next one, since we changed policy for
+ * this one and it may be compatible with adjacent
+ * ones now.
+ */
+ prev = AS_SEGPREV(seg->s_as, seg);
+ next = AS_SEGNEXT(seg->s_as, seg);
+
+ if (next && next->s_ops == &segvn_ops &&
+ addr + len == next->s_base)
+ (void) segvn_concat(seg, next, 1);
+
+ if (prev && prev->s_ops == &segvn_ops &&
+ addr == prev->s_base + prev->s_size) {
+ /*
+ * Drop lock for private data of current
+ * segment before concatenating (deleting) it
+ * and return IE_REATTACH to tell as_ctl() that
+ * current segment has changed
+ */
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ if (!segvn_concat(prev, seg, 1))
+ err = IE_REATTACH;
+
+ return (err);
+ }
+ break;
+
+ case MADV_SEQUENTIAL:
+ /*
+ * unloading mapping guarantees
+ * detection in segvn_fault
+ */
+ ASSERT(seg->s_szc == 0);
+ hat_unload(seg->s_as->a_hat, addr, len,
+ HAT_UNLOAD);
+ /* FALLTHROUGH */
+ case MADV_NORMAL:
+ case MADV_RANDOM:
+ svd->advice = (uchar_t)behav;
+ svd->pageadvice = 0;
+ break;
+ case MADV_WILLNEED: /* handled in memcntl */
+ case MADV_DONTNEED: /* handled in memcntl */
+ case MADV_FREE: /* handled above */
+ break;
+ default:
+ err = EINVAL;
+ }
+ } else {
+ caddr_t eaddr;
+ struct seg *new_seg;
+ struct segvn_data *new_svd;
+ u_offset_t off;
+ caddr_t oldeaddr;
+
+ page = seg_page(seg, addr);
+
+ segvn_vpage(seg);
+
+ switch (behav) {
+ struct vpage *bvpp, *evpp;
+
+ case MADV_ACCESS_LWP:
+ case MADV_ACCESS_MANY:
+ case MADV_ACCESS_DEFAULT:
+ /*
+ * Set memory allocation policy for portion of this
+ * segment
+ */
+
+ /*
+ * Align address and length of advice to page
+ * boundaries for large pages
+ */
+ if (seg->s_szc != 0) {
+ size_t pgsz;
+
+ pgsz = page_get_pagesize(seg->s_szc);
+ addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
+ len = P2ROUNDUP(len, pgsz);
+ }
+
+ /*
+ * Check to see whether policy is set already
+ */
+ policy = lgrp_madv_to_policy(behav, len, svd->type);
+
+ anon_index = svd->anon_index + page;
+ off = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+ if (svd->type == MAP_SHARED)
+ already_set = lgrp_shm_policy_set(policy, amp,
+ anon_index, vp, off, len);
+ else
+ already_set =
+ (policy == svd->policy_info.mem_policy);
+
+ /*
+ * If policy set already and it shouldn't be reapplied,
+ * don't do anything.
+ */
+ if (already_set &&
+ !LGRP_MEM_POLICY_REAPPLICABLE(policy))
+ break;
+
+ /*
+ * For private memory, need writers lock on
+ * address space because the segment may be
+ * split or concatenated when changing policy
+ */
+ if (svd->type == MAP_PRIVATE &&
+ AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (IE_RETRY);
+ }
+
+ /*
+ * Mark any existing pages in given range for
+ * migration
+ */
+ page_mark_migrate(seg, addr, len, amp, svd->anon_index,
+ vp, svd->offset, 1);
+
+ /*
+ * Don't need to try to split or concatenate
+ * segments, since policy is same or this is a shared
+ * memory segment
+ */
+ if (already_set || svd->type == MAP_SHARED)
+ break;
+
+ /*
+ * Split off new segment if advice only applies to a
+ * portion of existing segment starting in middle
+ */
+ new_seg = NULL;
+ eaddr = addr + len;
+ oldeaddr = seg->s_base + seg->s_size;
+ if (addr > seg->s_base) {
+ /*
+ * Must flush I/O page cache
+ * before splitting segment
+ */
+ if (svd->softlockcnt > 0)
+ segvn_purge(seg);
+
+ /*
+ * Split segment and return IE_REATTACH to tell
+ * as_ctl() that current segment changed
+ */
+ new_seg = segvn_split_seg(seg, addr);
+ new_svd = (struct segvn_data *)new_seg->s_data;
+ err = IE_REATTACH;
+
+ /*
+ * If new segment ends where old one
+ * did, try to concatenate the new
+ * segment with next one.
+ */
+ if (eaddr == oldeaddr) {
+ /*
+ * Set policy for new segment
+ */
+ (void) lgrp_privm_policy_set(policy,
+ &new_svd->policy_info,
+ new_seg->s_size);
+
+ next = AS_SEGNEXT(new_seg->s_as,
+ new_seg);
+
+ if (next &&
+ next->s_ops == &segvn_ops &&
+ eaddr == next->s_base)
+ (void) segvn_concat(new_seg,
+ next, 1);
+ }
+ }
+
+ /*
+ * Split off end of existing segment if advice only
+ * applies to a portion of segment ending before
+ * end of the existing segment
+ */
+ if (eaddr < oldeaddr) {
+ /*
+ * Must flush I/O page cache
+ * before splitting segment
+ */
+ if (svd->softlockcnt > 0)
+ segvn_purge(seg);
+
+ /*
+ * If beginning of old segment was already
+ * split off, use new segment to split end off
+ * from.
+ */
+ if (new_seg != NULL && new_seg != seg) {
+ /*
+ * Split segment
+ */
+ (void) segvn_split_seg(new_seg, eaddr);
+
+ /*
+ * Set policy for new segment
+ */
+ (void) lgrp_privm_policy_set(policy,
+ &new_svd->policy_info,
+ new_seg->s_size);
+ } else {
+ /*
+ * Split segment and return IE_REATTACH
+ * to tell as_ctl() that current
+ * segment changed
+ */
+ (void) segvn_split_seg(seg, eaddr);
+ err = IE_REATTACH;
+
+ (void) lgrp_privm_policy_set(policy,
+ &svd->policy_info, seg->s_size);
+
+ /*
+ * If new segment starts where old one
+ * did, try to concatenate it with
+ * previous segment.
+ */
+ if (addr == seg->s_base) {
+ prev = AS_SEGPREV(seg->s_as,
+ seg);
+
+ /*
+ * Drop lock for private data
+ * of current segment before
+ * concatenating (deleting) it
+ */
+ if (prev &&
+ prev->s_ops ==
+ &segvn_ops &&
+ addr == prev->s_base +
+ prev->s_size) {
+ SEGVN_LOCK_EXIT(
+ seg->s_as,
+ &svd->lock);
+ (void) segvn_concat(
+ prev, seg, 1);
+ return (err);
+ }
+ }
+ }
+ }
+ break;
+ case MADV_SEQUENTIAL:
+ ASSERT(seg->s_szc == 0);
+ hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
+ /* FALLTHROUGH */
+ case MADV_NORMAL:
+ case MADV_RANDOM:
+ bvpp = &svd->vpage[page];
+ evpp = &svd->vpage[page + (len >> PAGESHIFT)];
+ for (; bvpp < evpp; bvpp++)
+ VPP_SETADVICE(bvpp, behav);
+ svd->advice = MADV_NORMAL;
+ break;
+ case MADV_WILLNEED: /* handled in memcntl */
+ case MADV_DONTNEED: /* handled in memcntl */
+ case MADV_FREE: /* handled above */
+ break;
+ default:
+ err = EINVAL;
+ }
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (err);
+}
+
+/*
+ * Create a vpage structure for this seg.
+ */
+static void
+segvn_vpage(struct seg *seg)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct vpage *vp, *evp;
+
+ ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
+
+ /*
+ * If no vpage structure exists, allocate one. Copy the protections
+ * and the advice from the segment itself to the individual pages.
+ */
+ if (svd->vpage == NULL) {
+ svd->pageprot = 1;
+ svd->pageadvice = 1;
+ svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage),
+ KM_SLEEP);
+ evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)];
+ for (vp = svd->vpage; vp < evp; vp++) {
+ VPP_SETPROT(vp, svd->prot);
+ VPP_SETADVICE(vp, svd->advice);
+ }
+ }
+}
+
+/*
+ * Dump the pages belonging to this segvn segment.
+ */
+static void
+segvn_dump(struct seg *seg)
+{
+ struct segvn_data *svd;
+ page_t *pp;
+ struct anon_map *amp;
+ ulong_t anon_index;
+ struct vnode *vp;
+ u_offset_t off, offset;
+ pfn_t pfn;
+ pgcnt_t page, npages;
+ caddr_t addr;
+
+ npages = seg_pages(seg);
+ svd = (struct segvn_data *)seg->s_data;
+ vp = svd->vp;
+ off = offset = svd->offset;
+ addr = seg->s_base;
+
+ if ((amp = svd->amp) != NULL) {
+ anon_index = svd->anon_index;
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ }
+
+ for (page = 0; page < npages; page++, offset += PAGESIZE) {
+ struct anon *ap;
+ int we_own_it = 0;
+
+ if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) {
+ swap_xlate_nopanic(ap, &vp, &off);
+ } else {
+ vp = svd->vp;
+ off = offset;
+ }
+
+ /*
+ * If pp == NULL, the page either does not exist
+ * or is exclusively locked. So determine if it
+ * exists before searching for it.
+ */
+
+ if ((pp = page_lookup_nowait(vp, off, SE_SHARED)))
+ we_own_it = 1;
+ else
+ pp = page_exists(vp, off);
+
+ if (pp) {
+ pfn = page_pptonum(pp);
+ dump_addpage(seg->s_as, addr, pfn);
+ if (we_own_it)
+ page_unlock(pp);
+ }
+ addr += PAGESIZE;
+ dump_timeleft = dump_timeout;
+ }
+
+ if (amp != NULL)
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+}
+
+/*
+ * lock/unlock anon pages over a given range. Return shadow list
+ */
+static int
+segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
+ enum lock_type type, enum seg_rw rw)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ size_t np, adjustpages = 0, npages = (len >> PAGESHIFT);
+ ulong_t anon_index;
+ uint_t protchk;
+ uint_t error;
+ struct anon_map *amp;
+ struct page **pplist, **pl, *pp;
+ caddr_t a;
+ size_t page;
+ caddr_t lpgaddr, lpgeaddr;
+
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START,
+ "segvn_pagelock: start seg %p addr %p", seg, addr);
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+ if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) {
+ /*
+ * We are adjusting the pagelock region to the large page size
+ * boundary because the unlocked part of a large page cannot
+ * be freed anyway unless all constituent pages of a large
+ * page are locked. Therefore this adjustment allows us to
+ * decrement availrmem by the right value (note we don't want
+ * to just decrement availrem by the large page size without
+ * adjusting addr and len because then we may end up
+ * decrementing availrmem by large page size for every
+ * constituent page locked by a new as_pagelock call).
+ * as_pageunlock caller must always match as_pagelock call's
+ * addr and len.
+ *
+ * Note segment's page size cannot change while we are holding
+ * as lock. And then it cannot change while softlockcnt is
+ * not 0. This will allow us to correctly recalculate large
+ * page size region for the matching pageunlock/reclaim call.
+ *
+ * for pageunlock *ppp points to the pointer of page_t that
+ * corresponds to the real unadjusted start address. Similar
+ * for pagelock *ppp must point to the pointer of page_t that
+ * corresponds to the real unadjusted start address.
+ */
+ size_t pgsz = page_get_pagesize(seg->s_szc);
+ CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
+ adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT;
+ }
+
+ if (type == L_PAGEUNLOCK) {
+
+ /*
+ * update hat ref bits for /proc. We need to make sure
+ * that threads tracing the ref and mod bits of the
+ * address space get the right data.
+ * Note: page ref and mod bits are updated at reclaim time
+ */
+ if (seg->s_as->a_vbits) {
+ for (a = addr; a < addr + len; a += PAGESIZE) {
+ if (rw == S_WRITE) {
+ hat_setstat(seg->s_as, a,
+ PAGESIZE, P_REF | P_MOD);
+ } else {
+ hat_setstat(seg->s_as, a,
+ PAGESIZE, P_REF);
+ }
+ }
+ }
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ if (seg->s_szc != 0) {
+ VM_STAT_ADD(segvnvmstats.pagelock[0]);
+ seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr,
+ *ppp - adjustpages, rw, segvn_reclaim);
+ } else {
+ seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim);
+ }
+
+ /*
+ * If someone is blocked while unmapping, we purge
+ * segment page cache and thus reclaim pplist synchronously
+ * without waiting for seg_pasync_thread. This speeds up
+ * unmapping in cases where munmap(2) is called, while
+ * raw async i/o is still in progress or where a thread
+ * exits on data fault in a multithreaded application.
+ */
+ if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) {
+ /*
+ * Even if we grab segvn WRITER's lock or segp_slock
+ * here, there might be another thread which could've
+ * successfully performed lookup/insert just before
+ * we acquired the lock here. So, grabbing either
+ * lock here is of not much use. Until we devise
+ * a strategy at upper layers to solve the
+ * synchronization issues completely, we expect
+ * applications to handle this appropriately.
+ */
+ segvn_purge(seg);
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
+ "segvn_pagelock: unlock seg %p addr %p", seg, addr);
+ return (0);
+ } else if (type == L_PAGERECLAIM) {
+ VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]);
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ (void) segvn_reclaim(seg, addr, len, *ppp, rw);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
+ "segvn_pagelock: reclaim seg %p addr %p", seg, addr);
+ return (0);
+ }
+
+ if (seg->s_szc != 0) {
+ VM_STAT_ADD(segvnvmstats.pagelock[2]);
+ addr = lpgaddr;
+ len = lpgeaddr - lpgaddr;
+ npages = (len >> PAGESHIFT);
+ }
+
+ /*
+ * for now we only support pagelock to anon memory. We've to check
+ * protections for vnode objects and call into the vnode driver.
+ * That's too much for a fast path. Let the fault entry point handle it.
+ */
+ if (svd->vp != NULL) {
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
+ "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr);
+ *ppp = NULL;
+ return (ENOTSUP);
+ }
+
+ /*
+ * if anonmap is not yet created, let the fault entry point populate it
+ * with anon ptrs.
+ */
+ if ((amp = svd->amp) == NULL) {
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
+ "segvn_pagelock: anonmap null seg %p addr %p", seg, addr);
+ *ppp = NULL;
+ return (EFAULT);
+ }
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+ /*
+ * we acquire segp_slock to prevent duplicate entries
+ * in seg_pcache
+ */
+ mutex_enter(&svd->segp_slock);
+
+ /*
+ * try to find pages in segment page cache
+ */
+ pplist = seg_plookup(seg, addr, len, rw);
+ if (pplist != NULL) {
+ mutex_exit(&svd->segp_slock);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ *ppp = pplist + adjustpages;
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END,
+ "segvn_pagelock: cache hit seg %p addr %p", seg, addr);
+ return (0);
+ }
+
+ if (rw == S_READ) {
+ protchk = PROT_READ;
+ } else {
+ protchk = PROT_WRITE;
+ }
+
+ if (svd->pageprot == 0) {
+ if ((svd->prot & protchk) == 0) {
+ mutex_exit(&svd->segp_slock);
+ error = EFAULT;
+ goto out;
+ }
+ } else {
+ /*
+ * check page protections
+ */
+ for (a = addr; a < addr + len; a += PAGESIZE) {
+ struct vpage *vp;
+
+ vp = &svd->vpage[seg_page(seg, a)];
+ if ((VPP_PROT(vp) & protchk) == 0) {
+ mutex_exit(&svd->segp_slock);
+ error = EFAULT;
+ goto out;
+ }
+ }
+ }
+
+ mutex_enter(&freemem_lock);
+ if (availrmem < tune.t_minarmem + npages) {
+ mutex_exit(&freemem_lock);
+ mutex_exit(&svd->segp_slock);
+ error = ENOMEM;
+ goto out;
+ } else {
+ svd->softlockcnt += npages;
+ availrmem -= npages;
+ segvn_pages_locked += npages;
+ }
+ mutex_exit(&freemem_lock);
+
+ pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP);
+ pl = pplist;
+ *ppp = pplist + adjustpages;
+
+ page = seg_page(seg, addr);
+ anon_index = svd->anon_index + page;
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) {
+ struct anon *ap;
+ struct vnode *vp;
+ u_offset_t off;
+ anon_sync_obj_t cookie;
+
+ anon_array_enter(amp, anon_index, &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_index);
+ if (ap == NULL) {
+ anon_array_exit(&cookie);
+ break;
+ } else {
+ /*
+ * We must never use seg_pcache for COW pages
+ * because we might end up with original page still
+ * lying in seg_pcache even after private page is
+ * created. This leads to data corruption as
+ * aio_write refers to the page still in cache
+ * while all other accesses refer to the private
+ * page.
+ */
+ if (ap->an_refcnt != 1) {
+ anon_array_exit(&cookie);
+ break;
+ }
+ }
+ swap_xlate(ap, &vp, &off);
+ anon_array_exit(&cookie);
+
+ pp = page_lookup_nowait(vp, off, SE_SHARED);
+ if (pp == NULL) {
+ break;
+ }
+ *pplist++ = pp;
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+
+ if (a >= addr + len) {
+ (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH,
+ segvn_reclaim);
+ mutex_exit(&svd->segp_slock);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END,
+ "segvn_pagelock: cache fill seg %p addr %p", seg, addr);
+ return (0);
+ }
+
+ mutex_exit(&svd->segp_slock);
+ error = EFAULT;
+ pplist = pl;
+ np = ((uintptr_t)(a - addr)) >> PAGESHIFT;
+ while (np > (uint_t)0) {
+ page_unlock(*pplist);
+ np--;
+ pplist++;
+ }
+ kmem_free(pl, sizeof (page_t *) * npages);
+ mutex_enter(&freemem_lock);
+ svd->softlockcnt -= npages;
+ availrmem += npages;
+ segvn_pages_locked -= npages;
+ mutex_exit(&freemem_lock);
+ if (svd->softlockcnt <= 0) {
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ mutex_enter(&seg->s_as->a_contents);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ AS_CLRUNMAPWAIT(seg->s_as);
+ cv_broadcast(&seg->s_as->a_cv);
+ }
+ mutex_exit(&seg->s_as->a_contents);
+ }
+ }
+
+out:
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ *ppp = NULL;
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
+ "segvn_pagelock: cache miss seg %p addr %p", seg, addr);
+ return (error);
+}
+
+/*
+ * purge any cached pages in the I/O page cache
+ */
+static void
+segvn_purge(struct seg *seg)
+{
+ seg_ppurge(seg);
+}
+
+static int
+segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
+ enum seg_rw rw)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ pgcnt_t np, npages;
+ struct page **pl;
+
+#ifdef lint
+ addr = addr;
+#endif
+
+ npages = np = (len >> PAGESHIFT);
+ ASSERT(npages);
+ pl = pplist;
+ if (seg->s_szc != 0) {
+ size_t pgsz = page_get_pagesize(seg->s_szc);
+ if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
+ panic("segvn_reclaim: unaligned addr or len");
+ /*NOTREACHED*/
+ }
+ }
+
+ while (np > (uint_t)0) {
+ if (rw == S_WRITE) {
+ hat_setrefmod(*pplist);
+ } else {
+ hat_setref(*pplist);
+ }
+ page_unlock(*pplist);
+ np--;
+ pplist++;
+ }
+ kmem_free(pl, sizeof (page_t *) * npages);
+
+ mutex_enter(&freemem_lock);
+ availrmem += npages;
+ segvn_pages_locked -= npages;
+ svd->softlockcnt -= npages;
+ mutex_exit(&freemem_lock);
+ if (svd->softlockcnt <= 0) {
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ mutex_enter(&seg->s_as->a_contents);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ AS_CLRUNMAPWAIT(seg->s_as);
+ cv_broadcast(&seg->s_as->a_cv);
+ }
+ mutex_exit(&seg->s_as->a_contents);
+ }
+ }
+ return (0);
+}
+/*
+ * get a memory ID for an addr in a given segment
+ *
+ * XXX only creates PAGESIZE pages if anon slots are not initialized.
+ * At fault time they will be relocated into larger pages.
+ */
+static int
+segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct anon *ap = NULL;
+ ulong_t anon_index;
+ struct anon_map *amp;
+ anon_sync_obj_t cookie;
+
+ if (svd->type == MAP_PRIVATE) {
+ memidp->val[0] = (uintptr_t)seg->s_as;
+ memidp->val[1] = (uintptr_t)addr;
+ return (0);
+ }
+
+ if (svd->type == MAP_SHARED) {
+ if (svd->vp) {
+ memidp->val[0] = (uintptr_t)svd->vp;
+ memidp->val[1] = (u_longlong_t)svd->offset +
+ (uintptr_t)(addr - seg->s_base);
+ return (0);
+ } else {
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ if ((amp = svd->amp) != NULL) {
+ anon_index = svd->anon_index +
+ seg_page(seg, addr);
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+
+ ASSERT(amp != NULL);
+
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ anon_array_enter(amp, anon_index, &cookie);
+ ap = anon_get_ptr(amp->ahp, anon_index);
+ if (ap == NULL) {
+ page_t *pp;
+
+ pp = anon_zero(seg, addr, &ap, svd->cred);
+ if (pp == NULL) {
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ return (ENOMEM);
+ }
+ ASSERT(anon_get_ptr(amp->ahp, anon_index)
+ == NULL);
+ (void) anon_set_ptr(amp->ahp, anon_index,
+ ap, ANON_SLEEP);
+ page_unlock(pp);
+ }
+
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+
+ memidp->val[0] = (uintptr_t)ap;
+ memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
+ return (0);
+ }
+ }
+ return (EINVAL);
+}
+
+static int
+sameprot(struct seg *seg, caddr_t a, size_t len)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ struct vpage *vpage;
+ spgcnt_t pages = btop(len);
+ uint_t prot;
+
+ if (svd->pageprot == 0)
+ return (1);
+
+ ASSERT(svd->vpage != NULL);
+
+ vpage = &svd->vpage[seg_page(seg, a)];
+ prot = VPP_PROT(vpage);
+ vpage++;
+ pages--;
+ while (pages-- > 0) {
+ if (prot != VPP_PROT(vpage))
+ return (0);
+ vpage++;
+ }
+ return (1);
+}
+
+/*
+ * Get memory allocation policy info for specified address in given segment
+ */
+static lgrp_mem_policy_info_t *
+segvn_getpolicy(struct seg *seg, caddr_t addr)
+{
+ struct anon_map *amp;
+ ulong_t anon_index;
+ lgrp_mem_policy_info_t *policy_info;
+ struct segvn_data *svn_data;
+ u_offset_t vn_off;
+ vnode_t *vp;
+
+ ASSERT(seg != NULL);
+
+ svn_data = (struct segvn_data *)seg->s_data;
+ if (svn_data == NULL)
+ return (NULL);
+
+ /*
+ * Get policy info for private or shared memory
+ */
+ if (svn_data->type != MAP_SHARED)
+ policy_info = &svn_data->policy_info;
+ else {
+ amp = svn_data->amp;
+ anon_index = svn_data->anon_index + seg_page(seg, addr);
+ vp = svn_data->vp;
+ vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base);
+ policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off);
+ }
+
+ return (policy_info);
+}
diff --git a/usr/src/uts/common/vm/seg_vn.h b/usr/src/uts/common/vm/seg_vn.h
new file mode 100644
index 0000000000..4f66d495dd
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_vn.h
@@ -0,0 +1,168 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_SEG_VN_H
+#define _VM_SEG_VN_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/lgrp.h>
+#include <vm/anon.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * A pointer to this structure is passed to segvn_create().
+ */
+typedef struct segvn_crargs {
+ struct vnode *vp; /* vnode mapped from */
+ struct cred *cred; /* credentials */
+ u_offset_t offset; /* starting offset of vnode for mapping */
+ uchar_t type; /* type of sharing done */
+ uchar_t prot; /* protections */
+ uchar_t maxprot; /* maximum protections */
+ uint_t flags; /* flags */
+ struct anon_map *amp; /* anon mapping to map to */
+ uint_t szc; /* max preferred page size code */
+ uint_t lgrp_mem_policy_flags;
+} segvn_crargs_t;
+
+/*
+ * (Semi) private data maintained by the seg_vn driver per segment mapping.
+ *
+ * The read/write segment lock protects all of segvn_data including the
+ * vpage array. All fields in segvn_data are treated as read-only when
+ * the "read" version of the address space and the segment locks are held.
+ * The "write" version of the segment lock, however, is required in order to
+ * update the following fields:
+ *
+ * pageprot
+ * prot
+ * amp
+ * vpage
+ *
+ * softlockcnt
+ * is written by acquiring either the readers lock on the segment and
+ * freemem lock, or any lock combination which guarantees exclusive use
+ * of this segment (e.g., adress space writers lock,
+ * address space readers lock + segment writers lock).
+ */
+typedef struct segvn_data {
+ krwlock_t lock; /* protect segvn_data and vpage array */
+ kmutex_t segp_slock; /* serialize insertions into seg_pcache */
+ uchar_t pageprot; /* true if per page protections present */
+ uchar_t prot; /* current segment prot if pageprot == 0 */
+ uchar_t maxprot; /* maximum segment protections */
+ uchar_t type; /* type of sharing done */
+ u_offset_t offset; /* starting offset of vnode for mapping */
+ struct vnode *vp; /* vnode that segment mapping is to */
+ ulong_t anon_index; /* starting index into anon_map anon array */
+ struct anon_map *amp; /* pointer to anon share structure, if needed */
+ struct vpage *vpage; /* per-page information, if needed */
+ struct cred *cred; /* mapping credentials */
+ size_t swresv; /* swap space reserved for this segment */
+ uchar_t advice; /* madvise flags for segment */
+ uchar_t pageadvice; /* true if per page advice set */
+ ushort_t flags; /* flags - from sys/mman.h */
+ ssize_t softlockcnt; /* # of pages SOFTLOCKED in seg */
+ lgrp_mem_policy_info_t policy_info; /* memory allocation policy */
+} segvn_data_t;
+
+#ifdef _KERNEL
+
+/*
+ * Macros for segvn segment driver locking.
+ */
+#define SEGVN_LOCK_ENTER(as, lock, type) rw_enter((lock), (type))
+#define SEGVN_LOCK_EXIT(as, lock) rw_exit((lock))
+#define SEGVN_LOCK_DOWNGRADE(as, lock) rw_downgrade((lock))
+
+/*
+ * Macros to test lock states.
+ */
+#define SEGVN_LOCK_HELD(as, lock) RW_LOCK_HELD((lock))
+#define SEGVN_READ_HELD(as, lock) RW_READ_HELD((lock))
+#define SEGVN_WRITE_HELD(as, lock) RW_WRITE_HELD((lock))
+
+/*
+ * Macro used to detect the need to Break the sharing of COW pages
+ *
+ * The rw == S_WRITE is for the COW case
+ * rw == S_READ and type == SOFTLOCK is for the physio case
+ * We don't want to share a softlocked page because it can cause problems
+ * with multithreaded apps but if rw == S_READ_NOCOW it's ok to not break
+ * sharing of COW pages even in SOFTLOCK case.
+ */
+#define BREAK_COW_SHARE(rw, type, seg_type) ((rw == S_WRITE || \
+ (type == F_SOFTLOCK && rw != S_READ_NOCOW)) && \
+ seg_type == MAP_PRIVATE)
+
+#define SEGVN_ZFOD_ARGS(prot, max) \
+ { NULL, NULL, 0, MAP_PRIVATE, prot, max, 0, NULL, 0, 0 }
+
+#define AS_MAP_VNSEGS_USELPGS(crfp, argsp) \
+ ((crfp) == (int (*)())segvn_create && \
+ (((struct segvn_crargs *)(argsp))->flags & \
+ (MAP_TEXT | MAP_INITDATA)) && \
+ ((struct segvn_crargs *)(argsp))->vp != NULL && \
+ ((struct segvn_crargs *)(argsp))->amp == NULL)
+
+
+extern void segvn_init(void);
+extern int segvn_create(struct seg *, void *);
+
+extern struct seg_ops segvn_ops;
+
+/*
+ * Provided as shorthand for creating user zfod segments.
+ */
+extern caddr_t zfod_argsp;
+extern caddr_t kzfod_argsp;
+extern caddr_t stack_exec_argsp;
+extern caddr_t stack_noexec_argsp;
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_SEG_VN_H */
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
new file mode 100644
index 0000000000..b8da5c97c2
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -0,0 +1,3197 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - anonymous pages.
+ *
+ * This layer sits immediately above the vm_swap layer. It manages
+ * physical pages that have no permanent identity in the file system
+ * name space, using the services of the vm_swap layer to allocate
+ * backing storage for these pages. Since these pages have no external
+ * identity, they are discarded when the last reference is removed.
+ *
+ * An important function of this layer is to manage low-level sharing
+ * of pages that are logically distinct but that happen to be
+ * physically identical (e.g., the corresponding pages of the processes
+ * resulting from a fork before one process or the other changes their
+ * contents). This pseudo-sharing is present only as an optimization
+ * and is not to be confused with true sharing in which multiple
+ * address spaces deliberately contain references to the same object;
+ * such sharing is managed at a higher level.
+ *
+ * The key data structure here is the anon struct, which contains a
+ * reference count for its associated physical page and a hint about
+ * the identity of that page. Anon structs typically live in arrays,
+ * with an instance's position in its array determining where the
+ * corresponding backing storage is allocated; however, the swap_xlate()
+ * routine abstracts away this representation information so that the
+ * rest of the anon layer need not know it. (See the swap layer for
+ * more details on anon struct layout.)
+ *
+ * In the future versions of the system, the association between an
+ * anon struct and its position on backing store will change so that
+ * we don't require backing store all anonymous pages in the system.
+ * This is important for consideration for large memory systems.
+ * We can also use this technique to delay binding physical locations
+ * to anonymous pages until pageout/swapout time where we can make
+ * smarter allocation decisions to improve anonymous klustering.
+ *
+ * Many of the routines defined here take a (struct anon **) argument,
+ * which allows the code at this level to manage anon pages directly,
+ * so that callers can regard anon structs as opaque objects and not be
+ * concerned with assigning or inspecting their contents.
+ *
+ * Clients of this layer refer to anon pages indirectly. That is, they
+ * maintain arrays of pointers to anon structs rather than maintaining
+ * anon structs themselves. The (struct anon **) arguments mentioned
+ * above are pointers to entries in these arrays. It is these arrays
+ * that capture the mapping between offsets within a given segment and
+ * the corresponding anonymous backing storage address.
+ */
+
+#ifdef DEBUG
+#define ANON_DEBUG
+#endif
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/cred.h>
+#include <sys/thread.h>
+#include <sys/vnode.h>
+#include <sys/cpuvar.h>
+#include <sys/swap.h>
+#include <sys/cmn_err.h>
+#include <sys/vtrace.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/vmsystm.h>
+#include <sys/debug.h>
+#include <sys/tnf_probe.h>
+#include <sys/lgrp.h>
+#include <sys/policy.h>
+#include <sys/condvar_impl.h>
+#include <sys/mutex_impl.h>
+
+#include <vm/as.h>
+#include <vm/hat.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <vm/vpage.h>
+#include <vm/seg.h>
+#include <vm/rm.h>
+
+#include <fs/fs_subr.h>
+
+int anon_debug;
+
+kmutex_t anoninfo_lock;
+struct k_anoninfo k_anoninfo;
+ani_free_t ani_free_pool[ANI_MAX_POOL];
+pad_mutex_t anon_array_lock[ANON_LOCKSIZE];
+kcondvar_t anon_array_cv[ANON_LOCKSIZE];
+
+/*
+ * Global hash table for (vp, off) -> anon slot
+ */
+extern int swap_maxcontig;
+size_t anon_hash_size;
+struct anon **anon_hash;
+
+static struct kmem_cache *anon_cache;
+static struct kmem_cache *anonmap_cache;
+
+#ifdef VM_STATS
+static struct anonvmstats_str {
+ ulong_t getpages[30];
+ ulong_t privatepages[10];
+ ulong_t demotepages[9];
+ ulong_t decrefpages[9];
+ ulong_t dupfillholes[4];
+ ulong_t freepages[1];
+} anonvmstats;
+#endif /* VM_STATS */
+
+
+/*ARGSUSED*/
+static int
+anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ struct anon_map *amp = buf;
+
+ rw_init(&amp->a_rwlock, NULL, RW_DEFAULT, NULL);
+ return (0);
+}
+
+/*ARGSUSED1*/
+static void
+anonmap_cache_destructor(void *buf, void *cdrarg)
+{
+ struct anon_map *amp = buf;
+
+ rw_destroy(&amp->a_rwlock);
+}
+
+kmutex_t anonhash_lock[AH_LOCK_SIZE];
+kmutex_t anonpages_hash_lock[AH_LOCK_SIZE];
+
+void
+anon_init(void)
+{
+ int i;
+
+ anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN);
+
+ for (i = 0; i < AH_LOCK_SIZE; i++) {
+ mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL);
+ }
+
+ for (i = 0; i < ANON_LOCKSIZE; i++) {
+ mutex_init(&anon_array_lock[i].pad_mutex, NULL,
+ MUTEX_DEFAULT, NULL);
+ cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
+ }
+
+ anon_hash = (struct anon **)
+ kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
+ anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
+ AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
+ anonmap_cache = kmem_cache_create("anonmap_cache",
+ sizeof (struct anon_map), 0,
+ anonmap_cache_constructor, anonmap_cache_destructor, NULL,
+ NULL, NULL, 0);
+ swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */
+}
+
+/*
+ * Global anon slot hash table manipulation.
+ */
+
+static void
+anon_addhash(struct anon *ap)
+{
+ int index;
+
+ ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]));
+ index = ANON_HASH(ap->an_vp, ap->an_off);
+ ap->an_hash = anon_hash[index];
+ anon_hash[index] = ap;
+}
+
+static void
+anon_rmhash(struct anon *ap)
+{
+ struct anon **app;
+
+ ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]));
+
+ for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
+ *app; app = &((*app)->an_hash)) {
+ if (*app == ap) {
+ *app = ap->an_hash;
+ break;
+ }
+ }
+}
+
+/*
+ * The anon array interfaces. Functions allocating,
+ * freeing array of pointers, and returning/setting
+ * entries in the array of pointers for a given offset.
+ *
+ * Create the list of pointers
+ */
+struct anon_hdr *
+anon_create(pgcnt_t npages, int flags)
+{
+ struct anon_hdr *ahp;
+ ulong_t nchunks;
+ int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+
+ if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
+ return (NULL);
+ }
+
+ mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
+ /*
+ * Single level case.
+ */
+ ahp->size = npages;
+ if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
+
+ if (flags & ANON_ALLOC_FORCE)
+ ahp->flags |= ANON_ALLOC_FORCE;
+
+ ahp->array_chunk = kmem_zalloc(
+ ahp->size * sizeof (struct anon *), kmemflags);
+
+ if (ahp->array_chunk == NULL) {
+ kmem_free(ahp, sizeof (struct anon_hdr));
+ return (NULL);
+ }
+ } else {
+ /*
+ * 2 Level case.
+ */
+ nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
+
+ ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
+ kmemflags);
+
+ if (ahp->array_chunk == NULL) {
+ kmem_free(ahp, sizeof (struct anon_hdr));
+ return (NULL);
+ }
+ }
+ return (ahp);
+}
+
+/*
+ * Free the array of pointers
+ */
+void
+anon_release(struct anon_hdr *ahp, pgcnt_t npages)
+{
+ ulong_t i;
+ void **ppp;
+ ulong_t nchunks;
+
+ ASSERT(npages == ahp->size);
+
+ /*
+ * Single level case.
+ */
+ if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
+ kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
+ } else {
+ /*
+ * 2 level case.
+ */
+ nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
+ for (i = 0; i < nchunks; i++) {
+ ppp = &ahp->array_chunk[i];
+ if (*ppp != NULL)
+ kmem_free(*ppp, PAGESIZE);
+ }
+ kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
+ }
+ mutex_destroy(&ahp->serial_lock);
+ kmem_free(ahp, sizeof (struct anon_hdr));
+}
+
+/*
+ * Return the pointer from the list for a
+ * specified anon index.
+ */
+struct anon *
+anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
+{
+ struct anon **app;
+
+ ASSERT(an_idx < ahp->size);
+
+ /*
+ * Single level case.
+ */
+ if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
+ return ((struct anon *)
+ ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
+ } else {
+
+ /*
+ * 2 level case.
+ */
+ app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+ if (app) {
+ return ((struct anon *)
+ ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
+ ANON_PTRMASK));
+ } else {
+ return (NULL);
+ }
+ }
+}
+
+/*
+ * Return the anon pointer for the first valid entry in the anon list,
+ * starting from the given index.
+ */
+struct anon *
+anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
+{
+ struct anon *ap;
+ struct anon **app;
+ ulong_t chunkoff;
+ ulong_t i;
+ ulong_t j;
+ pgcnt_t size;
+
+ i = *index;
+ size = ahp->size;
+
+ ASSERT(i < size);
+
+ if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
+ /*
+ * 1 level case
+ */
+ while (i < size) {
+ ap = (struct anon *)
+ ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
+ if (ap) {
+ *index = i;
+ return (ap);
+ }
+ i++;
+ }
+ } else {
+ /*
+ * 2 level case
+ */
+ chunkoff = i & ANON_CHUNK_OFF;
+ while (i < size) {
+ app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
+ if (app)
+ for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
+ ap = (struct anon *)
+ ((uintptr_t)app[j] &
+ ANON_PTRMASK);
+ if (ap) {
+ *index = i + (j - chunkoff);
+ return (ap);
+ }
+ }
+ chunkoff = 0;
+ i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
+ }
+ }
+ *index = size;
+ return (NULL);
+}
+
+/*
+ * Set list entry with a given pointer for a specified offset
+ */
+int
+anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
+{
+ void **ppp;
+ struct anon **app;
+ int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+ uintptr_t *ap_addr;
+
+ ASSERT(an_idx < ahp->size);
+
+ /*
+ * Single level case.
+ */
+ if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
+ ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
+ } else {
+
+ /*
+ * 2 level case.
+ */
+ ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+
+ ASSERT(ppp != NULL);
+ if (*ppp == NULL) {
+ mutex_enter(&ahp->serial_lock);
+ ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+ if (*ppp == NULL) {
+ *ppp = kmem_zalloc(PAGESIZE, kmemflags);
+ if (*ppp == NULL) {
+ mutex_exit(&ahp->serial_lock);
+ return (ENOMEM);
+ }
+ }
+ mutex_exit(&ahp->serial_lock);
+ }
+ app = *ppp;
+ ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
+ }
+ *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
+ return (0);
+}
+
+/*
+ * Copy anon array into a given new anon array
+ */
+int
+anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx,
+ struct anon_hdr *dahp, ulong_t d_idx,
+ pgcnt_t npages, int flags)
+{
+ void **sapp, **dapp;
+ void *ap;
+ int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+
+ ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
+ ASSERT((npages <= sahp->size) && (npages <= dahp->size));
+
+ /*
+ * Both arrays are 1 level.
+ */
+ if (((sahp->size <= ANON_CHUNK_SIZE) &&
+ (dahp->size <= ANON_CHUNK_SIZE)) ||
+ ((sahp->flags & ANON_ALLOC_FORCE) &&
+ (dahp->flags & ANON_ALLOC_FORCE))) {
+
+ bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
+ npages * sizeof (struct anon *));
+ return (0);
+ }
+
+ /*
+ * Both arrays are 2 levels.
+ */
+ if (sahp->size > ANON_CHUNK_SIZE &&
+ dahp->size > ANON_CHUNK_SIZE &&
+ ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
+ ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
+
+ ulong_t sapidx, dapidx;
+ ulong_t *sap, *dap;
+ ulong_t chknp;
+
+ while (npages != 0) {
+
+ sapidx = s_idx & ANON_CHUNK_OFF;
+ dapidx = d_idx & ANON_CHUNK_OFF;
+ chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
+ if (chknp > npages)
+ chknp = npages;
+
+ sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
+ if ((sap = *sapp) != NULL) {
+ dapp = &dahp->array_chunk[d_idx
+ >> ANON_CHUNK_SHIFT];
+ if ((dap = *dapp) == NULL) {
+ *dapp = kmem_zalloc(PAGESIZE,
+ kmemflags);
+ if ((dap = *dapp) == NULL)
+ return (ENOMEM);
+ }
+ bcopy((sap + sapidx), (dap + dapidx),
+ chknp << ANON_PTRSHIFT);
+ }
+ s_idx += chknp;
+ d_idx += chknp;
+ npages -= chknp;
+ }
+ return (0);
+ }
+
+ /*
+ * At least one of the arrays is 2 level.
+ */
+ while (npages--) {
+ if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
+ ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
+ if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
+ return (ENOMEM);
+ }
+ s_idx++;
+ d_idx++;
+ }
+ return (0);
+}
+
+
+/*
+ * ANON_INITBUF is a convenience macro for anon_grow() below. It
+ * takes a buffer dst, which is at least as large as buffer src. It
+ * does a bcopy from src into dst, and then bzeros the extra bytes
+ * of dst. If tail is set, the data in src is tail aligned within
+ * dst instead of head aligned.
+ */
+
+#define ANON_INITBUF(src, srclen, dst, dstsize, tail) \
+ if (tail) { \
+ bzero((dst), (dstsize) - (srclen)); \
+ bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
+ } else { \
+ bcopy((src), (dst), (srclen)); \
+ bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \
+ }
+
+#define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8)
+#define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
+
+/*
+ * anon_grow() is used to efficiently extend an existing anon array.
+ * startidx_p points to the index into the anon array of the first page
+ * that is in use. curpages is the number of pages in use, starting at
+ * *startidx_p. newpages is the number of additional pages desired.
+ *
+ * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
+ *
+ * The growth is done by creating a new top level of the anon array,
+ * and (if the array is 2-level) reusing the existing second level arrays.
+ *
+ * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
+ *
+ * Returns the new number of pages in the anon array.
+ */
+
+pgcnt_t
+anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t curpages,
+ pgcnt_t newpages, int flags)
+{
+ ulong_t startidx = startidx_p ? *startidx_p : 0;
+ pgcnt_t osz = ahp->size, nsz;
+ pgcnt_t oelems, nelems, totpages;
+ void **level1;
+ int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+ int growdown = (flags & ANON_GROWDOWN);
+ size_t newarrsz, oldarrsz;
+ void *level2;
+
+ ASSERT(!(startidx_p == NULL && growdown));
+ ASSERT(startidx + curpages <= ahp->size);
+
+ /*
+ * Determine the total number of pages needed in the new
+ * anon array. If growing down, totpages is all pages from
+ * startidx through the end of the array, plus <newpages>
+ * pages. If growing up, keep all pages from page 0 through
+ * the last page currently in use, plus <newpages> pages.
+ */
+
+ if (growdown)
+ totpages = osz - startidx + newpages;
+ else
+ totpages = startidx + curpages + newpages;
+
+ /* If the array is already large enough, just return. */
+
+ if (osz >= totpages) {
+ nsz = osz;
+ goto out;
+ }
+
+ /*
+ * osz/nsz are the total numbers of pages represented by the array.
+ * oelems/nelems are the number of pointers in the top level array.
+ *
+ * Will the new anon array be one level or two levels?
+ */
+
+ if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
+ nsz = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
+ oelems = osz;
+ nelems = nsz;
+ } else {
+ nsz = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
+ oelems = (osz + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
+ nelems = nsz >> ANON_CHUNK_SHIFT;
+ }
+
+ newarrsz = nelems * sizeof (void *);
+ level1 = kmem_alloc(newarrsz, kmemflags);
+ if (level1 == NULL)
+ return (0);
+
+ /* Are we converting from a one level to a two level anon array? */
+
+ if (nsz > ANON_CHUNK_SIZE && osz <= ANON_CHUNK_SIZE &&
+ !(ahp->flags & ANON_ALLOC_FORCE)) {
+ /*
+ * Yes, we're converting to a two level. Reuse old level 1
+ * as new level 2 if it is exactly PAGESIZE. Otherwise
+ * alloc a new level 2 and copy the old level 1 data into it.
+ */
+
+ if (osz == ANON_CHUNK_SIZE) {
+ level2 = (void *)ahp->array_chunk;
+ } else {
+ level2 = kmem_alloc(PAGESIZE, kmemflags);
+ if (level2 == NULL) {
+ kmem_free(level1, newarrsz);
+ return (0);
+ }
+ oldarrsz = osz * sizeof (void *);
+
+ ANON_INITBUF(ahp->array_chunk, oldarrsz,
+ level2, PAGESIZE, growdown);
+ kmem_free(ahp->array_chunk, oldarrsz);
+ }
+ bzero(level1, newarrsz);
+ if (growdown)
+ level1[nelems - 1] = level2;
+ else
+ level1[0] = level2;
+ } else {
+ oldarrsz = oelems * sizeof (void *);
+
+ ANON_INITBUF(ahp->array_chunk, oldarrsz,
+ level1, newarrsz, growdown);
+ kmem_free(ahp->array_chunk, oldarrsz);
+ }
+
+ ahp->array_chunk = level1;
+ ahp->size = nsz;
+out:
+ if (growdown)
+ *startidx_p = nsz - totpages;
+ return (nsz);
+}
+
+/*
+ * Called from clock handler to sync ani_free value.
+ */
+
+void
+set_anoninfo(void)
+{
+ int ix;
+ pgcnt_t total = 0;
+
+ for (ix = 0; ix < ANI_MAX_POOL; ix++) {
+ total += ani_free_pool[ix].ani_count;
+ }
+ k_anoninfo.ani_free = total;
+}
+
+/*
+ * Reserve anon space.
+ *
+ * It's no longer simply a matter of incrementing ani_resv to
+ * reserve swap space, we need to check memory-based as well
+ * as disk-backed (physical) swap. The following algorithm
+ * is used:
+ * Check the space on physical swap
+ * i.e. amount needed < ani_max - ani_phys_resv
+ * If we are swapping on swapfs check
+ * amount needed < (availrmem - swapfs_minfree)
+ * Since the algorithm to check for the quantity of swap space is
+ * almost the same as that for reserving it, we'll just use anon_resvmem
+ * with a flag to decrement availrmem.
+ *
+ * Return non-zero on success.
+ */
+int
+anon_resvmem(size_t size, uint_t takemem)
+{
+ pgcnt_t npages = btopr(size);
+ pgcnt_t mswap_pages = 0;
+ pgcnt_t pswap_pages = 0;
+
+ mutex_enter(&anoninfo_lock);
+
+ /*
+ * pswap_pages is the number of pages we can take from
+ * physical (i.e. disk-backed) swap.
+ */
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+ pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
+
+ ANON_PRINT(A_RESV,
+ ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
+ npages, takemem, pswap_pages, (void *)caller()));
+
+ if (npages <= pswap_pages) {
+ /*
+ * we have enough space on a physical swap
+ */
+ if (takemem)
+ k_anoninfo.ani_phys_resv += npages;
+ mutex_exit(&anoninfo_lock);
+ return (1);
+ } else if (pswap_pages != 0) {
+ /*
+ * we have some space on a physical swap
+ */
+ if (takemem) {
+ /*
+ * use up remainder of phys swap
+ */
+ k_anoninfo.ani_phys_resv += pswap_pages;
+ ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
+ }
+ }
+ /*
+ * since (npages > pswap_pages) we need mem swap
+ * mswap_pages is the number of pages needed from availrmem
+ */
+ ASSERT(npages > pswap_pages);
+ mswap_pages = npages - pswap_pages;
+
+ ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
+ mswap_pages));
+
+ /*
+ * priv processes can reserve memory as swap as long as availrmem
+ * remains greater than swapfs_minfree; in the case of non-priv
+ * processes, memory can be reserved as swap only if availrmem
+ * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
+ * swapfs_reserve amount of memswap is not available to non-priv
+ * processes. This protects daemons such as automounter dying
+ * as a result of application processes eating away almost entire
+ * membased swap. This safeguard becomes useless if apps are run
+ * with root access.
+ *
+ * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
+ *
+ */
+ mutex_enter(&freemem_lock);
+ if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
+ (availrmem > (swapfs_minfree + mswap_pages) &&
+ secpolicy_resource(CRED()) == 0)) {
+
+ if (takemem) {
+ /*
+ * Take the memory from the rest of the system.
+ */
+ availrmem -= mswap_pages;
+ mutex_exit(&freemem_lock);
+ k_anoninfo.ani_mem_resv += mswap_pages;
+ ANI_ADD(mswap_pages);
+ ANON_PRINT((A_RESV | A_MRESV),
+ ("anon_resvmem: took %ld pages of availrmem\n",
+ mswap_pages));
+ } else {
+ mutex_exit(&freemem_lock);
+ }
+
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+ mutex_exit(&anoninfo_lock);
+ return (1);
+
+ } else {
+ /*
+ * Fail if not enough memory
+ */
+
+ if (takemem) {
+ k_anoninfo.ani_phys_resv -= pswap_pages;
+ }
+
+ mutex_exit(&freemem_lock);
+ mutex_exit(&anoninfo_lock);
+ ANON_PRINT(A_RESV,
+ ("anon_resvmem: not enough space from swapfs\n"));
+ return (0);
+ }
+}
+
+
+/*
+ * Give back an anon reservation.
+ */
+void
+anon_unresv(size_t size)
+{
+ pgcnt_t npages = btopr(size);
+ spgcnt_t mem_free_pages = 0;
+ pgcnt_t phys_free_slots;
+#ifdef ANON_DEBUG
+ pgcnt_t mem_resv;
+#endif
+
+ mutex_enter(&anoninfo_lock);
+
+ ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+ /*
+ * If some of this reservation belonged to swapfs
+ * give it back to availrmem.
+ * ani_mem_resv is the amount of availrmem swapfs has reserved.
+ * but some of that memory could be locked by segspt so we can only
+ * return non locked ani_mem_resv back to availrmem
+ */
+ if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
+ ANON_PRINT((A_RESV | A_MRESV),
+ ("anon_unresv: growing availrmem by %ld pages\n",
+ MIN(k_anoninfo.ani_mem_resv, npages)));
+
+ mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
+ k_anoninfo.ani_locked_swap), npages);
+ mutex_enter(&freemem_lock);
+ availrmem += mem_free_pages;
+ mutex_exit(&freemem_lock);
+ k_anoninfo.ani_mem_resv -= mem_free_pages;
+
+ ANI_ADD(-mem_free_pages);
+ }
+ /*
+ * The remainder of the pages is returned to phys swap
+ */
+ ASSERT(npages >= mem_free_pages);
+ phys_free_slots = npages - mem_free_pages;
+
+ if (phys_free_slots) {
+ k_anoninfo.ani_phys_resv -= phys_free_slots;
+ }
+
+#ifdef ANON_DEBUG
+ mem_resv = k_anoninfo.ani_mem_resv;
+#endif
+
+ ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+ mutex_exit(&anoninfo_lock);
+
+ ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
+ npages, mem_resv, (void *)caller()));
+}
+
+/*
+ * Allocate an anon slot and return it with the lock held.
+ */
+struct anon *
+anon_alloc(struct vnode *vp, anoff_t off)
+{
+ struct anon *ap;
+ kmutex_t *ahm;
+
+ ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
+ if (vp == NULL) {
+ swap_alloc(ap);
+ } else {
+ ap->an_vp = vp;
+ ap->an_off = off;
+ }
+ ap->an_refcnt = 1;
+ ap->an_pvp = NULL;
+ ap->an_poff = 0;
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+ mutex_enter(ahm);
+ anon_addhash(ap);
+ mutex_exit(ahm);
+ ANI_ADD(-1);
+ ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
+ (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
+ return (ap);
+}
+
+/*
+ * Decrement the reference count of an anon page.
+ * If reference count goes to zero, free it and
+ * its associated page (if any).
+ */
+void
+anon_decref(struct anon *ap)
+{
+ page_t *pp;
+ struct vnode *vp;
+ anoff_t off;
+ kmutex_t *ahm;
+
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+ mutex_enter(ahm);
+ ASSERT(ap->an_refcnt != 0);
+ if (ap->an_refcnt == 0)
+ panic("anon_decref: slot count 0");
+ if (--ap->an_refcnt == 0) {
+ swap_xlate(ap, &vp, &off);
+ mutex_exit(ahm);
+
+ /*
+ * If there is a page for this anon slot we will need to
+ * call VN_DISPOSE to get rid of the vp association and
+ * put the page back on the free list as really free.
+ * Acquire the "exclusive" lock to ensure that any
+ * pending i/o always completes before the swap slot
+ * is freed.
+ */
+ pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
+
+ /*
+ * If there was a page, we've synchronized on it (getting
+ * the exclusive lock is as good as gettting the iolock)
+ * so now we can free the physical backing store. Also, this
+ * is where we would free the name of the anonymous page
+ * (swap_free(ap)), a no-op in the current implementation.
+ */
+ mutex_enter(ahm);
+ ASSERT(ap->an_refcnt == 0);
+ anon_rmhash(ap);
+ if (ap->an_pvp)
+ swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
+ mutex_exit(ahm);
+
+ if (pp != NULL) {
+ /*LINTED: constant in conditional context */
+ VN_DISPOSE(pp, B_INVAL, 0, kcred);
+ }
+ ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
+ (void *)ap, (void *)ap->an_vp));
+ kmem_cache_free(anon_cache, ap);
+
+ ANI_ADD(1);
+ } else {
+ mutex_exit(ahm);
+ }
+}
+
+static int
+anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
+{
+ struct anon *ap;
+
+ while (nslots-- > 0) {
+ if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
+ ap->an_refcnt > 1)
+ return (1);
+ anon_index++;
+ }
+
+ return (0);
+}
+
+static void
+anon_decref_pages(
+ struct anon_hdr *ahp,
+ ulong_t an_idx,
+ uint_t szc)
+{
+ struct anon *ap = anon_get_ptr(ahp, an_idx);
+ kmutex_t *ahmpages = NULL;
+ page_t *pp;
+ pgcnt_t pgcnt = page_get_pagecnt(szc);
+ pgcnt_t i;
+ struct vnode *vp;
+ anoff_t off;
+ kmutex_t *ahm;
+#ifdef DEBUG
+ int refcnt = 1;
+#endif
+
+ ASSERT(szc != 0);
+ ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+ ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
+
+ VM_STAT_ADD(anonvmstats.decrefpages[0]);
+
+ if (ap != NULL) {
+ ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+ mutex_enter(ahmpages);
+ ASSERT((refcnt = ap->an_refcnt) != 0);
+ VM_STAT_ADD(anonvmstats.decrefpages[1]);
+ if (ap->an_refcnt == 1) {
+ VM_STAT_ADD(anonvmstats.decrefpages[2]);
+ ASSERT(!anon_share(ahp, an_idx, pgcnt));
+ mutex_exit(ahmpages);
+ ahmpages = NULL;
+ }
+ }
+
+ i = 0;
+ while (i < pgcnt) {
+ if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
+ ASSERT(refcnt == 1 && ahmpages == NULL);
+ i++;
+ continue;
+ }
+ ASSERT(ap->an_refcnt == refcnt);
+ ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
+ ASSERT(ahmpages == NULL || ap->an_refcnt > 1);
+
+ if (ahmpages == NULL) {
+ swap_xlate(ap, &vp, &off);
+ pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
+ if (pp == NULL || pp->p_szc == 0) {
+ VM_STAT_ADD(anonvmstats.decrefpages[3]);
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp,
+ ap->an_off)];
+ (void) anon_set_ptr(ahp, an_idx + i, NULL,
+ ANON_SLEEP);
+ mutex_enter(ahm);
+ ap->an_refcnt--;
+ ASSERT(ap->an_refcnt == 0);
+ anon_rmhash(ap);
+ if (ap->an_pvp)
+ swap_phys_free(ap->an_pvp, ap->an_poff,
+ PAGESIZE);
+ mutex_exit(ahm);
+ if (pp != NULL) {
+ VM_STAT_ADD(anonvmstats.decrefpages[4]);
+ /*LINTED*/
+ VN_DISPOSE(pp, B_INVAL, 0, kcred);
+ }
+ kmem_cache_free(anon_cache, ap);
+ ANI_ADD(1);
+ i++;
+ } else {
+ pgcnt_t j;
+ pgcnt_t curpgcnt =
+ page_get_pagecnt(pp->p_szc);
+ size_t ppasize = curpgcnt * sizeof (page_t *);
+ page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
+ int dispose = 0;
+
+ VM_STAT_ADD(anonvmstats.decrefpages[5]);
+
+ ASSERT(pp->p_szc <= szc);
+ ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt));
+ ASSERT(IS_P2ALIGNED(i, curpgcnt));
+ ASSERT(i + curpgcnt <= pgcnt);
+ ASSERT(!(page_pptonum(pp) & (curpgcnt - 1)));
+ ppa[0] = pp;
+ for (j = i + 1; j < i + curpgcnt; j++) {
+ ap = anon_get_ptr(ahp, an_idx + j);
+ ASSERT(ap != NULL &&
+ ap->an_refcnt == 1);
+ swap_xlate(ap, &vp, &off);
+ pp = page_lookup(vp, (u_offset_t)off,
+ SE_EXCL);
+ if (pp == NULL)
+ panic("anon_decref_pages: "
+ "no page");
+
+ (void) hat_pageunload(pp,
+ HAT_FORCE_PGUNLOAD);
+ ASSERT(pp->p_szc == ppa[0]->p_szc);
+ ASSERT(page_pptonum(pp) - 1 ==
+ page_pptonum(ppa[j - i - 1]));
+ ppa[j - i] = pp;
+ if (ap->an_pvp != NULL &&
+ !vn_matchopval(ap->an_pvp,
+ VOPNAME_DISPOSE,
+ (fs_generic_func_p)fs_dispose))
+ dispose = 1;
+ }
+ if (!dispose) {
+ VM_STAT_ADD(anonvmstats.decrefpages[6]);
+ page_destroy_pages(ppa[0]);
+ } else {
+ VM_STAT_ADD(anonvmstats.decrefpages[7]);
+ for (j = 0; j < curpgcnt; j++) {
+ ASSERT(PAGE_EXCL(ppa[j]));
+ ppa[j]->p_szc = 0;
+ }
+ for (j = 0; j < curpgcnt; j++) {
+ ASSERT(!hat_page_is_mapped(
+ ppa[j]));
+ /*LINTED*/
+ VN_DISPOSE(ppa[j], B_INVAL, 0,
+ kcred);
+ }
+ }
+ kmem_free(ppa, ppasize);
+ for (j = i; j < i + curpgcnt; j++) {
+ ap = anon_get_ptr(ahp, an_idx + j);
+ ASSERT(ap != NULL &&
+ ap->an_refcnt == 1);
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp,
+ ap->an_off)];
+ (void) anon_set_ptr(ahp, an_idx + j,
+ NULL, ANON_SLEEP);
+ mutex_enter(ahm);
+ ap->an_refcnt--;
+ ASSERT(ap->an_refcnt == 0);
+ anon_rmhash(ap);
+ if (ap->an_pvp)
+ swap_phys_free(ap->an_pvp,
+ ap->an_poff, PAGESIZE);
+ mutex_exit(ahm);
+ kmem_cache_free(anon_cache, ap);
+ ANI_ADD(1);
+ }
+ i += curpgcnt;
+ }
+ } else {
+ VM_STAT_ADD(anonvmstats.decrefpages[8]);
+ (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP);
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+ mutex_enter(ahm);
+ ap->an_refcnt--;
+ mutex_exit(ahm);
+ i++;
+ }
+ }
+
+ if (ahmpages != NULL) {
+ mutex_exit(ahmpages);
+ }
+}
+
+/*
+ * Duplicate references to size bytes worth of anon pages.
+ * Used when duplicating a segment that contains private anon pages.
+ * This code assumes that procedure calling this one has already used
+ * hat_chgprot() to disable write access to the range of addresses that
+ * that *old actually refers to.
+ */
+void
+anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new,
+ ulong_t new_idx, size_t size)
+{
+ spgcnt_t npages;
+ kmutex_t *ahm;
+ struct anon *ap;
+ ulong_t off;
+ ulong_t index;
+
+ npages = btopr(size);
+ while (npages > 0) {
+ index = old_idx;
+ if ((ap = anon_get_next_ptr(old, &index)) == NULL)
+ break;
+
+ ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
+ off = index - old_idx;
+ npages -= off;
+ if (npages <= 0)
+ break;
+
+ (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP);
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+
+ mutex_enter(ahm);
+ ap->an_refcnt++;
+ mutex_exit(ahm);
+
+ off++;
+ new_idx += off;
+ old_idx += off;
+ npages--;
+ }
+}
+
+/*
+ * Just like anon_dup but also guarantees there are no holes (unallocated anon
+ * slots) within any large page region. That means if a large page region is
+ * empty in the old array it will skip it. If there are 1 or more valid slots
+ * in the large page region of the old array it will make sure to fill in any
+ * unallocated ones and also copy them to the new array. If noalloc is 1 large
+ * page region should either have no valid anon slots or all slots should be
+ * valid.
+ */
+void
+anon_dup_fill_holes(
+ struct anon_hdr *old,
+ ulong_t old_idx,
+ struct anon_hdr *new,
+ ulong_t new_idx,
+ size_t size,
+ uint_t szc,
+ int noalloc)
+{
+ struct anon *ap;
+ spgcnt_t npages;
+ kmutex_t *ahm, *ahmpages = NULL;
+ pgcnt_t pgcnt, i;
+ ulong_t index, off;
+#ifdef DEBUG
+ int refcnt;
+#endif
+
+ ASSERT(szc != 0);
+ pgcnt = page_get_pagecnt(szc);
+ ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+ npages = btopr(size);
+ ASSERT(IS_P2ALIGNED(npages, pgcnt));
+ ASSERT(IS_P2ALIGNED(old_idx, pgcnt));
+
+ VM_STAT_ADD(anonvmstats.dupfillholes[0]);
+
+ while (npages > 0) {
+ index = old_idx;
+
+ /*
+ * Find the next valid slot.
+ */
+ if (anon_get_next_ptr(old, &index) == NULL)
+ break;
+
+ ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
+ /*
+ * Now backup index to the beginning of the
+ * current large page region of the old array.
+ */
+ index = P2ALIGN(index, pgcnt);
+ off = index - old_idx;
+ ASSERT(IS_P2ALIGNED(off, pgcnt));
+ npages -= off;
+ if (npages <= 0)
+ break;
+
+ /*
+ * Fill and copy a large page regions worth
+ * of anon slots.
+ */
+ for (i = 0; i < pgcnt; i++) {
+ if ((ap = anon_get_ptr(old, index + i)) == NULL) {
+ if (noalloc) {
+ panic("anon_dup_fill_holes: "
+ "empty anon slot\n");
+ }
+ VM_STAT_ADD(anonvmstats.dupfillholes[1]);
+ ap = anon_alloc(NULL, 0);
+ (void) anon_set_ptr(old, index + i, ap,
+ ANON_SLEEP);
+ } else if (i == 0) {
+ /*
+ * make the increment of all refcnts of all
+ * anon slots of a large page appear atomic by
+ * getting an anonpages_hash_lock for the
+ * first anon slot of a large page.
+ */
+ int hash = AH_LOCK(ap->an_vp, ap->an_off);
+
+ VM_STAT_ADD(anonvmstats.dupfillholes[2]);
+
+ ahmpages = &anonpages_hash_lock[hash];
+ mutex_enter(ahmpages);
+ /*LINTED*/
+ ASSERT(refcnt = ap->an_refcnt);
+
+ VM_STAT_COND_ADD(ap->an_refcnt > 1,
+ anonvmstats.dupfillholes[3]);
+ }
+ (void) anon_set_ptr(new, new_idx + off + i, ap,
+ ANON_SLEEP);
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+ mutex_enter(ahm);
+ ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
+ ASSERT(i == 0 || ahmpages == NULL ||
+ refcnt == ap->an_refcnt);
+ ap->an_refcnt++;
+ mutex_exit(ahm);
+ }
+ if (ahmpages != NULL) {
+ mutex_exit(ahmpages);
+ ahmpages = NULL;
+ }
+ off += pgcnt;
+ new_idx += off;
+ old_idx += off;
+ npages -= pgcnt;
+ }
+}
+
+/*
+ * Used when a segment with a vnode changes szc. similarly to
+ * anon_dup_fill_holes() makes sure each large page region either has no anon
+ * slots or all of them. but new slots are created by COWing the file
+ * pages. on entrance no anon slots should be shared.
+ */
+int
+anon_fill_cow_holes(
+ struct seg *seg,
+ caddr_t addr,
+ struct anon_hdr *ahp,
+ ulong_t an_idx,
+ struct vnode *vp,
+ u_offset_t vp_off,
+ size_t size,
+ uint_t szc,
+ uint_t prot,
+ struct vpage vpage[],
+ struct cred *cred)
+{
+ struct anon *ap;
+ spgcnt_t npages;
+ pgcnt_t pgcnt, i;
+ ulong_t index, off;
+ int err = 0;
+ int pageflags = 0;
+
+ ASSERT(szc != 0);
+ pgcnt = page_get_pagecnt(szc);
+ ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+ npages = btopr(size);
+ ASSERT(IS_P2ALIGNED(npages, pgcnt));
+ ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
+
+ while (npages > 0) {
+ index = an_idx;
+
+ /*
+ * Find the next valid slot.
+ */
+ if (anon_get_next_ptr(ahp, &index) == NULL) {
+ break;
+ }
+
+ ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
+ /*
+ * Now backup index to the beginning of the
+ * current large page region of the anon array.
+ */
+ index = P2ALIGN(index, pgcnt);
+ off = index - an_idx;
+ ASSERT(IS_P2ALIGNED(off, pgcnt));
+ npages -= off;
+ if (npages <= 0)
+ break;
+ an_idx += off;
+ vp_off += ptob(off);
+ addr += ptob(off);
+ if (vpage != NULL) {
+ vpage += off;
+ }
+
+ for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) {
+ if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) {
+ page_t *pl[1 + 1];
+ page_t *pp;
+
+ err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL,
+ pl, PAGESIZE, seg, addr, S_READ, cred);
+ if (err) {
+ break;
+ }
+ if (vpage != NULL) {
+ prot = VPP_PROT(vpage);
+ pageflags = VPP_ISPPLOCK(vpage) ?
+ LOCK_PAGE : 0;
+ }
+ pp = anon_private(&ap, seg, addr, prot, pl[0],
+ pageflags, cred);
+ if (pp == NULL) {
+ err = ENOMEM;
+ break;
+ }
+ (void) anon_set_ptr(ahp, an_idx, ap,
+ ANON_SLEEP);
+ page_unlock(pp);
+ }
+ ASSERT(ap->an_refcnt == 1);
+ addr += PAGESIZE;
+ if (vpage != NULL) {
+ vpage++;
+ }
+ }
+ npages -= pgcnt;
+ }
+
+ return (err);
+}
+
+/*
+ * Free a group of "size" anon pages, size in bytes,
+ * and clear out the pointers to the anon entries.
+ */
+void
+anon_free(struct anon_hdr *ahp, ulong_t index, size_t size)
+{
+ spgcnt_t npages;
+ struct anon *ap;
+ ulong_t old;
+
+ npages = btopr(size);
+
+ while (npages > 0) {
+ old = index;
+ if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
+ break;
+
+ ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
+ npages -= index - old;
+ if (npages <= 0)
+ break;
+
+ (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP);
+ anon_decref(ap);
+ /*
+ * Bump index and decrement page count
+ */
+ index++;
+ npages--;
+ }
+}
+
+void
+anon_free_pages(
+ struct anon_hdr *ahp,
+ ulong_t an_idx,
+ size_t size,
+ uint_t szc)
+{
+ spgcnt_t npages;
+ pgcnt_t pgcnt;
+ ulong_t index, off;
+
+ ASSERT(szc != 0);
+ pgcnt = page_get_pagecnt(szc);
+ ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+ npages = btopr(size);
+ ASSERT(IS_P2ALIGNED(npages, pgcnt));
+ ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
+
+ VM_STAT_ADD(anonvmstats.freepages[0]);
+
+ while (npages > 0) {
+ index = an_idx;
+
+ /*
+ * Find the next valid slot.
+ */
+ if (anon_get_next_ptr(ahp, &index) == NULL)
+ break;
+
+ ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
+ /*
+ * Now backup index to the beginning of the
+ * current large page region of the old array.
+ */
+ index = P2ALIGN(index, pgcnt);
+ off = index - an_idx;
+ ASSERT(IS_P2ALIGNED(off, pgcnt));
+ npages -= off;
+ if (npages <= 0)
+ break;
+
+ anon_decref_pages(ahp, index, szc);
+
+ off += pgcnt;
+ an_idx += off;
+ npages -= pgcnt;
+ }
+}
+
+/*
+ * Make anonymous pages discardable
+ */
+void
+anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, int flags)
+{
+ spgcnt_t npages = btopr(size);
+ struct anon *ap;
+ struct vnode *vp;
+ anoff_t off;
+ page_t *pp, *root_pp;
+ kmutex_t *ahm;
+ pgcnt_t pgcnt;
+ ulong_t old_idx, idx, i;
+ struct anon_hdr *ahp = amp->ahp;
+ anon_sync_obj_t cookie;
+
+ ASSERT(RW_READ_HELD(&amp->a_rwlock));
+ pgcnt = 1;
+ for (; npages > 0; index = (pgcnt == 1) ? index + 1:
+ P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) {
+
+ /*
+ * get anon pointer and index for the first valid entry
+ * in the anon list, starting from "index"
+ */
+ old_idx = index;
+ if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
+ break;
+
+ /*
+ * decrement npages by number of NULL anon slots we skipped
+ */
+ npages -= index - old_idx;
+ if (npages <= 0)
+ break;
+
+ anon_array_enter(amp, index, &cookie);
+ ap = anon_get_ptr(ahp, index);
+ ASSERT(ap != NULL);
+
+ /*
+ * Get anonymous page and try to lock it SE_EXCL;
+ * For non blocking case if we couldn't grab the lock
+ * we skip to next page.
+ * For blocking case (ANON_PGLOOKUP_BLK) block
+ * until we grab SE_EXCL lock.
+ */
+ swap_xlate(ap, &vp, &off);
+ if (flags & ANON_PGLOOKUP_BLK)
+ pp = page_lookup_create(vp, (u_offset_t)off,
+ SE_EXCL, NULL, NULL, SE_EXCL_WANTED);
+ else
+ pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL);
+ if (pp == NULL) {
+ segadvstat.MADV_FREE_miss.value.ul++;
+ pgcnt = 1;
+ anon_array_exit(&cookie);
+ continue;
+ }
+ pgcnt = page_get_pagecnt(pp->p_szc);
+
+ /*
+ * we cannot free a page which is permanently locked.
+ * The page_struct_lock need not be acquired to examine
+ * these fields since the page has an "exclusive" lock.
+ */
+ if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+ page_unlock(pp);
+ segadvstat.MADV_FREE_miss.value.ul++;
+ anon_array_exit(&cookie);
+ continue;
+ }
+
+ ahm = &anonhash_lock[AH_LOCK(vp, off)];
+ mutex_enter(ahm);
+ ASSERT(ap->an_refcnt != 0);
+ /*
+ * skip this one if copy-on-write is not yet broken.
+ */
+ if (ap->an_refcnt > 1) {
+ mutex_exit(ahm);
+ page_unlock(pp);
+ segadvstat.MADV_FREE_miss.value.ul++;
+ anon_array_exit(&cookie);
+ continue;
+ }
+
+ if (pp->p_szc == 0) {
+ pgcnt = 1;
+
+ /*
+ * free swap slot;
+ */
+ if (ap->an_pvp) {
+ swap_phys_free(ap->an_pvp, ap->an_poff,
+ PAGESIZE);
+ ap->an_pvp = NULL;
+ ap->an_poff = 0;
+ }
+ mutex_exit(ahm);
+ segadvstat.MADV_FREE_hit.value.ul++;
+
+ /*
+ * while we are at it, unload all the translations
+ * and attempt to free the page.
+ */
+ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+ /*LINTED: constant in conditional context */
+ VN_DISPOSE(pp, B_FREE, 0, kcred);
+ anon_array_exit(&cookie);
+ continue;
+ }
+
+ pgcnt = page_get_pagecnt(pp->p_szc);
+ if (!IS_P2ALIGNED(index, pgcnt)) {
+ if (!page_try_demote_pages(pp)) {
+ mutex_exit(ahm);
+ page_unlock(pp);
+ segadvstat.MADV_FREE_miss.value.ul++;
+ anon_array_exit(&cookie);
+ continue;
+ } else {
+ pgcnt = 1;
+ if (ap->an_pvp) {
+ swap_phys_free(ap->an_pvp,
+ ap->an_poff, PAGESIZE);
+ ap->an_pvp = NULL;
+ ap->an_poff = 0;
+ }
+ mutex_exit(ahm);
+ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+ /*LINTED*/
+ VN_DISPOSE(pp, B_FREE, 0, kcred);
+ segadvstat.MADV_FREE_hit.value.ul++;
+ anon_array_exit(&cookie);
+ continue;
+ }
+ }
+ mutex_exit(ahm);
+ root_pp = pp;
+
+ /*
+ * try to lock remaining pages
+ */
+ for (idx = 1; idx < pgcnt; idx++) {
+ pp = page_next(pp);
+ if (!page_trylock(pp, SE_EXCL))
+ break;
+ if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+ page_unlock(pp);
+ break;
+ }
+ }
+
+ if (idx == pgcnt) {
+ for (i = 0; i < pgcnt; i++) {
+ ap = anon_get_ptr(ahp, index + i);
+ if (ap == NULL)
+ break;
+ swap_xlate(ap, &vp, &off);
+ ahm = &anonhash_lock[AH_LOCK(vp, off)];
+ mutex_enter(ahm);
+ ASSERT(ap->an_refcnt != 0);
+
+ /*
+ * skip this one if copy-on-write
+ * is not yet broken.
+ */
+ if (ap->an_refcnt > 1) {
+ mutex_exit(ahm);
+ goto skiplp;
+ }
+ if (ap->an_pvp) {
+ swap_phys_free(ap->an_pvp,
+ ap->an_poff, PAGESIZE);
+ ap->an_pvp = NULL;
+ ap->an_poff = 0;
+ }
+ mutex_exit(ahm);
+ }
+ page_destroy_pages(root_pp);
+ segadvstat.MADV_FREE_hit.value.ul += pgcnt;
+ anon_array_exit(&cookie);
+ continue;
+ }
+skiplp:
+ segadvstat.MADV_FREE_miss.value.ul += pgcnt;
+ for (i = 0, pp = root_pp; i < idx; pp = page_next(pp), i++)
+ page_unlock(pp);
+ anon_array_exit(&cookie);
+ }
+}
+
+/*
+ * Return the kept page(s) and protections back to the segment driver.
+ */
+int
+anon_getpage(
+ struct anon **app,
+ uint_t *protp,
+ page_t *pl[],
+ size_t plsz,
+ struct seg *seg,
+ caddr_t addr,
+ enum seg_rw rw,
+ struct cred *cred)
+{
+ page_t *pp;
+ struct anon *ap = *app;
+ struct vnode *vp;
+ anoff_t off;
+ int err;
+ kmutex_t *ahm;
+
+ swap_xlate(ap, &vp, &off);
+
+ /*
+ * Lookup the page. If page is being paged in,
+ * wait for it to finish as we must return a list of
+ * pages since this routine acts like the VOP_GETPAGE
+ * routine does.
+ */
+ if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) {
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+ mutex_enter(ahm);
+ if (ap->an_refcnt == 1)
+ *protp = PROT_ALL;
+ else
+ *protp = PROT_ALL & ~PROT_WRITE;
+ mutex_exit(ahm);
+ pl[0] = pp;
+ pl[1] = NULL;
+ return (0);
+ }
+
+ /*
+ * Simply treat it as a vnode fault on the anon vp.
+ */
+
+ TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE,
+ "anon_getpage:seg %x addr %x vp %x",
+ seg, addr, vp);
+
+ err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz,
+ seg, addr, rw, cred);
+
+ if (err == 0 && pl != NULL) {
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+ mutex_enter(ahm);
+ if (ap->an_refcnt != 1)
+ *protp &= ~PROT_WRITE; /* make read-only */
+ mutex_exit(ahm);
+ }
+ return (err);
+}
+
+/*
+ * Creates or returns kept pages to the segment driver. returns -1 if a large
+ * page cannot be allocated. returns -2 if some other process has allocated a
+ * larger page.
+ *
+ * For cowfault it will alocate any size pages to fill the requested area to
+ * avoid partially overwritting anon slots (i.e. sharing only some of the anon
+ * slots within a large page with other processes). This policy greatly
+ * simplifies large page freeing (which is only freed when all anon slot
+ * refcnts are 0).
+ */
+int
+anon_map_getpages(
+ struct anon_map *amp,
+ ulong_t start_idx,
+ uint_t szc,
+ struct seg *seg,
+ caddr_t addr,
+ uint_t prot,
+ uint_t *protp,
+ page_t *ppa[],
+ uint_t *ppa_szc,
+ struct vpage vpage[],
+ enum seg_rw rw,
+ int brkcow,
+ int anypgsz,
+ struct cred *cred)
+{
+ pgcnt_t pgcnt;
+ struct anon *ap;
+ struct vnode *vp;
+ anoff_t off;
+ page_t *pp, *pl[2], *conpp = NULL;
+ caddr_t vaddr;
+ ulong_t pg_idx, an_idx, i;
+ spgcnt_t nreloc = 0;
+ int prealloc = 1;
+ int err, slotcreate;
+ uint_t vpprot;
+
+#if !defined(__i386) && !defined(__amd64)
+ ASSERT(seg->s_szc != 0);
+#endif
+ ASSERT(szc <= seg->s_szc);
+ ASSERT(ppa_szc != NULL);
+ ASSERT(rw != S_CREATE);
+
+ *protp = PROT_ALL;
+
+ VM_STAT_ADD(anonvmstats.getpages[0]);
+
+ if (szc == 0) {
+ VM_STAT_ADD(anonvmstats.getpages[1]);
+ if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) {
+ err = anon_getpage(&ap, protp, pl, PAGESIZE, seg,
+ addr, rw, cred);
+ if (err)
+ return (err);
+ ppa[0] = pl[0];
+ if (brkcow == 0 || (*protp & PROT_WRITE)) {
+ VM_STAT_ADD(anonvmstats.getpages[2]);
+ if (ppa[0]->p_szc != 0) {
+ VM_STAT_ADD(anonvmstats.getpages[3]);
+ *ppa_szc = ppa[0]->p_szc;
+ page_unlock(ppa[0]);
+ return (-2);
+ }
+ return (0);
+ }
+ panic("anon_map_getpages: cowfault for szc 0");
+ } else {
+ VM_STAT_ADD(anonvmstats.getpages[4]);
+ ppa[0] = anon_zero(seg, addr, &ap, cred);
+ if (ppa[0] == NULL)
+ return (ENOMEM);
+ (void) anon_set_ptr(amp->ahp, start_idx, ap,
+ ANON_SLEEP);
+ return (0);
+ }
+ }
+
+ pgcnt = page_get_pagecnt(szc);
+ ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+ ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
+
+ /*
+ * First we check for the case that the requtested large
+ * page or larger page already exists in the system.
+ * Actually we only check if the first constituent page
+ * exists and only preallocate if it's not found.
+ */
+ ap = anon_get_ptr(amp->ahp, start_idx);
+ if (ap) {
+ uint_t pszc;
+ swap_xlate(ap, &vp, &off);
+ if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) {
+ if (pszc > szc) {
+ *ppa_szc = pszc;
+ return (-2);
+ }
+ if (pszc == szc) {
+ prealloc = 0;
+ }
+ }
+ }
+
+ VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]);
+ VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]);
+
+top:
+ /*
+ * If a smaller page or no page at all was found,
+ * grab a large page off the freelist.
+ */
+ if (prealloc) {
+ ASSERT(conpp == NULL);
+ if (page_alloc_pages(seg, addr, NULL, ppa, szc, 0) != 0) {
+ VM_STAT_ADD(anonvmstats.getpages[7]);
+ if (brkcow == 0 ||
+ !anon_share(amp->ahp, start_idx, pgcnt)) {
+ /*
+ * If the refcnt's of all anon slots are <= 1
+ * they can't increase since we are holding
+ * the address space's lock. So segvn can
+ * safely decrease szc without risking to
+ * generate a cow fault for the region smaller
+ * than the segment's largest page size.
+ */
+ VM_STAT_ADD(anonvmstats.getpages[8]);
+ return (-1);
+ }
+ docow:
+ /*
+ * This is a cow fault. Copy away the entire 1 large
+ * page region of this segment.
+ */
+ if (szc != seg->s_szc)
+ panic("anon_map_getpages: cowfault for szc %d",
+ szc);
+ vaddr = addr;
+ for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
+ pg_idx++, an_idx++, vaddr += PAGESIZE) {
+ if ((ap = anon_get_ptr(amp->ahp, an_idx)) !=
+ NULL) {
+ err = anon_getpage(&ap, &vpprot, pl,
+ PAGESIZE, seg, vaddr, rw, cred);
+ if (err) {
+ for (i = 0; i < pg_idx; i++) {
+ if ((pp = ppa[i]) !=
+ NULL)
+ page_unlock(pp);
+ }
+ return (err);
+ }
+ ppa[pg_idx] = pl[0];
+ } else {
+ /*
+ * Since this is a cowfault we know
+ * that this address space has a
+ * parent or children which means
+ * anon_dup_fill_holes() has initialized
+ * all anon slots within a large page
+ * region that had at least one anon
+ * slot at the time of fork().
+ */
+ panic("anon_map_getpages: "
+ "cowfault but anon slot is empty");
+ }
+ }
+ VM_STAT_ADD(anonvmstats.getpages[9]);
+ *protp = PROT_ALL;
+ return (anon_map_privatepages(amp, start_idx, szc, seg,
+ addr, prot, ppa, vpage, anypgsz, cred));
+ }
+ }
+
+ VM_STAT_ADD(anonvmstats.getpages[10]);
+
+ an_idx = start_idx;
+ pg_idx = 0;
+ vaddr = addr;
+ while (pg_idx < pgcnt) {
+ slotcreate = 0;
+ if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) {
+ VM_STAT_ADD(anonvmstats.getpages[11]);
+ /*
+ * For us to have decided not to preallocate
+ * would have meant that a large page
+ * was found. Which also means that all of the
+ * anon slots for that page would have been
+ * already created for us.
+ */
+ if (prealloc == 0)
+ panic("anon_map_getpages: prealloc = 0");
+
+ slotcreate = 1;
+ ap = anon_alloc(NULL, 0);
+ }
+ swap_xlate(ap, &vp, &off);
+
+ /*
+ * Now setup our preallocated page to pass down
+ * to swap_getpage().
+ */
+ if (prealloc) {
+ ASSERT(ppa[pg_idx]->p_szc == szc);
+ conpp = ppa[pg_idx];
+ }
+ ASSERT(prealloc || conpp == NULL);
+
+ /*
+ * If we just created this anon slot then call
+ * with S_CREATE to prevent doing IO on the page.
+ * Similar to the anon_zero case.
+ */
+ err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE,
+ NULL, pl, PAGESIZE, conpp, &nreloc, seg, vaddr,
+ slotcreate == 1 ? S_CREATE : rw, cred);
+
+ if (err) {
+ VM_STAT_ADD(anonvmstats.getpages[12]);
+ ASSERT(slotcreate == 0);
+ goto io_err;
+ }
+
+ pp = pl[0];
+
+ if (pp->p_szc != szc) {
+ VM_STAT_ADD(anonvmstats.getpages[13]);
+ ASSERT(slotcreate == 0);
+ ASSERT(prealloc == 0);
+ ASSERT(pg_idx == 0);
+ if (pp->p_szc > szc) {
+ page_unlock(pp);
+ VM_STAT_ADD(anonvmstats.getpages[14]);
+ return (-2);
+ }
+ page_unlock(pp);
+ prealloc = 1;
+ goto top;
+ }
+
+ /*
+ * If we decided to preallocate but VOP_GETPAGE
+ * found a page in the system that satisfies our
+ * request then free up our preallocated large page
+ * and continue looping accross the existing large
+ * page via VOP_GETPAGE.
+ */
+ if (prealloc && pp != ppa[pg_idx]) {
+ VM_STAT_ADD(anonvmstats.getpages[15]);
+ ASSERT(slotcreate == 0);
+ ASSERT(pg_idx == 0);
+ conpp = NULL;
+ prealloc = 0;
+ page_free_pages(ppa[0]);
+ }
+
+ if (prealloc && nreloc > 1) {
+ /*
+ * we have relocated out of a smaller large page.
+ * skip npgs - 1 iterations and continue which will
+ * increment by one the loop indices.
+ */
+ spgcnt_t npgs = nreloc;
+
+ VM_STAT_ADD(anonvmstats.getpages[16]);
+
+ ASSERT(pp == ppa[pg_idx]);
+ ASSERT(slotcreate == 0);
+ ASSERT(pg_idx + npgs <= pgcnt);
+ if ((*protp & PROT_WRITE) &&
+ anon_share(amp->ahp, an_idx, npgs)) {
+ *protp &= ~PROT_WRITE;
+ }
+ pg_idx += npgs;
+ an_idx += npgs;
+ vaddr += PAGESIZE * npgs;
+ continue;
+ }
+
+ VM_STAT_ADD(anonvmstats.getpages[17]);
+
+ /*
+ * Anon_zero case.
+ */
+ if (slotcreate) {
+ ASSERT(prealloc);
+ pagezero(pp, 0, PAGESIZE);
+ CPU_STATS_ADD_K(vm, zfod, 1);
+ hat_setrefmod(pp);
+ }
+
+ ASSERT(prealloc == 0 || ppa[pg_idx] == pp);
+ ASSERT(prealloc != 0 || PAGE_SHARED(pp));
+ ASSERT(prealloc == 0 || PAGE_EXCL(pp));
+
+ if (pg_idx > 0 &&
+ ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) ||
+ (pp->p_szc != ppa[pg_idx - 1]->p_szc)))
+ panic("anon_map_getpages: unexpected page");
+
+ if (prealloc == 0) {
+ ppa[pg_idx] = pp;
+ }
+
+ if (ap->an_refcnt > 1) {
+ VM_STAT_ADD(anonvmstats.getpages[18]);
+ *protp &= ~PROT_WRITE;
+ }
+
+ /*
+ * If this is a new anon slot then initialize
+ * the anon array entry.
+ */
+ if (slotcreate) {
+ (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
+ }
+ pg_idx++;
+ an_idx++;
+ vaddr += PAGESIZE;
+ }
+
+ /*
+ * Since preallocated pages come off the freelist
+ * they are locked SE_EXCL. Simply downgrade and return.
+ */
+ if (prealloc) {
+ VM_STAT_ADD(anonvmstats.getpages[19]);
+ conpp = NULL;
+ for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
+ page_downgrade(ppa[pg_idx]);
+ }
+ }
+ ASSERT(conpp == NULL);
+
+ if (brkcow == 0 || (*protp & PROT_WRITE)) {
+ VM_STAT_ADD(anonvmstats.getpages[20]);
+ return (0);
+ }
+
+ if (szc < seg->s_szc)
+ panic("anon_map_getpages: cowfault for szc %d", szc);
+
+ VM_STAT_ADD(anonvmstats.getpages[21]);
+
+ *protp = PROT_ALL;
+ return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
+ ppa, vpage, anypgsz, cred));
+io_err:
+ /*
+ * We got an IO error somewhere in our large page.
+ * If we were using a preallocated page then just demote
+ * all the constituent pages that we've succeeded with sofar
+ * to PAGESIZE pages and leave them in the system
+ * unlocked.
+ */
+
+ ASSERT(err != -2 || pg_idx == 0);
+
+ VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]);
+ VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]);
+ VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]);
+
+ if (prealloc) {
+ conpp = NULL;
+ if (pg_idx > 0) {
+ VM_STAT_ADD(anonvmstats.getpages[25]);
+ for (i = 0; i < pgcnt; i++) {
+ pp = ppa[i];
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_szc == szc);
+ pp->p_szc = 0;
+ }
+ for (i = 0; i < pg_idx; i++) {
+ ASSERT(!hat_page_is_mapped(ppa[i]));
+ page_unlock(ppa[i]);
+ }
+ /*
+ * Now free up the remaining unused constituent
+ * pages.
+ */
+ while (pg_idx < pgcnt) {
+ ASSERT(!hat_page_is_mapped(ppa[pg_idx]));
+ page_free(ppa[pg_idx], 0);
+ pg_idx++;
+ }
+ } else {
+ VM_STAT_ADD(anonvmstats.getpages[26]);
+ page_free_pages(ppa[0]);
+ }
+ } else {
+ VM_STAT_ADD(anonvmstats.getpages[27]);
+ ASSERT(err > 0);
+ for (i = 0; i < pg_idx; i++)
+ page_unlock(ppa[i]);
+ }
+ ASSERT(conpp == NULL);
+ if (err != -1)
+ return (err);
+ /*
+ * we are here because we failed to relocate.
+ */
+ ASSERT(prealloc);
+ if (brkcow == 0 || !anon_share(amp->ahp, start_idx, pgcnt)) {
+ VM_STAT_ADD(anonvmstats.getpages[28]);
+ return (-1);
+ }
+ VM_STAT_ADD(anonvmstats.getpages[29]);
+ goto docow;
+}
+
+
+/*
+ * Turn a reference to an object or shared anon page
+ * into a private page with a copy of the data from the
+ * original page which is always locked by the caller.
+ * This routine unloads the translation and unlocks the
+ * original page, if it isn't being stolen, before returning
+ * to the caller.
+ *
+ * NOTE: The original anon slot is not freed by this routine
+ * It must be freed by the caller while holding the
+ * "anon_map" lock to prevent races which can occur if
+ * a process has multiple lwps in its address space.
+ */
+page_t *
+anon_private(
+ struct anon **app,
+ struct seg *seg,
+ caddr_t addr,
+ uint_t prot,
+ page_t *opp,
+ int oppflags,
+ struct cred *cred)
+{
+ struct anon *old = *app;
+ struct anon *new;
+ page_t *pp = NULL;
+ struct vnode *vp;
+ anoff_t off;
+ page_t *anon_pl[1 + 1];
+ int err;
+
+ if (oppflags & STEAL_PAGE)
+ ASSERT(PAGE_EXCL(opp));
+ else
+ ASSERT(PAGE_LOCKED(opp));
+
+ CPU_STATS_ADD_K(vm, cow_fault, 1);
+
+ /* Kernel probe */
+ TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */,
+ tnf_opaque, address, addr);
+
+ *app = new = anon_alloc(NULL, 0);
+ swap_xlate(new, &vp, &off);
+
+ if (oppflags & STEAL_PAGE) {
+ page_rename(opp, vp, (u_offset_t)off);
+ pp = opp;
+ TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE,
+ "anon_private:seg %p addr %x pp %p vp %p off %lx",
+ seg, addr, pp, vp, off);
+ hat_setmod(pp);
+
+ /* bug 4026339 */
+ page_downgrade(pp);
+ return (pp);
+ }
+
+ /*
+ * Call the VOP_GETPAGE routine to create the page, thereby
+ * enabling the vnode driver to allocate any filesystem
+ * space (e.g., disk block allocation for UFS). This also
+ * prevents more than one page from being added to the
+ * vnode at the same time.
+ */
+ err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL,
+ anon_pl, PAGESIZE, seg, addr, S_CREATE, cred);
+ if (err)
+ goto out;
+
+ pp = anon_pl[0];
+
+ /*
+ * If the original page was locked, we need to move the lock
+ * to the new page by transfering 'cowcnt/lckcnt' of the original
+ * page to 'cowcnt/lckcnt' of the new page.
+ *
+ * See Statement at the beginning of segvn_lockop() and
+ * comments in page_pp_useclaim() regarding the way
+ * cowcnts/lckcnts are handled.
+ *
+ * Also availrmem must be decremented up front for read only mapping
+ * before calling page_pp_useclaim. page_pp_useclaim will bump it back
+ * if availrmem did not need to be decremented after all.
+ */
+ if (oppflags & LOCK_PAGE) {
+ if ((prot & PROT_WRITE) == 0) {
+ mutex_enter(&freemem_lock);
+ if (availrmem > pages_pp_maximum) {
+ availrmem--;
+ pages_useclaim++;
+ } else {
+ mutex_exit(&freemem_lock);
+ goto out;
+ }
+ mutex_exit(&freemem_lock);
+ }
+ page_pp_useclaim(opp, pp, prot & PROT_WRITE);
+ }
+
+ /*
+ * Now copy the contents from the original page,
+ * which is locked and loaded in the MMU by
+ * the caller to prevent yet another page fault.
+ */
+ ppcopy(opp, pp); /* XXX - should set mod bit in here */
+
+ hat_setrefmod(pp); /* mark as modified */
+
+ /*
+ * Unload the old translation.
+ */
+ hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD);
+
+ /*
+ * Free unmapped, unmodified original page.
+ * or release the lock on the original page,
+ * otherwise the process will sleep forever in
+ * anon_decref() waiting for the "exclusive" lock
+ * on the page.
+ */
+ (void) page_release(opp, 1);
+
+ /*
+ * we are done with page creation so downgrade the new
+ * page's selock to shared, this helps when multiple
+ * as_fault(...SOFTLOCK...) are done to the same
+ * page(aio)
+ */
+ page_downgrade(pp);
+
+ /*
+ * NOTE: The original anon slot must be freed by the
+ * caller while holding the "anon_map" lock, if we
+ * copied away from an anonymous page.
+ */
+ return (pp);
+
+out:
+ *app = old;
+ if (pp)
+ page_unlock(pp);
+ anon_decref(new);
+ page_unlock(opp);
+ return ((page_t *)NULL);
+}
+
+int
+anon_map_privatepages(
+ struct anon_map *amp,
+ ulong_t start_idx,
+ uint_t szc,
+ struct seg *seg,
+ caddr_t addr,
+ uint_t prot,
+ page_t *ppa[],
+ struct vpage vpage[],
+ int anypgsz,
+ struct cred *cred)
+{
+ pgcnt_t pgcnt;
+ struct vnode *vp;
+ anoff_t off;
+ page_t *pl[2], *conpp = NULL;
+ int err;
+ int prealloc = 1;
+ struct anon *ap, *oldap;
+ caddr_t vaddr;
+ page_t *pplist, *pp;
+ ulong_t pg_idx, an_idx;
+ spgcnt_t nreloc = 0;
+ int pagelock = 0;
+ kmutex_t *ahmpages = NULL;
+#ifdef DEBUG
+ int refcnt;
+#endif
+
+ ASSERT(szc != 0);
+ ASSERT(szc == seg->s_szc);
+
+ VM_STAT_ADD(anonvmstats.privatepages[0]);
+
+ pgcnt = page_get_pagecnt(szc);
+ ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+ ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
+
+ ASSERT(amp != NULL);
+ ap = anon_get_ptr(amp->ahp, start_idx);
+ ASSERT(ap == NULL || ap->an_refcnt >= 1);
+
+ VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]);
+
+ /*
+ * Now try and allocate the large page. If we fail then just
+ * let VOP_GETPAGE give us PAGESIZE pages. Normally we let
+ * the caller make this decision but to avoid added complexity
+ * it's simplier to handle that case here.
+ */
+ if (anypgsz == -1) {
+ VM_STAT_ADD(anonvmstats.privatepages[2]);
+ prealloc = 0;
+ } else if (page_alloc_pages(seg, addr, &pplist, NULL, szc,
+ anypgsz) != 0) {
+ VM_STAT_ADD(anonvmstats.privatepages[3]);
+ prealloc = 0;
+ }
+
+ /*
+ * make the decrement of all refcnts of all
+ * anon slots of a large page appear atomic by
+ * getting an anonpages_hash_lock for the
+ * first anon slot of a large page.
+ */
+ if (ap != NULL) {
+ ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp,
+ ap->an_off)];
+ mutex_enter(ahmpages);
+ if (ap->an_refcnt == 1) {
+ VM_STAT_ADD(anonvmstats.privatepages[4]);
+ ASSERT(!anon_share(amp->ahp, start_idx, pgcnt));
+ mutex_exit(ahmpages);
+
+ if (prealloc) {
+ page_free_replacement_page(pplist);
+ page_create_putback(pgcnt);
+ }
+ ASSERT(ppa[0]->p_szc <= szc);
+ if (ppa[0]->p_szc == szc) {
+ VM_STAT_ADD(anonvmstats.privatepages[5]);
+ return (0);
+ }
+ for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
+ ASSERT(ppa[pg_idx] != NULL);
+ page_unlock(ppa[pg_idx]);
+ }
+ return (-1);
+ }
+ }
+
+ /*
+ * If we are passed in the vpage array and this is
+ * not PROT_WRITE then we need to decrement availrmem
+ * up front before we try anything. If we need to and
+ * can't decrement availrmem then its better to fail now
+ * than in the middle of processing the new large page.
+ * page_pp_usclaim() on behalf of each constituent page
+ * below will adjust availrmem back for the cases not needed.
+ */
+ if (vpage != NULL && (prot & PROT_WRITE) == 0) {
+ for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
+ if (VPP_ISPPLOCK(&vpage[pg_idx])) {
+ pagelock = 1;
+ break;
+ }
+ }
+ if (pagelock) {
+ VM_STAT_ADD(anonvmstats.privatepages[6]);
+ mutex_enter(&freemem_lock);
+ if (availrmem >= pages_pp_maximum + pgcnt) {
+ availrmem -= pgcnt;
+ pages_useclaim += pgcnt;
+ } else {
+ VM_STAT_ADD(anonvmstats.privatepages[7]);
+ mutex_exit(&freemem_lock);
+ if (ahmpages != NULL) {
+ mutex_exit(ahmpages);
+ }
+ if (prealloc) {
+ page_free_replacement_page(pplist);
+ page_create_putback(pgcnt);
+ }
+ for (pg_idx = 0; pg_idx < pgcnt; pg_idx++)
+ if (ppa[pg_idx] != NULL)
+ page_unlock(ppa[pg_idx]);
+ return (ENOMEM);
+ }
+ mutex_exit(&freemem_lock);
+ }
+ }
+
+ CPU_STATS_ADD_K(vm, cow_fault, pgcnt);
+
+ VM_STAT_ADD(anonvmstats.privatepages[8]);
+
+ an_idx = start_idx;
+ pg_idx = 0;
+ vaddr = addr;
+ for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) {
+ ASSERT(ppa[pg_idx] != NULL);
+ oldap = anon_get_ptr(amp->ahp, an_idx);
+ ASSERT(ahmpages != NULL || oldap == NULL);
+ ASSERT(ahmpages == NULL || oldap != NULL);
+ ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
+ ASSERT(ahmpages == NULL || pg_idx != 0 ||
+ (refcnt = oldap->an_refcnt));
+ ASSERT(ahmpages == NULL || pg_idx == 0 ||
+ refcnt == oldap->an_refcnt);
+
+ ap = anon_alloc(NULL, 0);
+
+ swap_xlate(ap, &vp, &off);
+
+ /*
+ * Now setup our preallocated page to pass down to
+ * swap_getpage().
+ */
+ if (prealloc) {
+ pp = pplist;
+ page_sub(&pplist, pp);
+ conpp = pp;
+ }
+
+ err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl,
+ PAGESIZE, conpp, &nreloc, seg, vaddr, S_CREATE, cred);
+
+ /*
+ * Impossible to fail this is S_CREATE.
+ */
+ if (err)
+ panic("anon_map_privatepages: VOP_GETPAGE failed");
+
+ ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0);
+ ASSERT(prealloc == 0 || nreloc == 1);
+
+ pp = pl[0];
+
+ /*
+ * If the original page was locked, we need to move
+ * the lock to the new page by transfering
+ * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
+ * of the new page. pg_idx can be used to index
+ * into the vpage array since the caller will guarentee
+ * that vpage struct passed in corresponds to addr
+ * and forward.
+ */
+ if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) {
+ page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE);
+ } else if (pagelock) {
+ mutex_enter(&freemem_lock);
+ availrmem++;
+ pages_useclaim--;
+ mutex_exit(&freemem_lock);
+ }
+
+ /*
+ * Now copy the contents from the original page.
+ */
+ ppcopy(ppa[pg_idx], pp);
+
+ hat_setrefmod(pp); /* mark as modified */
+
+ /*
+ * Release the lock on the original page,
+ * derement the old slot, and down grade the lock
+ * on the new copy.
+ */
+ page_unlock(ppa[pg_idx]);
+
+ if (!prealloc)
+ page_downgrade(pp);
+
+ ppa[pg_idx] = pp;
+
+ /*
+ * Now reflect the copy in the new anon array.
+ */
+ ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
+ if (oldap != NULL)
+ anon_decref(oldap);
+ (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
+ }
+ if (ahmpages != NULL) {
+ mutex_exit(ahmpages);
+ }
+ ASSERT(prealloc == 0 || pplist == NULL);
+ if (prealloc) {
+ VM_STAT_ADD(anonvmstats.privatepages[9]);
+ for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
+ page_downgrade(ppa[pg_idx]);
+ }
+ }
+
+ /*
+ * Unload the old large page translation.
+ */
+ hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD);
+ return (0);
+}
+
+/*
+ * Allocate a private zero-filled anon page.
+ */
+page_t *
+anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred)
+{
+ struct anon *ap;
+ page_t *pp;
+ struct vnode *vp;
+ anoff_t off;
+ page_t *anon_pl[1 + 1];
+ int err;
+
+ /* Kernel probe */
+ TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */,
+ tnf_opaque, address, addr);
+
+ *app = ap = anon_alloc(NULL, 0);
+ swap_xlate(ap, &vp, &off);
+
+ /*
+ * Call the VOP_GETPAGE routine to create the page, thereby
+ * enabling the vnode driver to allocate any filesystem
+ * dependent structures (e.g., disk block allocation for UFS).
+ * This also prevents more than on page from being added to
+ * the vnode at the same time since it is locked.
+ */
+ err = VOP_GETPAGE(vp, off, PAGESIZE, NULL,
+ anon_pl, PAGESIZE, seg, addr, S_CREATE, cred);
+ if (err) {
+ *app = NULL;
+ anon_decref(ap);
+ return (NULL);
+ }
+ pp = anon_pl[0];
+
+ pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */
+ page_downgrade(pp);
+ CPU_STATS_ADD_K(vm, zfod, 1);
+ hat_setrefmod(pp); /* mark as modified so pageout writes back */
+ return (pp);
+}
+
+
+/*
+ * Allocate array of private zero-filled anon pages for empty slots
+ * and kept pages for non empty slots within given range.
+ *
+ * NOTE: This rontine will try and use large pages
+ * if available and supported by underlying platform.
+ */
+int
+anon_map_createpages(
+ struct anon_map *amp,
+ ulong_t start_index,
+ size_t len,
+ page_t *ppa[],
+ struct seg *seg,
+ caddr_t addr,
+ enum seg_rw rw,
+ struct cred *cred)
+{
+
+ struct anon *ap;
+ struct vnode *ap_vp;
+ page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL;
+ int err = 0;
+ ulong_t p_index, index;
+ pgcnt_t npgs, pg_cnt;
+ spgcnt_t nreloc = 0;
+ uint_t l_szc, szc, prot;
+ anoff_t ap_off;
+ size_t pgsz;
+ lgrp_t *lgrp;
+
+ /*
+ * XXX For now only handle S_CREATE.
+ */
+ ASSERT(rw == S_CREATE);
+
+ index = start_index;
+ p_index = 0;
+ npgs = btopr(len);
+
+ /*
+ * If this platform supports multiple page sizes
+ * then try and allocate directly from the free
+ * list for pages larger than PAGESIZE.
+ *
+ * NOTE:When we have page_create_ru we can stop
+ * directly allocating from the freelist.
+ */
+ l_szc = seg->s_szc;
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ while (npgs) {
+
+ /*
+ * if anon slot already exists
+ * (means page has been created)
+ * so 1) look up the page
+ * 2) if the page is still in memory, get it.
+ * 3) if not, create a page and
+ * page in from physical swap device.
+ * These are done in anon_getpage().
+ */
+ ap = anon_get_ptr(amp->ahp, index);
+ if (ap) {
+ err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE,
+ seg, addr, S_READ, cred);
+ if (err) {
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ panic("anon_map_createpages: anon_getpage");
+ }
+ pp = anon_pl[0];
+ ppa[p_index++] = pp;
+
+ addr += PAGESIZE;
+ index++;
+ npgs--;
+ continue;
+ }
+ /*
+ * Now try and allocate the largest page possible
+ * for the current address and range.
+ * Keep dropping down in page size until:
+ *
+ * 1) Properly aligned
+ * 2) Does not overlap existing anon pages
+ * 3) Fits in remaining range.
+ * 4) able to allocate one.
+ *
+ * NOTE: XXX When page_create_ru is completed this code
+ * will change.
+ */
+ szc = l_szc;
+ pplist = NULL;
+ pg_cnt = 0;
+ while (szc) {
+ pgsz = page_get_pagesize(szc);
+ pg_cnt = pgsz >> PAGESHIFT;
+ if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs &&
+ anon_pages(amp->ahp, index, pg_cnt) == 0) {
+ /*
+ * XXX
+ * Since we are faking page_create()
+ * we also need to do the freemem and
+ * pcf accounting.
+ */
+ (void) page_create_wait(pg_cnt, PG_WAIT);
+
+ /*
+ * Get lgroup to allocate next page of shared
+ * memory from and use it to specify where to
+ * allocate the physical memory
+ */
+ lgrp = lgrp_mem_choose(seg, addr, pgsz);
+
+ pplist = page_get_freelist(
+ (struct vnode *)NULL, (u_offset_t)0, seg,
+ addr, pgsz, 0, lgrp);
+
+ if (pplist == NULL) {
+ page_create_putback(pg_cnt);
+ }
+
+ /*
+ * If a request for a page of size
+ * larger than PAGESIZE failed
+ * then don't try that size anymore.
+ */
+ if (pplist == NULL) {
+ l_szc = szc - 1;
+ } else {
+ break;
+ }
+ }
+ szc--;
+ }
+
+ /*
+ * If just using PAGESIZE pages then don't
+ * directly allocate from the free list.
+ */
+ if (pplist == NULL) {
+ ASSERT(szc == 0);
+ pp = anon_zero(seg, addr, &ap, cred);
+ if (pp == NULL) {
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ panic("anon_map_createpages: anon_zero");
+ }
+ ppa[p_index++] = pp;
+
+ ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
+ (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
+
+ addr += PAGESIZE;
+ index++;
+ npgs--;
+ continue;
+ }
+
+ /*
+ * pplist is a list of pg_cnt PAGESIZE pages.
+ * These pages are locked SE_EXCL since they
+ * came directly off the free list.
+ */
+ ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt));
+ ASSERT(IS_P2ALIGNED(index, pg_cnt));
+ ASSERT(conpp == NULL);
+ while (pg_cnt--) {
+
+ ap = anon_alloc(NULL, 0);
+ swap_xlate(ap, &ap_vp, &ap_off);
+
+ ASSERT(pplist != NULL);
+ pp = pplist;
+ page_sub(&pplist, pp);
+ PP_CLRFREE(pp);
+ PP_CLRAGED(pp);
+ conpp = pp;
+
+ err = swap_getconpage(ap_vp, ap_off, PAGESIZE,
+ (uint_t *)NULL, anon_pl, PAGESIZE, conpp, &nreloc,
+ seg, addr, S_CREATE, cred);
+
+ if (err) {
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ panic("anon_map_createpages: S_CREATE");
+ }
+
+ ASSERT(anon_pl[0] == pp);
+ ASSERT(nreloc == 1);
+ pagezero(pp, 0, PAGESIZE);
+ CPU_STATS_ADD_K(vm, zfod, 1);
+ hat_setrefmod(pp);
+
+ ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
+ (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
+
+ ppa[p_index++] = pp;
+
+ addr += PAGESIZE;
+ index++;
+ npgs--;
+ }
+ conpp = NULL;
+ pg_cnt = pgsz >> PAGESHIFT;
+ p_index = p_index - pg_cnt;
+ while (pg_cnt--) {
+ page_downgrade(ppa[p_index++]);
+ }
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ return (0);
+}
+
+int
+anon_map_demotepages(
+ struct anon_map *amp,
+ ulong_t start_idx,
+ struct seg *seg,
+ caddr_t addr,
+ uint_t prot,
+ struct vpage vpage[],
+ struct cred *cred)
+{
+ struct anon *ap;
+ uint_t szc = seg->s_szc;
+ pgcnt_t pgcnt = page_get_pagecnt(szc);
+ size_t ppasize = pgcnt * sizeof (page_t *);
+ page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
+ page_t *pp;
+ page_t *pl[2];
+ pgcnt_t i, pg_idx;
+ ulong_t an_idx;
+ caddr_t vaddr;
+ kmutex_t *ahmpages = NULL;
+ int err;
+ int retry = 0;
+ uint_t vpprot;
+
+ ASSERT(RW_WRITE_HELD(&amp->a_rwlock));
+ ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
+ ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
+ ASSERT(ppa != NULL);
+
+ VM_STAT_ADD(anonvmstats.demotepages[0]);
+
+ ap = anon_get_ptr(amp->ahp, start_idx);
+ if (ap != NULL) {
+ VM_STAT_ADD(anonvmstats.demotepages[1]);
+ ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+ mutex_enter(ahmpages);
+ }
+top:
+ if (ap == NULL || ap->an_refcnt <= 1) {
+ int root = 0;
+ pgcnt_t npgs, curnpgs = 0;
+
+ VM_STAT_ADD(anonvmstats.demotepages[2]);
+
+ ASSERT(retry == 0 || ap != NULL);
+
+ if (ahmpages != NULL)
+ mutex_exit(ahmpages);
+ an_idx = start_idx;
+ for (i = 0; i < pgcnt; i++, an_idx++) {
+ ap = anon_get_ptr(amp->ahp, an_idx);
+ if (ap != NULL) {
+ ASSERT(ap->an_refcnt == 1);
+ pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off,
+ SE_EXCL);
+ if (pp != NULL) {
+ (void) hat_pageunload(pp,
+ HAT_FORCE_PGUNLOAD);
+ }
+ } else {
+ ppa[i] = NULL;
+ }
+ }
+ for (i = 0; i < pgcnt; i++) {
+ if ((pp = ppa[i]) != NULL && pp->p_szc != 0) {
+ ASSERT(pp->p_szc <= szc);
+ if (!root) {
+ VM_STAT_ADD(anonvmstats.demotepages[3]);
+ if (curnpgs != 0)
+ panic("anon_map_demotepages: "
+ "bad large page");
+
+ root = 1;
+ curnpgs = npgs =
+ page_get_pagecnt(pp->p_szc);
+
+ ASSERT(npgs <= pgcnt);
+ ASSERT(IS_P2ALIGNED(npgs, npgs));
+ ASSERT(!(page_pptonum(pp) &
+ (npgs - 1)));
+ } else {
+ ASSERT(i > 0);
+ ASSERT(page_pptonum(pp) - 1 ==
+ page_pptonum(ppa[i - 1]));
+ if ((page_pptonum(pp) & (npgs - 1)) ==
+ npgs - 1)
+ root = 0;
+ }
+ ASSERT(PAGE_EXCL(pp));
+ pp->p_szc = 0;
+ curnpgs--;
+ }
+ }
+ if (root != 0 || curnpgs != 0)
+ panic("anon_map_demotepages: bad large page");
+
+ for (i = 0; i < pgcnt; i++) {
+ if ((pp = ppa[i]) != NULL) {
+ ASSERT(!hat_page_is_mapped(pp));
+ ASSERT(pp->p_szc == 0);
+ page_unlock(pp);
+ }
+ }
+ kmem_free(ppa, ppasize);
+ return (0);
+ }
+ ASSERT(ahmpages != NULL);
+ mutex_exit(ahmpages);
+ ahmpages = NULL;
+
+ VM_STAT_ADD(anonvmstats.demotepages[4]);
+
+ ASSERT(retry == 0); /* we can be here only once */
+
+ vaddr = addr;
+ for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
+ pg_idx++, an_idx++, vaddr += PAGESIZE) {
+ ap = anon_get_ptr(amp->ahp, an_idx);
+ if (ap == NULL)
+ panic("anon_map_demotepages: no anon slot");
+ err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr,
+ S_READ, cred);
+ if (err) {
+ for (i = 0; i < pg_idx; i++) {
+ if ((pp = ppa[i]) != NULL)
+ page_unlock(pp);
+ }
+ kmem_free(ppa, ppasize);
+ return (err);
+ }
+ ppa[pg_idx] = pl[0];
+ }
+
+ err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
+ vpage, -1, cred);
+ if (err > 0) {
+ VM_STAT_ADD(anonvmstats.demotepages[5]);
+ kmem_free(ppa, ppasize);
+ return (err);
+ }
+ ASSERT(err == 0 || err == -1);
+ if (err == -1) {
+ VM_STAT_ADD(anonvmstats.demotepages[6]);
+ retry = 1;
+ goto top;
+ }
+ for (i = 0; i < pgcnt; i++) {
+ ASSERT(ppa[i] != NULL);
+ if (ppa[i]->p_szc != 0)
+ retry = 1;
+ page_unlock(ppa[i]);
+ }
+ if (retry) {
+ VM_STAT_ADD(anonvmstats.demotepages[7]);
+ goto top;
+ }
+
+ VM_STAT_ADD(anonvmstats.demotepages[8]);
+
+ kmem_free(ppa, ppasize);
+
+ return (0);
+}
+
+/*
+ * Allocate and initialize an anon_map structure for seg
+ * associating the given swap reservation with the new anon_map.
+ */
+struct anon_map *
+anonmap_alloc(size_t size, size_t swresv)
+{
+ struct anon_map *amp;
+
+ amp = kmem_cache_alloc(anonmap_cache, KM_SLEEP);
+
+ amp->refcnt = 1;
+ amp->size = size;
+
+ amp->ahp = anon_create(btopr(size), ANON_SLEEP);
+ amp->swresv = swresv;
+ amp->locality = 0;
+ amp->a_szc = 0;
+ return (amp);
+}
+
+void
+anonmap_free(struct anon_map *amp)
+{
+ ASSERT(amp->ahp);
+ ASSERT(amp->refcnt == 0);
+
+ lgrp_shm_policy_fini(amp, NULL);
+ anon_release(amp->ahp, btopr(amp->size));
+ kmem_cache_free(anonmap_cache, amp);
+}
+
+/*
+ * Returns true if the app array has some empty slots.
+ * The offp and lenp paramters are in/out paramters. On entry
+ * these values represent the starting offset and length of the
+ * mapping. When true is returned, these values may be modified
+ * to be the largest range which includes empty slots.
+ */
+int
+non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp,
+ size_t *lenp)
+{
+ ulong_t i, el;
+ ssize_t low, high;
+ struct anon *ap;
+
+ low = -1;
+ for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) {
+ ap = anon_get_ptr(ahp, anon_idx);
+ if (ap == NULL) {
+ if (low == -1)
+ low = i;
+ high = i;
+ }
+ }
+ if (low != -1) {
+ /*
+ * Found at least one non-anon page.
+ * Set up the off and len return values.
+ */
+ if (low != 0)
+ *offp += low;
+ *lenp = high - low + PAGESIZE;
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Return a count of the number of existing anon pages in the anon array
+ * app in the range (off, off+len). The array and slots must be guaranteed
+ * stable by the caller.
+ */
+pgcnt_t
+anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
+{
+ pgcnt_t cnt = 0;
+
+ while (nslots-- > 0) {
+ if ((anon_get_ptr(ahp, anon_index)) != NULL)
+ cnt++;
+ anon_index++;
+ }
+ return (cnt);
+}
+
+/*
+ * Move reserved phys swap into memory swap (unreserve phys swap
+ * and reserve mem swap by the same amount).
+ * Used by segspt when it needs to lock resrved swap npages in memory
+ */
+int
+anon_swap_adjust(pgcnt_t npages)
+{
+ pgcnt_t unlocked_mem_swap;
+
+ mutex_enter(&anoninfo_lock);
+
+ ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+ unlocked_mem_swap = k_anoninfo.ani_mem_resv
+ - k_anoninfo.ani_locked_swap;
+ if (npages > unlocked_mem_swap) {
+ spgcnt_t adjusted_swap = npages - unlocked_mem_swap;
+
+ /*
+ * if there is not enough unlocked mem swap we take missing
+ * amount from phys swap and give it to mem swap
+ */
+ mutex_enter(&freemem_lock);
+ if (availrmem < adjusted_swap + segspt_minfree) {
+ mutex_exit(&freemem_lock);
+ mutex_exit(&anoninfo_lock);
+ return (ENOMEM);
+ }
+ availrmem -= adjusted_swap;
+ mutex_exit(&freemem_lock);
+
+ k_anoninfo.ani_mem_resv += adjusted_swap;
+ ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap);
+ k_anoninfo.ani_phys_resv -= adjusted_swap;
+
+ ANI_ADD(adjusted_swap);
+ }
+ k_anoninfo.ani_locked_swap += npages;
+
+ ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+ mutex_exit(&anoninfo_lock);
+
+ return (0);
+}
+
+/*
+ * 'unlocked' reserved mem swap so when it is unreserved it
+ * can be moved back phys (disk) swap
+ */
+void
+anon_swap_restore(pgcnt_t npages)
+{
+ mutex_enter(&anoninfo_lock);
+
+ ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
+
+ ASSERT(k_anoninfo.ani_locked_swap >= npages);
+ k_anoninfo.ani_locked_swap -= npages;
+
+ ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
+
+ mutex_exit(&anoninfo_lock);
+}
+
+/*
+ * Return the pointer from the list for a
+ * specified anon index.
+ */
+ulong_t *
+anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx)
+{
+ struct anon **app;
+ void **ppp;
+
+ ASSERT(an_idx < ahp->size);
+
+ /*
+ * Single level case.
+ */
+ if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
+ return ((ulong_t *)&ahp->array_chunk[an_idx]);
+ } else {
+
+ /*
+ * 2 level case.
+ */
+ ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+ if (*ppp == NULL) {
+ mutex_enter(&ahp->serial_lock);
+ ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
+ if (*ppp == NULL)
+ *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP);
+ mutex_exit(&ahp->serial_lock);
+ }
+ app = *ppp;
+ return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]);
+ }
+}
+
+void
+anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj)
+{
+ ulong_t *ap_slot;
+ kmutex_t *mtx;
+ kcondvar_t *cv;
+ int hash;
+
+ /*
+ * Use szc to determine anon slot(s) to appear atomic.
+ * If szc = 0, then lock the anon slot and mark it busy.
+ * If szc > 0, then lock the range of slots by getting the
+ * anon_array_lock for the first anon slot, and mark only the
+ * first anon slot busy to represent whole range being busy.
+ */
+
+ ASSERT(RW_READ_HELD(&amp->a_rwlock));
+ an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
+ hash = ANON_ARRAY_HASH(amp, an_idx);
+ sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
+ sobj->sync_cv = cv = &anon_array_cv[hash];
+ mutex_enter(mtx);
+ ap_slot = anon_get_slot(amp->ahp, an_idx);
+ while (ANON_ISBUSY(ap_slot))
+ cv_wait(cv, mtx);
+ ANON_SETBUSY(ap_slot);
+ sobj->sync_data = ap_slot;
+ mutex_exit(mtx);
+}
+
+void
+anon_array_exit(anon_sync_obj_t *sobj)
+{
+ mutex_enter(sobj->sync_mutex);
+ ASSERT(ANON_ISBUSY(sobj->sync_data));
+ ANON_CLRBUSY(sobj->sync_data);
+ if (CV_HAS_WAITERS(sobj->sync_cv))
+ cv_broadcast(sobj->sync_cv);
+ mutex_exit(sobj->sync_mutex);
+}
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
new file mode 100644
index 0000000000..f54ae54359
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -0,0 +1,2898 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - address spaces.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/sysmacros.h>
+#include <sys/cpuvar.h>
+#include <sys/sysinfo.h>
+#include <sys/kmem.h>
+#include <sys/vnode.h>
+#include <sys/vmsystm.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/tnf_probe.h>
+#include <sys/vtrace.h>
+
+#include <vm/hat.h>
+#include <vm/xhat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <vm/seg_dev.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_map.h>
+#include <vm/seg_spt.h>
+#include <vm/page.h>
+
+clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
+
+static struct kmem_cache *as_cache;
+
+static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
+static void as_clearwatchprot(struct as *, caddr_t, size_t);
+
+
+/*
+ * Verifying the segment lists is very time-consuming; it may not be
+ * desirable always to define VERIFY_SEGLIST when DEBUG is set.
+ */
+#ifdef DEBUG
+#define VERIFY_SEGLIST
+int do_as_verify = 0;
+#endif
+
+/*
+ * Allocate a new callback data structure entry and fill in the events of
+ * interest, the address range of interest, and the callback argument.
+ * Link the entry on the as->a_callbacks list. A callback entry for the
+ * entire address space may be specified with vaddr = 0 and size = -1.
+ *
+ * CALLERS RESPONSIBILITY: If not calling from within the process context for
+ * the specified as, the caller must guarantee persistence of the specified as
+ * for the duration of this function (eg. pages being locked within the as
+ * will guarantee persistence).
+ */
+int
+as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
+ caddr_t vaddr, size_t size, int sleepflag)
+{
+ struct as_callback *current_head, *cb;
+ caddr_t saddr;
+ size_t rsize;
+
+ /* callback function and an event are mandatory */
+ if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
+ return (EINVAL);
+
+ /* Adding a callback after as_free has been called is not allowed */
+ if (as == &kas)
+ return (ENOMEM);
+
+ /*
+ * vaddr = 0 and size = -1 is used to indicate that the callback range
+ * is the entire address space so no rounding is done in that case.
+ */
+ if (size != -1) {
+ saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
+ rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)saddr;
+ /* check for wraparound */
+ if (saddr + rsize < saddr)
+ return (ENOMEM);
+ } else {
+ if (vaddr != 0)
+ return (EINVAL);
+ saddr = vaddr;
+ rsize = size;
+ }
+
+ /* Allocate and initialize a callback entry */
+ cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
+ if (cb == NULL)
+ return (EAGAIN);
+
+ cb->ascb_func = cb_func;
+ cb->ascb_arg = arg;
+ cb->ascb_events = events;
+ cb->ascb_saddr = saddr;
+ cb->ascb_len = rsize;
+
+ /* Add the entry to the list */
+ mutex_enter(&as->a_contents);
+ current_head = as->a_callbacks;
+ as->a_callbacks = cb;
+ cb->ascb_next = current_head;
+
+ /*
+ * The call to this function may lose in a race with
+ * a pertinent event - eg. a thread does long term memory locking
+ * but before the callback is added another thread executes as_unmap.
+ * A broadcast here resolves that.
+ */
+ if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
+ AS_CLRUNMAPWAIT(as);
+ cv_broadcast(&as->a_cv);
+ }
+
+ mutex_exit(&as->a_contents);
+ return (0);
+}
+
+/*
+ * Search the callback list for an entry which pertains to arg.
+ *
+ * This is called from within the client upon completion of the callback.
+ * RETURN VALUES:
+ * AS_CALLBACK_DELETED (callback entry found and deleted)
+ * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
+ * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
+ * entry will be made in as_do_callbacks)
+ *
+ * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
+ * set, it indicates that as_do_callbacks is processing this entry. The
+ * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
+ * to unblock as_do_callbacks, in case it is blocked.
+ *
+ * CALLERS RESPONSIBILITY: If not calling from within the process context for
+ * the specified as, the caller must guarantee persistence of the specified as
+ * for the duration of this function (eg. pages being locked within the as
+ * will guarantee persistence).
+ */
+uint_t
+as_delete_callback(struct as *as, void *arg)
+{
+ struct as_callback **prevcb = &as->a_callbacks;
+ struct as_callback *cb;
+ uint_t rc = AS_CALLBACK_NOTFOUND;
+
+ mutex_enter(&as->a_contents);
+ for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
+ if (cb->ascb_arg != arg)
+ continue;
+
+ /*
+ * If the events indicate AS_CALLBACK_CALLED, just clear
+ * AS_ALL_EVENT in the events field and wakeup the thread
+ * that may be waiting in as_do_callbacks. as_do_callbacks
+ * will take care of removing this entry from the list. In
+ * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
+ * (AS_CALLBACK_CALLED not set), just remove it from the
+ * list, return the memory and return AS_CALLBACK_DELETED.
+ */
+ if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
+ /* leave AS_CALLBACK_CALLED */
+ cb->ascb_events &= ~AS_ALL_EVENT;
+ rc = AS_CALLBACK_DELETE_DEFERRED;
+ cv_broadcast(&as->a_cv);
+ } else {
+ *prevcb = cb->ascb_next;
+ kmem_free(cb, sizeof (struct as_callback));
+ rc = AS_CALLBACK_DELETED;
+ }
+ break;
+ }
+ mutex_exit(&as->a_contents);
+ return (rc);
+}
+
+/*
+ * Searches the as callback list for a matching entry.
+ * Returns a pointer to the first matching callback, or NULL if
+ * nothing is found.
+ * This function never sleeps so it is ok to call it with more
+ * locks held but the (required) a_contents mutex.
+ *
+ * See also comment on as_do_callbacks below.
+ */
+static struct as_callback *
+as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
+ size_t event_len)
+{
+ struct as_callback *cb;
+
+ ASSERT(MUTEX_HELD(&as->a_contents));
+ for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
+ /*
+ * If the callback has not already been called, then
+ * check if events or address range pertains. An event_len
+ * of zero means do an unconditional callback.
+ */
+ if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
+ ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
+ (event_addr + event_len < cb->ascb_saddr) ||
+ (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
+ continue;
+ }
+ break;
+ }
+ return (cb);
+}
+
+/*
+ * Executes a given callback and removes it from the callback list for
+ * this address space.
+ * This function may sleep so the caller must drop all locks except
+ * a_contents before calling this func.
+ *
+ * See also comments on as_do_callbacks below.
+ */
+static void
+as_execute_callback(struct as *as, struct as_callback *cb,
+ uint_t events)
+{
+ struct as_callback **prevcb;
+ void *cb_arg;
+
+ ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
+ cb->ascb_events |= AS_CALLBACK_CALLED;
+ mutex_exit(&as->a_contents);
+ (*cb->ascb_func)(as, cb->ascb_arg, events);
+ mutex_enter(&as->a_contents);
+ /*
+ * the callback function is required to delete the callback
+ * when the callback function determines it is OK for
+ * this thread to continue. as_delete_callback will clear
+ * the AS_ALL_EVENT in the events field when it is deleted.
+ * If the callback function called as_delete_callback,
+ * events will already be cleared and there will be no blocking.
+ */
+ while ((cb->ascb_events & events) != 0) {
+ cv_wait(&as->a_cv, &as->a_contents);
+ }
+ /*
+ * This entry needs to be taken off the list. Normally, the
+ * callback func itself does that, but unfortunately the list
+ * may have changed while the callback was running because the
+ * a_contents mutex was dropped and someone else other than the
+ * callback func itself could have called as_delete_callback,
+ * so we have to search to find this entry again. The entry
+ * must have AS_CALLBACK_CALLED, and have the same 'arg'.
+ */
+ cb_arg = cb->ascb_arg;
+ prevcb = &as->a_callbacks;
+ for (cb = as->a_callbacks; cb != NULL;
+ prevcb = &cb->ascb_next, cb = *prevcb) {
+ if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
+ (cb_arg != cb->ascb_arg)) {
+ continue;
+ }
+ *prevcb = cb->ascb_next;
+ kmem_free(cb, sizeof (struct as_callback));
+ break;
+ }
+}
+
+/*
+ * Check the callback list for a matching event and intersection of
+ * address range. If there is a match invoke the callback. Skip an entry if:
+ * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
+ * - not event of interest
+ * - not address range of interest
+ *
+ * An event_len of zero indicates a request for an unconditional callback
+ * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
+ * a_contents lock must be dropped before a callback, so only one callback
+ * can be done before returning. Return -1 (true) if a callback was
+ * executed and removed from the list, else return 0 (false).
+ *
+ * The logically separate parts, i.e. finding a matching callback and
+ * executing a given callback have been separated into two functions
+ * so that they can be called with different sets of locks held beyond
+ * the always-required a_contents. as_find_callback does not sleep so
+ * it is ok to call it if more locks than a_contents (i.e. the a_lock
+ * rwlock) are held. as_execute_callback on the other hand may sleep
+ * so all locks beyond a_contents must be dropped by the caller if one
+ * does not want to end comatose.
+ */
+static int
+as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
+ size_t event_len)
+{
+ struct as_callback *cb;
+
+ if ((cb = as_find_callback(as, events, event_addr, event_len))) {
+ as_execute_callback(as, cb, events);
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Search for the segment containing addr. If a segment containing addr
+ * exists, that segment is returned. If no such segment exists, and
+ * the list spans addresses greater than addr, then the first segment
+ * whose base is greater than addr is returned; otherwise, NULL is
+ * returned unless tail is true, in which case the last element of the
+ * list is returned.
+ *
+ * a_seglast is used to cache the last found segment for repeated
+ * searches to the same addr (which happens frequently).
+ */
+struct seg *
+as_findseg(struct as *as, caddr_t addr, int tail)
+{
+ struct seg *seg = as->a_seglast;
+ avl_index_t where;
+
+ ASSERT(AS_LOCK_HELD(as, &as->a_lock));
+
+ if (seg != NULL &&
+ seg->s_base <= addr &&
+ addr < seg->s_base + seg->s_size)
+ return (seg);
+
+ seg = avl_find(&as->a_segtree, &addr, &where);
+ if (seg != NULL)
+ return (as->a_seglast = seg);
+
+ seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
+ if (seg == NULL && tail)
+ seg = avl_last(&as->a_segtree);
+ return (as->a_seglast = seg);
+}
+
+#ifdef VERIFY_SEGLIST
+/*
+ * verify that the linked list is coherent
+ */
+static void
+as_verify(struct as *as)
+{
+ struct seg *seg, *seglast, *p, *n;
+ uint_t nsegs = 0;
+
+ if (do_as_verify == 0)
+ return;
+
+ seglast = as->a_seglast;
+
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+ ASSERT(seg->s_as == as);
+ p = AS_SEGPREV(as, seg);
+ n = AS_SEGNEXT(as, seg);
+ ASSERT(p == NULL || p->s_as == as);
+ ASSERT(p == NULL || p->s_base < seg->s_base);
+ ASSERT(n == NULL || n->s_base > seg->s_base);
+ ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
+ if (seg == seglast)
+ seglast = NULL;
+ nsegs++;
+ }
+ ASSERT(seglast == NULL);
+ ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
+}
+#endif /* VERIFY_SEGLIST */
+
+/*
+ * Add a new segment to the address space. The avl_find()
+ * may be expensive so we attempt to use last segment accessed
+ * in as_gap() as an insertion point.
+ */
+int
+as_addseg(struct as *as, struct seg *newseg)
+{
+ struct seg *seg;
+ caddr_t addr;
+ caddr_t eaddr;
+ avl_index_t where;
+
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+ as->a_updatedir = 1; /* inform /proc */
+ gethrestime(&as->a_updatetime);
+
+ if (as->a_lastgaphl != NULL) {
+ struct seg *hseg = NULL;
+ struct seg *lseg = NULL;
+
+ if (as->a_lastgaphl->s_base > newseg->s_base) {
+ hseg = as->a_lastgaphl;
+ lseg = AVL_PREV(&as->a_segtree, hseg);
+ } else {
+ lseg = as->a_lastgaphl;
+ hseg = AVL_NEXT(&as->a_segtree, lseg);
+ }
+
+ if (hseg && lseg && lseg->s_base < newseg->s_base &&
+ hseg->s_base > newseg->s_base) {
+ avl_insert_here(&as->a_segtree, newseg, lseg,
+ AVL_AFTER);
+ as->a_lastgaphl = NULL;
+ as->a_seglast = newseg;
+ return (0);
+ }
+ as->a_lastgaphl = NULL;
+ }
+
+ addr = newseg->s_base;
+ eaddr = addr + newseg->s_size;
+again:
+
+ seg = avl_find(&as->a_segtree, &addr, &where);
+
+ if (seg == NULL)
+ seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
+
+ if (seg == NULL)
+ seg = avl_last(&as->a_segtree);
+
+ if (seg != NULL) {
+ caddr_t base = seg->s_base;
+
+ /*
+ * If top of seg is below the requested address, then
+ * the insertion point is at the end of the linked list,
+ * and seg points to the tail of the list. Otherwise,
+ * the insertion point is immediately before seg.
+ */
+ if (base + seg->s_size > addr) {
+ if (addr >= base || eaddr > base) {
+#ifdef __sparc
+ extern struct seg_ops segnf_ops;
+
+ /*
+ * no-fault segs must disappear if overlaid.
+ * XXX need new segment type so
+ * we don't have to check s_ops
+ */
+ if (seg->s_ops == &segnf_ops) {
+ seg_unmap(seg);
+ goto again;
+ }
+#endif
+ return (-1); /* overlapping segment */
+ }
+ }
+ }
+ as->a_seglast = newseg;
+ avl_insert(&as->a_segtree, newseg, where);
+
+#ifdef VERIFY_SEGLIST
+ as_verify(as);
+#endif
+ return (0);
+}
+
+struct seg *
+as_removeseg(struct as *as, struct seg *seg)
+{
+ avl_tree_t *t;
+
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+ as->a_updatedir = 1; /* inform /proc */
+ gethrestime(&as->a_updatetime);
+
+ if (seg == NULL)
+ return (NULL);
+
+ t = &as->a_segtree;
+ if (as->a_seglast == seg)
+ as->a_seglast = NULL;
+ as->a_lastgaphl = NULL;
+
+ /*
+ * if this segment is at an address higher than
+ * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
+ */
+ if (as->a_lastgap &&
+ (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
+ as->a_lastgap = AVL_NEXT(t, seg);
+
+ /*
+ * remove the segment from the seg tree
+ */
+ avl_remove(t, seg);
+
+#ifdef VERIFY_SEGLIST
+ as_verify(as);
+#endif
+ return (seg);
+}
+
+/*
+ * Find a segment containing addr.
+ */
+struct seg *
+as_segat(struct as *as, caddr_t addr)
+{
+ struct seg *seg = as->a_seglast;
+
+ ASSERT(AS_LOCK_HELD(as, &as->a_lock));
+
+ if (seg != NULL && seg->s_base <= addr &&
+ addr < seg->s_base + seg->s_size)
+ return (seg);
+
+ seg = avl_find(&as->a_segtree, &addr, NULL);
+ return (seg);
+}
+
+/*
+ * Serialize all searches for holes in an address space to
+ * prevent two or more threads from allocating the same virtual
+ * address range. The address space must not be "read/write"
+ * locked by the caller since we may block.
+ */
+void
+as_rangelock(struct as *as)
+{
+ mutex_enter(&as->a_contents);
+ while (AS_ISCLAIMGAP(as))
+ cv_wait(&as->a_cv, &as->a_contents);
+ AS_SETCLAIMGAP(as);
+ mutex_exit(&as->a_contents);
+}
+
+/*
+ * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
+ */
+void
+as_rangeunlock(struct as *as)
+{
+ mutex_enter(&as->a_contents);
+ AS_CLRCLAIMGAP(as);
+ cv_signal(&as->a_cv);
+ mutex_exit(&as->a_contents);
+}
+
+/*
+ * compar segments (or just an address) by segment address range
+ */
+static int
+as_segcompar(const void *x, const void *y)
+{
+ struct seg *a = (struct seg *)x;
+ struct seg *b = (struct seg *)y;
+
+ if (a->s_base < b->s_base)
+ return (-1);
+ if (a->s_base >= b->s_base + b->s_size)
+ return (1);
+ return (0);
+}
+
+
+void
+as_avlinit(struct as *as)
+{
+ avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
+ offsetof(struct seg, s_tree));
+ avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
+ offsetof(struct watched_page, wp_link));
+}
+
+/*ARGSUSED*/
+static int
+as_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ struct as *as = buf;
+
+ mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
+ rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
+ as_avlinit(as);
+ return (0);
+}
+
+/*ARGSUSED1*/
+static void
+as_destructor(void *buf, void *cdrarg)
+{
+ struct as *as = buf;
+
+ avl_destroy(&as->a_segtree);
+ mutex_destroy(&as->a_contents);
+ cv_destroy(&as->a_cv);
+ rw_destroy(&as->a_lock);
+}
+
+void
+as_init(void)
+{
+ as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
+ as_constructor, as_destructor, NULL, NULL, NULL, 0);
+}
+
+/*
+ * Allocate and initialize an address space data structure.
+ * We call hat_alloc to allow any machine dependent
+ * information in the hat structure to be initialized.
+ */
+struct as *
+as_alloc(void)
+{
+ struct as *as;
+
+ as = kmem_cache_alloc(as_cache, KM_SLEEP);
+
+ as->a_flags = 0;
+ as->a_vbits = 0;
+ as->a_hrm = NULL;
+ as->a_seglast = NULL;
+ as->a_size = 0;
+ as->a_updatedir = 0;
+ gethrestime(&as->a_updatetime);
+ as->a_objectdir = NULL;
+ as->a_sizedir = 0;
+ as->a_userlimit = (caddr_t)USERLIMIT;
+ as->a_lastgap = NULL;
+ as->a_lastgaphl = NULL;
+ as->a_callbacks = NULL;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+ as->a_hat = hat_alloc(as); /* create hat for default system mmu */
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ as->a_xhat = NULL;
+
+ return (as);
+}
+
+/*
+ * Free an address space data structure.
+ * Need to free the hat first and then
+ * all the segments on this as and finally
+ * the space for the as struct itself.
+ */
+void
+as_free(struct as *as)
+{
+ struct hat *hat = as->a_hat;
+ struct seg *seg, *next;
+ int called = 0;
+
+top:
+ /*
+ * Invoke ALL callbacks. as_do_callbacks will do one callback
+ * per call, and not return (-1) until the callback has completed.
+ * When as_do_callbacks returns zero, all callbacks have completed.
+ */
+ mutex_enter(&as->a_contents);
+ while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0));
+
+ /* This will prevent new XHATs from attaching to as */
+ if (!called)
+ AS_SETBUSY(as);
+ mutex_exit(&as->a_contents);
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+
+ if (!called) {
+ called = 1;
+ hat_free_start(hat);
+ if (as->a_xhat != NULL)
+ xhat_free_start_all(as);
+ }
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
+ int err;
+
+ next = AS_SEGNEXT(as, seg);
+ err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
+ if (err == EAGAIN) {
+ mutex_enter(&as->a_contents);
+ if (as->a_callbacks) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ } else {
+ /*
+ * Memory is currently locked. Wait for a
+ * cv_signal that it has been unlocked, then
+ * try the operation again.
+ */
+ if (AS_ISUNMAPWAIT(as) == 0)
+ cv_broadcast(&as->a_cv);
+ AS_SETUNMAPWAIT(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ while (AS_ISUNMAPWAIT(as))
+ cv_wait(&as->a_cv, &as->a_contents);
+ }
+ mutex_exit(&as->a_contents);
+ goto top;
+ } else {
+ /*
+ * We do not expect any other error return at this
+ * time. This is similar to an ASSERT in seg_unmap()
+ */
+ ASSERT(err == 0);
+ }
+ }
+ hat_free_end(hat);
+ if (as->a_xhat != NULL)
+ xhat_free_end_all(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ /* /proc stuff */
+ ASSERT(avl_numnodes(&as->a_wpage) == 0);
+ if (as->a_objectdir) {
+ kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
+ as->a_objectdir = NULL;
+ as->a_sizedir = 0;
+ }
+
+ /*
+ * Free the struct as back to kmem. Assert it has no segments.
+ */
+ ASSERT(avl_numnodes(&as->a_segtree) == 0);
+ kmem_cache_free(as_cache, as);
+}
+
+int
+as_dup(struct as *as, struct as **outas)
+{
+ struct as *newas;
+ struct seg *seg, *newseg;
+ int error;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+ as_clearwatch(as);
+ newas = as_alloc();
+ newas->a_userlimit = as->a_userlimit;
+ AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
+
+ /* This will prevent new XHATs from attaching */
+ mutex_enter(&as->a_contents);
+ AS_SETBUSY(as);
+ mutex_exit(&as->a_contents);
+ mutex_enter(&newas->a_contents);
+ AS_SETBUSY(newas);
+ mutex_exit(&newas->a_contents);
+
+
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+
+ if (seg->s_flags & S_PURGE)
+ continue;
+
+ newseg = seg_alloc(newas, seg->s_base, seg->s_size);
+ if (newseg == NULL) {
+ AS_LOCK_EXIT(newas, &newas->a_lock);
+ as_setwatch(as);
+ mutex_enter(&as->a_contents);
+ AS_CLRBUSY(as);
+ mutex_exit(&as->a_contents);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ as_free(newas);
+ return (-1);
+ }
+ if ((error = SEGOP_DUP(seg, newseg)) != 0) {
+ /*
+ * We call seg_free() on the new seg
+ * because the segment is not set up
+ * completely; i.e. it has no ops.
+ */
+ as_setwatch(as);
+ mutex_enter(&as->a_contents);
+ AS_CLRBUSY(as);
+ mutex_exit(&as->a_contents);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ seg_free(newseg);
+ AS_LOCK_EXIT(newas, &newas->a_lock);
+ as_free(newas);
+ return (error);
+ }
+ newas->a_size += seg->s_size;
+ }
+
+ error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
+ if (as->a_xhat != NULL)
+ error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
+
+ mutex_enter(&newas->a_contents);
+ AS_CLRBUSY(newas);
+ mutex_exit(&newas->a_contents);
+ AS_LOCK_EXIT(newas, &newas->a_lock);
+
+ as_setwatch(as);
+ mutex_enter(&as->a_contents);
+ AS_CLRBUSY(as);
+ mutex_exit(&as->a_contents);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ if (error != 0) {
+ as_free(newas);
+ return (error);
+ }
+ *outas = newas;
+ return (0);
+}
+
+/*
+ * Handle a ``fault'' at addr for size bytes.
+ */
+faultcode_t
+as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
+ enum fault_type type, enum seg_rw rw)
+{
+ struct seg *seg;
+ caddr_t raddr; /* rounded down addr */
+ size_t rsize; /* rounded up size */
+ size_t ssize;
+ faultcode_t res = 0;
+ caddr_t addrsav;
+ struct seg *segsav;
+ int as_lock_held;
+ klwp_t *lwp = ttolwp(curthread);
+ int is_xhat = 0;
+ int holding_wpage = 0;
+ extern struct seg_ops segdev_ops;
+
+
+
+ if (as->a_hat != hat) {
+ /* This must be an XHAT then */
+ is_xhat = 1;
+
+ if ((type != F_INVAL) || (as == &kas))
+ return (FC_NOSUPPORT);
+ }
+
+retry:
+ if (!is_xhat) {
+ /*
+ * Indicate that the lwp is not to be stopped while waiting
+ * for a pagefault. This is to avoid deadlock while debugging
+ * a process via /proc over NFS (in particular).
+ */
+ if (lwp != NULL)
+ lwp->lwp_nostop++;
+
+ /*
+ * same length must be used when we softlock and softunlock.
+ * We don't support softunlocking lengths less than
+ * the original length when there is largepage support.
+ * See seg_dev.c for more comments.
+ */
+ switch (type) {
+
+ case F_SOFTLOCK:
+ CPU_STATS_ADD_K(vm, softlock, 1);
+ break;
+
+ case F_SOFTUNLOCK:
+ break;
+
+ case F_PROT:
+ CPU_STATS_ADD_K(vm, prot_fault, 1);
+ break;
+
+ case F_INVAL:
+ CPU_STATS_ENTER_K();
+ CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
+ if (as == &kas)
+ CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
+ CPU_STATS_EXIT_K();
+ break;
+ }
+ }
+
+ /* Kernel probe */
+ TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
+ tnf_opaque, address, addr,
+ tnf_fault_type, fault_type, type,
+ tnf_seg_access, access, rw);
+
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+
+ /*
+ * XXX -- Don't grab the as lock for segkmap. We should grab it for
+ * correctness, but then we could be stuck holding this lock for
+ * a LONG time if the fault needs to be resolved on a slow
+ * filesystem, and then no-one will be able to exec new commands,
+ * as exec'ing requires the write lock on the as.
+ */
+ if (as == &kas && segkmap && segkmap->s_base <= raddr &&
+ raddr + size < segkmap->s_base + segkmap->s_size) {
+ /*
+ * if (as==&kas), this can't be XHAT: we've already returned
+ * FC_NOSUPPORT.
+ */
+ seg = segkmap;
+ as_lock_held = 0;
+ } else {
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
+ /*
+ * Grab and hold the writers' lock on the as
+ * if the fault is to a watched page.
+ * This will keep CPUs from "peeking" at the
+ * address range while we're temporarily boosting
+ * the permissions for the XHAT device to
+ * resolve the fault in the segment layer.
+ *
+ * We could check whether faulted address
+ * is within a watched page and only then grab
+ * the writer lock, but this is simpler.
+ */
+ AS_LOCK_EXIT(as, &as->a_lock);
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+ }
+
+ seg = as_segat(as, raddr);
+ if (seg == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ if ((lwp != NULL) && (!is_xhat))
+ lwp->lwp_nostop--;
+ return (FC_NOMAP);
+ }
+
+ as_lock_held = 1;
+ }
+
+ addrsav = raddr;
+ segsav = seg;
+
+ for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+ if (raddr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ if (seg == NULL || raddr != seg->s_base) {
+ res = FC_NOMAP;
+ break;
+ }
+ }
+ if (raddr + rsize > seg->s_base + seg->s_size)
+ ssize = seg->s_base + seg->s_size - raddr;
+ else
+ ssize = rsize;
+
+ if (!is_xhat || (seg->s_ops != &segdev_ops)) {
+
+ if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
+ pr_is_watchpage_as(raddr, rw, as)) {
+ /*
+ * Handle watch pages. If we're faulting on a
+ * watched page from an X-hat, we have to
+ * restore the original permissions while we
+ * handle the fault.
+ */
+ as_clearwatch(as);
+ holding_wpage = 1;
+ }
+
+ res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
+
+ /* Restore watchpoints */
+ if (holding_wpage) {
+ as_setwatch(as);
+ holding_wpage = 0;
+ }
+
+ if (res != 0)
+ break;
+ } else {
+ /* XHAT does not support seg_dev */
+ res = FC_NOSUPPORT;
+ break;
+ }
+ }
+
+ /*
+ * If we were SOFTLOCKing and encountered a failure,
+ * we must SOFTUNLOCK the range we already did. (Maybe we
+ * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
+ * right here...)
+ */
+ if (res != 0 && type == F_SOFTLOCK) {
+ for (seg = segsav; addrsav < raddr; addrsav += ssize) {
+ if (addrsav >= seg->s_base + seg->s_size)
+ seg = AS_SEGNEXT(as, seg);
+ ASSERT(seg != NULL);
+ /*
+ * Now call the fault routine again to perform the
+ * unlock using S_OTHER instead of the rw variable
+ * since we never got a chance to touch the pages.
+ */
+ if (raddr > seg->s_base + seg->s_size)
+ ssize = seg->s_base + seg->s_size - addrsav;
+ else
+ ssize = raddr - addrsav;
+ (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
+ F_SOFTUNLOCK, S_OTHER);
+ }
+ }
+ if (as_lock_held)
+ AS_LOCK_EXIT(as, &as->a_lock);
+ if ((lwp != NULL) && (!is_xhat))
+ lwp->lwp_nostop--;
+ /*
+ * If the lower levels returned EDEADLK for a fault,
+ * It means that we should retry the fault. Let's wait
+ * a bit also to let the deadlock causing condition clear.
+ * This is part of a gross hack to work around a design flaw
+ * in the ufs/sds logging code and should go away when the
+ * logging code is re-designed to fix the problem. See bug
+ * 4125102 for details of the problem.
+ */
+ if (FC_ERRNO(res) == EDEADLK) {
+ delay(deadlk_wait);
+ res = 0;
+ goto retry;
+ }
+ return (res);
+}
+
+
+
+/*
+ * Asynchronous ``fault'' at addr for size bytes.
+ */
+faultcode_t
+as_faulta(struct as *as, caddr_t addr, size_t size)
+{
+ struct seg *seg;
+ caddr_t raddr; /* rounded down addr */
+ size_t rsize; /* rounded up size */
+ faultcode_t res = 0;
+ klwp_t *lwp = ttolwp(curthread);
+
+retry:
+ /*
+ * Indicate that the lwp is not to be stopped while waiting
+ * for a pagefault. This is to avoid deadlock while debugging
+ * a process via /proc over NFS (in particular).
+ */
+ if (lwp != NULL)
+ lwp->lwp_nostop++;
+
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ seg = as_segat(as, raddr);
+ if (seg == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ if (lwp != NULL)
+ lwp->lwp_nostop--;
+ return (FC_NOMAP);
+ }
+
+ for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
+ if (raddr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ if (seg == NULL || raddr != seg->s_base) {
+ res = FC_NOMAP;
+ break;
+ }
+ }
+ res = SEGOP_FAULTA(seg, raddr);
+ if (res != 0)
+ break;
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ if (lwp != NULL)
+ lwp->lwp_nostop--;
+ /*
+ * If the lower levels returned EDEADLK for a fault,
+ * It means that we should retry the fault. Let's wait
+ * a bit also to let the deadlock causing condition clear.
+ * This is part of a gross hack to work around a design flaw
+ * in the ufs/sds logging code and should go away when the
+ * logging code is re-designed to fix the problem. See bug
+ * 4125102 for details of the problem.
+ */
+ if (FC_ERRNO(res) == EDEADLK) {
+ delay(deadlk_wait);
+ res = 0;
+ goto retry;
+ }
+ return (res);
+}
+
+/*
+ * Set the virtual mapping for the interval from [addr : addr + size)
+ * in address space `as' to have the specified protection.
+ * It is ok for the range to cross over several segments,
+ * as long as they are contiguous.
+ */
+int
+as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
+{
+ struct seg *seg;
+ struct as_callback *cb;
+ size_t ssize;
+ caddr_t raddr; /* rounded down addr */
+ size_t rsize; /* rounded up size */
+ int error = 0, writer = 0;
+ caddr_t saveraddr;
+ size_t saversize;
+
+setprot_top:
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+
+ if (raddr + rsize < raddr) /* check for wraparound */
+ return (ENOMEM);
+
+ saveraddr = raddr;
+ saversize = rsize;
+
+ /*
+ * Normally we only lock the as as a reader. But
+ * if due to setprot the segment driver needs to split
+ * a segment it will return IE_RETRY. Therefore we re-aquire
+ * the as lock as a writer so the segment driver can change
+ * the seg list. Also the segment driver will return IE_RETRY
+ * after it has changed the segment list so we therefore keep
+ * locking as a writer. Since these opeartions should be rare
+ * want to only lock as a writer when necessary.
+ */
+ if (writer || avl_numnodes(&as->a_wpage) != 0) {
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+ } else {
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ }
+
+ as_clearwatchprot(as, raddr, rsize);
+ seg = as_segat(as, raddr);
+ if (seg == NULL) {
+ as_setwatch(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENOMEM);
+ }
+
+ for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+ if (raddr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ if (seg == NULL || raddr != seg->s_base) {
+ error = ENOMEM;
+ break;
+ }
+ }
+ if ((raddr + rsize) > (seg->s_base + seg->s_size))
+ ssize = seg->s_base + seg->s_size - raddr;
+ else
+ ssize = rsize;
+ error = SEGOP_SETPROT(seg, raddr, ssize, prot);
+
+ if (error == IE_NOMEM) {
+ error = EAGAIN;
+ break;
+ }
+
+ if (error == IE_RETRY) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ writer = 1;
+ goto setprot_top;
+ }
+
+ if (error == EAGAIN) {
+ /*
+ * Make sure we have a_lock as writer.
+ */
+ if (writer == 0) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ writer = 1;
+ goto setprot_top;
+ }
+
+ /*
+ * Memory is currently locked. It must be unlocked
+ * before this operation can succeed through a retry.
+ * The possible reasons for locked memory and
+ * corresponding strategies for unlocking are:
+ * (1) Normal I/O
+ * wait for a signal that the I/O operation
+ * has completed and the memory is unlocked.
+ * (2) Asynchronous I/O
+ * The aio subsystem does not unlock pages when
+ * the I/O is completed. Those pages are unlocked
+ * when the application calls aiowait/aioerror.
+ * So, to prevent blocking forever, cv_broadcast()
+ * is done to wake up aio_cleanup_thread.
+ * Subsequently, segvn_reclaim will be called, and
+ * that will do AS_CLRUNMAPWAIT() and wake us up.
+ * (3) Long term page locking:
+ * Drivers intending to have pages locked for a
+ * period considerably longer than for normal I/O
+ * (essentially forever) may have registered for a
+ * callback so they may unlock these pages on
+ * request. This is needed to allow this operation
+ * to succeed. Each entry on the callback list is
+ * examined. If the event or address range pertains
+ * the callback is invoked (unless it already is in
+ * progress). The a_contents lock must be dropped
+ * before the callback, so only one callback can
+ * be done at a time. Go to the top and do more
+ * until zero is returned. If zero is returned,
+ * either there were no callbacks for this event
+ * or they were already in progress.
+ */
+ mutex_enter(&as->a_contents);
+ if (as->a_callbacks &&
+ (cb = as_find_callback(as, AS_SETPROT_EVENT,
+ seg->s_base, seg->s_size))) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ as_execute_callback(as, cb, AS_SETPROT_EVENT);
+ } else {
+ if (AS_ISUNMAPWAIT(as) == 0)
+ cv_broadcast(&as->a_cv);
+ AS_SETUNMAPWAIT(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ while (AS_ISUNMAPWAIT(as))
+ cv_wait(&as->a_cv, &as->a_contents);
+ }
+ mutex_exit(&as->a_contents);
+ goto setprot_top;
+ } else if (error != 0)
+ break;
+ }
+ if (error != 0) {
+ as_setwatch(as);
+ } else {
+ as_setwatchprot(as, saveraddr, saversize, prot);
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (error);
+}
+
+/*
+ * Check to make sure that the interval [addr, addr + size)
+ * in address space `as' has at least the specified protection.
+ * It is ok for the range to cross over several segments, as long
+ * as they are contiguous.
+ */
+int
+as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
+{
+ struct seg *seg;
+ size_t ssize;
+ caddr_t raddr; /* rounded down addr */
+ size_t rsize; /* rounded up size */
+ int error = 0;
+
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+
+ if (raddr + rsize < raddr) /* check for wraparound */
+ return (ENOMEM);
+
+ /*
+ * This is ugly as sin...
+ * Normally, we only acquire the address space readers lock.
+ * However, if the address space has watchpoints present,
+ * we must acquire the writer lock on the address space for
+ * the benefit of as_clearwatchprot() and as_setwatchprot().
+ */
+ if (avl_numnodes(&as->a_wpage) != 0)
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+ else
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ as_clearwatchprot(as, raddr, rsize);
+ seg = as_segat(as, raddr);
+ if (seg == NULL) {
+ as_setwatch(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENOMEM);
+ }
+
+ for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+ if (raddr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ if (seg == NULL || raddr != seg->s_base) {
+ error = ENOMEM;
+ break;
+ }
+ }
+ if ((raddr + rsize) > (seg->s_base + seg->s_size))
+ ssize = seg->s_base + seg->s_size - raddr;
+ else
+ ssize = rsize;
+
+ error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
+ if (error != 0)
+ break;
+ }
+ as_setwatch(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (error);
+}
+
+int
+as_unmap(struct as *as, caddr_t addr, size_t size)
+{
+ struct seg *seg, *seg_next;
+ struct as_callback *cb;
+ caddr_t raddr, eaddr;
+ size_t ssize;
+ int err;
+
+top:
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
+ (uintptr_t)PAGEMASK);
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+
+ as->a_updatedir = 1; /* inform /proc */
+ gethrestime(&as->a_updatetime);
+
+ /*
+ * Use as_findseg to find the first segment in the range, then
+ * step through the segments in order, following s_next.
+ */
+ as_clearwatchprot(as, raddr, eaddr - raddr);
+
+ for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
+ if (eaddr <= seg->s_base)
+ break; /* eaddr was in a gap; all done */
+
+ /* this is implied by the test above */
+ ASSERT(raddr < eaddr);
+
+ if (raddr < seg->s_base)
+ raddr = seg->s_base; /* raddr was in a gap */
+
+ if (eaddr > (seg->s_base + seg->s_size))
+ ssize = seg->s_base + seg->s_size - raddr;
+ else
+ ssize = eaddr - raddr;
+
+ /*
+ * Save next segment pointer since seg can be
+ * destroyed during the segment unmap operation.
+ */
+ seg_next = AS_SEGNEXT(as, seg);
+
+ err = SEGOP_UNMAP(seg, raddr, ssize);
+ if (err == EAGAIN) {
+ /*
+ * Memory is currently locked. It must be unlocked
+ * before this operation can succeed through a retry.
+ * The possible reasons for locked memory and
+ * corresponding strategies for unlocking are:
+ * (1) Normal I/O
+ * wait for a signal that the I/O operation
+ * has completed and the memory is unlocked.
+ * (2) Asynchronous I/O
+ * The aio subsystem does not unlock pages when
+ * the I/O is completed. Those pages are unlocked
+ * when the application calls aiowait/aioerror.
+ * So, to prevent blocking forever, cv_broadcast()
+ * is done to wake up aio_cleanup_thread.
+ * Subsequently, segvn_reclaim will be called, and
+ * that will do AS_CLRUNMAPWAIT() and wake us up.
+ * (3) Long term page locking:
+ * Drivers intending to have pages locked for a
+ * period considerably longer than for normal I/O
+ * (essentially forever) may have registered for a
+ * callback so they may unlock these pages on
+ * request. This is needed to allow this operation
+ * to succeed. Each entry on the callback list is
+ * examined. If the event or address range pertains
+ * the callback is invoked (unless it already is in
+ * progress). The a_contents lock must be dropped
+ * before the callback, so only one callback can
+ * be done at a time. Go to the top and do more
+ * until zero is returned. If zero is returned,
+ * either there were no callbacks for this event
+ * or they were already in progress.
+ */
+ as_setwatch(as);
+ mutex_enter(&as->a_contents);
+ if (as->a_callbacks &&
+ (cb = as_find_callback(as, AS_UNMAP_EVENT,
+ seg->s_base, seg->s_size))) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ as_execute_callback(as, cb, AS_UNMAP_EVENT);
+ } else {
+ if (AS_ISUNMAPWAIT(as) == 0)
+ cv_broadcast(&as->a_cv);
+ AS_SETUNMAPWAIT(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ while (AS_ISUNMAPWAIT(as))
+ cv_wait(&as->a_cv, &as->a_contents);
+ }
+ mutex_exit(&as->a_contents);
+ goto top;
+ } else if (err == IE_RETRY) {
+ as_setwatch(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ goto top;
+ } else if (err) {
+ as_setwatch(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (-1);
+ }
+
+ as->a_size -= ssize;
+ raddr += ssize;
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (0);
+}
+
+static int
+as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
+ int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
+{
+ int text = vn_a->flags & MAP_TEXT;
+ uint_t szcvec = map_execseg_pgszcvec(text, addr, size);
+ uint_t szc;
+ uint_t nszc;
+ int error;
+ caddr_t a;
+ caddr_t eaddr;
+ size_t segsize;
+ struct seg *seg;
+ uint_t save_szcvec;
+ size_t pgsz;
+ struct vattr va;
+ u_offset_t eoff;
+ size_t save_size = 0;
+
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+ ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
+ ASSERT(IS_P2ALIGNED(size, PAGESIZE));
+ ASSERT(vn_a->vp != NULL);
+ ASSERT(vn_a->amp == NULL);
+
+again:
+ if (szcvec <= 1) {
+ seg = seg_alloc(as, addr, size);
+ if (seg == NULL) {
+ return (ENOMEM);
+ }
+ vn_a->szc = 0;
+ error = (*crfp)(seg, vn_a);
+ if (error != 0) {
+ seg_free(seg);
+ }
+ return (error);
+ }
+
+ va.va_mask = AT_SIZE;
+ if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred) != 0) {
+ szcvec = 0;
+ goto again;
+ }
+ eoff = vn_a->offset & PAGEMASK;
+ if (eoff >= va.va_size) {
+ szcvec = 0;
+ goto again;
+ }
+ eoff += size;
+ if (btopr(va.va_size) < btopr(eoff)) {
+ save_size = size;
+ size = va.va_size - (vn_a->offset & PAGEMASK);
+ size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
+ szcvec = map_execseg_pgszcvec(text, addr, size);
+ if (szcvec <= 1) {
+ size = save_size;
+ goto again;
+ }
+ }
+
+ eaddr = addr + size;
+ save_szcvec = szcvec;
+ szcvec >>= 1;
+ szc = 0;
+ nszc = 0;
+ while (szcvec) {
+ if ((szcvec & 0x1) == 0) {
+ nszc++;
+ szcvec >>= 1;
+ continue;
+ }
+ nszc++;
+ pgsz = page_get_pagesize(nszc);
+ a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
+ if (a != addr) {
+ ASSERT(a < eaddr);
+ segsize = a - addr;
+ seg = seg_alloc(as, addr, segsize);
+ if (seg == NULL) {
+ return (ENOMEM);
+ }
+ vn_a->szc = szc;
+ error = (*crfp)(seg, vn_a);
+ if (error != 0) {
+ seg_free(seg);
+ return (error);
+ }
+ *segcreated = 1;
+ vn_a->offset += segsize;
+ addr = a;
+ }
+ szc = nszc;
+ szcvec >>= 1;
+ }
+
+ ASSERT(addr < eaddr);
+ szcvec = save_szcvec | 1; /* add 8K pages */
+ while (szcvec) {
+ a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
+ ASSERT(a >= addr);
+ if (a != addr) {
+ segsize = a - addr;
+ seg = seg_alloc(as, addr, segsize);
+ if (seg == NULL) {
+ return (ENOMEM);
+ }
+ vn_a->szc = szc;
+ error = (*crfp)(seg, vn_a);
+ if (error != 0) {
+ seg_free(seg);
+ return (error);
+ }
+ *segcreated = 1;
+ vn_a->offset += segsize;
+ addr = a;
+ }
+ szcvec &= ~(1 << szc);
+ if (szcvec) {
+ szc = highbit(szcvec) - 1;
+ pgsz = page_get_pagesize(szc);
+ }
+ }
+ ASSERT(addr == eaddr);
+
+ if (save_size) {
+ size = save_size - size;
+ goto again;
+ }
+
+ return (0);
+}
+
+int
+as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
+{
+ struct seg *seg = NULL;
+ caddr_t raddr; /* rounded down addr */
+ size_t rsize; /* rounded up size */
+ int error;
+ struct proc *p = curproc;
+
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+
+ /*
+ * check for wrap around
+ */
+ if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENOMEM);
+ }
+
+ as->a_updatedir = 1; /* inform /proc */
+ gethrestime(&as->a_updatetime);
+
+ if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
+ RCA_UNSAFE_ALL);
+
+ return (ENOMEM);
+ }
+
+ if (AS_MAP_VNSEGS_USELPGS(crfp, argsp)) {
+ int unmap = 0;
+ error = as_map_vnsegs(as, raddr, rsize, crfp,
+ (struct segvn_crargs *)argsp, &unmap);
+ if (error != 0) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ if (unmap) {
+ (void) as_unmap(as, addr, size);
+ }
+ return (error);
+ }
+ } else {
+ seg = seg_alloc(as, addr, size);
+ if (seg == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENOMEM);
+ }
+
+ error = (*crfp)(seg, argsp);
+ if (error != 0) {
+ seg_free(seg);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (error);
+ }
+ }
+
+ /*
+ * Add size now so as_unmap will work if as_ctl fails.
+ */
+ as->a_size += rsize;
+
+ as_setwatch(as);
+
+ /*
+ * If the address space is locked,
+ * establish memory locks for the new segment.
+ */
+ mutex_enter(&as->a_contents);
+ if (AS_ISPGLCK(as)) {
+ mutex_exit(&as->a_contents);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
+ if (error != 0)
+ (void) as_unmap(as, addr, size);
+ } else {
+ mutex_exit(&as->a_contents);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ }
+ return (error);
+}
+
+
+/*
+ * Delete all segments in the address space marked with S_PURGE.
+ * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
+ * These segments are deleted as a first step before calls to as_gap(), so
+ * that they don't affect mmap() or shmat().
+ */
+void
+as_purge(struct as *as)
+{
+ struct seg *seg;
+ struct seg *next_seg;
+
+ /*
+ * the setting of NEEDSPURGE is protect by as_rangelock(), so
+ * no need to grab a_contents mutex for this check
+ */
+ if ((as->a_flags & AS_NEEDSPURGE) == 0)
+ return;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+ next_seg = NULL;
+ seg = AS_SEGFIRST(as);
+ while (seg != NULL) {
+ next_seg = AS_SEGNEXT(as, seg);
+ if (seg->s_flags & S_PURGE)
+ SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
+ seg = next_seg;
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ mutex_enter(&as->a_contents);
+ as->a_flags &= ~AS_NEEDSPURGE;
+ mutex_exit(&as->a_contents);
+}
+
+/*
+ * Find a hole of at least size minlen within [base, base + len).
+ *
+ * If flags specifies AH_HI, the hole will have the highest possible address
+ * in the range. We use the as->a_lastgap field to figure out where to
+ * start looking for a gap.
+ *
+ * Otherwise, the gap will have the lowest possible address.
+ *
+ * If flags specifies AH_CONTAIN, the hole will contain the address addr.
+ *
+ * If an adequate hole is found, base and len are set to reflect the part of
+ * the hole that is within range, and 0 is returned, otherwise,
+ * -1 is returned.
+ *
+ * NOTE: This routine is not correct when base+len overflows caddr_t.
+ */
+int
+as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
+ caddr_t addr)
+{
+ caddr_t lobound = *basep;
+ caddr_t hibound = lobound + *lenp;
+ struct seg *lseg, *hseg;
+ caddr_t lo, hi;
+ int forward;
+ caddr_t save_base;
+ size_t save_len;
+
+ save_base = *basep;
+ save_len = *lenp;
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ if (AS_SEGFIRST(as) == NULL) {
+ if (valid_va_range(basep, lenp, minlen, flags & AH_DIR)) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (0);
+ } else {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ *basep = save_base;
+ *lenp = save_len;
+ return (-1);
+ }
+ }
+
+ /*
+ * Set up to iterate over all the inter-segment holes in the given
+ * direction. lseg is NULL for the lowest-addressed hole and hseg is
+ * NULL for the highest-addressed hole. If moving backwards, we reset
+ * sseg to denote the highest-addressed segment.
+ */
+ forward = (flags & AH_DIR) == AH_LO;
+ if (forward) {
+ hseg = as_findseg(as, lobound, 1);
+ lseg = AS_SEGPREV(as, hseg);
+ } else {
+
+ /*
+ * If allocating at least as much as the last allocation,
+ * use a_lastgap's base as a better estimate of hibound.
+ */
+ if (as->a_lastgap &&
+ minlen >= as->a_lastgap->s_size &&
+ hibound >= as->a_lastgap->s_base)
+ hibound = as->a_lastgap->s_base;
+
+ hseg = as_findseg(as, hibound, 1);
+ if (hseg->s_base + hseg->s_size < hibound) {
+ lseg = hseg;
+ hseg = NULL;
+ } else {
+ lseg = AS_SEGPREV(as, hseg);
+ }
+ }
+
+ for (;;) {
+ /*
+ * Set lo and hi to the hole's boundaries. (We should really
+ * use MAXADDR in place of hibound in the expression below,
+ * but can't express it easily; using hibound in its place is
+ * harmless.)
+ */
+ lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
+ hi = (hseg == NULL) ? hibound : hseg->s_base;
+ /*
+ * If the iteration has moved past the interval from lobound
+ * to hibound it's pointless to continue.
+ */
+ if ((forward && lo > hibound) || (!forward && hi < lobound))
+ break;
+ else if (lo > hibound || hi < lobound)
+ goto cont;
+ /*
+ * Candidate hole lies at least partially within the allowable
+ * range. Restrict it to fall completely within that range,
+ * i.e., to [max(lo, lobound), min(hi, hibound)].
+ */
+ if (lo < lobound)
+ lo = lobound;
+ if (hi > hibound)
+ hi = hibound;
+ /*
+ * Verify that the candidate hole is big enough and meets
+ * hardware constraints.
+ */
+ *basep = lo;
+ *lenp = hi - lo;
+ if (valid_va_range(basep, lenp, minlen,
+ forward ? AH_LO : AH_HI) &&
+ ((flags & AH_CONTAIN) == 0 ||
+ (*basep <= addr && *basep + *lenp > addr))) {
+ if (!forward)
+ as->a_lastgap = hseg;
+ if (hseg != NULL)
+ as->a_lastgaphl = hseg;
+ else
+ as->a_lastgaphl = lseg;
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (0);
+ }
+ cont:
+ /*
+ * Move to the next hole.
+ */
+ if (forward) {
+ lseg = hseg;
+ if (lseg == NULL)
+ break;
+ hseg = AS_SEGNEXT(as, hseg);
+ } else {
+ hseg = lseg;
+ if (hseg == NULL)
+ break;
+ lseg = AS_SEGPREV(as, lseg);
+ }
+ }
+ *basep = save_base;
+ *lenp = save_len;
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (-1);
+}
+
+/*
+ * Return the next range within [base, base + len) that is backed
+ * with "real memory". Skip holes and non-seg_vn segments.
+ * We're lazy and only return one segment at a time.
+ */
+int
+as_memory(struct as *as, caddr_t *basep, size_t *lenp)
+{
+ extern struct seg_ops segspt_shmops; /* needs a header file */
+ struct seg *seg;
+ caddr_t addr, eaddr;
+ caddr_t segend;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+
+ addr = *basep;
+ eaddr = addr + *lenp;
+
+ seg = as_findseg(as, addr, 0);
+ if (seg != NULL)
+ addr = MAX(seg->s_base, addr);
+
+ for (;;) {
+ if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (EINVAL);
+ }
+
+ if (seg->s_ops == &segvn_ops) {
+ segend = seg->s_base + seg->s_size;
+ break;
+ }
+
+ /*
+ * We do ISM by looking into the private data
+ * to determine the real size of the segment.
+ */
+ if (seg->s_ops == &segspt_shmops) {
+ segend = seg->s_base + spt_realsize(seg);
+ if (addr < segend)
+ break;
+ }
+
+ seg = AS_SEGNEXT(as, seg);
+
+ if (seg != NULL)
+ addr = seg->s_base;
+ }
+
+ *basep = addr;
+
+ if (segend > eaddr)
+ *lenp = eaddr - addr;
+ else
+ *lenp = segend - addr;
+
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (0);
+}
+
+/*
+ * Swap the pages associated with the address space as out to
+ * secondary storage, returning the number of bytes actually
+ * swapped.
+ *
+ * The value returned is intended to correlate well with the process's
+ * memory requirements. Its usefulness for this purpose depends on
+ * how well the segment-level routines do at returning accurate
+ * information.
+ */
+size_t
+as_swapout(struct as *as)
+{
+ struct seg *seg;
+ size_t swpcnt = 0;
+
+ /*
+ * Kernel-only processes have given up their address
+ * spaces. Of course, we shouldn't be attempting to
+ * swap out such processes in the first place...
+ */
+ if (as == NULL)
+ return (0);
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+
+ /* Prevent XHATs from attaching */
+ mutex_enter(&as->a_contents);
+ AS_SETBUSY(as);
+ mutex_exit(&as->a_contents);
+
+
+ /*
+ * Free all mapping resources associated with the address
+ * space. The segment-level swapout routines capitalize
+ * on this unmapping by scavanging pages that have become
+ * unmapped here.
+ */
+ hat_swapout(as->a_hat);
+ if (as->a_xhat != NULL)
+ xhat_swapout_all(as);
+
+ mutex_enter(&as->a_contents);
+ AS_CLRBUSY(as);
+ mutex_exit(&as->a_contents);
+
+ /*
+ * Call the swapout routines of all segments in the address
+ * space to do the actual work, accumulating the amount of
+ * space reclaimed.
+ */
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+ struct seg_ops *ov = seg->s_ops;
+
+ /*
+ * We have to check to see if the seg has
+ * an ops vector because the seg may have
+ * been in the middle of being set up when
+ * the process was picked for swapout.
+ */
+ if ((ov != NULL) && (ov->swapout != NULL))
+ swpcnt += SEGOP_SWAPOUT(seg);
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (swpcnt);
+}
+
+/*
+ * Determine whether data from the mappings in interval [addr, addr + size)
+ * are in the primary memory (core) cache.
+ */
+int
+as_incore(struct as *as, caddr_t addr,
+ size_t size, char *vec, size_t *sizep)
+{
+ struct seg *seg;
+ size_t ssize;
+ caddr_t raddr; /* rounded down addr */
+ size_t rsize; /* rounded up size */
+ size_t isize; /* iteration size */
+ int error = 0; /* result, assume success */
+
+ *sizep = 0;
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+
+ if (raddr + rsize < raddr) /* check for wraparound */
+ return (ENOMEM);
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ seg = as_segat(as, raddr);
+ if (seg == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (-1);
+ }
+
+ for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+ if (raddr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ if (seg == NULL || raddr != seg->s_base) {
+ error = -1;
+ break;
+ }
+ }
+ if ((raddr + rsize) > (seg->s_base + seg->s_size))
+ ssize = seg->s_base + seg->s_size - raddr;
+ else
+ ssize = rsize;
+ *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
+ if (isize != ssize) {
+ error = -1;
+ break;
+ }
+ vec += btopr(ssize);
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (error);
+}
+
+static void
+as_segunlock(struct seg *seg, caddr_t addr, int attr,
+ ulong_t *bitmap, size_t position, size_t npages)
+{
+ caddr_t range_start;
+ size_t pos1 = position;
+ size_t pos2;
+ size_t size;
+ size_t end_pos = npages + position;
+
+ while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
+ size = ptob((pos2 - pos1));
+ range_start = (caddr_t)((uintptr_t)addr +
+ ptob(pos1 - position));
+
+ (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
+ (ulong_t *)NULL, (size_t)NULL);
+ pos1 = pos2;
+ }
+}
+
+static void
+as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
+ caddr_t raddr, size_t rsize)
+{
+ struct seg *seg = as_segat(as, raddr);
+ size_t ssize;
+
+ while (rsize != 0) {
+ if (raddr >= seg->s_base + seg->s_size)
+ seg = AS_SEGNEXT(as, seg);
+
+ if ((raddr + rsize) > (seg->s_base + seg->s_size))
+ ssize = seg->s_base + seg->s_size - raddr;
+ else
+ ssize = rsize;
+
+ as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
+
+ rsize -= ssize;
+ raddr += ssize;
+ }
+}
+
+/*
+ * Cache control operations over the interval [addr, addr + size) in
+ * address space "as".
+ */
+/*ARGSUSED*/
+int
+as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
+ uintptr_t arg, ulong_t *lock_map, size_t pos)
+{
+ struct seg *seg; /* working segment */
+ caddr_t raddr; /* rounded down addr */
+ caddr_t initraddr; /* saved initial rounded down addr */
+ size_t rsize; /* rounded up size */
+ size_t initrsize; /* saved initial rounded up size */
+ size_t ssize; /* size of seg */
+ int error = 0; /* result */
+ size_t mlock_size; /* size of bitmap */
+ ulong_t *mlock_map; /* pointer to bitmap used */
+ /* to represent the locked */
+ /* pages. */
+retry:
+ if (error == IE_RETRY)
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+ else
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+
+ /*
+ * If these are address space lock/unlock operations, loop over
+ * all segments in the address space, as appropriate.
+ */
+ if (func == MC_LOCKAS) {
+ size_t npages, idx;
+ size_t rlen = 0; /* rounded as length */
+
+ idx = pos;
+
+ if (arg & MCL_FUTURE) {
+ mutex_enter(&as->a_contents);
+ AS_SETPGLCK(as);
+ mutex_exit(&as->a_contents);
+ }
+ if ((arg & MCL_CURRENT) == 0) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (0);
+ }
+
+ seg = AS_SEGFIRST(as);
+ if (seg == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (0);
+ }
+
+ do {
+ raddr = (caddr_t)((uintptr_t)seg->s_base &
+ (uintptr_t)PAGEMASK);
+ rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
+ PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
+ } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
+
+ mlock_size = BT_BITOUL(btopr(rlen));
+ if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
+ sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (EAGAIN);
+ }
+
+ for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
+ error = SEGOP_LOCKOP(seg, seg->s_base,
+ seg->s_size, attr, MC_LOCK, mlock_map, pos);
+ if (error != 0)
+ break;
+ pos += seg_pages(seg);
+ }
+
+ if (error) {
+ for (seg = AS_SEGFIRST(as); seg != NULL;
+ seg = AS_SEGNEXT(as, seg)) {
+
+ raddr = (caddr_t)((uintptr_t)seg->s_base &
+ (uintptr_t)PAGEMASK);
+ npages = seg_pages(seg);
+ as_segunlock(seg, raddr, attr, mlock_map,
+ idx, npages);
+ idx += npages;
+ }
+ }
+
+ kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
+ AS_LOCK_EXIT(as, &as->a_lock);
+ goto lockerr;
+ } else if (func == MC_UNLOCKAS) {
+ mutex_enter(&as->a_contents);
+ AS_CLRPGLCK(as);
+ mutex_exit(&as->a_contents);
+
+ for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
+ error = SEGOP_LOCKOP(seg, seg->s_base,
+ seg->s_size, attr, MC_UNLOCK, NULL, 0);
+ if (error != 0)
+ break;
+ }
+
+ AS_LOCK_EXIT(as, &as->a_lock);
+ goto lockerr;
+ }
+
+ /*
+ * Normalize addresses and sizes.
+ */
+ initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+
+ if (raddr + rsize < raddr) { /* check for wraparound */
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENOMEM);
+ }
+
+ /*
+ * Get initial segment.
+ */
+ if ((seg = as_segat(as, raddr)) == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENOMEM);
+ }
+
+ if (func == MC_LOCK) {
+ mlock_size = BT_BITOUL(btopr(rsize));
+ if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
+ sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (EAGAIN);
+ }
+ }
+
+ /*
+ * Loop over all segments. If a hole in the address range is
+ * discovered, then fail. For each segment, perform the appropriate
+ * control operation.
+ */
+ while (rsize != 0) {
+
+ /*
+ * Make sure there's no hole, calculate the portion
+ * of the next segment to be operated over.
+ */
+ if (raddr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ if (seg == NULL || raddr != seg->s_base) {
+ if (func == MC_LOCK) {
+ as_unlockerr(as, attr, mlock_map,
+ initraddr, initrsize - rsize);
+ kmem_free(mlock_map,
+ mlock_size * sizeof (ulong_t));
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENOMEM);
+ }
+ }
+ if ((raddr + rsize) > (seg->s_base + seg->s_size))
+ ssize = seg->s_base + seg->s_size - raddr;
+ else
+ ssize = rsize;
+
+ /*
+ * Dispatch on specific function.
+ */
+ switch (func) {
+
+ /*
+ * Synchronize cached data from mappings with backing
+ * objects.
+ */
+ case MC_SYNC:
+ if (error = SEGOP_SYNC(seg, raddr, ssize,
+ attr, (uint_t)arg)) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (error);
+ }
+ break;
+
+ /*
+ * Lock pages in memory.
+ */
+ case MC_LOCK:
+ if (error = SEGOP_LOCKOP(seg, raddr, ssize,
+ attr, func, mlock_map, pos)) {
+ as_unlockerr(as, attr, mlock_map, initraddr,
+ initrsize - rsize + ssize);
+ kmem_free(mlock_map, mlock_size *
+ sizeof (ulong_t));
+ AS_LOCK_EXIT(as, &as->a_lock);
+ goto lockerr;
+ }
+ break;
+
+ /*
+ * Unlock mapped pages.
+ */
+ case MC_UNLOCK:
+ (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
+ (ulong_t *)NULL, (size_t)NULL);
+ break;
+
+ /*
+ * Store VM advise for mapped pages in segment layer.
+ */
+ case MC_ADVISE:
+ error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
+
+ /*
+ * Check for regular errors and special retry error
+ */
+ if (error) {
+ if (error == IE_RETRY) {
+ /*
+ * Need to acquire writers lock, so
+ * have to drop readers lock and start
+ * all over again
+ */
+ AS_LOCK_EXIT(as, &as->a_lock);
+ goto retry;
+ } else if (error == IE_REATTACH) {
+ /*
+ * Find segment for current address
+ * because current segment just got
+ * split or concatenated
+ */
+ seg = as_segat(as, raddr);
+ if (seg == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENOMEM);
+ }
+ } else {
+ /*
+ * Regular error
+ */
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (error);
+ }
+ }
+ break;
+
+ /*
+ * Can't happen.
+ */
+ default:
+ panic("as_ctl: bad operation %d", func);
+ /*NOTREACHED*/
+ }
+
+ rsize -= ssize;
+ raddr += ssize;
+ }
+
+ if (func == MC_LOCK)
+ kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (0);
+lockerr:
+
+ /*
+ * If the lower levels returned EDEADLK for a segment lockop,
+ * it means that we should retry the operation. Let's wait
+ * a bit also to let the deadlock causing condition clear.
+ * This is part of a gross hack to work around a design flaw
+ * in the ufs/sds logging code and should go away when the
+ * logging code is re-designed to fix the problem. See bug
+ * 4125102 for details of the problem.
+ */
+ if (error == EDEADLK) {
+ delay(deadlk_wait);
+ error = 0;
+ goto retry;
+ }
+ return (error);
+}
+
+/*
+ * Special code for exec to move the stack segment from its interim
+ * place in the old address to the right place in the new address space.
+ */
+/*ARGSUSED*/
+int
+as_exec(struct as *oas, caddr_t ostka, size_t stksz,
+ struct as *nas, caddr_t nstka, uint_t hatflag)
+{
+ struct seg *stkseg;
+
+ AS_LOCK_ENTER(oas, &oas->a_lock, RW_WRITER);
+ stkseg = as_segat(oas, ostka);
+ stkseg = as_removeseg(oas, stkseg);
+ ASSERT(stkseg != NULL);
+ ASSERT(stkseg->s_base == ostka && stkseg->s_size == stksz);
+ stkseg->s_as = nas;
+ stkseg->s_base = nstka;
+
+ /*
+ * It's ok to lock the address space we are about to exec to.
+ */
+ AS_LOCK_ENTER(nas, &nas->a_lock, RW_WRITER);
+ ASSERT(avl_numnodes(&nas->a_wpage) == 0);
+ nas->a_size += stkseg->s_size;
+ oas->a_size -= stkseg->s_size;
+ (void) as_addseg(nas, stkseg);
+ AS_LOCK_EXIT(nas, &nas->a_lock);
+ AS_LOCK_EXIT(oas, &oas->a_lock);
+ return (0);
+}
+
+static int
+f_decode(faultcode_t fault_err)
+{
+ int error = 0;
+
+ switch (FC_CODE(fault_err)) {
+ case FC_OBJERR:
+ error = FC_ERRNO(fault_err);
+ break;
+ case FC_PROT:
+ error = EACCES;
+ break;
+ default:
+ error = EFAULT;
+ break;
+ }
+ return (error);
+}
+
+/*
+ * lock pages in a given address space. Return shadow list. If
+ * the list is NULL, the MMU mapping is also locked.
+ */
+int
+as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
+ size_t size, enum seg_rw rw)
+{
+ size_t rsize;
+ caddr_t base;
+ caddr_t raddr;
+ faultcode_t fault_err;
+ struct seg *seg;
+ int res;
+ int prefaulted = 0;
+
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
+ "as_pagelock_start: addr %p size %ld", addr, size);
+
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+top:
+ /*
+ * if the request crosses two segments let
+ * as_fault handle it.
+ */
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ seg = as_findseg(as, addr, 0);
+ if ((seg == NULL) || ((base = seg->s_base) > addr) ||
+ (addr + size) > base + seg->s_size) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ goto slow;
+ }
+
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
+ "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
+
+ /*
+ * try to lock pages and pass back shadow list
+ */
+ res = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
+
+ TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
+ AS_LOCK_EXIT(as, &as->a_lock);
+ if (res == 0) {
+ return (0);
+ } else if (res == ENOTSUP || prefaulted) {
+ /*
+ * (1) segment driver doesn't support PAGELOCK fastpath, or
+ * (2) we've already tried fast path unsuccessfully after
+ * faulting in the addr range below; system might be
+ * thrashing or there may not be enough availrmem.
+ */
+ goto slow;
+ }
+
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_FAULT_START,
+ "as_fault_start: addr %p size %ld", addr, size);
+
+ /*
+ * we might get here because of some COW fault or non
+ * existing page. Let as_fault deal with it. Just load
+ * the page, don't lock the MMU mapping.
+ */
+ fault_err = as_fault(as->a_hat, as, addr, size, F_INVAL, rw);
+ if (fault_err != 0) {
+ return (f_decode(fault_err));
+ }
+
+ prefaulted = 1;
+
+ /*
+ * try fast path again; since we've dropped a_lock,
+ * we need to try the dance from the start to see if
+ * the addr range is still valid.
+ */
+ goto top;
+slow:
+ /*
+ * load the page and lock the MMU mapping.
+ */
+ fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
+ if (fault_err != 0) {
+ return (f_decode(fault_err));
+ }
+ *ppp = NULL;
+
+ TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
+ return (0);
+}
+
+/*
+ * unlock pages in a given address range
+ */
+void
+as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
+ enum seg_rw rw)
+{
+ struct seg *seg;
+ size_t rsize;
+ caddr_t raddr;
+
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
+ "as_pageunlock_start: addr %p size %ld", addr, size);
+
+ /*
+ * if the shadow list is NULL, as_pagelock was
+ * falling back to as_fault
+ */
+ if (pp == NULL) {
+ (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
+ return;
+ }
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ seg = as_findseg(as, addr, 0);
+ ASSERT(seg);
+ TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
+ "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
+ SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
+}
+
+/*
+ * reclaim cached pages in a given address range
+ */
+void
+as_pagereclaim(struct as *as, struct page **pp, caddr_t addr,
+ size_t size, enum seg_rw rw)
+{
+ struct seg *seg;
+ size_t rsize;
+ caddr_t raddr;
+
+ ASSERT(AS_READ_HELD(as, &as->a_lock));
+ ASSERT(pp != NULL);
+
+ raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
+ (size_t)raddr;
+ seg = as_findseg(as, addr, 0);
+ ASSERT(seg);
+ SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGERECLAIM, rw);
+}
+
+#define MAXPAGEFLIP 4
+#define MAXPAGEFLIPSIZ MAXPAGEFLIP*PAGESIZE
+
+int
+as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
+ boolean_t wait)
+{
+ struct seg *seg;
+ size_t ssize;
+ caddr_t raddr; /* rounded down addr */
+ size_t rsize; /* rounded up size */
+ int error = 0;
+ size_t pgsz = page_get_pagesize(szc);
+
+setpgsz_top:
+ if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
+ return (EINVAL);
+ }
+
+ raddr = addr;
+ rsize = size;
+
+ if (raddr + rsize < raddr) /* check for wraparound */
+ return (ENOMEM);
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+ as_clearwatchprot(as, raddr, rsize);
+ seg = as_segat(as, raddr);
+ if (seg == NULL) {
+ as_setwatch(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENOMEM);
+ }
+
+ for (; rsize != 0; rsize -= ssize, raddr += ssize) {
+ if (raddr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ if (seg == NULL || raddr != seg->s_base) {
+ error = ENOMEM;
+ break;
+ }
+ }
+ if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
+ ssize = seg->s_base + seg->s_size - raddr;
+ } else {
+ ssize = rsize;
+ }
+
+ error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
+
+ if (error == IE_NOMEM) {
+ error = EAGAIN;
+ break;
+ }
+
+ if (error == IE_RETRY) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ goto setpgsz_top;
+ }
+
+ if (error == ENOTSUP) {
+ error = EINVAL;
+ break;
+ }
+
+ if (wait && (error == EAGAIN)) {
+ /*
+ * Memory is currently locked. It must be unlocked
+ * before this operation can succeed through a retry.
+ * The possible reasons for locked memory and
+ * corresponding strategies for unlocking are:
+ * (1) Normal I/O
+ * wait for a signal that the I/O operation
+ * has completed and the memory is unlocked.
+ * (2) Asynchronous I/O
+ * The aio subsystem does not unlock pages when
+ * the I/O is completed. Those pages are unlocked
+ * when the application calls aiowait/aioerror.
+ * So, to prevent blocking forever, cv_broadcast()
+ * is done to wake up aio_cleanup_thread.
+ * Subsequently, segvn_reclaim will be called, and
+ * that will do AS_CLRUNMAPWAIT() and wake us up.
+ * (3) Long term page locking:
+ * This is not relevant for as_setpagesize()
+ * because we cannot change the page size for
+ * driver memory. The attempt to do so will
+ * fail with a different error than EAGAIN so
+ * there's no need to trigger as callbacks like
+ * as_unmap, as_setprot or as_free would do.
+ */
+ mutex_enter(&as->a_contents);
+ if (AS_ISUNMAPWAIT(as) == 0) {
+ cv_broadcast(&as->a_cv);
+ }
+ AS_SETUNMAPWAIT(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ while (AS_ISUNMAPWAIT(as)) {
+ cv_wait(&as->a_cv, &as->a_contents);
+ }
+ mutex_exit(&as->a_contents);
+ goto setpgsz_top;
+ } else if (error != 0) {
+ break;
+ }
+ }
+ as_setwatch(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (error);
+}
+
+/*
+ * Setup all of the uninitialized watched pages that we can.
+ */
+void
+as_setwatch(struct as *as)
+{
+ struct watched_page *pwp;
+ struct seg *seg;
+ caddr_t vaddr;
+ uint_t prot;
+ int err, retrycnt;
+
+ if (avl_numnodes(&as->a_wpage) == 0)
+ return;
+
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+ for (pwp = avl_first(&as->a_wpage); pwp != NULL;
+ pwp = AVL_NEXT(&as->a_wpage, pwp)) {
+ retrycnt = 0;
+ retry:
+ vaddr = pwp->wp_vaddr;
+ if (pwp->wp_oprot != 0 || /* already set up */
+ (seg = as_segat(as, vaddr)) == NULL ||
+ SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
+ continue;
+
+ pwp->wp_oprot = prot;
+ if (pwp->wp_read)
+ prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
+ if (pwp->wp_write)
+ prot &= ~PROT_WRITE;
+ if (pwp->wp_exec)
+ prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
+ if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
+ err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
+ if (err == IE_RETRY) {
+ pwp->wp_oprot = 0;
+ ASSERT(retrycnt == 0);
+ retrycnt++;
+ goto retry;
+ }
+ }
+ pwp->wp_prot = prot;
+ }
+}
+
+/*
+ * Clear all of the watched pages in the address space.
+ */
+void
+as_clearwatch(struct as *as)
+{
+ struct watched_page *pwp;
+ struct seg *seg;
+ caddr_t vaddr;
+ uint_t prot;
+ int err, retrycnt;
+
+ if (avl_numnodes(&as->a_wpage) == 0)
+ return;
+
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+ for (pwp = avl_first(&as->a_wpage); pwp != NULL;
+ pwp = AVL_NEXT(&as->a_wpage, pwp)) {
+ retrycnt = 0;
+ retry:
+ vaddr = pwp->wp_vaddr;
+ if (pwp->wp_oprot == 0 || /* not set up */
+ (seg = as_segat(as, vaddr)) == NULL)
+ continue;
+
+ if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
+ err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
+ if (err == IE_RETRY) {
+ ASSERT(retrycnt == 0);
+ retrycnt++;
+ goto retry;
+ }
+ }
+ pwp->wp_oprot = 0;
+ pwp->wp_prot = 0;
+ }
+}
+
+/*
+ * Force a new setup for all the watched pages in the range.
+ */
+static void
+as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
+{
+ struct watched_page *pwp;
+ struct watched_page tpw;
+ caddr_t eaddr = addr + size;
+ caddr_t vaddr;
+ struct seg *seg;
+ int err, retrycnt;
+ uint_t wprot;
+ avl_index_t where;
+
+ if (avl_numnodes(&as->a_wpage) == 0)
+ return;
+
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+ tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
+ pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
+
+ while (pwp != NULL && pwp->wp_vaddr < eaddr) {
+ retrycnt = 0;
+ vaddr = pwp->wp_vaddr;
+
+ wprot = prot;
+ if (pwp->wp_read)
+ wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
+ if (pwp->wp_write)
+ wprot &= ~PROT_WRITE;
+ if (pwp->wp_exec)
+ wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
+ if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
+ retry:
+ seg = as_segat(as, vaddr);
+ if (seg == NULL) {
+ panic("as_setwatchprot: no seg");
+ /*NOTREACHED*/
+ }
+ err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
+ if (err == IE_RETRY) {
+ ASSERT(retrycnt == 0);
+ retrycnt++;
+ goto retry;
+ }
+ }
+ pwp->wp_oprot = prot;
+ pwp->wp_prot = wprot;
+
+ pwp = AVL_NEXT(&as->a_wpage, pwp);
+ }
+}
+
+/*
+ * Clear all of the watched pages in the range.
+ */
+static void
+as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
+{
+ caddr_t eaddr = addr + size;
+ struct watched_page *pwp;
+ struct watched_page tpw;
+ uint_t prot;
+ struct seg *seg;
+ int err, retrycnt;
+ avl_index_t where;
+
+ if (avl_numnodes(&as->a_wpage) == 0)
+ return;
+
+ tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
+ pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
+
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
+ while (pwp != NULL && pwp->wp_vaddr < eaddr) {
+ ASSERT(addr >= pwp->wp_vaddr);
+
+ if ((prot = pwp->wp_oprot) != 0) {
+ retrycnt = 0;
+
+ if (prot != pwp->wp_prot) {
+ retry:
+ seg = as_segat(as, pwp->wp_vaddr);
+ if (seg == NULL)
+ continue;
+ err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
+ PAGESIZE, prot);
+ if (err == IE_RETRY) {
+ ASSERT(retrycnt == 0);
+ retrycnt++;
+ goto retry;
+
+ }
+ }
+ pwp->wp_oprot = 0;
+ pwp->wp_prot = 0;
+ }
+
+ pwp = AVL_NEXT(&as->a_wpage, pwp);
+ }
+}
+
+void
+as_signal_proc(struct as *as, k_siginfo_t *siginfo)
+{
+ struct proc *p;
+
+ mutex_enter(&pidlock);
+ for (p = practive; p; p = p->p_next) {
+ if (p->p_as == as) {
+ mutex_enter(&p->p_lock);
+ if (p->p_as == as)
+ sigaddq(p, NULL, siginfo, KM_NOSLEEP);
+ mutex_exit(&p->p_lock);
+ }
+ }
+ mutex_exit(&pidlock);
+}
+
+/*
+ * return memory object ID
+ */
+int
+as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
+{
+ struct seg *seg;
+ int sts;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ seg = as_segat(as, addr);
+ if (seg == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (EFAULT);
+ }
+ /*
+ * catch old drivers which may not support getmemid
+ */
+ if (seg->s_ops->getmemid == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (ENODEV);
+ }
+
+ sts = SEGOP_GETMEMID(seg, addr, memidp);
+
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (sts);
+}
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
new file mode 100644
index 0000000000..67b4e58f0f
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -0,0 +1,6708 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - physical page management.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/vm.h>
+#include <sys/vtrace.h>
+#include <sys/swap.h>
+#include <sys/cmn_err.h>
+#include <sys/tuneable.h>
+#include <sys/sysmacros.h>
+#include <sys/cpuvar.h>
+#include <sys/callb.h>
+#include <sys/debug.h>
+#include <sys/tnf_probe.h>
+#include <sys/condvar_impl.h>
+#include <sys/mem_config.h>
+#include <sys/mem_cage.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/strlog.h>
+#include <sys/mman.h>
+#include <sys/ontrap.h>
+#include <sys/lgrp.h>
+#include <sys/vfs.h>
+
+#include <vm/hat.h>
+#include <vm/anon.h>
+#include <vm/page.h>
+#include <vm/seg.h>
+#include <vm/pvn.h>
+#include <vm/seg_kmem.h>
+#include <vm/vm_dep.h>
+
+#include <fs/fs_subr.h>
+
+static int nopageage = 0;
+
+static pgcnt_t max_page_get; /* max page_get request size in pages */
+pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */
+
+/*
+ * vnode for all pages which are retired from the VM system;
+ * such as pages with Uncorrectable Errors.
+ */
+struct vnode retired_ppages;
+
+static void page_retired_init(void);
+static void retired_dispose(vnode_t *vp, page_t *pp, int flag,
+ int dn, cred_t *cr);
+static void retired_inactive(vnode_t *vp, cred_t *cr);
+static void page_retired(page_t *pp);
+static void retired_page_removed(page_t *pp);
+void page_unretire_pages(void);
+
+/*
+ * The maximum number of pages that will be unretired in one iteration.
+ * This number is totally arbitrary.
+ */
+#define UNRETIRE_PAGES 256
+
+/*
+ * We limit the number of pages that may be retired to
+ * a percentage of the total physical memory. Note that
+ * the percentage values are stored as 'basis points',
+ * ie, 100 basis points is 1%.
+ */
+#define MAX_PAGES_RETIRED_BPS_DEFAULT 10 /* .1% */
+
+uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT;
+
+static int pages_retired_limit_exceeded(void);
+
+/*
+ * operations vector for vnode with retired pages. Only VOP_DISPOSE
+ * and VOP_INACTIVE are intercepted.
+ */
+struct vnodeops retired_vnodeops = {
+ "retired_vnodeops",
+ fs_nosys, /* open */
+ fs_nosys, /* close */
+ fs_nosys, /* read */
+ fs_nosys, /* write */
+ fs_nosys, /* ioctl */
+ fs_nosys, /* setfl */
+ fs_nosys, /* getattr */
+ fs_nosys, /* setattr */
+ fs_nosys, /* access */
+ fs_nosys, /* lookup */
+ fs_nosys, /* create */
+ fs_nosys, /* remove */
+ fs_nosys, /* link */
+ fs_nosys, /* rename */
+ fs_nosys, /* mkdir */
+ fs_nosys, /* rmdir */
+ fs_nosys, /* readdir */
+ fs_nosys, /* symlink */
+ fs_nosys, /* readlink */
+ fs_nosys, /* fsync */
+ retired_inactive,
+ fs_nosys, /* fid */
+ fs_rwlock, /* rwlock */
+ fs_rwunlock, /* rwunlock */
+ fs_nosys, /* seek */
+ fs_nosys, /* cmp */
+ fs_nosys, /* frlock */
+ fs_nosys, /* space */
+ fs_nosys, /* realvp */
+ fs_nosys, /* getpage */
+ fs_nosys, /* putpage */
+ fs_nosys_map,
+ fs_nosys_addmap,
+ fs_nosys, /* delmap */
+ fs_nosys_poll,
+ fs_nosys, /* dump */
+ fs_nosys, /* l_pathconf */
+ fs_nosys, /* pageio */
+ fs_nosys, /* dumpctl */
+ retired_dispose,
+ fs_nosys, /* setsecattr */
+ fs_nosys, /* getsecatt */
+ fs_nosys, /* shrlock */
+ fs_vnevent_nosupport /* vnevent */
+};
+
+/*
+ * freemem_lock protects all freemem variables:
+ * availrmem. Also this lock protects the globals which track the
+ * availrmem changes for accurate kernel footprint calculation.
+ * See below for an explanation of these
+ * globals.
+ */
+kmutex_t freemem_lock;
+pgcnt_t availrmem;
+pgcnt_t availrmem_initial;
+
+/*
+ * These globals track availrmem changes to get a more accurate
+ * estimate of tke kernel size. Historically pp_kernel is used for
+ * kernel size and is based on availrmem. But availrmem is adjusted for
+ * locked pages in the system not just for kernel locked pages.
+ * These new counters will track the pages locked through segvn and
+ * by explicit user locking.
+ *
+ * segvn_pages_locked : This keeps track on a global basis how many pages
+ * are currently locked because of I/O.
+ *
+ * pages_locked : How many pages are locked becuase of user specified
+ * locking through mlock or plock.
+ *
+ * pages_useclaim,pages_claimed : These two variables track the
+ * cliam adjustments because of the protection changes on a segvn segment.
+ *
+ * All these globals are protected by the same lock which protects availrmem.
+ */
+pgcnt_t segvn_pages_locked;
+pgcnt_t pages_locked;
+pgcnt_t pages_useclaim;
+pgcnt_t pages_claimed;
+
+
+/*
+ * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
+ */
+static kmutex_t new_freemem_lock;
+static uint_t freemem_wait; /* someone waiting for freemem */
+static kcondvar_t freemem_cv;
+
+/*
+ * The logical page free list is maintained as two lists, the 'free'
+ * and the 'cache' lists.
+ * The free list contains those pages that should be reused first.
+ *
+ * The implementation of the lists is machine dependent.
+ * page_get_freelist(), page_get_cachelist(),
+ * page_list_sub(), and page_list_add()
+ * form the interface to the machine dependent implementation.
+ *
+ * Pages with p_free set are on the cache list.
+ * Pages with p_free and p_age set are on the free list,
+ *
+ * A page may be locked while on either list.
+ */
+
+/*
+ * free list accounting stuff.
+ *
+ *
+ * Spread out the value for the number of pages on the
+ * page free and page cache lists. If there is just one
+ * value, then it must be under just one lock.
+ * The lock contention and cache traffic are a real bother.
+ *
+ * When we acquire and then drop a single pcf lock
+ * we can start in the middle of the array of pcf structures.
+ * If we acquire more than one pcf lock at a time, we need to
+ * start at the front to avoid deadlocking.
+ *
+ * pcf_count holds the number of pages in each pool.
+ *
+ * pcf_block is set when page_create_get_something() has asked the
+ * PSM page freelist and page cachelist routines without specifying
+ * a color and nothing came back. This is used to block anything
+ * else from moving pages from one list to the other while the
+ * lists are searched again. If a page is freeed while pcf_block is
+ * set, then pcf_reserve is incremented. pcgs_unblock() takes care
+ * of clearning pcf_block, doing the wakeups, etc.
+ */
+
+#if NCPU <= 4
+#define PAD 1
+#define PCF_FANOUT 4
+static uint_t pcf_mask = PCF_FANOUT - 1;
+#else
+#define PAD 9
+#ifdef sun4v
+#define PCF_FANOUT 32
+#else
+#define PCF_FANOUT 128
+#endif
+static uint_t pcf_mask = PCF_FANOUT - 1;
+#endif
+
+struct pcf {
+ uint_t pcf_touch; /* just to help the cache */
+ uint_t pcf_count; /* page count */
+ kmutex_t pcf_lock; /* protects the structure */
+ uint_t pcf_wait; /* number of waiters */
+ uint_t pcf_block; /* pcgs flag to page_free() */
+ uint_t pcf_reserve; /* pages freed after pcf_block set */
+ uint_t pcf_fill[PAD]; /* to line up on the caches */
+};
+
+static struct pcf pcf[PCF_FANOUT];
+#define PCF_INDEX() ((CPU->cpu_id) & (pcf_mask))
+
+kmutex_t pcgs_lock; /* serializes page_create_get_ */
+kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */
+kmutex_t pcgs_wait_lock; /* used for delay in pcgs */
+static kcondvar_t pcgs_cv; /* cv for delay in pcgs */
+
+#define PAGE_LOCK_MAXIMUM \
+ ((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1)
+
+/*
+ * Control over the verbosity of page retirement. When set to zero, no messages
+ * will be printed. A value of one will trigger messages for retirement
+ * operations, and is intended for processors which don't yet support FMA
+ * (spitfire). Two will cause verbose messages to be printed when retirements
+ * complete, and is intended only for debugging purposes.
+ */
+int page_retire_messages = 0;
+
+#ifdef VM_STATS
+
+/*
+ * No locks, but so what, they are only statistics.
+ */
+
+static struct page_tcnt {
+ int pc_free_cache; /* free's into cache list */
+ int pc_free_dontneed; /* free's with dontneed */
+ int pc_free_pageout; /* free's from pageout */
+ int pc_free_free; /* free's into free list */
+ int pc_free_pages; /* free's into large page free list */
+ int pc_destroy_pages; /* large page destroy's */
+ int pc_get_cache; /* get's from cache list */
+ int pc_get_free; /* get's from free list */
+ int pc_reclaim; /* reclaim's */
+ int pc_abortfree; /* abort's of free pages */
+ int pc_find_hit; /* find's that find page */
+ int pc_find_miss; /* find's that don't find page */
+ int pc_destroy_free; /* # of free pages destroyed */
+#define PC_HASH_CNT (4*PAGE_HASHAVELEN)
+ int pc_find_hashlen[PC_HASH_CNT+1];
+ int pc_addclaim_pages;
+ int pc_subclaim_pages;
+ int pc_free_replacement_page[2];
+ int pc_try_demote_pages[6];
+ int pc_demote_pages[2];
+} pagecnt;
+
+uint_t hashin_count;
+uint_t hashin_not_held;
+uint_t hashin_already;
+
+uint_t hashout_count;
+uint_t hashout_not_held;
+
+uint_t page_create_count;
+uint_t page_create_not_enough;
+uint_t page_create_not_enough_again;
+uint_t page_create_zero;
+uint_t page_create_hashout;
+uint_t page_create_page_lock_failed;
+uint_t page_create_trylock_failed;
+uint_t page_create_found_one;
+uint_t page_create_hashin_failed;
+uint_t page_create_dropped_phm;
+
+uint_t page_create_new;
+uint_t page_create_exists;
+uint_t page_create_putbacks;
+uint_t page_create_overshoot;
+
+uint_t page_reclaim_zero;
+uint_t page_reclaim_zero_locked;
+
+uint_t page_rename_exists;
+uint_t page_rename_count;
+
+uint_t page_lookup_cnt[20];
+uint_t page_lookup_nowait_cnt[10];
+uint_t page_find_cnt;
+uint_t page_exists_cnt;
+uint_t page_exists_forreal_cnt;
+uint_t page_lookup_dev_cnt;
+uint_t get_cachelist_cnt;
+uint_t page_create_cnt[10];
+uint_t alloc_pages[8];
+uint_t page_exphcontg[19];
+uint_t page_create_large_cnt[10];
+
+/*
+ * Collects statistics.
+ */
+#define PAGE_HASH_SEARCH(index, pp, vp, off) { \
+ uint_t mylen = 0; \
+ \
+ for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
+ if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
+ break; \
+ } \
+ if ((pp) != NULL) \
+ pagecnt.pc_find_hit++; \
+ else \
+ pagecnt.pc_find_miss++; \
+ if (mylen > PC_HASH_CNT) \
+ mylen = PC_HASH_CNT; \
+ pagecnt.pc_find_hashlen[mylen]++; \
+}
+
+#else /* VM_STATS */
+
+/*
+ * Don't collect statistics
+ */
+#define PAGE_HASH_SEARCH(index, pp, vp, off) { \
+ for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
+ if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
+ break; \
+ } \
+}
+
+#endif /* VM_STATS */
+
+
+
+#ifdef DEBUG
+#define MEMSEG_SEARCH_STATS
+#endif
+
+#ifdef MEMSEG_SEARCH_STATS
+struct memseg_stats {
+ uint_t nsearch;
+ uint_t nlastwon;
+ uint_t nhashwon;
+ uint_t nnotfound;
+} memseg_stats;
+
+#define MEMSEG_STAT_INCR(v) \
+ atomic_add_32(&memseg_stats.v, 1)
+#else
+#define MEMSEG_STAT_INCR(x)
+#endif
+
+struct memseg *memsegs; /* list of memory segments */
+
+
+static void page_init_mem_config(void);
+static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
+static void page_do_hashout(page_t *);
+
+static void page_demote_vp_pages(page_t *);
+
+/*
+ * vm subsystem related initialization
+ */
+void
+vm_init(void)
+{
+ boolean_t callb_vm_cpr(void *, int);
+
+ (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
+ page_init_mem_config();
+
+ /*
+ * initialise the vnode for retired pages
+ */
+ page_retired_init();
+}
+
+/*
+ * This function is called at startup and when memory is added or deleted.
+ */
+void
+init_pages_pp_maximum()
+{
+ static pgcnt_t p_min;
+ static pgcnt_t pages_pp_maximum_startup;
+ static pgcnt_t avrmem_delta;
+ static int init_done;
+ static int user_set; /* true if set in /etc/system */
+
+ if (init_done == 0) {
+
+ /* If the user specified a value, save it */
+ if (pages_pp_maximum != 0) {
+ user_set = 1;
+ pages_pp_maximum_startup = pages_pp_maximum;
+ }
+
+ /*
+ * Setting of pages_pp_maximum is based first time
+ * on the value of availrmem just after the start-up
+ * allocations. To preserve this relationship at run
+ * time, use a delta from availrmem_initial.
+ */
+ ASSERT(availrmem_initial >= availrmem);
+ avrmem_delta = availrmem_initial - availrmem;
+
+ /* The allowable floor of pages_pp_maximum */
+ p_min = tune.t_minarmem + 100;
+
+ /* Make sure we don't come through here again. */
+ init_done = 1;
+ }
+ /*
+ * Determine pages_pp_maximum, the number of currently available
+ * pages (availrmem) that can't be `locked'. If not set by
+ * the user, we set it to 4% of the currently available memory
+ * plus 4MB.
+ * But we also insist that it be greater than tune.t_minarmem;
+ * otherwise a process could lock down a lot of memory, get swapped
+ * out, and never have enough to get swapped back in.
+ */
+ if (user_set)
+ pages_pp_maximum = pages_pp_maximum_startup;
+ else
+ pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
+ + btop(4 * 1024 * 1024);
+
+ if (pages_pp_maximum <= p_min) {
+ pages_pp_maximum = p_min;
+ }
+}
+
+void
+set_max_page_get(pgcnt_t target_total_pages)
+{
+ max_page_get = target_total_pages / 2;
+}
+
+static pgcnt_t pending_delete;
+
+/*ARGSUSED*/
+static void
+page_mem_config_post_add(
+ void *arg,
+ pgcnt_t delta_pages)
+{
+ set_max_page_get(total_pages - pending_delete);
+ init_pages_pp_maximum();
+}
+
+/*ARGSUSED*/
+static int
+page_mem_config_pre_del(
+ void *arg,
+ pgcnt_t delta_pages)
+{
+ pgcnt_t nv;
+
+ nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
+ set_max_page_get(total_pages - nv);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+page_mem_config_post_del(
+ void *arg,
+ pgcnt_t delta_pages,
+ int cancelled)
+{
+ pgcnt_t nv;
+
+ nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
+ set_max_page_get(total_pages - nv);
+ if (!cancelled)
+ init_pages_pp_maximum();
+}
+
+static kphysm_setup_vector_t page_mem_config_vec = {
+ KPHYSM_SETUP_VECTOR_VERSION,
+ page_mem_config_post_add,
+ page_mem_config_pre_del,
+ page_mem_config_post_del,
+};
+
+static void
+page_init_mem_config(void)
+{
+ int ret;
+
+ ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
+ ASSERT(ret == 0);
+}
+
+/*
+ * Evenly spread out the PCF counters for large free pages
+ */
+static void
+page_free_large_ctr(pgcnt_t npages)
+{
+ static struct pcf *p = pcf;
+ pgcnt_t lump;
+
+ freemem += npages;
+
+ lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT;
+
+ while (npages > 0) {
+
+ ASSERT(!p->pcf_block);
+
+ if (lump < npages) {
+ p->pcf_count += (uint_t)lump;
+ npages -= lump;
+ } else {
+ p->pcf_count += (uint_t)npages;
+ npages = 0;
+ }
+
+ ASSERT(!p->pcf_wait);
+
+ if (++p > &pcf[PCF_FANOUT - 1])
+ p = pcf;
+ }
+
+ ASSERT(npages == 0);
+}
+
+/*
+ * Add a physical chunk of memory to the system freee lists during startup.
+ * Platform specific startup() allocates the memory for the page structs.
+ *
+ * num - number of page structures
+ * base - page number (pfn) to be associated with the first page.
+ *
+ * Since we are doing this during startup (ie. single threaded), we will
+ * use shortcut routines to avoid any locking overhead while putting all
+ * these pages on the freelists.
+ *
+ * NOTE: Any changes performed to page_free(), must also be performed to
+ * add_physmem() since this is how we initialize all page_t's at
+ * boot time.
+ */
+void
+add_physmem(
+ page_t *pp,
+ pgcnt_t num,
+ pfn_t pnum)
+{
+ page_t *root = NULL;
+ uint_t szc = page_num_pagesizes() - 1;
+ pgcnt_t large = page_get_pagecnt(szc);
+ pgcnt_t cnt = 0;
+
+ TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
+ "add_physmem:pp %p num %lu", pp, num);
+
+ /*
+ * Arbitrarily limit the max page_get request
+ * to 1/2 of the page structs we have.
+ */
+ total_pages += num;
+ set_max_page_get(total_pages);
+
+ /*
+ * The physical space for the pages array
+ * representing ram pages has already been
+ * allocated. Here we initialize each lock
+ * in the page structure, and put each on
+ * the free list
+ */
+ for (; num; pp = page_next_raw(pp), pnum++, num--) {
+
+ /*
+ * this needs to fill in the page number
+ * and do any other arch specific initialization
+ */
+ add_physmem_cb(pp, pnum);
+
+ /*
+ * Initialize the page lock as unlocked, since nobody
+ * can see or access this page yet.
+ */
+ pp->p_selock = 0;
+
+ /*
+ * Initialize IO lock
+ */
+ page_iolock_init(pp);
+
+ /*
+ * initialize other fields in the page_t
+ */
+ PP_SETFREE(pp);
+ page_clr_all_props(pp);
+ PP_SETAGED(pp);
+ pp->p_offset = (u_offset_t)-1;
+ pp->p_next = pp;
+ pp->p_prev = pp;
+
+ /*
+ * Simple case: System doesn't support large pages.
+ */
+ if (szc == 0) {
+ pp->p_szc = 0;
+ page_free_at_startup(pp);
+ continue;
+ }
+
+ /*
+ * Handle unaligned pages, we collect them up onto
+ * the root page until we have a full large page.
+ */
+ if (!IS_P2ALIGNED(pnum, large)) {
+
+ /*
+ * If not in a large page,
+ * just free as small page.
+ */
+ if (root == NULL) {
+ pp->p_szc = 0;
+ page_free_at_startup(pp);
+ continue;
+ }
+
+ /*
+ * Link a constituent page into the large page.
+ */
+ pp->p_szc = szc;
+ page_list_concat(&root, &pp);
+
+ /*
+ * When large page is fully formed, free it.
+ */
+ if (++cnt == large) {
+ page_free_large_ctr(cnt);
+ page_list_add_pages(root, PG_LIST_ISINIT);
+ root = NULL;
+ cnt = 0;
+ }
+ continue;
+ }
+
+ /*
+ * At this point we have a page number which
+ * is aligned. We assert that we aren't already
+ * in a different large page.
+ */
+ ASSERT(IS_P2ALIGNED(pnum, large));
+ ASSERT(root == NULL && cnt == 0);
+
+ /*
+ * If insufficient number of pages left to form
+ * a large page, just free the small page.
+ */
+ if (num < large) {
+ pp->p_szc = 0;
+ page_free_at_startup(pp);
+ continue;
+ }
+
+ /*
+ * Otherwise start a new large page.
+ */
+ pp->p_szc = szc;
+ cnt++;
+ root = pp;
+ }
+ ASSERT(root == NULL && cnt == 0);
+}
+
+/*
+ * Find a page representing the specified [vp, offset].
+ * If we find the page but it is intransit coming in,
+ * it will have an "exclusive" lock and we wait for
+ * the i/o to complete. A page found on the free list
+ * is always reclaimed and then locked. On success, the page
+ * is locked, its data is valid and it isn't on the free
+ * list, while a NULL is returned if the page doesn't exist.
+ */
+page_t *
+page_lookup(vnode_t *vp, u_offset_t off, se_t se)
+{
+ return (page_lookup_create(vp, off, se, NULL, NULL, 0));
+}
+
+/*
+ * Find a page representing the specified [vp, offset].
+ * We either return the one we found or, if passed in,
+ * create one with identity of [vp, offset] of the
+ * pre-allocated page. If we find exsisting page but it is
+ * intransit coming in, it will have an "exclusive" lock
+ * and we wait for the i/o to complete. A page found on
+ * the free list is always reclaimed and then locked.
+ * On success, the page is locked, its data is valid and
+ * it isn't on the free list, while a NULL is returned
+ * if the page doesn't exist and newpp is NULL;
+ */
+page_t *
+page_lookup_create(
+ vnode_t *vp,
+ u_offset_t off,
+ se_t se,
+ page_t *newpp,
+ spgcnt_t *nrelocp,
+ int flags)
+{
+ page_t *pp;
+ kmutex_t *phm;
+ ulong_t index;
+ uint_t hash_locked;
+ uint_t es;
+
+ ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+ VM_STAT_ADD(page_lookup_cnt[0]);
+ ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
+
+ /*
+ * Acquire the appropriate page hash lock since
+ * we have to search the hash list. Pages that
+ * hash to this list can't change identity while
+ * this lock is held.
+ */
+ hash_locked = 0;
+ index = PAGE_HASH_FUNC(vp, off);
+ phm = NULL;
+top:
+ PAGE_HASH_SEARCH(index, pp, vp, off);
+ if (pp != NULL) {
+ VM_STAT_ADD(page_lookup_cnt[1]);
+ es = (newpp != NULL) ? 1 : 0;
+ es |= flags;
+ if (!hash_locked) {
+ VM_STAT_ADD(page_lookup_cnt[2]);
+ if (!page_try_reclaim_lock(pp, se, es)) {
+ /*
+ * On a miss, acquire the phm. Then
+ * next time, page_lock() will be called,
+ * causing a wait if the page is busy.
+ * just looping with page_trylock() would
+ * get pretty boring.
+ */
+ VM_STAT_ADD(page_lookup_cnt[3]);
+ phm = PAGE_HASH_MUTEX(index);
+ mutex_enter(phm);
+ hash_locked = 1;
+ goto top;
+ }
+ } else {
+ VM_STAT_ADD(page_lookup_cnt[4]);
+ if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
+ VM_STAT_ADD(page_lookup_cnt[5]);
+ goto top;
+ }
+ }
+
+ /*
+ * Since `pp' is locked it can not change identity now.
+ * Reconfirm we locked the correct page.
+ *
+ * Both the p_vnode and p_offset *must* be cast volatile
+ * to force a reload of their values: The PAGE_HASH_SEARCH
+ * macro will have stuffed p_vnode and p_offset into
+ * registers before calling page_trylock(); another thread,
+ * actually holding the hash lock, could have changed the
+ * page's identity in memory, but our registers would not
+ * be changed, fooling the reconfirmation. If the hash
+ * lock was held during the search, the casting would
+ * not be needed.
+ */
+ VM_STAT_ADD(page_lookup_cnt[6]);
+ if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
+ ((volatile u_offset_t)(pp->p_offset) != off)) {
+ VM_STAT_ADD(page_lookup_cnt[7]);
+ if (hash_locked) {
+ panic("page_lookup_create: lost page %p",
+ (void *)pp);
+ /*NOTREACHED*/
+ }
+ page_unlock(pp);
+ phm = PAGE_HASH_MUTEX(index);
+ mutex_enter(phm);
+ hash_locked = 1;
+ goto top;
+ }
+
+ /*
+ * If page_trylock() was called, then pp may still be on
+ * the cachelist (can't be on the free list, it would not
+ * have been found in the search). If it is on the
+ * cachelist it must be pulled now. To pull the page from
+ * the cachelist, it must be exclusively locked.
+ *
+ * The other big difference between page_trylock() and
+ * page_lock(), is that page_lock() will pull the
+ * page from whatever free list (the cache list in this
+ * case) the page is on. If page_trylock() was used
+ * above, then we have to do the reclaim ourselves.
+ */
+ if ((!hash_locked) && (PP_ISFREE(pp))) {
+ ASSERT(PP_ISAGED(pp) == 0);
+ VM_STAT_ADD(page_lookup_cnt[8]);
+
+ /*
+ * page_relcaim will insure that we
+ * have this page exclusively
+ */
+
+ if (!page_reclaim(pp, NULL)) {
+ /*
+ * Page_reclaim dropped whatever lock
+ * we held.
+ */
+ VM_STAT_ADD(page_lookup_cnt[9]);
+ phm = PAGE_HASH_MUTEX(index);
+ mutex_enter(phm);
+ hash_locked = 1;
+ goto top;
+ } else if (se == SE_SHARED && newpp == NULL) {
+ VM_STAT_ADD(page_lookup_cnt[10]);
+ page_downgrade(pp);
+ }
+ }
+
+ if (hash_locked) {
+ mutex_exit(phm);
+ }
+
+ if (newpp != NULL && pp->p_szc < newpp->p_szc &&
+ PAGE_EXCL(pp) && nrelocp != NULL) {
+ ASSERT(nrelocp != NULL);
+ (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
+ NULL);
+ if (*nrelocp > 0) {
+ VM_STAT_COND_ADD(*nrelocp == 1,
+ page_lookup_cnt[11]);
+ VM_STAT_COND_ADD(*nrelocp > 1,
+ page_lookup_cnt[12]);
+ pp = newpp;
+ se = SE_EXCL;
+ } else {
+ if (se == SE_SHARED) {
+ page_downgrade(pp);
+ }
+ VM_STAT_ADD(page_lookup_cnt[13]);
+ }
+ } else if (newpp != NULL && nrelocp != NULL) {
+ if (PAGE_EXCL(pp) && se == SE_SHARED) {
+ page_downgrade(pp);
+ }
+ VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
+ page_lookup_cnt[14]);
+ VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
+ page_lookup_cnt[15]);
+ VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
+ page_lookup_cnt[16]);
+ } else if (newpp != NULL && PAGE_EXCL(pp)) {
+ se = SE_EXCL;
+ }
+ } else if (!hash_locked) {
+ VM_STAT_ADD(page_lookup_cnt[17]);
+ phm = PAGE_HASH_MUTEX(index);
+ mutex_enter(phm);
+ hash_locked = 1;
+ goto top;
+ } else if (newpp != NULL) {
+ /*
+ * If we have a preallocated page then
+ * insert it now and basically behave like
+ * page_create.
+ */
+ VM_STAT_ADD(page_lookup_cnt[18]);
+ /*
+ * Since we hold the page hash mutex and
+ * just searched for this page, page_hashin
+ * had better not fail. If it does, that
+ * means some thread did not follow the
+ * page hash mutex rules. Panic now and
+ * get it over with. As usual, go down
+ * holding all the locks.
+ */
+ ASSERT(MUTEX_HELD(phm));
+ if (!page_hashin(newpp, vp, off, phm)) {
+ ASSERT(MUTEX_HELD(phm));
+ panic("page_lookup_create: hashin failed %p %p %llx %p",
+ (void *)newpp, (void *)vp, off, (void *)phm);
+ /*NOTREACHED*/
+ }
+ ASSERT(MUTEX_HELD(phm));
+ mutex_exit(phm);
+ phm = NULL;
+ page_set_props(newpp, P_REF);
+ page_io_lock(newpp);
+ pp = newpp;
+ se = SE_EXCL;
+ } else {
+ VM_STAT_ADD(page_lookup_cnt[19]);
+ mutex_exit(phm);
+ }
+
+ ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
+
+ ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
+
+ return (pp);
+}
+
+/*
+ * Search the hash list for the page representing the
+ * specified [vp, offset] and return it locked. Skip
+ * free pages and pages that cannot be locked as requested.
+ * Used while attempting to kluster pages.
+ */
+page_t *
+page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
+{
+ page_t *pp;
+ kmutex_t *phm;
+ ulong_t index;
+ uint_t locked;
+
+ ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+ VM_STAT_ADD(page_lookup_nowait_cnt[0]);
+
+ index = PAGE_HASH_FUNC(vp, off);
+ PAGE_HASH_SEARCH(index, pp, vp, off);
+ locked = 0;
+ if (pp == NULL) {
+top:
+ VM_STAT_ADD(page_lookup_nowait_cnt[1]);
+ locked = 1;
+ phm = PAGE_HASH_MUTEX(index);
+ mutex_enter(phm);
+ PAGE_HASH_SEARCH(index, pp, vp, off);
+ }
+
+ if (pp == NULL || PP_ISFREE(pp)) {
+ VM_STAT_ADD(page_lookup_nowait_cnt[2]);
+ pp = NULL;
+ } else {
+ if (!page_trylock(pp, se)) {
+ VM_STAT_ADD(page_lookup_nowait_cnt[3]);
+ pp = NULL;
+ } else {
+ VM_STAT_ADD(page_lookup_nowait_cnt[4]);
+ /*
+ * See the comment in page_lookup()
+ */
+ if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
+ ((u_offset_t)(pp->p_offset) != off)) {
+ VM_STAT_ADD(page_lookup_nowait_cnt[5]);
+ if (locked) {
+ panic("page_lookup_nowait %p",
+ (void *)pp);
+ /*NOTREACHED*/
+ }
+ page_unlock(pp);
+ goto top;
+ }
+ if (PP_ISFREE(pp)) {
+ VM_STAT_ADD(page_lookup_nowait_cnt[6]);
+ page_unlock(pp);
+ pp = NULL;
+ }
+ }
+ }
+ if (locked) {
+ VM_STAT_ADD(page_lookup_nowait_cnt[7]);
+ mutex_exit(phm);
+ }
+
+ ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
+
+ return (pp);
+}
+
+/*
+ * Search the hash list for a page with the specified [vp, off]
+ * that is known to exist and is already locked. This routine
+ * is typically used by segment SOFTUNLOCK routines.
+ */
+page_t *
+page_find(vnode_t *vp, u_offset_t off)
+{
+ page_t *pp;
+ kmutex_t *phm;
+ ulong_t index;
+
+ ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+ VM_STAT_ADD(page_find_cnt);
+
+ index = PAGE_HASH_FUNC(vp, off);
+ phm = PAGE_HASH_MUTEX(index);
+
+ mutex_enter(phm);
+ PAGE_HASH_SEARCH(index, pp, vp, off);
+ mutex_exit(phm);
+
+ ASSERT(pp != NULL);
+ ASSERT(PAGE_LOCKED(pp) || panicstr);
+ return (pp);
+}
+
+/*
+ * Determine whether a page with the specified [vp, off]
+ * currently exists in the system. Obviously this should
+ * only be considered as a hint since nothing prevents the
+ * page from disappearing or appearing immediately after
+ * the return from this routine. Subsequently, we don't
+ * even bother to lock the list.
+ */
+page_t *
+page_exists(vnode_t *vp, u_offset_t off)
+{
+ page_t *pp;
+ ulong_t index;
+
+ ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+ VM_STAT_ADD(page_exists_cnt);
+
+ index = PAGE_HASH_FUNC(vp, off);
+ PAGE_HASH_SEARCH(index, pp, vp, off);
+
+ return (pp);
+}
+
+/*
+ * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
+ * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array
+ * with these pages locked SHARED. If necessary reclaim pages from
+ * freelist. Return 1 if contiguous pages exist and 0 otherwise.
+ *
+ * If we fail to lock pages still return 1 if pages exist and contiguous.
+ * But in this case return value is just a hint. ppa array won't be filled.
+ * Caller should initialize ppa[0] as NULL to distinguish return value.
+ *
+ * Returns 0 if pages don't exist or not physically contiguous.
+ *
+ * This routine doesn't work for anonymous(swapfs) pages.
+ */
+int
+page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
+{
+ pgcnt_t pages;
+ pfn_t pfn;
+ page_t *rootpp;
+ pgcnt_t i;
+ pgcnt_t j;
+ u_offset_t save_off = off;
+ ulong_t index;
+ kmutex_t *phm;
+ page_t *pp;
+ uint_t pszc;
+ int loopcnt = 0;
+
+ ASSERT(szc != 0);
+ ASSERT(vp != NULL);
+ ASSERT(!IS_SWAPFSVP(vp));
+ ASSERT(vp != &kvp);
+
+again:
+ if (++loopcnt > 3) {
+ VM_STAT_ADD(page_exphcontg[0]);
+ return (0);
+ }
+
+ index = PAGE_HASH_FUNC(vp, off);
+ phm = PAGE_HASH_MUTEX(index);
+
+ mutex_enter(phm);
+ PAGE_HASH_SEARCH(index, pp, vp, off);
+ mutex_exit(phm);
+
+ VM_STAT_ADD(page_exphcontg[1]);
+
+ if (pp == NULL) {
+ VM_STAT_ADD(page_exphcontg[2]);
+ return (0);
+ }
+
+ pages = page_get_pagecnt(szc);
+ rootpp = pp;
+ pfn = rootpp->p_pagenum;
+
+ if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
+ VM_STAT_ADD(page_exphcontg[3]);
+ if (!page_trylock(pp, SE_SHARED)) {
+ VM_STAT_ADD(page_exphcontg[4]);
+ return (1);
+ }
+ if (pp->p_szc != pszc || pp->p_vnode != vp ||
+ pp->p_offset != off) {
+ VM_STAT_ADD(page_exphcontg[5]);
+ page_unlock(pp);
+ off = save_off;
+ goto again;
+ }
+ /*
+ * szc was non zero and vnode and offset matched after we
+ * locked the page it means it can't become free on us.
+ */
+ ASSERT(!PP_ISFREE(pp));
+ if (!IS_P2ALIGNED(pfn, pages)) {
+ page_unlock(pp);
+ return (0);
+ }
+ ppa[0] = pp;
+ pp++;
+ off += PAGESIZE;
+ pfn++;
+ for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
+ if (!page_trylock(pp, SE_SHARED)) {
+ VM_STAT_ADD(page_exphcontg[6]);
+ pp--;
+ while (i-- > 0) {
+ page_unlock(pp);
+ pp--;
+ }
+ ppa[0] = NULL;
+ return (1);
+ }
+ if (pp->p_szc != pszc) {
+ VM_STAT_ADD(page_exphcontg[7]);
+ page_unlock(pp);
+ pp--;
+ while (i-- > 0) {
+ page_unlock(pp);
+ pp--;
+ }
+ ppa[0] = NULL;
+ off = save_off;
+ goto again;
+ }
+ /*
+ * szc the same as for previous already locked pages
+ * with right identity. Since this page had correct
+ * szc after we locked it can't get freed or destroyed
+ * and therefore must have the expected identity.
+ */
+ ASSERT(!PP_ISFREE(pp));
+ if (pp->p_vnode != vp ||
+ pp->p_offset != off) {
+ panic("page_exists_physcontig: "
+ "large page identity doesn't match");
+ }
+ ppa[i] = pp;
+ ASSERT(pp->p_pagenum == pfn);
+ }
+ VM_STAT_ADD(page_exphcontg[8]);
+ ppa[pages] = NULL;
+ return (1);
+ } else if (pszc >= szc) {
+ VM_STAT_ADD(page_exphcontg[9]);
+ if (!IS_P2ALIGNED(pfn, pages)) {
+ return (0);
+ }
+ return (1);
+ }
+
+ if (!IS_P2ALIGNED(pfn, pages)) {
+ VM_STAT_ADD(page_exphcontg[10]);
+ return (0);
+ }
+
+ if (page_numtomemseg_nolock(pfn) !=
+ page_numtomemseg_nolock(pfn + pages - 1)) {
+ VM_STAT_ADD(page_exphcontg[11]);
+ return (0);
+ }
+
+ /*
+ * We loop up 4 times across pages to promote page size.
+ * We're extra cautious to promote page size atomically with respect
+ * to everybody else. But we can probably optimize into 1 loop if
+ * this becomes an issue.
+ */
+
+ for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
+ ASSERT(pp->p_pagenum == pfn);
+ if (!page_trylock(pp, SE_EXCL)) {
+ VM_STAT_ADD(page_exphcontg[12]);
+ break;
+ }
+ if (pp->p_vnode != vp ||
+ pp->p_offset != off) {
+ VM_STAT_ADD(page_exphcontg[13]);
+ page_unlock(pp);
+ break;
+ }
+ if (pp->p_szc >= szc) {
+ ASSERT(i == 0);
+ page_unlock(pp);
+ off = save_off;
+ goto again;
+ }
+ }
+
+ if (i != pages) {
+ VM_STAT_ADD(page_exphcontg[14]);
+ --pp;
+ while (i-- > 0) {
+ page_unlock(pp);
+ --pp;
+ }
+ return (0);
+ }
+
+ pp = rootpp;
+ for (i = 0; i < pages; i++, pp++) {
+ if (PP_ISFREE(pp)) {
+ VM_STAT_ADD(page_exphcontg[15]);
+ ASSERT(!PP_ISAGED(pp));
+ ASSERT(pp->p_szc == 0);
+ if (!page_reclaim(pp, NULL)) {
+ break;
+ }
+ } else {
+ ASSERT(pp->p_szc < szc);
+ VM_STAT_ADD(page_exphcontg[16]);
+ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+ }
+ }
+ if (i < pages) {
+ VM_STAT_ADD(page_exphcontg[17]);
+ /*
+ * page_reclaim failed because we were out of memory.
+ * drop the rest of the locks and return because this page
+ * must be already reallocated anyway.
+ */
+ pp = rootpp;
+ for (j = 0; j < pages; j++, pp++) {
+ if (j != i) {
+ page_unlock(pp);
+ }
+ }
+ return (0);
+ }
+
+ off = save_off;
+ pp = rootpp;
+ for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(!PP_ISFREE(pp));
+ ASSERT(!hat_page_is_mapped(pp));
+ ASSERT(pp->p_vnode == vp);
+ ASSERT(pp->p_offset == off);
+ pp->p_szc = szc;
+ }
+ pp = rootpp;
+ for (i = 0; i < pages; i++, pp++) {
+ if (ppa == NULL) {
+ page_unlock(pp);
+ } else {
+ ppa[i] = pp;
+ page_downgrade(ppa[i]);
+ }
+ }
+ if (ppa != NULL) {
+ ppa[pages] = NULL;
+ }
+ VM_STAT_ADD(page_exphcontg[18]);
+ ASSERT(vp->v_pages != NULL);
+ return (1);
+}
+
+/*
+ * Determine whether a page with the specified [vp, off]
+ * currently exists in the system and if so return its
+ * size code. Obviously this should only be considered as
+ * a hint since nothing prevents the page from disappearing
+ * or appearing immediately after the return from this routine.
+ */
+int
+page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
+{
+ page_t *pp;
+ kmutex_t *phm;
+ ulong_t index;
+ int rc = 0;
+
+ ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+ ASSERT(szc != NULL);
+ VM_STAT_ADD(page_exists_forreal_cnt);
+
+ index = PAGE_HASH_FUNC(vp, off);
+ phm = PAGE_HASH_MUTEX(index);
+
+ mutex_enter(phm);
+ PAGE_HASH_SEARCH(index, pp, vp, off);
+ if (pp != NULL) {
+ *szc = pp->p_szc;
+ rc = 1;
+ }
+ mutex_exit(phm);
+ return (rc);
+}
+
+/* wakeup threads waiting for pages in page_create_get_something() */
+void
+wakeup_pcgs(void)
+{
+ if (!CV_HAS_WAITERS(&pcgs_cv))
+ return;
+ cv_broadcast(&pcgs_cv);
+}
+
+/*
+ * 'freemem' is used all over the kernel as an indication of how many
+ * pages are free (either on the cache list or on the free page list)
+ * in the system. In very few places is a really accurate 'freemem'
+ * needed. To avoid contention of the lock protecting a the
+ * single freemem, it was spread out into NCPU buckets. Set_freemem
+ * sets freemem to the total of all NCPU buckets. It is called from
+ * clock() on each TICK.
+ */
+void
+set_freemem()
+{
+ struct pcf *p;
+ ulong_t t;
+ uint_t i;
+
+ t = 0;
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ t += p->pcf_count;
+ p++;
+ }
+ freemem = t;
+
+ /*
+ * Don't worry about grabbing mutex. It's not that
+ * critical if we miss a tick or two. This is
+ * where we wakeup possible delayers in
+ * page_create_get_something().
+ */
+ wakeup_pcgs();
+}
+
+ulong_t
+get_freemem()
+{
+ struct pcf *p;
+ ulong_t t;
+ uint_t i;
+
+ t = 0;
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ t += p->pcf_count;
+ p++;
+ }
+ /*
+ * We just calculated it, might as well set it.
+ */
+ freemem = t;
+ return (t);
+}
+
+/*
+ * Acquire all of the page cache & free (pcf) locks.
+ */
+void
+pcf_acquire_all()
+{
+ struct pcf *p;
+ uint_t i;
+
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ p->pcf_touch = 1;
+ mutex_enter(&p->pcf_lock);
+ p++;
+ }
+}
+
+/*
+ * Release all the pcf_locks.
+ */
+void
+pcf_release_all()
+{
+ struct pcf *p;
+ uint_t i;
+
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ mutex_exit(&p->pcf_lock);
+ p++;
+ }
+}
+
+/*
+ * Inform the VM system that we need some pages freed up.
+ * Calls must be symmetric, e.g.:
+ *
+ * page_needfree(100);
+ * wait a bit;
+ * page_needfree(-100);
+ */
+void
+page_needfree(spgcnt_t npages)
+{
+ mutex_enter(&new_freemem_lock);
+ needfree += npages;
+ mutex_exit(&new_freemem_lock);
+}
+
+/*
+ * Throttle for page_create(): try to prevent freemem from dropping
+ * below throttlefree. We can't provide a 100% guarantee because
+ * KM_NOSLEEP allocations, page_reclaim(), and various other things
+ * nibble away at the freelist. However, we can block all PG_WAIT
+ * allocations until memory becomes available. The motivation is
+ * that several things can fall apart when there's no free memory:
+ *
+ * (1) If pageout() needs memory to push a page, the system deadlocks.
+ *
+ * (2) By (broken) specification, timeout(9F) can neither fail nor
+ * block, so it has no choice but to panic the system if it
+ * cannot allocate a callout structure.
+ *
+ * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
+ * it panics if it cannot allocate a callback structure.
+ *
+ * (4) Untold numbers of third-party drivers have not yet been hardened
+ * against KM_NOSLEEP and/or allocb() failures; they simply assume
+ * success and panic the system with a data fault on failure.
+ * (The long-term solution to this particular problem is to ship
+ * hostile fault-injecting DEBUG kernels with the DDK.)
+ *
+ * It is theoretically impossible to guarantee success of non-blocking
+ * allocations, but in practice, this throttle is very hard to break.
+ */
+static int
+page_create_throttle(pgcnt_t npages, int flags)
+{
+ ulong_t fm;
+ uint_t i;
+ pgcnt_t tf; /* effective value of throttlefree */
+
+ /*
+ * Never deny pages when:
+ * - it's a thread that cannot block [NOMEMWAIT()]
+ * - the allocation cannot block and must not fail
+ * - the allocation cannot block and is pageout dispensated
+ */
+ if (NOMEMWAIT() ||
+ ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
+ ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
+ return (1);
+
+ /*
+ * If the allocation can't block, we look favorably upon it
+ * unless we're below pageout_reserve. In that case we fail
+ * the allocation because we want to make sure there are a few
+ * pages available for pageout.
+ */
+ if ((flags & PG_WAIT) == 0)
+ return (freemem >= npages + pageout_reserve);
+
+ /* Calculate the effective throttlefree value */
+ tf = throttlefree -
+ ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
+
+ cv_signal(&proc_pageout->p_cv);
+
+ while (freemem < npages + tf) {
+ pcf_acquire_all();
+ mutex_enter(&new_freemem_lock);
+ fm = 0;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ fm += pcf[i].pcf_count;
+ pcf[i].pcf_wait++;
+ mutex_exit(&pcf[i].pcf_lock);
+ }
+ freemem = fm;
+ needfree += npages;
+ freemem_wait++;
+ cv_wait(&freemem_cv, &new_freemem_lock);
+ freemem_wait--;
+ needfree -= npages;
+ mutex_exit(&new_freemem_lock);
+ }
+ return (1);
+}
+
+/*
+ * page_create_wait() is called to either coalecse pages from the
+ * different pcf buckets or to wait because there simply are not
+ * enough pages to satisfy the caller's request.
+ *
+ * Sadly, this is called from platform/vm/vm_machdep.c
+ */
+int
+page_create_wait(size_t npages, uint_t flags)
+{
+ pgcnt_t total;
+ uint_t i;
+ struct pcf *p;
+
+ /*
+ * Wait until there are enough free pages to satisfy our
+ * entire request.
+ * We set needfree += npages before prodding pageout, to make sure
+ * it does real work when npages > lotsfree > freemem.
+ */
+ VM_STAT_ADD(page_create_not_enough);
+
+ ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
+checkagain:
+ if ((flags & PG_NORELOC) &&
+ kcage_freemem < kcage_throttlefree + npages)
+ (void) kcage_create_throttle(npages, flags);
+
+ if (freemem < npages + throttlefree)
+ if (!page_create_throttle(npages, flags))
+ return (0);
+
+ /*
+ * Since page_create_va() looked at every
+ * bucket, assume we are going to have to wait.
+ * Get all of the pcf locks.
+ */
+ total = 0;
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ p->pcf_touch = 1;
+ mutex_enter(&p->pcf_lock);
+ total += p->pcf_count;
+ if (total >= npages) {
+ /*
+ * Wow! There are enough pages laying around
+ * to satisfy the request. Do the accounting,
+ * drop the locks we acquired, and go back.
+ *
+ * freemem is not protected by any lock. So,
+ * we cannot have any assertion containing
+ * freemem.
+ */
+ freemem -= npages;
+
+ while (p >= pcf) {
+ if (p->pcf_count <= npages) {
+ npages -= p->pcf_count;
+ p->pcf_count = 0;
+ } else {
+ p->pcf_count -= (uint_t)npages;
+ npages = 0;
+ }
+ mutex_exit(&p->pcf_lock);
+ p--;
+ }
+ ASSERT(npages == 0);
+ return (1);
+ }
+ p++;
+ }
+
+ /*
+ * All of the pcf locks are held, there are not enough pages
+ * to satisfy the request (npages < total).
+ * Be sure to acquire the new_freemem_lock before dropping
+ * the pcf locks. This prevents dropping wakeups in page_free().
+ * The order is always pcf_lock then new_freemem_lock.
+ *
+ * Since we hold all the pcf locks, it is a good time to set freemem.
+ *
+ * If the caller does not want to wait, return now.
+ * Else turn the pageout daemon loose to find something
+ * and wait till it does.
+ *
+ */
+ freemem = total;
+
+ if ((flags & PG_WAIT) == 0) {
+ pcf_release_all();
+
+ TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
+ "page_create_nomem:npages %ld freemem %ld", npages, freemem);
+ return (0);
+ }
+
+ ASSERT(proc_pageout != NULL);
+ cv_signal(&proc_pageout->p_cv);
+
+ TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
+ "page_create_sleep_start: freemem %ld needfree %ld",
+ freemem, needfree);
+
+ /*
+ * We are going to wait.
+ * We currently hold all of the pcf_locks,
+ * get the new_freemem_lock (it protects freemem_wait),
+ * before dropping the pcf_locks.
+ */
+ mutex_enter(&new_freemem_lock);
+
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ p->pcf_wait++;
+ mutex_exit(&p->pcf_lock);
+ p++;
+ }
+
+ needfree += npages;
+ freemem_wait++;
+
+ cv_wait(&freemem_cv, &new_freemem_lock);
+
+ freemem_wait--;
+ needfree -= npages;
+
+ mutex_exit(&new_freemem_lock);
+
+ TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
+ "page_create_sleep_end: freemem %ld needfree %ld",
+ freemem, needfree);
+
+ VM_STAT_ADD(page_create_not_enough_again);
+ goto checkagain;
+}
+
+/*
+ * A routine to do the opposite of page_create_wait().
+ */
+void
+page_create_putback(spgcnt_t npages)
+{
+ struct pcf *p;
+ pgcnt_t lump;
+ uint_t *which;
+
+ /*
+ * When a contiguous lump is broken up, we have to
+ * deal with lots of pages (min 64) so lets spread
+ * the wealth around.
+ */
+ lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT;
+ freemem += npages;
+
+ for (p = pcf; (npages > 0) && (p < &pcf[PCF_FANOUT]); p++) {
+ which = &p->pcf_count;
+
+ mutex_enter(&p->pcf_lock);
+
+ if (p->pcf_block) {
+ which = &p->pcf_reserve;
+ }
+
+ if (lump < npages) {
+ *which += (uint_t)lump;
+ npages -= lump;
+ } else {
+ *which += (uint_t)npages;
+ npages = 0;
+ }
+
+ if (p->pcf_wait) {
+ mutex_enter(&new_freemem_lock);
+ /*
+ * Check to see if some other thread
+ * is actually waiting. Another bucket
+ * may have woken it up by now. If there
+ * are no waiters, then set our pcf_wait
+ * count to zero to avoid coming in here
+ * next time.
+ */
+ if (freemem_wait) {
+ if (npages > 1) {
+ cv_broadcast(&freemem_cv);
+ } else {
+ cv_signal(&freemem_cv);
+ }
+ p->pcf_wait--;
+ } else {
+ p->pcf_wait = 0;
+ }
+ mutex_exit(&new_freemem_lock);
+ }
+ mutex_exit(&p->pcf_lock);
+ }
+ ASSERT(npages == 0);
+}
+
+/*
+ * A helper routine for page_create_get_something.
+ * The indenting got to deep down there.
+ * Unblock the pcf counters. Any pages freed after
+ * pcf_block got set are moved to pcf_count and
+ * wakeups (cv_broadcast() or cv_signal()) are done as needed.
+ */
+static void
+pcgs_unblock(void)
+{
+ int i;
+ struct pcf *p;
+
+ /* Update freemem while we're here. */
+ freemem = 0;
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ mutex_enter(&p->pcf_lock);
+ ASSERT(p->pcf_count == 0);
+ p->pcf_count = p->pcf_reserve;
+ p->pcf_block = 0;
+ freemem += p->pcf_count;
+ if (p->pcf_wait) {
+ mutex_enter(&new_freemem_lock);
+ if (freemem_wait) {
+ if (p->pcf_reserve > 1) {
+ cv_broadcast(&freemem_cv);
+ p->pcf_wait = 0;
+ } else {
+ cv_signal(&freemem_cv);
+ p->pcf_wait--;
+ }
+ } else {
+ p->pcf_wait = 0;
+ }
+ mutex_exit(&new_freemem_lock);
+ }
+ p->pcf_reserve = 0;
+ mutex_exit(&p->pcf_lock);
+ p++;
+ }
+}
+
+/*
+ * Called from page_create_va() when both the cache and free lists
+ * have been checked once.
+ *
+ * Either returns a page or panics since the accounting was done
+ * way before we got here.
+ *
+ * We don't come here often, so leave the accounting on permanently.
+ */
+
+#define MAX_PCGS 100
+
+#ifdef DEBUG
+#define PCGS_TRIES 100
+#else /* DEBUG */
+#define PCGS_TRIES 10
+#endif /* DEBUG */
+
+#ifdef VM_STATS
+uint_t pcgs_counts[PCGS_TRIES];
+uint_t pcgs_too_many;
+uint_t pcgs_entered;
+uint_t pcgs_entered_noreloc;
+uint_t pcgs_locked;
+uint_t pcgs_cagelocked;
+#endif /* VM_STATS */
+
+static page_t *
+page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
+ caddr_t vaddr, uint_t flags)
+{
+ uint_t count;
+ page_t *pp;
+ uint_t locked, i;
+ struct pcf *p;
+ lgrp_t *lgrp;
+ int cagelocked = 0;
+
+ VM_STAT_ADD(pcgs_entered);
+
+ /*
+ * Tap any reserve freelists: if we fail now, we'll die
+ * since the page(s) we're looking for have already been
+ * accounted for.
+ */
+ flags |= PG_PANIC;
+
+ if ((flags & PG_NORELOC) != 0) {
+ VM_STAT_ADD(pcgs_entered_noreloc);
+ /*
+ * Requests for free pages from critical threads
+ * such as pageout still won't throttle here, but
+ * we must try again, to give the cageout thread
+ * another chance to catch up. Since we already
+ * accounted for the pages, we had better get them
+ * this time.
+ *
+ * N.B. All non-critical threads acquire the pcgs_cagelock
+ * to serialize access to the freelists. This implements a
+ * turnstile-type synchornization to avoid starvation of
+ * critical requests for PG_NORELOC memory by non-critical
+ * threads: all non-critical threads must acquire a 'ticket'
+ * before passing through, which entails making sure
+ * kcage_freemem won't fall below minfree prior to grabbing
+ * pages from the freelists.
+ */
+ if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
+ mutex_enter(&pcgs_cagelock);
+ cagelocked = 1;
+ VM_STAT_ADD(pcgs_cagelocked);
+ }
+ }
+
+ /*
+ * Time to get serious.
+ * We failed to get a `correctly colored' page from both the
+ * free and cache lists.
+ * We escalate in stage.
+ *
+ * First try both lists without worring about color.
+ *
+ * Then, grab all page accounting locks (ie. pcf[]) and
+ * steal any pages that they have and set the pcf_block flag to
+ * stop deletions from the lists. This will help because
+ * a page can get added to the free list while we are looking
+ * at the cache list, then another page could be added to the cache
+ * list allowing the page on the free list to be removed as we
+ * move from looking at the cache list to the free list. This
+ * could happen over and over. We would never find the page
+ * we have accounted for.
+ *
+ * Noreloc pages are a subset of the global (relocatable) page pool.
+ * They are not tracked separately in the pcf bins, so it is
+ * impossible to know when doing pcf accounting if the available
+ * page(s) are noreloc pages or not. When looking for a noreloc page
+ * it is quite easy to end up here even if the global (relocatable)
+ * page pool has plenty of free pages but the noreloc pool is empty.
+ *
+ * When the noreloc pool is empty (or low), additional noreloc pages
+ * are created by converting pages from the global page pool. This
+ * process will stall during pcf accounting if the pcf bins are
+ * already locked. Such is the case when a noreloc allocation is
+ * looping here in page_create_get_something waiting for more noreloc
+ * pages to appear.
+ *
+ * Short of adding a new field to the pcf bins to accurately track
+ * the number of free noreloc pages, we instead do not grab the
+ * pcgs_lock, do not set the pcf blocks and do not timeout when
+ * allocating a noreloc page. This allows noreloc allocations to
+ * loop without blocking global page pool allocations.
+ *
+ * NOTE: the behaviour of page_create_get_something has not changed
+ * for the case of global page pool allocations.
+ */
+
+ flags &= ~PG_MATCH_COLOR;
+ locked = 0;
+#ifndef __sparc
+ /*
+ * page_create_get_something may be called because 4g memory may be
+ * depleted. Set flags to allow for relocation of base page below
+ * 4g if necessary.
+ */
+ if (physmax4g)
+ flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
+#endif
+
+ lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
+
+ for (count = 0; kcage_on || count < MAX_PCGS; count++) {
+ pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
+ flags, lgrp);
+ if (pp == NULL) {
+ pp = page_get_cachelist(vp, off, seg, vaddr,
+ flags, lgrp);
+ }
+ if (pp == NULL) {
+ /*
+ * Serialize. Don't fight with other pcgs().
+ */
+ if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
+ mutex_enter(&pcgs_lock);
+ VM_STAT_ADD(pcgs_locked);
+ locked = 1;
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ mutex_enter(&p->pcf_lock);
+ ASSERT(p->pcf_block == 0);
+ p->pcf_block = 1;
+ p->pcf_reserve = p->pcf_count;
+ p->pcf_count = 0;
+ mutex_exit(&p->pcf_lock);
+ p++;
+ }
+ freemem = 0;
+ }
+
+ if (count) {
+ /*
+ * Since page_free() puts pages on
+ * a list then accounts for it, we
+ * just have to wait for page_free()
+ * to unlock any page it was working
+ * with. The page_lock()-page_reclaim()
+ * path falls in the same boat.
+ *
+ * We don't need to check on the
+ * PG_WAIT flag, we have already
+ * accounted for the page we are
+ * looking for in page_create_va().
+ *
+ * We just wait a moment to let any
+ * locked pages on the lists free up,
+ * then continue around and try again.
+ *
+ * Will be awakened by set_freemem().
+ */
+ mutex_enter(&pcgs_wait_lock);
+ cv_wait(&pcgs_cv, &pcgs_wait_lock);
+ mutex_exit(&pcgs_wait_lock);
+ }
+ } else {
+#ifdef VM_STATS
+ if (count >= PCGS_TRIES) {
+ VM_STAT_ADD(pcgs_too_many);
+ } else {
+ VM_STAT_ADD(pcgs_counts[count]);
+ }
+#endif
+ if (locked) {
+ pcgs_unblock();
+ mutex_exit(&pcgs_lock);
+ }
+ if (cagelocked)
+ mutex_exit(&pcgs_cagelock);
+ return (pp);
+ }
+ }
+ /*
+ * we go down holding the pcf locks.
+ */
+ panic("no %spage found %d",
+ ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
+ /*NOTREACHED*/
+}
+
+/*
+ * Create enough pages for "bytes" worth of data starting at
+ * "off" in "vp".
+ *
+ * Where flag must be one of:
+ *
+ * PG_EXCL: Exclusive create (fail if any page already
+ * exists in the page cache) which does not
+ * wait for memory to become available.
+ *
+ * PG_WAIT: Non-exclusive create which can wait for
+ * memory to become available.
+ *
+ * PG_PHYSCONTIG: Allocate physically contiguous pages.
+ * (Not Supported)
+ *
+ * A doubly linked list of pages is returned to the caller. Each page
+ * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
+ * lock.
+ *
+ * Unable to change the parameters to page_create() in a minor release,
+ * we renamed page_create() to page_create_va(), changed all known calls
+ * from page_create() to page_create_va(), and created this wrapper.
+ *
+ * Upon a major release, we should break compatibility by deleting this
+ * wrapper, and replacing all the strings "page_create_va", with "page_create".
+ *
+ * NOTE: There is a copy of this interface as page_create_io() in
+ * i86/vm/vm_machdep.c. Any bugs fixed here should be applied
+ * there.
+ */
+page_t *
+page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
+{
+ caddr_t random_vaddr;
+ struct seg kseg;
+
+#ifdef DEBUG
+ cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
+ (void *)caller());
+#endif
+
+ random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
+ (uintptr_t)(off >> PAGESHIFT));
+ kseg.s_as = &kas;
+
+ return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
+}
+
+#ifdef DEBUG
+uint32_t pg_alloc_pgs_mtbf = 0;
+#endif
+
+/*
+ * Used for large page support. It will attempt to allocate
+ * a large page(s) off the freelist.
+ *
+ * Returns non zero on failure.
+ */
+int
+page_alloc_pages(struct seg *seg, caddr_t addr, page_t **basepp,
+ page_t *ppa[], uint_t szc, int anypgsz)
+{
+ pgcnt_t npgs, curnpgs, totpgs;
+ size_t pgsz;
+ page_t *pplist = NULL, *pp;
+ int err = 0;
+ lgrp_t *lgrp;
+
+ ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
+
+ VM_STAT_ADD(alloc_pages[0]);
+
+#ifdef DEBUG
+ if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
+ return (ENOMEM);
+ }
+#endif
+
+ pgsz = page_get_pagesize(szc);
+ totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
+
+ ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
+ /*
+ * One must be NULL but not both.
+ * And one must be non NULL but not both.
+ */
+ ASSERT(basepp != NULL || ppa != NULL);
+ ASSERT(basepp == NULL || ppa == NULL);
+
+ (void) page_create_wait(npgs, PG_WAIT);
+
+ while (npgs && szc) {
+ lgrp = lgrp_mem_choose(seg, addr, pgsz);
+ pp = page_get_freelist(NULL, 0, seg, addr, pgsz, 0, lgrp);
+ if (pp != NULL) {
+ VM_STAT_ADD(alloc_pages[1]);
+ page_list_concat(&pplist, &pp);
+ ASSERT(npgs >= curnpgs);
+ npgs -= curnpgs;
+ } else if (anypgsz) {
+ VM_STAT_ADD(alloc_pages[2]);
+ szc--;
+ pgsz = page_get_pagesize(szc);
+ curnpgs = pgsz >> PAGESHIFT;
+ } else {
+ VM_STAT_ADD(alloc_pages[3]);
+ ASSERT(npgs == totpgs);
+ page_create_putback(npgs);
+ return (ENOMEM);
+ }
+ }
+ if (szc == 0) {
+ VM_STAT_ADD(alloc_pages[4]);
+ ASSERT(npgs != 0);
+ page_create_putback(npgs);
+ err = ENOMEM;
+ } else if (basepp != NULL) {
+ ASSERT(npgs == 0);
+ ASSERT(ppa == NULL);
+ *basepp = pplist;
+ }
+
+ npgs = totpgs - npgs;
+ pp = pplist;
+
+ /*
+ * Clear the free and age bits. Also if we were passed in a ppa then
+ * fill it in with all the constituent pages from the large page. But
+ * if we failed to allocate all the pages just free what we got.
+ */
+ while (npgs != 0) {
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+ if (ppa != NULL || err != 0) {
+ if (err == 0) {
+ VM_STAT_ADD(alloc_pages[5]);
+ PP_CLRFREE(pp);
+ PP_CLRAGED(pp);
+ page_sub(&pplist, pp);
+ *ppa++ = pp;
+ npgs--;
+ } else {
+ VM_STAT_ADD(alloc_pages[6]);
+ ASSERT(pp->p_szc != 0);
+ curnpgs = page_get_pagecnt(pp->p_szc);
+ page_list_break(&pp, &pplist, curnpgs);
+ page_list_add_pages(pp, 0);
+ page_create_putback(curnpgs);
+ ASSERT(npgs >= curnpgs);
+ npgs -= curnpgs;
+ }
+ pp = pplist;
+ } else {
+ VM_STAT_ADD(alloc_pages[7]);
+ PP_CLRFREE(pp);
+ PP_CLRAGED(pp);
+ pp = pp->p_next;
+ npgs--;
+ }
+ }
+ return (err);
+}
+
+/*
+ * Get a single large page off of the freelists, and set it up for use.
+ * Number of bytes requested must be a supported page size.
+ *
+ * Note that this call may fail even if there is sufficient
+ * memory available or PG_WAIT is set, so the caller must
+ * be willing to fallback on page_create_va(), block and retry,
+ * or fail the requester.
+ */
+page_t *
+page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
+ struct seg *seg, caddr_t vaddr, void *arg)
+{
+ pgcnt_t npages, pcftotal;
+ page_t *pp;
+ page_t *rootpp;
+ lgrp_t *lgrp;
+ uint_t enough;
+ uint_t pcf_index;
+ uint_t i;
+ struct pcf *p;
+ struct pcf *q;
+ lgrp_id_t *lgrpid = (lgrp_id_t *)arg;
+
+ ASSERT(vp != NULL);
+
+ ASSERT((flags & ~(PG_EXCL | PG_WAIT |
+ PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
+ /* but no others */
+
+ ASSERT((flags & PG_EXCL) == PG_EXCL);
+
+ npages = btop(bytes);
+
+ if (!kcage_on || panicstr) {
+ /*
+ * Cage is OFF, or we are single threaded in
+ * panic, so make everything a RELOC request.
+ */
+ flags &= ~PG_NORELOC;
+ }
+
+ /*
+ * Make sure there's adequate physical memory available.
+ * Note: PG_WAIT is ignored here.
+ */
+ if (freemem <= throttlefree + npages) {
+ VM_STAT_ADD(page_create_large_cnt[1]);
+ return (NULL);
+ }
+
+ /*
+ * If cage is on, dampen draw from cage when available
+ * cage space is low.
+ */
+ if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) &&
+ kcage_freemem < kcage_throttlefree + npages) {
+
+ /*
+ * The cage is on, the caller wants PG_NORELOC
+ * pages and available cage memory is very low.
+ * Call kcage_create_throttle() to attempt to
+ * control demand on the cage.
+ */
+ if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
+ VM_STAT_ADD(page_create_large_cnt[2]);
+ return (NULL);
+ }
+ }
+
+ enough = 0;
+ pcf_index = PCF_INDEX();
+ p = &pcf[pcf_index];
+ p->pcf_touch = 1;
+ q = &pcf[PCF_FANOUT];
+ for (pcftotal = 0, i = 0; i < PCF_FANOUT; i++) {
+ if (p->pcf_count > npages) {
+ /*
+ * a good one to try.
+ */
+ mutex_enter(&p->pcf_lock);
+ if (p->pcf_count > npages) {
+ p->pcf_count -= (uint_t)npages;
+ /*
+ * freemem is not protected by any lock.
+ * Thus, we cannot have any assertion
+ * containing freemem here.
+ */
+ freemem -= npages;
+ enough = 1;
+ mutex_exit(&p->pcf_lock);
+ break;
+ }
+ mutex_exit(&p->pcf_lock);
+ }
+ pcftotal += p->pcf_count;
+ p++;
+ if (p >= q) {
+ p = pcf;
+ }
+ p->pcf_touch = 1;
+ }
+
+ if (!enough) {
+ /* If there isn't enough memory available, give up. */
+ if (pcftotal < npages) {
+ VM_STAT_ADD(page_create_large_cnt[3]);
+ return (NULL);
+ }
+
+ /* try to collect pages from several pcf bins */
+ for (p = pcf, pcftotal = 0, i = 0; i < PCF_FANOUT; i++) {
+ p->pcf_touch = 1;
+ mutex_enter(&p->pcf_lock);
+ pcftotal += p->pcf_count;
+ if (pcftotal >= npages) {
+ /*
+ * Wow! There are enough pages laying around
+ * to satisfy the request. Do the accounting,
+ * drop the locks we acquired, and go back.
+ *
+ * freemem is not protected by any lock. So,
+ * we cannot have any assertion containing
+ * freemem.
+ */
+ pgcnt_t tpages = npages;
+ freemem -= npages;
+ while (p >= pcf) {
+ if (p->pcf_count <= tpages) {
+ tpages -= p->pcf_count;
+ p->pcf_count = 0;
+ } else {
+ p->pcf_count -= (uint_t)tpages;
+ tpages = 0;
+ }
+ mutex_exit(&p->pcf_lock);
+ p--;
+ }
+ ASSERT(tpages == 0);
+ break;
+ }
+ p++;
+ }
+ if (i == PCF_FANOUT) {
+ /* failed to collect pages - release the locks */
+ while (--p >= pcf) {
+ mutex_exit(&p->pcf_lock);
+ }
+ VM_STAT_ADD(page_create_large_cnt[4]);
+ return (NULL);
+ }
+ }
+
+ /*
+ * This is where this function behaves fundamentally differently
+ * than page_create_va(); since we're intending to map the page
+ * with a single TTE, we have to get it as a physically contiguous
+ * hardware pagesize chunk. If we can't, we fail.
+ */
+ if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
+ LGRP_EXISTS(lgrp_table[*lgrpid]))
+ lgrp = lgrp_table[*lgrpid];
+ else
+ lgrp = lgrp_mem_choose(seg, vaddr, bytes);
+
+ if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
+ bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
+ page_create_putback(npages);
+ VM_STAT_ADD(page_create_large_cnt[5]);
+ return (NULL);
+ }
+
+ /*
+ * if we got the page with the wrong mtype give it back this is a
+ * workaround for CR 6249718. When CR 6249718 is fixed we never get
+ * inside "if" and the workaround becomes just a nop
+ */
+ if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
+ page_list_add_pages(rootpp, 0);
+ page_create_putback(npages);
+ VM_STAT_ADD(page_create_large_cnt[6]);
+ return (NULL);
+ }
+
+ /*
+ * If satisfying this request has left us with too little
+ * memory, start the wheels turning to get some back. The
+ * first clause of the test prevents waking up the pageout
+ * daemon in situations where it would decide that there's
+ * nothing to do.
+ */
+ if (nscan < desscan && freemem < minfree) {
+ TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
+ "pageout_cv_signal:freemem %ld", freemem);
+ cv_signal(&proc_pageout->p_cv);
+ }
+
+ pp = rootpp;
+ while (npages--) {
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_vnode == NULL);
+ ASSERT(!hat_page_is_mapped(pp));
+ PP_CLRFREE(pp);
+ PP_CLRAGED(pp);
+ if (!page_hashin(pp, vp, off, NULL))
+ panic("page_create_large: hashin failed: page %p",
+ (void *)pp);
+ page_io_lock(pp);
+ off += PAGESIZE;
+ pp = pp->p_next;
+ }
+
+ VM_STAT_ADD(page_create_large_cnt[0]);
+ return (rootpp);
+}
+
+page_t *
+page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
+ struct seg *seg, caddr_t vaddr)
+{
+ page_t *plist = NULL;
+ pgcnt_t npages;
+ pgcnt_t found_on_free = 0;
+ pgcnt_t pages_req;
+ page_t *npp = NULL;
+ uint_t enough;
+ uint_t i;
+ uint_t pcf_index;
+ struct pcf *p;
+ struct pcf *q;
+ lgrp_t *lgrp;
+
+ TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
+ "page_create_start:vp %p off %llx bytes %lu flags %x",
+ vp, off, bytes, flags);
+
+ ASSERT(bytes != 0 && vp != NULL);
+
+ if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
+ panic("page_create: invalid flags");
+ /*NOTREACHED*/
+ }
+ ASSERT((flags & ~(PG_EXCL | PG_WAIT |
+ PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
+ /* but no others */
+
+ pages_req = npages = btopr(bytes);
+ /*
+ * Try to see whether request is too large to *ever* be
+ * satisfied, in order to prevent deadlock. We arbitrarily
+ * decide to limit maximum size requests to max_page_get.
+ */
+ if (npages >= max_page_get) {
+ if ((flags & PG_WAIT) == 0) {
+ TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
+ "page_create_toobig:vp %p off %llx npages "
+ "%lu max_page_get %lu",
+ vp, off, npages, max_page_get);
+ return (NULL);
+ } else {
+ cmn_err(CE_WARN,
+ "Request for too much kernel memory "
+ "(%lu bytes), will hang forever", bytes);
+ for (;;)
+ delay(1000000000);
+ }
+ }
+
+ if (!kcage_on || panicstr) {
+ /*
+ * Cage is OFF, or we are single threaded in
+ * panic, so make everything a RELOC request.
+ */
+ flags &= ~PG_NORELOC;
+ }
+
+ if (freemem <= throttlefree + npages)
+ if (!page_create_throttle(npages, flags))
+ return (NULL);
+
+ /*
+ * If cage is on, dampen draw from cage when available
+ * cage space is low.
+ */
+ if ((flags & PG_NORELOC) &&
+ kcage_freemem < kcage_throttlefree + npages) {
+
+ /*
+ * The cage is on, the caller wants PG_NORELOC
+ * pages and available cage memory is very low.
+ * Call kcage_create_throttle() to attempt to
+ * control demand on the cage.
+ */
+ if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
+ return (NULL);
+ }
+
+ VM_STAT_ADD(page_create_cnt[0]);
+
+ enough = 0;
+ pcf_index = PCF_INDEX();
+
+ p = &pcf[pcf_index];
+ p->pcf_touch = 1;
+ q = &pcf[PCF_FANOUT];
+ for (i = 0; i < PCF_FANOUT; i++) {
+ if (p->pcf_count > npages) {
+ /*
+ * a good one to try.
+ */
+ mutex_enter(&p->pcf_lock);
+ if (p->pcf_count > npages) {
+ p->pcf_count -= (uint_t)npages;
+ /*
+ * freemem is not protected by any lock.
+ * Thus, we cannot have any assertion
+ * containing freemem here.
+ */
+ freemem -= npages;
+ enough = 1;
+ mutex_exit(&p->pcf_lock);
+ break;
+ }
+ mutex_exit(&p->pcf_lock);
+ }
+ p++;
+ if (p >= q) {
+ p = pcf;
+ }
+ p->pcf_touch = 1;
+ }
+
+ if (!enough) {
+ /*
+ * Have to look harder. If npages is greater than
+ * one, then we might have to coalecse the counters.
+ *
+ * Go wait. We come back having accounted
+ * for the memory.
+ */
+ VM_STAT_ADD(page_create_cnt[1]);
+ if (!page_create_wait(npages, flags)) {
+ VM_STAT_ADD(page_create_cnt[2]);
+ return (NULL);
+ }
+ }
+
+ TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
+ "page_create_success:vp %p off %llx", vp, off);
+
+ /*
+ * If satisfying this request has left us with too little
+ * memory, start the wheels turning to get some back. The
+ * first clause of the test prevents waking up the pageout
+ * daemon in situations where it would decide that there's
+ * nothing to do.
+ */
+ if (nscan < desscan && freemem < minfree) {
+ TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
+ "pageout_cv_signal:freemem %ld", freemem);
+ cv_signal(&proc_pageout->p_cv);
+ }
+
+ /*
+ * Loop around collecting the requested number of pages.
+ * Most of the time, we have to `create' a new page. With
+ * this in mind, pull the page off the free list before
+ * getting the hash lock. This will minimize the hash
+ * lock hold time, nesting, and the like. If it turns
+ * out we don't need the page, we put it back at the end.
+ */
+ while (npages--) {
+ page_t *pp;
+ kmutex_t *phm = NULL;
+ ulong_t index;
+
+ index = PAGE_HASH_FUNC(vp, off);
+top:
+ ASSERT(phm == NULL);
+ ASSERT(index == PAGE_HASH_FUNC(vp, off));
+ ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+
+ if (npp == NULL) {
+ /*
+ * Try to get a page from the freelist (ie,
+ * a page with no [vp, off] tag). If that
+ * fails, use the cachelist.
+ *
+ * During the first attempt at both the free
+ * and cache lists we try for the correct color.
+ */
+ /*
+ * XXXX-how do we deal with virtual indexed
+ * caches and and colors?
+ */
+ VM_STAT_ADD(page_create_cnt[4]);
+ /*
+ * Get lgroup to allocate next page of shared memory
+ * from and use it to specify where to allocate
+ * the physical memory
+ */
+ lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
+ npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
+ flags | PG_MATCH_COLOR, lgrp);
+ if (npp == NULL) {
+ npp = page_get_cachelist(vp, off, seg,
+ vaddr, flags | PG_MATCH_COLOR, lgrp);
+ if (npp == NULL) {
+ npp = page_create_get_something(vp,
+ off, seg, vaddr,
+ flags & ~PG_MATCH_COLOR);
+ }
+
+ if (PP_ISAGED(npp) == 0) {
+ /*
+ * Since this page came from the
+ * cachelist, we must destroy the
+ * old vnode association.
+ */
+ page_hashout(npp, NULL);
+ }
+ }
+ }
+
+ /*
+ * We own this page!
+ */
+ ASSERT(PAGE_EXCL(npp));
+ ASSERT(npp->p_vnode == NULL);
+ ASSERT(!hat_page_is_mapped(npp));
+ PP_CLRFREE(npp);
+ PP_CLRAGED(npp);
+
+ /*
+ * Here we have a page in our hot little mits and are
+ * just waiting to stuff it on the appropriate lists.
+ * Get the mutex and check to see if it really does
+ * not exist.
+ */
+ phm = PAGE_HASH_MUTEX(index);
+ mutex_enter(phm);
+ PAGE_HASH_SEARCH(index, pp, vp, off);
+ if (pp == NULL) {
+ VM_STAT_ADD(page_create_new);
+ pp = npp;
+ npp = NULL;
+ if (!page_hashin(pp, vp, off, phm)) {
+ /*
+ * Since we hold the page hash mutex and
+ * just searched for this page, page_hashin
+ * had better not fail. If it does, that
+ * means somethread did not follow the
+ * page hash mutex rules. Panic now and
+ * get it over with. As usual, go down
+ * holding all the locks.
+ */
+ ASSERT(MUTEX_HELD(phm));
+ panic("page_create: "
+ "hashin failed %p %p %llx %p",
+ (void *)pp, (void *)vp, off, (void *)phm);
+ /*NOTREACHED*/
+ }
+ ASSERT(MUTEX_HELD(phm));
+ mutex_exit(phm);
+ phm = NULL;
+
+ /*
+ * Hat layer locking need not be done to set
+ * the following bits since the page is not hashed
+ * and was on the free list (i.e., had no mappings).
+ *
+ * Set the reference bit to protect
+ * against immediate pageout
+ *
+ * XXXmh modify freelist code to set reference
+ * bit so we don't have to do it here.
+ */
+ page_set_props(pp, P_REF);
+ found_on_free++;
+ } else {
+ VM_STAT_ADD(page_create_exists);
+ if (flags & PG_EXCL) {
+ /*
+ * Found an existing page, and the caller
+ * wanted all new pages. Undo all of the work
+ * we have done.
+ */
+ mutex_exit(phm);
+ phm = NULL;
+ while (plist != NULL) {
+ pp = plist;
+ page_sub(&plist, pp);
+ page_io_unlock(pp);
+ /* large pages should not end up here */
+ ASSERT(pp->p_szc == 0);
+ /*LINTED: constant in conditional ctx*/
+ VN_DISPOSE(pp, B_INVAL, 0, kcred);
+ }
+ VM_STAT_ADD(page_create_found_one);
+ goto fail;
+ }
+ ASSERT(flags & PG_WAIT);
+ if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
+ /*
+ * Start all over again if we blocked trying
+ * to lock the page.
+ */
+ mutex_exit(phm);
+ VM_STAT_ADD(page_create_page_lock_failed);
+ phm = NULL;
+ goto top;
+ }
+ mutex_exit(phm);
+ phm = NULL;
+
+ if (PP_ISFREE(pp)) {
+ ASSERT(PP_ISAGED(pp) == 0);
+ VM_STAT_ADD(pagecnt.pc_get_cache);
+ page_list_sub(pp, PG_CACHE_LIST);
+ PP_CLRFREE(pp);
+ found_on_free++;
+ }
+ }
+
+ /*
+ * Got a page! It is locked. Acquire the i/o
+ * lock since we are going to use the p_next and
+ * p_prev fields to link the requested pages together.
+ */
+ page_io_lock(pp);
+ page_add(&plist, pp);
+ plist = plist->p_next;
+ off += PAGESIZE;
+ vaddr += PAGESIZE;
+ }
+
+ ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
+fail:
+ if (npp != NULL) {
+ /*
+ * Did not need this page after all.
+ * Put it back on the free list.
+ */
+ VM_STAT_ADD(page_create_putbacks);
+ PP_SETFREE(npp);
+ PP_SETAGED(npp);
+ npp->p_offset = (u_offset_t)-1;
+ page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
+ page_unlock(npp);
+
+ }
+
+ ASSERT(pages_req >= found_on_free);
+
+ {
+ uint_t overshoot = (uint_t)(pages_req - found_on_free);
+
+ if (overshoot) {
+ VM_STAT_ADD(page_create_overshoot);
+ p = &pcf[pcf_index];
+ p->pcf_touch = 1;
+ mutex_enter(&p->pcf_lock);
+ if (p->pcf_block) {
+ p->pcf_reserve += overshoot;
+ } else {
+ p->pcf_count += overshoot;
+ if (p->pcf_wait) {
+ mutex_enter(&new_freemem_lock);
+ if (freemem_wait) {
+ cv_signal(&freemem_cv);
+ p->pcf_wait--;
+ } else {
+ p->pcf_wait = 0;
+ }
+ mutex_exit(&new_freemem_lock);
+ }
+ }
+ mutex_exit(&p->pcf_lock);
+ /* freemem is approximate, so this test OK */
+ if (!p->pcf_block)
+ freemem += overshoot;
+ }
+ }
+
+ return (plist);
+}
+
+/*
+ * One or more constituent pages of this large page has been marked
+ * toxic. Simply demote the large page to PAGESIZE pages and let
+ * page_free() handle it. This routine should only be called by
+ * large page free routines (page_free_pages() and page_destroy_pages().
+ * All pages are locked SE_EXCL and have already been marked free.
+ */
+static void
+page_free_toxic_pages(page_t *rootpp)
+{
+ page_t *tpp;
+ pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
+ uint_t szc = rootpp->p_szc;
+
+ for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
+ ASSERT(tpp->p_szc == szc);
+ ASSERT((PAGE_EXCL(tpp) &&
+ !page_iolock_assert(tpp)) || panicstr);
+ tpp->p_szc = 0;
+ }
+
+ while (rootpp != NULL) {
+ tpp = rootpp;
+ page_sub(&rootpp, tpp);
+ ASSERT(PP_ISFREE(tpp));
+ PP_CLRFREE(tpp);
+ page_free(tpp, 1);
+ }
+}
+
+/*
+ * Put page on the "free" list.
+ * The free list is really two lists maintained by
+ * the PSM of whatever machine we happen to be on.
+ */
+void
+page_free(page_t *pp, int dontneed)
+{
+ struct pcf *p;
+ uint_t pcf_index;
+
+ ASSERT((PAGE_EXCL(pp) &&
+ !page_iolock_assert(pp)) || panicstr);
+
+ if (page_deteriorating(pp)) {
+ volatile int i = 0;
+ char *kaddr;
+ volatile int rb, wb;
+ uint64_t pa;
+ volatile int ue = 0;
+ on_trap_data_t otd;
+
+ if (pp->p_vnode != NULL) {
+ /*
+ * Let page_destroy() do its bean counting and
+ * hash out the page; it will then call back
+ * into page_free() with pp->p_vnode == NULL.
+ */
+ page_destroy(pp, 0);
+ return;
+ }
+
+ if (page_isfailing(pp)) {
+ /*
+ * If we have already exceeded the limit for
+ * pages retired, we will treat this page as
+ * 'toxic' rather than failing. That will ensure
+ * that the page is at least cleaned, and if
+ * a UE is detected, the page will be retired
+ * anyway.
+ */
+ if (pages_retired_limit_exceeded()) {
+ /*
+ * clear the flag and reset to toxic
+ */
+ page_clrtoxic(pp);
+ page_settoxic(pp, PAGE_IS_TOXIC);
+ } else {
+ pa = ptob((uint64_t)page_pptonum(pp));
+ if (page_retire_messages) {
+ cmn_err(CE_NOTE, "Page 0x%08x.%08x "
+ "removed from service",
+ (uint32_t)(pa >> 32), (uint32_t)pa);
+ }
+ goto page_failed;
+ }
+ }
+
+ pagescrub(pp, 0, PAGESIZE);
+
+ /*
+ * We want to determine whether the error that occurred on
+ * this page is transient or persistent, so we get a mapping
+ * to the page and try every possible bit pattern to compare
+ * what we write with what we read back. A smaller number
+ * of bit patterns might suffice, but there's no point in
+ * getting fancy. If this is the hot path on your system,
+ * you've got bigger problems.
+ */
+ kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
+ for (wb = 0xff; wb >= 0; wb--) {
+ if (on_trap(&otd, OT_DATA_EC)) {
+ pa = ptob((uint64_t)page_pptonum(pp)) + i;
+ page_settoxic(pp, PAGE_IS_FAILING);
+
+ if (page_retire_messages) {
+ cmn_err(CE_WARN, "Uncorrectable Error "
+ "occurred at PA 0x%08x.%08x while "
+ "attempting to clear previously "
+ "reported error; page removed from "
+ "service", (uint32_t)(pa >> 32),
+ (uint32_t)pa);
+ }
+
+ ue++;
+ break;
+ }
+
+ /*
+ * Write out the bit pattern, flush it to memory, and
+ * read it back while under on_trap() protection.
+ */
+ for (i = 0; i < PAGESIZE; i++)
+ kaddr[i] = wb;
+
+ sync_data_memory(kaddr, PAGESIZE);
+
+ for (i = 0; i < PAGESIZE; i++) {
+ if ((rb = (uchar_t)kaddr[i]) != wb) {
+ page_settoxic(pp, PAGE_IS_FAILING);
+ goto out;
+ }
+ }
+ }
+out:
+ no_trap();
+ ppmapout(kaddr);
+
+ if (wb >= 0 && !ue) {
+ pa = ptob((uint64_t)page_pptonum(pp)) + i;
+ if (page_retire_messages) {
+ cmn_err(CE_WARN, "Data Mismatch occurred at PA "
+ "0x%08x.%08x [ 0x%x != 0x%x ] while "
+ "attempting to clear previously reported "
+ "error; page removed from service",
+ (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb);
+ }
+ }
+page_failed:
+ /*
+ * DR operations change the association between a page_t
+ * and the physical page it represents. Check if the
+ * page is still bad. If it is, then retire it.
+ */
+ if (page_isfaulty(pp) && page_isfailing(pp)) {
+ /*
+ * In the future, it might be useful to have a platform
+ * callback here to tell the hardware to fence off this
+ * page during the next reboot.
+ *
+ * We move the page to the retired_vnode here
+ */
+ (void) page_hashin(pp, &retired_ppages,
+ (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL);
+ mutex_enter(&freemem_lock);
+ availrmem--;
+ mutex_exit(&freemem_lock);
+ page_retired(pp);
+ page_downgrade(pp);
+
+ /*
+ * If DR raced with the above page retirement code,
+ * we might have retired a good page. If so, unretire
+ * the page.
+ */
+ if (!page_isfaulty(pp))
+ page_unretire_pages();
+ return;
+ }
+
+ pa = ptob((uint64_t)page_pptonum(pp));
+
+ if (page_retire_messages) {
+ cmn_err(CE_NOTE, "Previously reported error on page "
+ "0x%08x.%08x cleared", (uint32_t)(pa >> 32),
+ (uint32_t)pa);
+ }
+
+ page_clrtoxic(pp);
+ }
+
+ if (PP_ISFREE(pp)) {
+ panic("page_free: page %p is free", (void *)pp);
+ }
+
+ if (pp->p_szc != 0) {
+ if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
+ pp->p_vnode == &kvp) {
+ panic("page_free: anon or kernel "
+ "or no vnode large page %p", (void *)pp);
+ }
+ page_demote_vp_pages(pp);
+ ASSERT(pp->p_szc == 0);
+ }
+
+ /*
+ * The page_struct_lock need not be acquired to examine these
+ * fields since the page has an "exclusive" lock.
+ */
+ if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+ panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d",
+ pp, page_pptonum(pp), pp->p_lckcnt, pp->p_cowcnt);
+ /*NOTREACHED*/
+ }
+
+ ASSERT(!hat_page_getshare(pp));
+
+ PP_SETFREE(pp);
+ ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
+ !hat_ismod(pp));
+ page_clr_all_props(pp);
+ ASSERT(!hat_page_getshare(pp));
+
+ /*
+ * Now we add the page to the head of the free list.
+ * But if this page is associated with a paged vnode
+ * then we adjust the head forward so that the page is
+ * effectively at the end of the list.
+ */
+ if (pp->p_vnode == NULL) {
+ /*
+ * Page has no identity, put it on the free list.
+ */
+ PP_SETAGED(pp);
+ pp->p_offset = (u_offset_t)-1;
+ page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
+ VM_STAT_ADD(pagecnt.pc_free_free);
+ TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
+ "page_free_free:pp %p", pp);
+ } else {
+ PP_CLRAGED(pp);
+
+ if (!dontneed || nopageage) {
+ /* move it to the tail of the list */
+ page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
+
+ VM_STAT_ADD(pagecnt.pc_free_cache);
+ TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
+ "page_free_cache_tail:pp %p", pp);
+ } else {
+ page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
+
+ VM_STAT_ADD(pagecnt.pc_free_dontneed);
+ TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
+ "page_free_cache_head:pp %p", pp);
+ }
+ }
+ page_unlock(pp);
+
+ /*
+ * Now do the `freemem' accounting.
+ */
+ pcf_index = PCF_INDEX();
+ p = &pcf[pcf_index];
+ p->pcf_touch = 1;
+
+ mutex_enter(&p->pcf_lock);
+ if (p->pcf_block) {
+ p->pcf_reserve += 1;
+ } else {
+ p->pcf_count += 1;
+ if (p->pcf_wait) {
+ mutex_enter(&new_freemem_lock);
+ /*
+ * Check to see if some other thread
+ * is actually waiting. Another bucket
+ * may have woken it up by now. If there
+ * are no waiters, then set our pcf_wait
+ * count to zero to avoid coming in here
+ * next time. Also, since only one page
+ * was put on the free list, just wake
+ * up one waiter.
+ */
+ if (freemem_wait) {
+ cv_signal(&freemem_cv);
+ p->pcf_wait--;
+ } else {
+ p->pcf_wait = 0;
+ }
+ mutex_exit(&new_freemem_lock);
+ }
+ }
+ mutex_exit(&p->pcf_lock);
+
+ /* freemem is approximate, so this test OK */
+ if (!p->pcf_block)
+ freemem += 1;
+}
+
+/*
+ * Put page on the "free" list during intial startup.
+ * This happens during initial single threaded execution.
+ */
+void
+page_free_at_startup(page_t *pp)
+{
+ struct pcf *p;
+ uint_t pcf_index;
+
+ page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
+ VM_STAT_ADD(pagecnt.pc_free_free);
+
+ /*
+ * Now do the `freemem' accounting.
+ */
+ pcf_index = PCF_INDEX();
+ p = &pcf[pcf_index];
+ p->pcf_touch = 1;
+
+ ASSERT(p->pcf_block == 0);
+ ASSERT(p->pcf_wait == 0);
+ p->pcf_count += 1;
+
+ /* freemem is approximate, so this is OK */
+ freemem += 1;
+}
+
+void
+page_free_pages(page_t *pp)
+{
+ page_t *tpp, *rootpp = NULL;
+ pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
+ pgcnt_t i;
+ uint_t szc = pp->p_szc;
+ int toxic = 0;
+
+ VM_STAT_ADD(pagecnt.pc_free_pages);
+ TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
+ "page_free_free:pp %p", pp);
+
+ ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
+ if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
+ panic("page_free_pages: not root page %p", (void *)pp);
+ /*NOTREACHED*/
+ }
+
+ for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) {
+ ASSERT((PAGE_EXCL(tpp) &&
+ !page_iolock_assert(tpp)) || panicstr);
+ if (PP_ISFREE(tpp)) {
+ panic("page_free_pages: page %p is free", (void *)tpp);
+ /*NOTREACHED*/
+ }
+ if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
+ tpp->p_cowcnt != 0) {
+ panic("page_free_pages %p", (void *)tpp);
+ /*NOTREACHED*/
+ }
+
+ ASSERT(!hat_page_getshare(tpp));
+ ASSERT(tpp->p_vnode == NULL);
+ ASSERT(tpp->p_szc == szc);
+
+ if (page_deteriorating(tpp))
+ toxic = 1;
+
+ PP_SETFREE(tpp);
+ page_clr_all_props(tpp);
+ PP_SETAGED(tpp);
+ tpp->p_offset = (u_offset_t)-1;
+ ASSERT(tpp->p_next == tpp);
+ ASSERT(tpp->p_prev == tpp);
+ page_list_concat(&rootpp, &tpp);
+ }
+ ASSERT(rootpp == pp);
+
+ if (toxic) {
+ page_free_toxic_pages(rootpp);
+ return;
+ }
+ page_list_add_pages(rootpp, 0);
+ page_create_putback(pgcnt);
+}
+
+int free_pages = 1;
+
+/*
+ * This routine attempts to return pages to the cachelist via page_release().
+ * It does not *have* to be successful in all cases, since the pageout scanner
+ * will catch any pages it misses. It does need to be fast and not introduce
+ * too much overhead.
+ *
+ * If a page isn't found on the unlocked sweep of the page_hash bucket, we
+ * don't lock and retry. This is ok, since the page scanner will eventually
+ * find any page we miss in free_vp_pages().
+ */
+void
+free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
+{
+ page_t *pp;
+ u_offset_t eoff;
+ extern int swap_in_range(vnode_t *, u_offset_t, size_t);
+
+ eoff = off + len;
+
+ if (free_pages == 0)
+ return;
+ if (swap_in_range(vp, off, len))
+ return;
+
+ for (; off < eoff; off += PAGESIZE) {
+
+ /*
+ * find the page using a fast, but inexact search. It'll be OK
+ * if a few pages slip through the cracks here.
+ */
+ pp = page_exists(vp, off);
+
+ /*
+ * If we didn't find the page (it may not exist), the page
+ * is free, looks still in use (shared), or we can't lock it,
+ * just give up.
+ */
+ if (pp == NULL ||
+ PP_ISFREE(pp) ||
+ page_share_cnt(pp) > 0 ||
+ !page_trylock(pp, SE_EXCL))
+ continue;
+
+ /*
+ * Once we have locked pp, verify that it's still the
+ * correct page and not already free
+ */
+ ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
+ if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
+ page_unlock(pp);
+ continue;
+ }
+
+ /*
+ * try to release the page...
+ */
+ (void) page_release(pp, 1);
+ }
+}
+
+/*
+ * Reclaim the given page from the free list.
+ * Returns 1 on success or 0 on failure.
+ *
+ * The page is unlocked if it can't be reclaimed (when freemem == 0).
+ * If `lock' is non-null, it will be dropped and re-acquired if
+ * the routine must wait while freemem is 0.
+ *
+ * As it turns out, boot_getpages() does this. It picks a page,
+ * based on where OBP mapped in some address, gets its pfn, searches
+ * the memsegs, locks the page, then pulls it off the free list!
+ */
+int
+page_reclaim(page_t *pp, kmutex_t *lock)
+{
+ struct pcf *p;
+ uint_t pcf_index;
+ struct cpu *cpup;
+ int enough;
+ uint_t i;
+
+ ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
+ ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
+ ASSERT(pp->p_szc == 0);
+
+ /*
+ * If `freemem' is 0, we cannot reclaim this page from the
+ * freelist, so release every lock we might hold: the page,
+ * and the `lock' before blocking.
+ *
+ * The only way `freemem' can become 0 while there are pages
+ * marked free (have their p->p_free bit set) is when the
+ * system is low on memory and doing a page_create(). In
+ * order to guarantee that once page_create() starts acquiring
+ * pages it will be able to get all that it needs since `freemem'
+ * was decreased by the requested amount. So, we need to release
+ * this page, and let page_create() have it.
+ *
+ * Since `freemem' being zero is not supposed to happen, just
+ * use the usual hash stuff as a starting point. If that bucket
+ * is empty, then assume the worst, and start at the beginning
+ * of the pcf array. If we always start at the beginning
+ * when acquiring more than one pcf lock, there won't be any
+ * deadlock problems.
+ */
+
+ /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
+
+ if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
+ pcf_acquire_all();
+ goto page_reclaim_nomem;
+ }
+
+ enough = 0;
+ pcf_index = PCF_INDEX();
+ p = &pcf[pcf_index];
+ p->pcf_touch = 1;
+ mutex_enter(&p->pcf_lock);
+ if (p->pcf_count >= 1) {
+ enough = 1;
+ p->pcf_count--;
+ }
+ mutex_exit(&p->pcf_lock);
+
+ if (!enough) {
+ VM_STAT_ADD(page_reclaim_zero);
+ /*
+ * Check again. Its possible that some other thread
+ * could have been right behind us, and added one
+ * to a list somewhere. Acquire each of the pcf locks
+ * until we find a page.
+ */
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ p->pcf_touch = 1;
+ mutex_enter(&p->pcf_lock);
+ if (p->pcf_count >= 1) {
+ p->pcf_count -= 1;
+ enough = 1;
+ break;
+ }
+ p++;
+ }
+
+ if (!enough) {
+page_reclaim_nomem:
+ /*
+ * We really can't have page `pp'.
+ * Time for the no-memory dance with
+ * page_free(). This is just like
+ * page_create_wait(). Plus the added
+ * attraction of releasing whatever mutex
+ * we held when we were called with in `lock'.
+ * Page_unlock() will wakeup any thread
+ * waiting around for this page.
+ */
+ if (lock) {
+ VM_STAT_ADD(page_reclaim_zero_locked);
+ mutex_exit(lock);
+ }
+ page_unlock(pp);
+
+ /*
+ * get this before we drop all the pcf locks.
+ */
+ mutex_enter(&new_freemem_lock);
+
+ p = pcf;
+ for (i = 0; i < PCF_FANOUT; i++) {
+ p->pcf_wait++;
+ mutex_exit(&p->pcf_lock);
+ p++;
+ }
+
+ freemem_wait++;
+ cv_wait(&freemem_cv, &new_freemem_lock);
+ freemem_wait--;
+
+ mutex_exit(&new_freemem_lock);
+
+ if (lock) {
+ mutex_enter(lock);
+ }
+ return (0);
+ }
+
+ /*
+ * There was a page to be found.
+ * The pcf accounting has been done,
+ * though none of the pcf_wait flags have been set,
+ * drop the locks and continue on.
+ */
+ while (p >= pcf) {
+ mutex_exit(&p->pcf_lock);
+ p--;
+ }
+ }
+
+ /*
+ * freemem is not protected by any lock. Thus, we cannot
+ * have any assertion containing freemem here.
+ */
+ freemem -= 1;
+
+ VM_STAT_ADD(pagecnt.pc_reclaim);
+ if (PP_ISAGED(pp)) {
+ page_list_sub(pp, PG_FREE_LIST);
+ TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
+ "page_reclaim_free:pp %p", pp);
+ } else {
+ page_list_sub(pp, PG_CACHE_LIST);
+ TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
+ "page_reclaim_cache:pp %p", pp);
+ }
+
+ /*
+ * clear the p_free & p_age bits since this page is no longer
+ * on the free list. Notice that there was a brief time where
+ * a page is marked as free, but is not on the list.
+ *
+ * Set the reference bit to protect against immediate pageout.
+ */
+ PP_CLRFREE(pp);
+ PP_CLRAGED(pp);
+ page_set_props(pp, P_REF);
+
+ CPU_STATS_ENTER_K();
+ cpup = CPU; /* get cpup now that CPU cannot change */
+ CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
+ CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
+ CPU_STATS_EXIT_K();
+
+ return (1);
+}
+
+
+
+/*
+ * Destroy identity of the page and put it back on
+ * the page free list. Assumes that the caller has
+ * acquired the "exclusive" lock on the page.
+ */
+void
+page_destroy(page_t *pp, int dontfree)
+{
+ ASSERT((PAGE_EXCL(pp) &&
+ !page_iolock_assert(pp)) || panicstr);
+
+ if (pp->p_szc != 0) {
+ if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
+ pp->p_vnode == &kvp) {
+ panic("page_destroy: anon or kernel or no vnode "
+ "large page %p", (void *)pp);
+ }
+ page_demote_vp_pages(pp);
+ ASSERT(pp->p_szc == 0);
+ }
+
+ TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
+
+ /*
+ * Unload translations, if any, then hash out the
+ * page to erase its identity.
+ */
+ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+ page_hashout(pp, NULL);
+
+ if (!dontfree) {
+ /*
+ * Acquire the "freemem_lock" for availrmem.
+ * The page_struct_lock need not be acquired for lckcnt
+ * and cowcnt since the page has an "exclusive" lock.
+ */
+ if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
+ mutex_enter(&freemem_lock);
+ if (pp->p_lckcnt != 0) {
+ availrmem++;
+ pp->p_lckcnt = 0;
+ }
+ if (pp->p_cowcnt != 0) {
+ availrmem += pp->p_cowcnt;
+ pp->p_cowcnt = 0;
+ }
+ mutex_exit(&freemem_lock);
+ }
+ /*
+ * Put the page on the "free" list.
+ */
+ page_free(pp, 0);
+ }
+}
+
+void
+page_destroy_pages(page_t *pp)
+{
+
+ page_t *tpp, *rootpp = NULL;
+ pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
+ pgcnt_t i, pglcks = 0;
+ uint_t szc = pp->p_szc;
+ int toxic = 0;
+
+ ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
+
+ VM_STAT_ADD(pagecnt.pc_destroy_pages);
+
+ TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
+
+ if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
+ panic("page_destroy_pages: not root page %p", (void *)pp);
+ /*NOTREACHED*/
+ }
+
+ for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) {
+ ASSERT((PAGE_EXCL(tpp) &&
+ !page_iolock_assert(tpp)) || panicstr);
+ (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
+ page_hashout(tpp, NULL);
+ ASSERT(tpp->p_offset == (u_offset_t)-1);
+ if (tpp->p_lckcnt != 0) {
+ pglcks++;
+ tpp->p_lckcnt = 0;
+ } else if (tpp->p_cowcnt != 0) {
+ pglcks += tpp->p_cowcnt;
+ tpp->p_cowcnt = 0;
+ }
+ ASSERT(!hat_page_getshare(tpp));
+ ASSERT(tpp->p_vnode == NULL);
+ ASSERT(tpp->p_szc == szc);
+
+ if (page_deteriorating(tpp))
+ toxic = 1;
+
+ PP_SETFREE(tpp);
+ page_clr_all_props(tpp);
+ PP_SETAGED(tpp);
+ ASSERT(tpp->p_next == tpp);
+ ASSERT(tpp->p_prev == tpp);
+ page_list_concat(&rootpp, &tpp);
+ }
+
+ ASSERT(rootpp == pp);
+ if (pglcks != 0) {
+ mutex_enter(&freemem_lock);
+ availrmem += pglcks;
+ mutex_exit(&freemem_lock);
+ }
+
+ if (toxic) {
+ page_free_toxic_pages(rootpp);
+ return;
+ }
+ page_list_add_pages(rootpp, 0);
+ page_create_putback(pgcnt);
+}
+
+/*
+ * Similar to page_destroy(), but destroys pages which are
+ * locked and known to be on the page free list. Since
+ * the page is known to be free and locked, no one can access
+ * it.
+ *
+ * Also, the number of free pages does not change.
+ */
+void
+page_destroy_free(page_t *pp)
+{
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(pp->p_vnode);
+ ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
+ ASSERT(!hat_page_is_mapped(pp));
+ ASSERT(PP_ISAGED(pp) == 0);
+ ASSERT(pp->p_szc == 0);
+
+ VM_STAT_ADD(pagecnt.pc_destroy_free);
+ page_list_sub(pp, PG_CACHE_LIST);
+
+ page_hashout(pp, NULL);
+ ASSERT(pp->p_vnode == NULL);
+ ASSERT(pp->p_offset == (u_offset_t)-1);
+ ASSERT(pp->p_hash == NULL);
+
+ PP_SETAGED(pp);
+ page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
+ page_unlock(pp);
+
+ mutex_enter(&new_freemem_lock);
+ if (freemem_wait) {
+ cv_signal(&freemem_cv);
+ }
+ mutex_exit(&new_freemem_lock);
+}
+
+/*
+ * Rename the page "opp" to have an identity specified
+ * by [vp, off]. If a page already exists with this name
+ * it is locked and destroyed. Note that the page's
+ * translations are not unloaded during the rename.
+ *
+ * This routine is used by the anon layer to "steal" the
+ * original page and is not unlike destroying a page and
+ * creating a new page using the same page frame.
+ *
+ * XXX -- Could deadlock if caller 1 tries to rename A to B while
+ * caller 2 tries to rename B to A.
+ */
+void
+page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
+{
+ page_t *pp;
+ int olckcnt = 0;
+ int ocowcnt = 0;
+ kmutex_t *phm;
+ ulong_t index;
+
+ ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
+ ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+ ASSERT(PP_ISFREE(opp) == 0);
+
+ VM_STAT_ADD(page_rename_count);
+
+ TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
+ "page rename:pp %p vp %p off %llx", opp, vp, off);
+
+ page_hashout(opp, NULL);
+ PP_CLRAGED(opp);
+
+ /*
+ * Acquire the appropriate page hash lock, since
+ * we're going to rename the page.
+ */
+ index = PAGE_HASH_FUNC(vp, off);
+ phm = PAGE_HASH_MUTEX(index);
+ mutex_enter(phm);
+top:
+ /*
+ * Look for an existing page with this name and destroy it if found.
+ * By holding the page hash lock all the way to the page_hashin()
+ * call, we are assured that no page can be created with this
+ * identity. In the case when the phm lock is dropped to undo any
+ * hat layer mappings, the existing page is held with an "exclusive"
+ * lock, again preventing another page from being created with
+ * this identity.
+ */
+ PAGE_HASH_SEARCH(index, pp, vp, off);
+ if (pp != NULL) {
+ VM_STAT_ADD(page_rename_exists);
+
+ /*
+ * As it turns out, this is one of only two places where
+ * page_lock() needs to hold the passed in lock in the
+ * successful case. In all of the others, the lock could
+ * be dropped as soon as the attempt is made to lock
+ * the page. It is tempting to add yet another arguement,
+ * PL_KEEP or PL_DROP, to let page_lock know what to do.
+ */
+ if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
+ /*
+ * Went to sleep because the page could not
+ * be locked. We were woken up when the page
+ * was unlocked, or when the page was destroyed.
+ * In either case, `phm' was dropped while we
+ * slept. Hence we should not just roar through
+ * this loop.
+ */
+ goto top;
+ }
+
+ if (hat_page_is_mapped(pp)) {
+ /*
+ * Unload translations. Since we hold the
+ * exclusive lock on this page, the page
+ * can not be changed while we drop phm.
+ * This is also not a lock protocol violation,
+ * but rather the proper way to do things.
+ */
+ mutex_exit(phm);
+ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+ mutex_enter(phm);
+ }
+ page_hashout(pp, phm);
+ }
+ /*
+ * Hash in the page with the new identity.
+ */
+ if (!page_hashin(opp, vp, off, phm)) {
+ /*
+ * We were holding phm while we searched for [vp, off]
+ * and only dropped phm if we found and locked a page.
+ * If we can't create this page now, then some thing
+ * is really broken.
+ */
+ panic("page_rename: Can't hash in page: %p", (void *)pp);
+ /*NOTREACHED*/
+ }
+
+ ASSERT(MUTEX_HELD(phm));
+ mutex_exit(phm);
+
+ /*
+ * Now that we have dropped phm, lets get around to finishing up
+ * with pp.
+ */
+ if (pp != NULL) {
+ ASSERT(!hat_page_is_mapped(pp));
+ /* for now large pages should not end up here */
+ ASSERT(pp->p_szc == 0);
+ /*
+ * Save the locks for transfer to the new page and then
+ * clear them so page_free doesn't think they're important.
+ * The page_struct_lock need not be acquired for lckcnt and
+ * cowcnt since the page has an "exclusive" lock.
+ */
+ olckcnt = pp->p_lckcnt;
+ ocowcnt = pp->p_cowcnt;
+ pp->p_lckcnt = pp->p_cowcnt = 0;
+
+ /*
+ * Put the page on the "free" list after we drop
+ * the lock. The less work under the lock the better.
+ */
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, B_FREE, 0, kcred);
+ }
+
+ /*
+ * Transfer the lock count from the old page (if any).
+ * The page_struct_lock need not be acquired for lckcnt and
+ * cowcnt since the page has an "exclusive" lock.
+ */
+ opp->p_lckcnt += olckcnt;
+ opp->p_cowcnt += ocowcnt;
+}
+
+/*
+ * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
+ *
+ * Pages are normally inserted at the start of a vnode's v_pages list.
+ * If the vnode is VMODSORT and the page is modified, it goes at the end.
+ * This can happen when a modified page is relocated for DR.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
+static int
+page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
+{
+ page_t **listp;
+ page_t *tp;
+ ulong_t index;
+
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(vp != NULL);
+ ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
+
+ /*
+ * Be sure to set these up before the page is inserted on the hash
+ * list. As soon as the page is placed on the list some other
+ * thread might get confused and wonder how this page could
+ * possibly hash to this list.
+ */
+ pp->p_vnode = vp;
+ pp->p_offset = offset;
+
+ /*
+ * record if this page is on a swap vnode
+ */
+ if ((vp->v_flag & VISSWAP) != 0)
+ PP_SETSWAP(pp);
+
+ index = PAGE_HASH_FUNC(vp, offset);
+ ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
+ listp = &page_hash[index];
+
+ /*
+ * If this page is already hashed in, fail this attempt to add it.
+ */
+ for (tp = *listp; tp != NULL; tp = tp->p_hash) {
+ if (tp->p_vnode == vp && tp->p_offset == offset) {
+ pp->p_vnode = NULL;
+ pp->p_offset = (u_offset_t)(-1);
+ return (0);
+ }
+ }
+ pp->p_hash = *listp;
+ *listp = pp;
+
+ /*
+ * Add the page to the vnode's list of pages
+ */
+ if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
+ listp = &vp->v_pages->p_vpprev->p_vpnext;
+ else
+ listp = &vp->v_pages;
+
+ page_vpadd(listp, pp);
+
+ return (1);
+}
+
+/*
+ * Add page `pp' to both the hash and vp chains for [vp, offset].
+ *
+ * Returns 1 on success and 0 on failure.
+ * If hold is passed in, it is not dropped.
+ */
+int
+page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
+{
+ kmutex_t *phm = NULL;
+ kmutex_t *vphm;
+ int rc;
+
+ ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
+
+ TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
+ "page_hashin:pp %p vp %p offset %llx",
+ pp, vp, offset);
+
+ VM_STAT_ADD(hashin_count);
+
+ if (hold != NULL)
+ phm = hold;
+ else {
+ VM_STAT_ADD(hashin_not_held);
+ phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
+ mutex_enter(phm);
+ }
+
+ vphm = page_vnode_mutex(vp);
+ mutex_enter(vphm);
+ rc = page_do_hashin(pp, vp, offset);
+ mutex_exit(vphm);
+ if (hold == NULL)
+ mutex_exit(phm);
+ if (rc == 0)
+ VM_STAT_ADD(hashin_already);
+ return (rc);
+}
+
+/*
+ * Remove page ``pp'' from the hash and vp chains and remove vp association.
+ * All mutexes must be held
+ */
+static void
+page_do_hashout(page_t *pp)
+{
+ page_t **hpp;
+ page_t *hp;
+ vnode_t *vp = pp->p_vnode;
+
+ ASSERT(vp != NULL);
+ ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
+
+ /*
+ * First, take pp off of its hash chain.
+ */
+ hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
+
+ for (;;) {
+ hp = *hpp;
+ if (hp == pp)
+ break;
+ if (hp == NULL) {
+ panic("page_do_hashout");
+ /*NOTREACHED*/
+ }
+ hpp = &hp->p_hash;
+ }
+ *hpp = pp->p_hash;
+
+ /*
+ * Now remove it from its associated vnode.
+ */
+ if (vp->v_pages)
+ page_vpsub(&vp->v_pages, pp);
+
+ pp->p_hash = NULL;
+ page_clr_all_props(pp);
+ PP_CLRSWAP(pp);
+ pp->p_vnode = NULL;
+ pp->p_offset = (u_offset_t)-1;
+}
+
+/*
+ * Remove page ``pp'' from the hash and vp chains and remove vp association.
+ *
+ * When `phm' is non-NULL it contains the address of the mutex protecting the
+ * hash list pp is on. It is not dropped.
+ */
+void
+page_hashout(page_t *pp, kmutex_t *phm)
+{
+ vnode_t *vp;
+ ulong_t index;
+ kmutex_t *nphm;
+ kmutex_t *vphm;
+ kmutex_t *sep;
+
+ ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
+ ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
+
+ vp = pp->p_vnode;
+
+ TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
+ "page_hashout:pp %p vp %p", pp, vp);
+
+ /* Kernel probe */
+ TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
+ tnf_opaque, vnode, vp,
+ tnf_offset, offset, pp->p_offset);
+
+ /*
+ *
+ */
+ VM_STAT_ADD(hashout_count);
+ index = PAGE_HASH_FUNC(vp, pp->p_offset);
+ if (phm == NULL) {
+ VM_STAT_ADD(hashout_not_held);
+ nphm = PAGE_HASH_MUTEX(index);
+ mutex_enter(nphm);
+ }
+ ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
+
+
+ /*
+ * grab page vnode mutex and remove it...
+ */
+ vphm = page_vnode_mutex(vp);
+ mutex_enter(vphm);
+
+ page_do_hashout(pp);
+
+ mutex_exit(vphm);
+ if (phm == NULL)
+ mutex_exit(nphm);
+
+ /*
+ * If the page was retired, update the pages_retired
+ * total and clear the page flag
+ */
+ if (page_isretired(pp)) {
+ retired_page_removed(pp);
+ }
+
+ /*
+ * Wake up processes waiting for this page. The page's
+ * identity has been changed, and is probably not the
+ * desired page any longer.
+ */
+ sep = page_se_mutex(pp);
+ mutex_enter(sep);
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ mutex_exit(sep);
+}
+
+/*
+ * Add the page to the front of a linked list of pages
+ * using the p_next & p_prev pointers for the list.
+ * The caller is responsible for protecting the list pointers.
+ */
+void
+page_add(page_t **ppp, page_t *pp)
+{
+ ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
+
+ page_add_common(ppp, pp);
+}
+
+
+
+/*
+ * Common code for page_add() and mach_page_add()
+ */
+void
+page_add_common(page_t **ppp, page_t *pp)
+{
+ if (*ppp == NULL) {
+ pp->p_next = pp->p_prev = pp;
+ } else {
+ pp->p_next = *ppp;
+ pp->p_prev = (*ppp)->p_prev;
+ (*ppp)->p_prev = pp;
+ pp->p_prev->p_next = pp;
+ }
+ *ppp = pp;
+}
+
+
+/*
+ * Remove this page from a linked list of pages
+ * using the p_next & p_prev pointers for the list.
+ *
+ * The caller is responsible for protecting the list pointers.
+ */
+void
+page_sub(page_t **ppp, page_t *pp)
+{
+ ASSERT((PP_ISFREE(pp)) ? 1 :
+ (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
+
+ if (*ppp == NULL || pp == NULL) {
+ panic("page_sub: bad arg(s): pp %p, *ppp %p",
+ (void *)pp, (void *)(*ppp));
+ /*NOTREACHED*/
+ }
+
+ page_sub_common(ppp, pp);
+}
+
+
+/*
+ * Common code for page_sub() and mach_page_sub()
+ */
+void
+page_sub_common(page_t **ppp, page_t *pp)
+{
+ if (*ppp == pp)
+ *ppp = pp->p_next; /* go to next page */
+
+ if (*ppp == pp)
+ *ppp = NULL; /* page list is gone */
+ else {
+ pp->p_prev->p_next = pp->p_next;
+ pp->p_next->p_prev = pp->p_prev;
+ }
+ pp->p_prev = pp->p_next = pp; /* make pp a list of one */
+}
+
+
+/*
+ * Break page list cppp into two lists with npages in the first list.
+ * The tail is returned in nppp.
+ */
+void
+page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
+{
+ page_t *s1pp = *oppp;
+ page_t *s2pp;
+ page_t *e1pp, *e2pp;
+ long n = 0;
+
+ if (s1pp == NULL) {
+ *nppp = NULL;
+ return;
+ }
+ if (npages == 0) {
+ *nppp = s1pp;
+ *oppp = NULL;
+ return;
+ }
+ for (n = 0, s2pp = *oppp; n < npages; n++) {
+ s2pp = s2pp->p_next;
+ }
+ /* Fix head and tail of new lists */
+ e1pp = s2pp->p_prev;
+ e2pp = s1pp->p_prev;
+ s1pp->p_prev = e1pp;
+ e1pp->p_next = s1pp;
+ s2pp->p_prev = e2pp;
+ e2pp->p_next = s2pp;
+
+ /* second list empty */
+ if (s2pp == s1pp) {
+ *oppp = s1pp;
+ *nppp = NULL;
+ } else {
+ *oppp = s1pp;
+ *nppp = s2pp;
+ }
+}
+
+/*
+ * Concatenate page list nppp onto the end of list ppp.
+ */
+void
+page_list_concat(page_t **ppp, page_t **nppp)
+{
+ page_t *s1pp, *s2pp, *e1pp, *e2pp;
+
+ if (*nppp == NULL) {
+ return;
+ }
+ if (*ppp == NULL) {
+ *ppp = *nppp;
+ return;
+ }
+ s1pp = *ppp;
+ e1pp = s1pp->p_prev;
+ s2pp = *nppp;
+ e2pp = s2pp->p_prev;
+ s1pp->p_prev = e2pp;
+ e2pp->p_next = s1pp;
+ e1pp->p_next = s2pp;
+ s2pp->p_prev = e1pp;
+}
+
+/*
+ * return the next page in the page list
+ */
+page_t *
+page_list_next(page_t *pp)
+{
+ return (pp->p_next);
+}
+
+
+/*
+ * Add the page to the front of the linked list of pages
+ * using p_vpnext/p_vpprev pointers for the list.
+ *
+ * The caller is responsible for protecting the lists.
+ */
+void
+page_vpadd(page_t **ppp, page_t *pp)
+{
+ if (*ppp == NULL) {
+ pp->p_vpnext = pp->p_vpprev = pp;
+ } else {
+ pp->p_vpnext = *ppp;
+ pp->p_vpprev = (*ppp)->p_vpprev;
+ (*ppp)->p_vpprev = pp;
+ pp->p_vpprev->p_vpnext = pp;
+ }
+ *ppp = pp;
+}
+
+/*
+ * Remove this page from the linked list of pages
+ * using p_vpnext/p_vpprev pointers for the list.
+ *
+ * The caller is responsible for protecting the lists.
+ */
+void
+page_vpsub(page_t **ppp, page_t *pp)
+{
+ if (*ppp == NULL || pp == NULL) {
+ panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
+ (void *)pp, (void *)(*ppp));
+ /*NOTREACHED*/
+ }
+
+ if (*ppp == pp)
+ *ppp = pp->p_vpnext; /* go to next page */
+
+ if (*ppp == pp)
+ *ppp = NULL; /* page list is gone */
+ else {
+ pp->p_vpprev->p_vpnext = pp->p_vpnext;
+ pp->p_vpnext->p_vpprev = pp->p_vpprev;
+ }
+ pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */
+}
+
+/*
+ * Lock a physical page into memory "long term". Used to support "lock
+ * in memory" functions. Accepts the page to be locked, and a cow variable
+ * to indicate whether a the lock will travel to the new page during
+ * a potential copy-on-write.
+ */
+int
+page_pp_lock(
+ page_t *pp, /* page to be locked */
+ int cow, /* cow lock */
+ int kernel) /* must succeed -- ignore checking */
+{
+ int r = 0; /* result -- assume failure */
+
+ ASSERT(PAGE_LOCKED(pp));
+
+ page_struct_lock(pp);
+ /*
+ * Acquire the "freemem_lock" for availrmem.
+ */
+ if (cow) {
+ mutex_enter(&freemem_lock);
+ if ((availrmem > pages_pp_maximum) &&
+ (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
+ availrmem--;
+ pages_locked++;
+ mutex_exit(&freemem_lock);
+ r = 1;
+ if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+ cmn_err(CE_WARN,
+ "COW lock limit reached on pfn 0x%lx",
+ page_pptonum(pp));
+ }
+ } else
+ mutex_exit(&freemem_lock);
+ } else {
+ if (pp->p_lckcnt) {
+ if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
+ r = 1;
+ if (++pp->p_lckcnt ==
+ (ushort_t)PAGE_LOCK_MAXIMUM) {
+ cmn_err(CE_WARN, "Page lock limit "
+ "reached on pfn 0x%lx",
+ page_pptonum(pp));
+ }
+ }
+ } else {
+ if (kernel) {
+ /* availrmem accounting done by caller */
+ ++pp->p_lckcnt;
+ r = 1;
+ } else {
+ mutex_enter(&freemem_lock);
+ if (availrmem > pages_pp_maximum) {
+ availrmem--;
+ pages_locked++;
+ ++pp->p_lckcnt;
+ r = 1;
+ }
+ mutex_exit(&freemem_lock);
+ }
+ }
+ }
+ page_struct_unlock(pp);
+ return (r);
+}
+
+/*
+ * Decommit a lock on a physical page frame. Account for cow locks if
+ * appropriate.
+ */
+void
+page_pp_unlock(
+ page_t *pp, /* page to be unlocked */
+ int cow, /* expect cow lock */
+ int kernel) /* this was a kernel lock */
+{
+ ASSERT(PAGE_LOCKED(pp));
+
+ page_struct_lock(pp);
+ /*
+ * Acquire the "freemem_lock" for availrmem.
+ * If cowcnt or lcknt is already 0 do nothing; i.e., we
+ * could be called to unlock even if nothing is locked. This could
+ * happen if locked file pages were truncated (removing the lock)
+ * and the file was grown again and new pages faulted in; the new
+ * pages are unlocked but the segment still thinks they're locked.
+ */
+ if (cow) {
+ if (pp->p_cowcnt) {
+ mutex_enter(&freemem_lock);
+ pp->p_cowcnt--;
+ availrmem++;
+ pages_locked--;
+ mutex_exit(&freemem_lock);
+ }
+ } else {
+ if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
+ if (!kernel) {
+ mutex_enter(&freemem_lock);
+ availrmem++;
+ pages_locked--;
+ mutex_exit(&freemem_lock);
+ }
+ }
+ }
+ page_struct_unlock(pp);
+}
+
+/*
+ * This routine reserves availrmem for npages;
+ * flags: KM_NOSLEEP or KM_SLEEP
+ * returns 1 on success or 0 on failure
+ */
+int
+page_resv(pgcnt_t npages, uint_t flags)
+{
+ mutex_enter(&freemem_lock);
+ while (availrmem < tune.t_minarmem + npages) {
+ if (flags & KM_NOSLEEP) {
+ mutex_exit(&freemem_lock);
+ return (0);
+ }
+ mutex_exit(&freemem_lock);
+ page_needfree(npages);
+ kmem_reap();
+ delay(hz >> 2);
+ page_needfree(-(spgcnt_t)npages);
+ mutex_enter(&freemem_lock);
+ }
+ availrmem -= npages;
+ mutex_exit(&freemem_lock);
+ return (1);
+}
+
+/*
+ * This routine unreserves availrmem for npages;
+ */
+void
+page_unresv(pgcnt_t npages)
+{
+ mutex_enter(&freemem_lock);
+ availrmem += npages;
+ mutex_exit(&freemem_lock);
+}
+
+/*
+ * See Statement at the beginning of segvn_lockop() regarding
+ * the way we handle cowcnts and lckcnts.
+ *
+ * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
+ * that breaks COW has PROT_WRITE.
+ *
+ * Note that, we may also break COW in case we are softlocking
+ * on read access during physio;
+ * in this softlock case, the vpage may not have PROT_WRITE.
+ * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
+ * if the vpage doesn't have PROT_WRITE.
+ *
+ * This routine is never called if we are stealing a page
+ * in anon_private.
+ *
+ * The caller subtracted from availrmem for read only mapping.
+ * if lckcnt is 1 increment availrmem.
+ */
+void
+page_pp_useclaim(
+ page_t *opp, /* original page frame losing lock */
+ page_t *npp, /* new page frame gaining lock */
+ uint_t write_perm) /* set if vpage has PROT_WRITE */
+{
+ int payback = 0;
+
+ ASSERT(PAGE_LOCKED(opp));
+ ASSERT(PAGE_LOCKED(npp));
+
+ page_struct_lock(opp);
+
+ ASSERT(npp->p_cowcnt == 0);
+ ASSERT(npp->p_lckcnt == 0);
+
+ /* Don't use claim if nothing is locked (see page_pp_unlock above) */
+ if ((write_perm && opp->p_cowcnt != 0) ||
+ (!write_perm && opp->p_lckcnt != 0)) {
+
+ if (write_perm) {
+ npp->p_cowcnt++;
+ ASSERT(opp->p_cowcnt != 0);
+ opp->p_cowcnt--;
+ } else {
+
+ ASSERT(opp->p_lckcnt != 0);
+
+ /*
+ * We didn't need availrmem decremented if p_lckcnt on
+ * original page is 1. Here, we are unlocking
+ * read-only copy belonging to original page and
+ * are locking a copy belonging to new page.
+ */
+ if (opp->p_lckcnt == 1)
+ payback = 1;
+
+ npp->p_lckcnt++;
+ opp->p_lckcnt--;
+ }
+ }
+ if (payback) {
+ mutex_enter(&freemem_lock);
+ availrmem++;
+ pages_useclaim--;
+ mutex_exit(&freemem_lock);
+ }
+ page_struct_unlock(opp);
+}
+
+/*
+ * Simple claim adjust functions -- used to support changes in
+ * claims due to changes in access permissions. Used by segvn_setprot().
+ */
+int
+page_addclaim(page_t *pp)
+{
+ int r = 0; /* result */
+
+ ASSERT(PAGE_LOCKED(pp));
+
+ page_struct_lock(pp);
+ ASSERT(pp->p_lckcnt != 0);
+
+ if (pp->p_lckcnt == 1) {
+ if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
+ --pp->p_lckcnt;
+ r = 1;
+ if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+ cmn_err(CE_WARN,
+ "COW lock limit reached on pfn 0x%lx",
+ page_pptonum(pp));
+ }
+ }
+ } else {
+ mutex_enter(&freemem_lock);
+ if ((availrmem > pages_pp_maximum) &&
+ (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
+ --availrmem;
+ ++pages_claimed;
+ mutex_exit(&freemem_lock);
+ --pp->p_lckcnt;
+ r = 1;
+ if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+ cmn_err(CE_WARN,
+ "COW lock limit reached on pfn 0x%lx",
+ page_pptonum(pp));
+ }
+ } else
+ mutex_exit(&freemem_lock);
+ }
+ page_struct_unlock(pp);
+ return (r);
+}
+
+int
+page_subclaim(page_t *pp)
+{
+ int r = 0;
+
+ ASSERT(PAGE_LOCKED(pp));
+
+ page_struct_lock(pp);
+ ASSERT(pp->p_cowcnt != 0);
+
+ if (pp->p_lckcnt) {
+ if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
+ r = 1;
+ /*
+ * for availrmem
+ */
+ mutex_enter(&freemem_lock);
+ availrmem++;
+ pages_claimed--;
+ mutex_exit(&freemem_lock);
+
+ pp->p_cowcnt--;
+
+ if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+ cmn_err(CE_WARN,
+ "Page lock limit reached on pfn 0x%lx",
+ page_pptonum(pp));
+ }
+ }
+ } else {
+ r = 1;
+ pp->p_cowcnt--;
+ pp->p_lckcnt++;
+ }
+ page_struct_unlock(pp);
+ return (r);
+}
+
+int
+page_addclaim_pages(page_t **ppa)
+{
+
+ pgcnt_t lckpgs = 0, pg_idx;
+
+ VM_STAT_ADD(pagecnt.pc_addclaim_pages);
+
+ mutex_enter(&page_llock);
+ for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
+
+ ASSERT(PAGE_LOCKED(ppa[pg_idx]));
+ ASSERT(ppa[pg_idx]->p_lckcnt != 0);
+ if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+ mutex_exit(&page_llock);
+ return (0);
+ }
+ if (ppa[pg_idx]->p_lckcnt > 1)
+ lckpgs++;
+ }
+
+ if (lckpgs != 0) {
+ mutex_enter(&freemem_lock);
+ if (availrmem >= pages_pp_maximum + lckpgs) {
+ availrmem -= lckpgs;
+ pages_claimed += lckpgs;
+ } else {
+ mutex_exit(&freemem_lock);
+ mutex_exit(&page_llock);
+ return (0);
+ }
+ mutex_exit(&freemem_lock);
+ }
+
+ for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
+ ppa[pg_idx]->p_lckcnt--;
+ ppa[pg_idx]->p_cowcnt++;
+ }
+ mutex_exit(&page_llock);
+ return (1);
+}
+
+int
+page_subclaim_pages(page_t **ppa)
+{
+ pgcnt_t ulckpgs = 0, pg_idx;
+
+ VM_STAT_ADD(pagecnt.pc_subclaim_pages);
+
+ mutex_enter(&page_llock);
+ for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
+
+ ASSERT(PAGE_LOCKED(ppa[pg_idx]));
+ ASSERT(ppa[pg_idx]->p_cowcnt != 0);
+ if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
+ mutex_exit(&page_llock);
+ return (0);
+ }
+ if (ppa[pg_idx]->p_lckcnt != 0)
+ ulckpgs++;
+ }
+
+ if (ulckpgs != 0) {
+ mutex_enter(&freemem_lock);
+ availrmem += ulckpgs;
+ pages_claimed -= ulckpgs;
+ mutex_exit(&freemem_lock);
+ }
+
+ for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
+ ppa[pg_idx]->p_cowcnt--;
+ ppa[pg_idx]->p_lckcnt++;
+
+ }
+ mutex_exit(&page_llock);
+ return (1);
+}
+
+page_t *
+page_numtopp(pfn_t pfnum, se_t se)
+{
+ page_t *pp;
+
+retry:
+ pp = page_numtopp_nolock(pfnum);
+ if (pp == NULL) {
+ return ((page_t *)NULL);
+ }
+
+ /*
+ * Acquire the appropriate lock on the page.
+ */
+ while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
+ if (page_pptonum(pp) != pfnum)
+ goto retry;
+ continue;
+ }
+
+ if (page_pptonum(pp) != pfnum) {
+ page_unlock(pp);
+ goto retry;
+ }
+
+ return (pp);
+}
+
+page_t *
+page_numtopp_noreclaim(pfn_t pfnum, se_t se)
+{
+ page_t *pp;
+
+retry:
+ pp = page_numtopp_nolock(pfnum);
+ if (pp == NULL) {
+ return ((page_t *)NULL);
+ }
+
+ /*
+ * Acquire the appropriate lock on the page.
+ */
+ while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
+ if (page_pptonum(pp) != pfnum)
+ goto retry;
+ continue;
+ }
+
+ if (page_pptonum(pp) != pfnum) {
+ page_unlock(pp);
+ goto retry;
+ }
+
+ return (pp);
+}
+
+/*
+ * This routine is like page_numtopp, but will only return page structs
+ * for pages which are ok for loading into hardware using the page struct.
+ */
+page_t *
+page_numtopp_nowait(pfn_t pfnum, se_t se)
+{
+ page_t *pp;
+
+retry:
+ pp = page_numtopp_nolock(pfnum);
+ if (pp == NULL) {
+ return ((page_t *)NULL);
+ }
+
+ /*
+ * Try to acquire the appropriate lock on the page.
+ */
+ if (PP_ISFREE(pp))
+ pp = NULL;
+ else {
+ if (!page_trylock(pp, se))
+ pp = NULL;
+ else {
+ if (page_pptonum(pp) != pfnum) {
+ page_unlock(pp);
+ goto retry;
+ }
+ if (PP_ISFREE(pp)) {
+ page_unlock(pp);
+ pp = NULL;
+ }
+ }
+ }
+ return (pp);
+}
+
+/*
+ * Returns a count of dirty pages that are in the process
+ * of being written out. If 'cleanit' is set, try to push the page.
+ */
+pgcnt_t
+page_busy(int cleanit)
+{
+ page_t *page0 = page_first();
+ page_t *pp = page0;
+ pgcnt_t nppbusy = 0;
+ u_offset_t off;
+
+ do {
+ vnode_t *vp = pp->p_vnode;
+
+ /*
+ * A page is a candidate for syncing if it is:
+ *
+ * (a) On neither the freelist nor the cachelist
+ * (b) Hashed onto a vnode
+ * (c) Not a kernel page
+ * (d) Dirty
+ * (e) Not part of a swapfile
+ * (f) a page which belongs to a real vnode; eg has a non-null
+ * v_vfsp pointer.
+ * (g) Backed by a filesystem which doesn't have a
+ * stubbed-out sync operation
+ */
+ if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp &&
+ hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
+ vfs_can_sync(vp->v_vfsp)) {
+ nppbusy++;
+ vfs_syncprogress();
+
+ if (!cleanit)
+ continue;
+ if (!page_trylock(pp, SE_EXCL))
+ continue;
+
+ if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
+ pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
+ !(hat_pagesync(pp,
+ HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
+ page_unlock(pp);
+ continue;
+ }
+ off = pp->p_offset;
+ VN_HOLD(vp);
+ page_unlock(pp);
+ (void) VOP_PUTPAGE(vp, off, PAGESIZE,
+ B_ASYNC | B_FREE, kcred);
+ VN_RELE(vp);
+ }
+ } while ((pp = page_next(pp)) != page0);
+
+ return (nppbusy);
+}
+
+void page_invalidate_pages(void);
+
+/*
+ * callback handler to vm sub-system
+ *
+ * callers make sure no recursive entries to this func.
+ */
+/*ARGSUSED*/
+boolean_t
+callb_vm_cpr(void *arg, int code)
+{
+ if (code == CB_CODE_CPR_CHKPT)
+ page_invalidate_pages();
+ return (B_TRUE);
+}
+
+/*
+ * Invalidate all pages of the system.
+ * It shouldn't be called until all user page activities are all stopped.
+ */
+void
+page_invalidate_pages()
+{
+ page_t *pp;
+ page_t *page0;
+ pgcnt_t nbusypages;
+ int retry = 0;
+ const int MAXRETRIES = 4;
+#if defined(__sparc)
+ extern struct vnode prom_ppages;
+#endif /* __sparc */
+
+top:
+ /*
+ * Flush dirty pages and destory the clean ones.
+ */
+ nbusypages = 0;
+
+ pp = page0 = page_first();
+ do {
+ struct vnode *vp;
+ u_offset_t offset;
+ int mod;
+
+ /*
+ * skip the page if it has no vnode or the page associated
+ * with the kernel vnode or prom allocated kernel mem.
+ */
+#if defined(__sparc)
+ if ((vp = pp->p_vnode) == NULL || vp == &kvp ||
+ vp == &prom_ppages)
+#else /* x86 doesn't have prom or prom_ppage */
+ if ((vp = pp->p_vnode) == NULL || vp == &kvp)
+#endif /* __sparc */
+ continue;
+
+ /*
+ * skip the page which is already free invalidated.
+ */
+ if (PP_ISFREE(pp) && PP_ISAGED(pp))
+ continue;
+
+ /*
+ * skip pages that are already locked or can't be "exclusively"
+ * locked or are already free. After we lock the page, check
+ * the free and age bits again to be sure it's not destroied
+ * yet.
+ * To achieve max. parallelization, we use page_trylock instead
+ * of page_lock so that we don't get block on individual pages
+ * while we have thousands of other pages to process.
+ */
+ if (!page_trylock(pp, SE_EXCL)) {
+ nbusypages++;
+ continue;
+ } else if (PP_ISFREE(pp)) {
+ if (!PP_ISAGED(pp)) {
+ page_destroy_free(pp);
+ } else {
+ page_unlock(pp);
+ }
+ continue;
+ }
+ /*
+ * Is this page involved in some I/O? shared?
+ *
+ * The page_struct_lock need not be acquired to
+ * examine these fields since the page has an
+ * "exclusive" lock.
+ */
+ if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+ page_unlock(pp);
+ continue;
+ }
+
+ if (vp->v_type == VCHR) {
+ panic("vp->v_type == VCHR");
+ /*NOTREACHED*/
+ }
+
+ if (!page_try_demote_pages(pp)) {
+ page_unlock(pp);
+ continue;
+ }
+
+ /*
+ * Check the modified bit. Leave the bits alone in hardware
+ * (they will be modified if we do the putpage).
+ */
+ mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
+ & P_MOD);
+ if (mod) {
+ offset = pp->p_offset;
+ /*
+ * Hold the vnode before releasing the page lock
+ * to prevent it from being freed and re-used by
+ * some other thread.
+ */
+ VN_HOLD(vp);
+ page_unlock(pp);
+ /*
+ * No error return is checked here. Callers such as
+ * cpr deals with the dirty pages at the dump time
+ * if this putpage fails.
+ */
+ (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
+ kcred);
+ VN_RELE(vp);
+ } else {
+ page_destroy(pp, 0);
+ }
+ } while ((pp = page_next(pp)) != page0);
+ if (nbusypages && retry++ < MAXRETRIES) {
+ delay(1);
+ goto top;
+ }
+}
+
+/*
+ * Replace the page "old" with the page "new" on the page hash and vnode lists
+ *
+ * the replacemnt must be done in place, ie the equivalent sequence:
+ *
+ * vp = old->p_vnode;
+ * off = old->p_offset;
+ * page_do_hashout(old)
+ * page_do_hashin(new, vp, off)
+ *
+ * doesn't work, since
+ * 1) if old is the only page on the vnode, the v_pages list has a window
+ * where it looks empty. This will break file system assumptions.
+ * and
+ * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
+ */
+static void
+page_do_relocate_hash(page_t *new, page_t *old)
+{
+ page_t **hash_list;
+ vnode_t *vp = old->p_vnode;
+ kmutex_t *sep;
+
+ ASSERT(PAGE_EXCL(old));
+ ASSERT(PAGE_EXCL(new));
+ ASSERT(vp != NULL);
+ ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
+ ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
+
+ /*
+ * First find old page on the page hash list
+ */
+ hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
+
+ for (;;) {
+ if (*hash_list == old)
+ break;
+ if (*hash_list == NULL) {
+ panic("page_do_hashout");
+ /*NOTREACHED*/
+ }
+ hash_list = &(*hash_list)->p_hash;
+ }
+
+ /*
+ * update new and replace old with new on the page hash list
+ */
+ new->p_vnode = old->p_vnode;
+ new->p_offset = old->p_offset;
+ new->p_hash = old->p_hash;
+ *hash_list = new;
+
+ if ((new->p_vnode->v_flag & VISSWAP) != 0)
+ PP_SETSWAP(new);
+
+ /*
+ * replace old with new on the vnode's page list
+ */
+ if (old->p_vpnext == old) {
+ new->p_vpnext = new;
+ new->p_vpprev = new;
+ } else {
+ new->p_vpnext = old->p_vpnext;
+ new->p_vpprev = old->p_vpprev;
+ new->p_vpnext->p_vpprev = new;
+ new->p_vpprev->p_vpnext = new;
+ }
+ if (vp->v_pages == old)
+ vp->v_pages = new;
+
+ /*
+ * clear out the old page
+ */
+ old->p_hash = NULL;
+ old->p_vpnext = NULL;
+ old->p_vpprev = NULL;
+ old->p_vnode = NULL;
+ PP_CLRSWAP(old);
+ old->p_offset = (u_offset_t)-1;
+ page_clr_all_props(old);
+
+ /*
+ * Wake up processes waiting for this page. The page's
+ * identity has been changed, and is probably not the
+ * desired page any longer.
+ */
+ sep = page_se_mutex(old);
+ mutex_enter(sep);
+ if (CV_HAS_WAITERS(&old->p_cv))
+ cv_broadcast(&old->p_cv);
+ mutex_exit(sep);
+}
+
+/*
+ * This function moves the identity of page "pp_old" to page "pp_new".
+ * Both pages must be locked on entry. "pp_new" is free, has no identity,
+ * and need not be hashed out from anywhere.
+ */
+void
+page_relocate_hash(page_t *pp_new, page_t *pp_old)
+{
+ vnode_t *vp = pp_old->p_vnode;
+ u_offset_t off = pp_old->p_offset;
+ kmutex_t *phm, *vphm;
+
+ /*
+ * Rehash two pages
+ */
+ ASSERT(PAGE_EXCL(pp_old));
+ ASSERT(PAGE_EXCL(pp_new));
+ ASSERT(vp != NULL);
+ ASSERT(pp_new->p_vnode == NULL);
+
+ /*
+ * hashout then hashin while holding the mutexes
+ */
+ phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
+ mutex_enter(phm);
+ vphm = page_vnode_mutex(vp);
+ mutex_enter(vphm);
+
+ page_do_relocate_hash(pp_new, pp_old);
+
+ mutex_exit(vphm);
+ mutex_exit(phm);
+
+ /*
+ * The page_struct_lock need not be acquired for lckcnt and
+ * cowcnt since the page has an "exclusive" lock.
+ */
+ ASSERT(pp_new->p_lckcnt == 0);
+ ASSERT(pp_new->p_cowcnt == 0);
+ pp_new->p_lckcnt = pp_old->p_lckcnt;
+ pp_new->p_cowcnt = pp_old->p_cowcnt;
+ pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
+
+ /* The following comment preserved from page_flip(). */
+ /* XXX - Do we need to protect fsdata? */
+ pp_new->p_fsdata = pp_old->p_fsdata;
+}
+
+/*
+ * Helper routine used to lock all remaining members of a
+ * large page. The caller is responsible for passing in a locked
+ * pp. If pp is a large page, then it succeeds in locking all the
+ * remaining constituent pages or it returns with only the
+ * original page locked.
+ *
+ * Returns 1 on success, 0 on failure.
+ *
+ * If success is returned this routine gurantees p_szc for all constituent
+ * pages of a large page pp belongs to can't change. To achieve this we
+ * recheck szc of pp after locking all constituent pages and retry if szc
+ * changed (it could only decrease). Since hat_page_demote() needs an EXCL
+ * lock on one of constituent pages it can't be running after all constituent
+ * pages are locked. hat_page_demote() with a lock on a constituent page
+ * outside of this large page (i.e. pp belonged to a larger large page) is
+ * already done with all constituent pages of pp since the root's p_szc is
+ * changed last. Thefore no need to synchronize with hat_page_demote() that
+ * locked a constituent page outside of pp's current large page.
+ */
+#ifdef DEBUG
+uint32_t gpg_trylock_mtbf = 0;
+#endif
+
+int
+group_page_trylock(page_t *pp, se_t se)
+{
+ page_t *tpp;
+ pgcnt_t npgs, i, j;
+ uint_t pszc = pp->p_szc;
+
+#ifdef DEBUG
+ if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
+ return (0);
+ }
+#endif
+
+ if (pp != PP_GROUPLEADER(pp, pszc)) {
+ return (0);
+ }
+
+retry:
+ ASSERT(PAGE_LOCKED_SE(pp, se));
+ ASSERT(!PP_ISFREE(pp));
+ if (pszc == 0) {
+ return (1);
+ }
+ npgs = page_get_pagecnt(pszc);
+ tpp = pp + 1;
+ for (i = 1; i < npgs; i++, tpp++) {
+ if (!page_trylock(tpp, se)) {
+ tpp = pp + 1;
+ for (j = 1; j < i; j++, tpp++) {
+ page_unlock(tpp);
+ }
+ return (0);
+ }
+ }
+ if (pp->p_szc != pszc) {
+ ASSERT(pp->p_szc < pszc);
+ ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp &&
+ !IS_SWAPFSVP(pp->p_vnode));
+ tpp = pp + 1;
+ for (i = 1; i < npgs; i++, tpp++) {
+ page_unlock(tpp);
+ }
+ pszc = pp->p_szc;
+ goto retry;
+ }
+ return (1);
+}
+
+void
+group_page_unlock(page_t *pp)
+{
+ page_t *tpp;
+ pgcnt_t npgs, i;
+
+ ASSERT(PAGE_LOCKED(pp));
+ ASSERT(!PP_ISFREE(pp));
+ ASSERT(pp == PP_PAGEROOT(pp));
+ npgs = page_get_pagecnt(pp->p_szc);
+ for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
+ page_unlock(tpp);
+ }
+}
+
+/*
+ * returns
+ * 0 : on success and *nrelocp is number of relocated PAGESIZE pages
+ * ERANGE : this is not a base page
+ * EBUSY : failure to get locks on the page/pages
+ * ENOMEM : failure to obtain replacement pages
+ * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel
+ *
+ * Return with all constituent members of target and replacement
+ * SE_EXCL locked. It is the callers responsibility to drop the
+ * locks.
+ */
+int
+do_page_relocate(
+ page_t **target,
+ page_t **replacement,
+ int grouplock,
+ spgcnt_t *nrelocp,
+ lgrp_t *lgrp)
+{
+#ifdef DEBUG
+ page_t *first_repl;
+#endif /* DEBUG */
+ page_t *repl;
+ page_t *targ;
+ page_t *pl = NULL;
+ uint_t ppattr;
+ pfn_t pfn, repl_pfn;
+ uint_t szc;
+ spgcnt_t npgs, i;
+ int repl_contig = 0;
+ uint_t flags = 0;
+ spgcnt_t dofree = 0;
+
+ *nrelocp = 0;
+
+#if defined(__sparc)
+ /*
+ * We need to wait till OBP has completed
+ * its boot-time handoff of its resources to the kernel
+ * before we allow page relocation
+ */
+ if (page_relocate_ready == 0) {
+ return (EAGAIN);
+ }
+#endif
+
+ /*
+ * If this is not a base page,
+ * just return with 0x0 pages relocated.
+ */
+ targ = *target;
+ ASSERT(PAGE_EXCL(targ));
+ ASSERT(!PP_ISFREE(targ));
+ szc = targ->p_szc;
+ ASSERT(szc < mmu_page_sizes);
+ VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
+ pfn = targ->p_pagenum;
+ if (pfn != PFN_BASE(pfn, szc)) {
+ VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
+ return (ERANGE);
+ }
+
+ if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
+ repl_pfn = repl->p_pagenum;
+ if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
+ VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
+ return (ERANGE);
+ }
+ repl_contig = 1;
+ }
+
+ /*
+ * We must lock all members of this large page or we cannot
+ * relocate any part of it.
+ */
+ if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
+ VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
+ return (EBUSY);
+ }
+
+ /*
+ * reread szc it could have been decreased before
+ * group_page_trylock() was done.
+ */
+ szc = targ->p_szc;
+ ASSERT(szc < mmu_page_sizes);
+ VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
+ ASSERT(pfn == PFN_BASE(pfn, szc));
+
+ npgs = page_get_pagecnt(targ->p_szc);
+
+ if (repl == NULL) {
+ dofree = npgs; /* Size of target page in MMU pages */
+ if (!page_create_wait(dofree, 0)) {
+ if (grouplock != 0) {
+ group_page_unlock(targ);
+ }
+ VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
+ return (ENOMEM);
+ }
+
+ /*
+ * seg kmem pages require that the target and replacement
+ * page be the same pagesize.
+ */
+ flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0;
+ repl = page_get_replacement_page(targ, lgrp, flags);
+ if (repl == NULL) {
+ if (grouplock != 0) {
+ group_page_unlock(targ);
+ }
+ page_create_putback(dofree);
+ VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
+ return (ENOMEM);
+ }
+ }
+#ifdef DEBUG
+ else {
+ ASSERT(PAGE_LOCKED(repl));
+ }
+#endif /* DEBUG */
+
+#if defined(__sparc)
+ /*
+ * Let hat_page_relocate() complete the relocation if it's kernel page
+ */
+ if (targ->p_vnode == &kvp) {
+ *replacement = repl;
+ if (hat_page_relocate(target, replacement, nrelocp) != 0) {
+ if (grouplock != 0) {
+ group_page_unlock(targ);
+ }
+ if (dofree) {
+ *replacement = NULL;
+ page_free_replacement_page(repl);
+ page_create_putback(dofree);
+ }
+ VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
+ return (EAGAIN);
+ }
+ VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
+ return (0);
+ }
+#else
+#if defined(lint)
+ dofree = dofree;
+#endif
+#endif
+
+#ifdef DEBUG
+ first_repl = repl;
+#endif /* DEBUG */
+
+ for (i = 0; i < npgs; i++) {
+ ASSERT(PAGE_EXCL(targ));
+
+ (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
+
+ ASSERT(hat_page_getshare(targ) == 0);
+ ASSERT(!PP_ISFREE(targ));
+ ASSERT(targ->p_pagenum == (pfn + i));
+ ASSERT(repl_contig == 0 ||
+ repl->p_pagenum == (repl_pfn + i));
+
+ /*
+ * Copy the page contents and attributes then
+ * relocate the page in the page hash.
+ */
+ ppcopy(targ, repl);
+ ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
+ page_clr_all_props(repl);
+ page_set_props(repl, ppattr);
+ page_relocate_hash(repl, targ);
+
+ ASSERT(hat_page_getshare(targ) == 0);
+ ASSERT(hat_page_getshare(repl) == 0);
+ /*
+ * Now clear the props on targ, after the
+ * page_relocate_hash(), they no longer
+ * have any meaning.
+ */
+ page_clr_all_props(targ);
+ ASSERT(targ->p_next == targ);
+ ASSERT(targ->p_prev == targ);
+ page_list_concat(&pl, &targ);
+
+ targ++;
+ if (repl_contig != 0) {
+ repl++;
+ } else {
+ repl = repl->p_next;
+ }
+ }
+ /* assert that we have come full circle with repl */
+ ASSERT(repl_contig == 1 || first_repl == repl);
+
+ *target = pl;
+ if (*replacement == NULL) {
+ ASSERT(first_repl == repl);
+ *replacement = repl;
+ }
+ VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
+ *nrelocp = npgs;
+ return (0);
+}
+/*
+ * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
+ */
+int
+page_relocate(
+ page_t **target,
+ page_t **replacement,
+ int grouplock,
+ int freetarget,
+ spgcnt_t *nrelocp,
+ lgrp_t *lgrp)
+{
+ spgcnt_t ret;
+
+ /* do_page_relocate returns 0 on success or errno value */
+ ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
+
+ if (ret != 0 || freetarget == 0) {
+ return (ret);
+ }
+ if (*nrelocp == 1) {
+ ASSERT(*target != NULL);
+ page_free(*target, 1);
+ } else {
+ page_t *tpp = *target;
+ uint_t szc = tpp->p_szc;
+ pgcnt_t npgs = page_get_pagecnt(szc);
+ ASSERT(npgs > 1);
+ ASSERT(szc != 0);
+ do {
+ ASSERT(PAGE_EXCL(tpp));
+ ASSERT(!hat_page_is_mapped(tpp));
+ ASSERT(tpp->p_szc == szc);
+ PP_SETFREE(tpp);
+ PP_SETAGED(tpp);
+ npgs--;
+ } while ((tpp = tpp->p_next) != *target);
+ ASSERT(npgs == 0);
+ page_list_add_pages(*target, 0);
+ npgs = page_get_pagecnt(szc);
+ page_create_putback(npgs);
+ }
+ return (ret);
+}
+
+/*
+ * it is up to the caller to deal with pcf accounting.
+ */
+void
+page_free_replacement_page(page_t *pplist)
+{
+ page_t *pp;
+
+ while (pplist != NULL) {
+ /*
+ * pp_targ is a linked list.
+ */
+ pp = pplist;
+ if (pp->p_szc == 0) {
+ page_sub(&pplist, pp);
+ page_clr_all_props(pp);
+ PP_SETFREE(pp);
+ PP_SETAGED(pp);
+ page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
+ page_unlock(pp);
+ VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
+ } else {
+ spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
+ page_t *tpp;
+ page_list_break(&pp, &pplist, curnpgs);
+ tpp = pp;
+ do {
+ ASSERT(PAGE_EXCL(tpp));
+ ASSERT(!hat_page_is_mapped(tpp));
+ page_clr_all_props(pp);
+ PP_SETFREE(tpp);
+ PP_SETAGED(tpp);
+ } while ((tpp = tpp->p_next) != pp);
+ page_list_add_pages(pp, 0);
+ VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
+ }
+ }
+}
+
+/*
+ * Relocate target to non-relocatable replacement page.
+ */
+int
+page_relocate_cage(page_t **target, page_t **replacement)
+{
+ page_t *tpp, *rpp;
+ spgcnt_t pgcnt, npgs;
+ int result;
+
+ tpp = *target;
+
+ ASSERT(PAGE_EXCL(tpp));
+ ASSERT(tpp->p_szc == 0);
+
+ pgcnt = btop(page_get_pagesize(tpp->p_szc));
+
+ do {
+ (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
+ rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
+ if (rpp == NULL) {
+ page_create_putback(pgcnt);
+ kcage_cageout_wakeup();
+ }
+ } while (rpp == NULL);
+
+ ASSERT(PP_ISNORELOC(rpp));
+
+ result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
+
+ if (result == 0) {
+ *replacement = rpp;
+ if (pgcnt != npgs)
+ panic("page_relocate_cage: partial relocation");
+ }
+
+ return (result);
+}
+
+/*
+ * Release the page lock on a page, place on cachelist
+ * tail if no longer mapped. Caller can let us know if
+ * the page is known to be clean.
+ */
+int
+page_release(page_t *pp, int checkmod)
+{
+ int status;
+
+ ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
+ (pp->p_vnode != NULL));
+
+ if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
+ ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
+ pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
+ !hat_page_is_mapped(pp)) {
+
+ /*
+ * If page is modified, unlock it
+ *
+ * (p_nrm & P_MOD) bit has the latest stuff because:
+ * (1) We found that this page doesn't have any mappings
+ * _after_ holding SE_EXCL and
+ * (2) We didn't drop SE_EXCL lock after the check in (1)
+ */
+ if (checkmod && hat_ismod(pp)) {
+ page_unlock(pp);
+ status = PGREL_MOD;
+ } else {
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, B_FREE, 0, kcred);
+ status = PGREL_CLEAN;
+ }
+ } else {
+ page_unlock(pp);
+ status = PGREL_NOTREL;
+ }
+ return (status);
+}
+
+int
+page_try_demote_pages(page_t *pp)
+{
+ page_t *tpp, *rootpp = pp;
+ pfn_t pfn = page_pptonum(pp);
+ spgcnt_t i, npgs;
+ uint_t szc = pp->p_szc;
+ vnode_t *vp = pp->p_vnode;
+
+ ASSERT(PAGE_EXCL(rootpp));
+
+ VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
+
+ if (rootpp->p_szc == 0) {
+ VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
+ return (1);
+ }
+
+ if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) {
+ VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
+ page_demote_vp_pages(rootpp);
+ ASSERT(pp->p_szc == 0);
+ return (1);
+ }
+
+ /*
+ * Adjust rootpp if passed in is not the base
+ * constituent page.
+ */
+ npgs = page_get_pagecnt(rootpp->p_szc);
+ ASSERT(npgs > 1);
+ if (!IS_P2ALIGNED(pfn, npgs)) {
+ pfn = P2ALIGN(pfn, npgs);
+ rootpp = page_numtopp_nolock(pfn);
+ VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
+ ASSERT(rootpp->p_vnode != NULL);
+ ASSERT(rootpp->p_szc == szc);
+ }
+
+ /*
+ * We can't demote kernel pages since we can't hat_unload()
+ * the mappings.
+ */
+ if (rootpp->p_vnode == &kvp)
+ return (0);
+
+ /*
+ * Attempt to lock all constituent pages except the page passed
+ * in since it's already locked.
+ */
+ for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) {
+ ASSERT(!PP_ISFREE(tpp));
+ ASSERT(tpp->p_vnode != NULL);
+
+ if (tpp != pp && !page_trylock(tpp, SE_EXCL))
+ break;
+ ASSERT(tpp->p_szc == rootpp->p_szc);
+ ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
+ (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
+ }
+
+ /*
+ * If we failed to lock them all then unlock what we have locked
+ * so far and bail.
+ */
+ if (i < npgs) {
+ tpp = rootpp;
+ while (i-- > 0) {
+ if (tpp != pp)
+ page_unlock(tpp);
+ tpp = page_next(tpp);
+ }
+ VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
+ return (0);
+ }
+
+ /*
+ * XXX probably p_szc clearing and page unlocking can be done within
+ * one loop but since this is rare code we can play very safe.
+ */
+ for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) {
+ ASSERT(PAGE_EXCL(tpp));
+ tpp->p_szc = 0;
+ }
+
+ /*
+ * Unlock all pages except the page passed in.
+ */
+ for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) {
+ ASSERT(!hat_page_is_mapped(tpp));
+ if (tpp != pp)
+ page_unlock(tpp);
+ }
+ VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
+ return (1);
+}
+
+/*
+ * Called by page_free() and page_destroy() to demote the page size code
+ * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
+ * p_szc on free list, neither can we just clear p_szc of a single page_t
+ * within a large page since it will break other code that relies on p_szc
+ * being the same for all page_t's of a large page). Anonymous pages should
+ * never end up here because anon_map_getpages() cannot deal with p_szc
+ * changes after a single constituent page is locked. While anonymous or
+ * kernel large pages are demoted or freed the entire large page at a time
+ * with all constituent pages locked EXCL for the file system pages we
+ * have to be able to demote a large page (i.e. decrease all constituent pages
+ * p_szc) with only just an EXCL lock on one of constituent pages. The reason
+ * we can easily deal with anonymous page demotion the entire large page at a
+ * time is that those operation originate at address space level and concern
+ * the entire large page region with actual demotion only done when pages are
+ * not shared with any other processes (therefore we can always get EXCL lock
+ * on all anonymous constituent pages after clearing segment page
+ * cache). However file system pages can be truncated or invalidated at a
+ * PAGESIZE level from the file system side and end up in page_free() or
+ * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
+ * and therfore pageout should be able to demote a large page by EXCL locking
+ * any constituent page that is not under SOFTLOCK). In those cases we cannot
+ * rely on being able to lock EXCL all constituent pages.
+ *
+ * To prevent szc changes on file system pages one has to lock all constituent
+ * pages at least SHARED (or call page_szc_lock()). The only subsystem that
+ * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
+ * prevent szc changes is hat layer that uses its own page level mlist
+ * locks. hat assumes that szc doesn't change after mlist lock for a page is
+ * taken. Therefore we need to change szc under hat level locks if we only
+ * have an EXCL lock on a single constituent page and hat still references any
+ * of constituent pages. (Note we can't "ignore" hat layer by simply
+ * hat_pageunload() all constituent pages without having EXCL locks on all of
+ * constituent pages). We use hat_page_demote() call to safely demote szc of
+ * all constituent pages under hat locks when we only have an EXCL lock on one
+ * of constituent pages.
+ *
+ * This routine calls page_szc_lock() before calling hat_page_demote() to
+ * allow segvn in one special case not to lock all constituent pages SHARED
+ * before calling hat_memload_array() that relies on p_szc not changeing even
+ * before hat level mlist lock is taken. In that case segvn uses
+ * page_szc_lock() to prevent hat_page_demote() changeing p_szc values.
+ *
+ * Anonymous or kernel page demotion still has to lock all pages exclusively
+ * and do hat_pageunload() on all constituent pages before demoting the page
+ * therefore there's no need for anonymous or kernel page demotion to use
+ * hat_page_demote() mechanism.
+ *
+ * hat_page_demote() removes all large mappings that map pp and then decreases
+ * p_szc starting from the last constituent page of the large page. By working
+ * from the tail of a large page in pfn decreasing order allows one looking at
+ * the root page to know that hat_page_demote() is done for root's szc area.
+ * e.g. if a root page has szc 1 one knows it only has to lock all constituent
+ * pages within szc 1 area to prevent szc changes because hat_page_demote()
+ * that started on this page when it had szc > 1 is done for this szc 1 area.
+ *
+ * We are guranteed that all constituent pages of pp's large page belong to
+ * the same vnode with the consecutive offsets increasing in the direction of
+ * the pfn i.e. the identity of constituent pages can't change until their
+ * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
+ * large mappings to pp even though we don't lock any constituent page except
+ * pp (i.e. we won't unload e.g. kernel locked page).
+ */
+static void
+page_demote_vp_pages(page_t *pp)
+{
+ kmutex_t *mtx;
+
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(!PP_ISFREE(pp));
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT(!IS_SWAPFSVP(pp->p_vnode));
+ ASSERT(pp->p_vnode != &kvp);
+
+ VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
+
+ mtx = page_szc_lock(pp);
+ if (mtx != NULL) {
+ hat_page_demote(pp);
+ mutex_exit(mtx);
+ }
+ ASSERT(pp->p_szc == 0);
+}
+
+/*
+ * Page retire operation.
+ *
+ * page_retire()
+ * Attempt to retire (throw away) page pp. We cannot do this if
+ * the page is dirty; if the page is clean, we can try. We return 0 on
+ * success, -1 on failure. This routine should be invoked by the platform's
+ * memory error detection code.
+ *
+ * pages_retired_limit_exceeded()
+ * We set a limit on the number of pages which may be retired. This
+ * is set to a percentage of total physical memory. This limit is
+ * enforced here.
+ */
+
+static pgcnt_t retired_pgcnt = 0;
+
+/*
+ * routines to update the count of retired pages
+ */
+static void
+page_retired(page_t *pp)
+{
+ ASSERT(pp);
+
+ page_settoxic(pp, PAGE_IS_RETIRED);
+ atomic_add_long(&retired_pgcnt, 1);
+}
+
+static void
+retired_page_removed(page_t *pp)
+{
+ ASSERT(pp);
+ ASSERT(page_isretired(pp));
+ ASSERT(retired_pgcnt > 0);
+
+ page_clrtoxic(pp);
+ atomic_add_long(&retired_pgcnt, -1);
+}
+
+
+static int
+pages_retired_limit_exceeded()
+{
+ pgcnt_t retired_max;
+
+ /*
+ * If the percentage is zero or is not set correctly,
+ * return TRUE so that pages are not retired.
+ */
+ if (max_pages_retired_bps <= 0 ||
+ max_pages_retired_bps >= 10000)
+ return (1);
+
+ /*
+ * Calculate the maximum number of pages allowed to
+ * be retired as a percentage of total physical memory
+ * (Remember that we are using basis points, hence the 10000.)
+ */
+ retired_max = (physmem * max_pages_retired_bps) / 10000;
+
+ /*
+ * return 'TRUE' if we have already retired more
+ * than the legal limit
+ */
+ return (retired_pgcnt >= retired_max);
+}
+
+#define PAGE_RETIRE_SELOCK 0
+#define PAGE_RETIRE_NORECLAIM 1
+#define PAGE_RETIRE_LOCKED 2
+#define PAGE_RETIRE_COW 3
+#define PAGE_RETIRE_DIRTY 4
+#define PAGE_RETIRE_LPAGE 5
+#define PAGE_RETIRE_SUCCESS 6
+#define PAGE_RETIRE_LIMIT 7
+#define PAGE_RETIRE_NCODES 8
+
+typedef struct page_retire_op {
+ int pr_count;
+ short pr_unlock;
+ short pr_retval;
+ char *pr_message;
+} page_retire_op_t;
+
+page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = {
+ { 0, 0, -1, "cannot lock page" },
+ { 0, 0, -1, "cannot reclaim cached page" },
+ { 0, 1, -1, "page is locked" },
+ { 0, 1, -1, "copy-on-write page" },
+ { 0, 1, -1, "page is dirty" },
+ { 0, 1, -1, "cannot demote large page" },
+ { 0, 0, 0, "page successfully retired" },
+ { 0, 0, -1, "excess pages retired already" },
+};
+
+static int
+page_retire_done(page_t *pp, int code)
+{
+ page_retire_op_t *prop = &page_retire_ops[code];
+
+ prop->pr_count++;
+
+ if (prop->pr_unlock)
+ page_unlock(pp);
+
+ if (page_retire_messages > 1) {
+ printf("page_retire(%p) pfn 0x%lx %s: %s\n",
+ (void *)pp, page_pptonum(pp),
+ prop->pr_retval == -1 ? "failed" : "succeeded",
+ prop->pr_message);
+ }
+
+ return (prop->pr_retval);
+}
+
+int
+page_retire(page_t *pp, uchar_t flag)
+{
+ uint64_t pa = ptob((uint64_t)page_pptonum(pp));
+
+ ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC);
+
+ /*
+ * DR operations change the association between a page_t
+ * and the physical page it represents. Check if the
+ * page is still bad.
+ */
+ if (!page_isfaulty(pp)) {
+ page_clrtoxic(pp);
+ return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
+ }
+
+ /*
+ * We set the flag here so that even if we fail due
+ * to exceeding the limit for retired pages, the
+ * page will still be checked and either cleared
+ * or retired in page_free().
+ */
+ page_settoxic(pp, flag);
+
+ if (flag == PAGE_IS_TOXIC) {
+ if (page_retire_messages) {
+ cmn_err(CE_NOTE, "Scheduling clearing of error on"
+ " page 0x%08x.%08x",
+ (uint32_t)(pa >> 32), (uint32_t)pa);
+ }
+
+ } else { /* PAGE_IS_FAILING */
+ if (pages_retired_limit_exceeded()) {
+ /*
+ * Return as we have already exceeded the
+ * maximum number of pages allowed to be
+ * retired
+ */
+ return (page_retire_done(pp, PAGE_RETIRE_LIMIT));
+ }
+
+ if (page_retire_messages) {
+ cmn_err(CE_NOTE, "Scheduling removal of "
+ "page 0x%08x.%08x",
+ (uint32_t)(pa >> 32), (uint32_t)pa);
+ }
+ }
+
+ if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL))
+ return (page_retire_done(pp, PAGE_RETIRE_SELOCK));
+
+ /*
+ * If this is a large page we first try and demote it
+ * to PAGESIZE pages and then dispose of the toxic page.
+ * On failure we will let the page free/destroy
+ * code handle it later since this is a mapped page.
+ * Note that free large pages can always be demoted.
+ *
+ */
+ if (pp->p_szc != 0) {
+ if (PP_ISFREE(pp))
+ (void) page_demote_free_pages(pp);
+ else
+ (void) page_try_demote_pages(pp);
+
+ if (pp->p_szc != 0)
+ return (page_retire_done(pp, PAGE_RETIRE_LPAGE));
+ }
+
+ if (PP_ISFREE(pp)) {
+ if (!page_reclaim(pp, NULL))
+ return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM));
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred)
+ return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
+ }
+
+ if (pp->p_lckcnt != 0)
+ return (page_retire_done(pp, PAGE_RETIRE_LOCKED));
+
+ if (pp->p_cowcnt != 0)
+ return (page_retire_done(pp, PAGE_RETIRE_COW));
+
+ /*
+ * Unload all translations to this page. No new translations
+ * can be created while we hold the exclusive lock on the page.
+ */
+ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+
+ if (hat_ismod(pp))
+ return (page_retire_done(pp, PAGE_RETIRE_DIRTY));
+
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, B_INVAL, 0, kcred);
+
+ return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
+}
+
+/*
+ * Mark any existing pages for migration in the given range
+ */
+void
+page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
+ struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
+ u_offset_t vnoff, int rflag)
+{
+ struct anon *ap;
+ vnode_t *curvp;
+ lgrp_t *from;
+ pgcnt_t i;
+ pgcnt_t nlocked;
+ u_offset_t off;
+ pfn_t pfn;
+ size_t pgsz;
+ size_t segpgsz;
+ pgcnt_t pages;
+ uint_t pszc;
+ page_t **ppa;
+ pgcnt_t ppa_nentries;
+ page_t *pp;
+ caddr_t va;
+ ulong_t an_idx;
+ anon_sync_obj_t cookie;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /*
+ * Don't do anything if don't need to do lgroup optimizations
+ * on this system
+ */
+ if (!lgrp_optimizations())
+ return;
+
+ /*
+ * Align address and length to (potentially large) page boundary
+ */
+ segpgsz = page_get_pagesize(seg->s_szc);
+ addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
+ if (rflag)
+ len = P2ROUNDUP(len, segpgsz);
+
+ /*
+ * Allocate page array to accomodate largest page size
+ */
+ pgsz = page_get_pagesize(page_num_pagesizes() - 1);
+ ppa_nentries = btop(pgsz);
+ ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP);
+
+ /*
+ * Do one (large) page at a time
+ */
+ va = addr;
+ while (va < addr + len) {
+ /*
+ * Lookup (root) page for vnode and offset corresponding to
+ * this virtual address
+ * Try anonmap first since there may be copy-on-write
+ * pages, but initialize vnode pointer and offset using
+ * vnode arguments just in case there isn't an amp.
+ */
+ curvp = vp;
+ off = vnoff + va - seg->s_base;
+ if (amp) {
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ an_idx = anon_index + seg_page(seg, va);
+ anon_array_enter(amp, an_idx, &cookie);
+ ap = anon_get_ptr(amp->ahp, an_idx);
+ if (ap)
+ swap_xlate(ap, &curvp, &off);
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+
+ pp = NULL;
+ if (curvp)
+ pp = page_lookup(curvp, off, SE_SHARED);
+
+ /*
+ * If there isn't a page at this virtual address,
+ * skip to next page
+ */
+ if (pp == NULL) {
+ va += PAGESIZE;
+ continue;
+ }
+
+ /*
+ * Figure out which lgroup this page is in for kstats
+ */
+ pfn = page_pptonum(pp);
+ from = lgrp_pfn_to_lgrp(pfn);
+
+ /*
+ * Get page size, and round up and skip to next page boundary
+ * if unaligned address
+ */
+ pszc = pp->p_szc;
+ pgsz = page_get_pagesize(pszc);
+ pages = btop(pgsz);
+ if (!IS_P2ALIGNED(va, pgsz) ||
+ !IS_P2ALIGNED(pfn, pages) ||
+ pgsz > segpgsz) {
+ pgsz = MIN(pgsz, segpgsz);
+ page_unlock(pp);
+ i = btop(P2END((uintptr_t)va, pgsz) -
+ (uintptr_t)va);
+ va = (caddr_t)P2END((uintptr_t)va, pgsz);
+ lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i);
+ continue;
+ }
+
+ /*
+ * Upgrade to exclusive lock on page
+ */
+ if (!page_tryupgrade(pp)) {
+ page_unlock(pp);
+ va += pgsz;
+ lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
+ btop(pgsz));
+ continue;
+ }
+
+ /*
+ * Remember pages locked exclusively and how many
+ */
+ ppa[0] = pp;
+ nlocked = 1;
+
+ /*
+ * Lock constituent pages if this is large page
+ */
+ if (pages > 1) {
+ /*
+ * Lock all constituents except root page, since it
+ * should be locked already.
+ */
+ for (i = 1; i < pages; i++) {
+ pp = page_next(pp);
+ if (!page_trylock(pp, SE_EXCL)) {
+ break;
+ }
+ if (PP_ISFREE(pp) ||
+ pp->p_szc != pszc) {
+ /*
+ * hat_page_demote() raced in with us.
+ */
+ ASSERT(!IS_SWAPFSVP(curvp));
+ page_unlock(pp);
+ break;
+ }
+ ppa[nlocked] = pp;
+ nlocked++;
+ }
+ }
+
+ /*
+ * If all constituent pages couldn't be locked,
+ * unlock pages locked so far and skip to next page.
+ */
+ if (nlocked != pages) {
+ for (i = 0; i < nlocked; i++)
+ page_unlock(ppa[i]);
+ va += pgsz;
+ lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
+ btop(pgsz));
+ continue;
+ }
+
+ /*
+ * hat_page_demote() can no longer happen
+ * since last cons page had the right p_szc after
+ * all cons pages were locked. all cons pages
+ * should now have the same p_szc.
+ */
+
+ /*
+ * All constituent pages locked successfully, so mark
+ * large page for migration and unload the mappings of
+ * constituent pages, so a fault will occur on any part of the
+ * large page
+ */
+ PP_SETMIGRATE(ppa[0]);
+ for (i = 0; i < nlocked; i++) {
+ pp = ppa[i];
+ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+ ASSERT(hat_page_getshare(pp) == 0);
+ page_unlock(pp);
+ }
+ lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
+
+ va += pgsz;
+ }
+ kmem_free(ppa, ppa_nentries * sizeof (page_t *));
+}
+
+/*
+ * Migrate any pages that have been marked for migration in the given range
+ */
+void
+page_migrate(
+ struct seg *seg,
+ caddr_t addr,
+ page_t **ppa,
+ pgcnt_t npages)
+{
+ lgrp_t *from;
+ lgrp_t *to;
+ page_t *newpp;
+ page_t *pp;
+ pfn_t pfn;
+ size_t pgsz;
+ spgcnt_t page_cnt;
+ spgcnt_t i;
+ uint_t pszc;
+
+ ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ while (npages > 0) {
+ pp = *ppa;
+ pszc = pp->p_szc;
+ pgsz = page_get_pagesize(pszc);
+ page_cnt = btop(pgsz);
+
+ /*
+ * Check to see whether this page is marked for migration
+ *
+ * Assume that root page of large page is marked for
+ * migration and none of the other constituent pages
+ * are marked. This really simplifies clearing the
+ * migrate bit by not having to clear it from each
+ * constituent page.
+ *
+ * note we don't want to relocate an entire large page if
+ * someone is only using one subpage.
+ */
+ if (npages < page_cnt)
+ break;
+
+ /*
+ * Is it marked for migration?
+ */
+ if (!PP_ISMIGRATE(pp))
+ goto next;
+
+ /*
+ * Determine lgroups that page is being migrated between
+ */
+ pfn = page_pptonum(pp);
+ if (!IS_P2ALIGNED(pfn, page_cnt)) {
+ break;
+ }
+ from = lgrp_pfn_to_lgrp(pfn);
+ to = lgrp_mem_choose(seg, addr, pgsz);
+
+ /*
+ * Check to see whether we are trying to migrate page to lgroup
+ * where it is allocated already
+ */
+ if (to == from) {
+ PP_CLRMIGRATE(pp);
+ goto next;
+ }
+
+ /*
+ * Need to get exclusive lock's to migrate
+ */
+ for (i = 0; i < page_cnt; i++) {
+ ASSERT(PAGE_LOCKED(ppa[i]));
+ if (page_pptonum(ppa[i]) != pfn + i ||
+ ppa[i]->p_szc != pszc) {
+ break;
+ }
+ if (!page_tryupgrade(ppa[i])) {
+ lgrp_stat_add(from->lgrp_id,
+ LGRP_PM_FAIL_LOCK_PGS,
+ page_cnt);
+ break;
+ }
+ }
+ if (i != page_cnt) {
+ while (--i != -1) {
+ page_downgrade(ppa[i]);
+ }
+ goto next;
+ }
+
+ (void) page_create_wait(page_cnt, PG_WAIT);
+ newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
+ if (newpp == NULL) {
+ page_create_putback(page_cnt);
+ for (i = 0; i < page_cnt; i++) {
+ page_downgrade(ppa[i]);
+ }
+ lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
+ page_cnt);
+ goto next;
+ }
+ ASSERT(newpp->p_szc == pszc);
+ /*
+ * Clear migrate bit and relocate page
+ */
+ PP_CLRMIGRATE(pp);
+ if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
+ panic("page_migrate: page_relocate failed");
+ }
+ ASSERT(page_cnt * PAGESIZE == pgsz);
+
+ /*
+ * Keep stats for number of pages migrated from and to
+ * each lgroup
+ */
+ lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
+ lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
+ /*
+ * update the page_t array we were passed in and
+ * unlink constituent pages of a large page.
+ */
+ for (i = 0; i < page_cnt; ++i, ++pp) {
+ ASSERT(PAGE_EXCL(newpp));
+ ASSERT(newpp->p_szc == pszc);
+ ppa[i] = newpp;
+ pp = newpp;
+ page_sub(&newpp, pp);
+ page_downgrade(pp);
+ }
+ ASSERT(newpp == NULL);
+next:
+ addr += pgsz;
+ ppa += page_cnt;
+ npages -= page_cnt;
+ }
+}
+
+/*
+ * initialize the vnode for retired pages
+ */
+static void
+page_retired_init(void)
+{
+ vn_setops(&retired_ppages, &retired_vnodeops);
+}
+
+/* ARGSUSED */
+static void
+retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr)
+{
+ panic("retired_dispose invoked");
+}
+
+/* ARGSUSED */
+static void
+retired_inactive(vnode_t *vp, cred_t *cr)
+{}
+
+void
+page_unretire_pages(void)
+{
+ page_t *pp;
+ kmutex_t *vphm;
+ vnode_t *vp;
+ page_t *rpages[UNRETIRE_PAGES];
+ pgcnt_t i, npages, rmem;
+ uint64_t pa;
+
+ rmem = 0;
+
+ for (;;) {
+ /*
+ * We do this in 2 steps:
+ *
+ * 1. We walk the retired pages list and collect a list of
+ * pages that have the toxic field cleared.
+ *
+ * 2. We iterate through the page list and unretire each one.
+ *
+ * We have to do it in two steps on account of the mutexes that
+ * we need to acquire.
+ */
+
+ vp = &retired_ppages;
+ vphm = page_vnode_mutex(vp);
+ mutex_enter(vphm);
+
+ if ((pp = vp->v_pages) == NULL) {
+ mutex_exit(vphm);
+ break;
+ }
+
+ i = 0;
+ do {
+ ASSERT(pp != NULL);
+ ASSERT(pp->p_vnode == vp);
+
+ /*
+ * DR operations change the association between a page_t
+ * and the physical page it represents. Check if the
+ * page is still bad. If not, unretire it.
+ */
+ if (!page_isfaulty(pp))
+ rpages[i++] = pp;
+
+ pp = pp->p_vpnext;
+ } while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES));
+
+ mutex_exit(vphm);
+
+ npages = i;
+ for (i = 0; i < npages; i++) {
+ pp = rpages[i];
+ pa = ptob((uint64_t)page_pptonum(pp));
+
+ /*
+ * Need to upgrade the shared lock to an exclusive
+ * lock in order to hash out the page.
+ *
+ * The page could have been retired but the page lock
+ * may not have been downgraded yet. If so, skip this
+ * page. page_free() will call this function after the
+ * lock is downgraded.
+ */
+
+ if (!PAGE_SHARED(pp) || !page_tryupgrade(pp))
+ continue;
+
+ /*
+ * Both page_free() and DR call this function. They
+ * can potentially call this function at the same
+ * time and race with each other.
+ */
+ if (!page_isretired(pp) || page_isfaulty(pp)) {
+ page_downgrade(pp);
+ continue;
+ }
+
+ cmn_err(CE_NOTE,
+ "unretiring retired page 0x%08x.%08x",
+ (uint32_t)(pa >> 32), (uint32_t)pa);
+
+ /*
+ * When a page is removed from the retired pages vnode,
+ * its toxic field is also cleared. So, we do not have
+ * to do that seperately here.
+ */
+ page_hashout(pp, (kmutex_t *)NULL);
+
+ /*
+ * This is a good page. So, free it.
+ */
+ pp->p_vnode = NULL;
+ page_free(pp, 1);
+ rmem++;
+ }
+
+ /*
+ * If the rpages array was filled up, then there could be more
+ * retired pages that are not faulty. We need to iterate
+ * again and unretire them. Otherwise, we are done.
+ */
+ if (npages < UNRETIRE_PAGES)
+ break;
+ }
+
+ mutex_enter(&freemem_lock);
+ availrmem += rmem;
+ mutex_exit(&freemem_lock);
+}
+
+ulong_t mem_waiters = 0;
+ulong_t max_count = 20;
+#define MAX_DELAY 0x1ff
+
+/*
+ * Check if enough memory is available to proceed.
+ * Depending on system configuration and how much memory is
+ * reserved for swap we need to check against two variables.
+ * e.g. on systems with little physical swap availrmem can be
+ * more reliable indicator of how much memory is available.
+ * On systems with large phys swap freemem can be better indicator.
+ * If freemem drops below threshold level don't return an error
+ * immediately but wake up pageout to free memory and block.
+ * This is done number of times. If pageout is not able to free
+ * memory within certain time return an error.
+ * The same applies for availrmem but kmem_reap is used to
+ * free memory.
+ */
+int
+page_mem_avail(pgcnt_t npages)
+{
+ ulong_t count;
+
+#if defined(__i386)
+ if (freemem > desfree + npages &&
+ availrmem > swapfs_reserve + npages &&
+ btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem +
+ npages)
+ return (1);
+#else
+ if (freemem > desfree + npages &&
+ availrmem > swapfs_reserve + npages)
+ return (1);
+#endif
+
+ count = max_count;
+ atomic_add_long(&mem_waiters, 1);
+
+ while (freemem < desfree + npages && --count) {
+ cv_signal(&proc_pageout->p_cv);
+ if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
+ atomic_add_long(&mem_waiters, -1);
+ return (0);
+ }
+ }
+ if (count == 0) {
+ atomic_add_long(&mem_waiters, -1);
+ return (0);
+ }
+
+ count = max_count;
+ while (availrmem < swapfs_reserve + npages && --count) {
+ kmem_reap();
+ if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
+ atomic_add_long(&mem_waiters, -1);
+ return (0);
+ }
+ }
+ atomic_add_long(&mem_waiters, -1);
+ if (count == 0)
+ return (0);
+
+#if defined(__i386)
+ if (btop(vmem_size(heap_arena, VMEM_FREE)) <
+ tune.t_minarmem + npages)
+ return (0);
+#endif
+ return (1);
+}
+
+
+/*
+ * Search the memory segments to locate the desired page. Within a
+ * segment, pages increase linearly with one page structure per
+ * physical page frame (size PAGESIZE). The search begins
+ * with the segment that was accessed last, to take advantage of locality.
+ * If the hint misses, we start from the beginning of the sorted memseg list
+ */
+
+
+/*
+ * Some data structures for pfn to pp lookup.
+ */
+ulong_t mhash_per_slot;
+struct memseg *memseg_hash[N_MEM_SLOTS];
+
+page_t *
+page_numtopp_nolock(pfn_t pfnum)
+{
+ static struct memseg *last_memseg_by_pfnum = NULL;
+ struct memseg *seg;
+ page_t *pp;
+
+ /*
+ * XXX - Since page_numtopp_nolock is called in many places where
+ * the search fails more than it succeeds. It maybe worthwhile
+ * to put a check for pf_is_memory or a pfnum <= max_pfn (set at
+ * boot time).
+ *
+ * if (!pf_is_memory(pfnum) || (pfnum > max_pfn))
+ * return (NULL);
+ */
+
+ MEMSEG_STAT_INCR(nsearch);
+
+ /* Try last winner first */
+ if (((seg = last_memseg_by_pfnum) != NULL) &&
+ (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
+ MEMSEG_STAT_INCR(nlastwon);
+ pp = seg->pages + (pfnum - seg->pages_base);
+ if (pp->p_pagenum == pfnum)
+ return ((page_t *)pp);
+ }
+
+ /* Else Try hash */
+ if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
+ (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
+ MEMSEG_STAT_INCR(nhashwon);
+ last_memseg_by_pfnum = seg;
+ pp = seg->pages + (pfnum - seg->pages_base);
+ if (pp->p_pagenum == pfnum)
+ return ((page_t *)pp);
+ }
+
+ /* Else Brute force */
+ for (seg = memsegs; seg != NULL; seg = seg->next) {
+ if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
+ last_memseg_by_pfnum = seg;
+ pp = seg->pages + (pfnum - seg->pages_base);
+ return ((page_t *)pp);
+ }
+ }
+ last_memseg_by_pfnum = NULL;
+ MEMSEG_STAT_INCR(nnotfound);
+ return ((page_t *)NULL);
+
+}
+
+struct memseg *
+page_numtomemseg_nolock(pfn_t pfnum)
+{
+ struct memseg *seg;
+ page_t *pp;
+
+ /* Try hash */
+ if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
+ (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
+ pp = seg->pages + (pfnum - seg->pages_base);
+ if (pp->p_pagenum == pfnum)
+ return (seg);
+ }
+
+ /* Else Brute force */
+ for (seg = memsegs; seg != NULL; seg = seg->next) {
+ if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
+ return (seg);
+ }
+ }
+ return ((struct memseg *)NULL);
+}
+
+/*
+ * Given a page and a count return the page struct that is
+ * n structs away from the current one in the global page
+ * list.
+ *
+ * This function wraps to the first page upon
+ * reaching the end of the memseg list.
+ */
+page_t *
+page_nextn(page_t *pp, ulong_t n)
+{
+ static struct memseg *last_page_next_memseg = NULL;
+ struct memseg *seg;
+ page_t *ppn;
+
+ if (((seg = last_page_next_memseg) == NULL) ||
+ (seg->pages_base == seg->pages_end) ||
+ !(pp >= seg->pages && pp < seg->epages)) {
+
+ for (seg = memsegs; seg; seg = seg->next) {
+ if (pp >= seg->pages && pp < seg->epages)
+ break;
+ }
+
+ if (seg == NULL) {
+ /* Memory delete got in, return something valid. */
+ /* TODO: fix me. */
+ seg = memsegs;
+ pp = seg->pages;
+ }
+ }
+
+ /* check for wraparound - possible if n is large */
+ while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
+ n -= seg->epages - pp;
+ seg = seg->next;
+ if (seg == NULL)
+ seg = memsegs;
+ pp = seg->pages;
+ }
+ last_page_next_memseg = seg;
+ return (ppn);
+}
+
+/*
+ * Initialize for a loop using page_next_scan_large().
+ */
+page_t *
+page_next_scan_init(void **cookie)
+{
+ ASSERT(cookie != NULL);
+ *cookie = (void *)memsegs;
+ return ((page_t *)memsegs->pages);
+}
+
+/*
+ * Return the next page in a scan of page_t's, assuming we want
+ * to skip over sub-pages within larger page sizes.
+ *
+ * The cookie is used to keep track of the current memseg.
+ */
+page_t *
+page_next_scan_large(
+ page_t *pp,
+ ulong_t *n,
+ void **cookie)
+{
+ struct memseg *seg = (struct memseg *)*cookie;
+ page_t *new_pp;
+ ulong_t cnt;
+ pfn_t pfn;
+
+
+ /*
+ * get the count of page_t's to skip based on the page size
+ */
+ ASSERT(pp != NULL);
+ if (pp->p_szc == 0) {
+ cnt = 1;
+ } else {
+ pfn = page_pptonum(pp);
+ cnt = page_get_pagecnt(pp->p_szc);
+ cnt -= pfn & (cnt - 1);
+ }
+ *n += cnt;
+ new_pp = pp + cnt;
+
+ /*
+ * Catch if we went past the end of the current memory segment. If so,
+ * just move to the next segment with pages.
+ */
+ if (new_pp >= seg->epages) {
+ do {
+ seg = seg->next;
+ if (seg == NULL)
+ seg = memsegs;
+ } while (seg->pages == seg->epages);
+ new_pp = seg->pages;
+ *cookie = (void *)seg;
+ }
+
+ return (new_pp);
+}
+
+
+/*
+ * Returns next page in list. Note: this function wraps
+ * to the first page in the list upon reaching the end
+ * of the list. Callers should be aware of this fact.
+ */
+
+/* We should change this be a #define */
+
+page_t *
+page_next(page_t *pp)
+{
+ return (page_nextn(pp, 1));
+}
+
+/*
+ * Special for routines processing an array of page_t.
+ */
+page_t *
+page_nextn_raw(page_t *pp, ulong_t n)
+{
+ return (pp+n);
+}
+
+page_t *
+page_first()
+{
+ return ((page_t *)memsegs->pages);
+}
+
+
+/*
+ * This routine is called at boot with the initial memory configuration
+ * and when memory is added or removed.
+ */
+void
+build_pfn_hash()
+{
+ pfn_t cur;
+ pgcnt_t index;
+ struct memseg *pseg;
+ int i;
+
+ /*
+ * Clear memseg_hash array.
+ * Since memory add/delete is designed to operate concurrently
+ * with normal operation, the hash rebuild must be able to run
+ * concurrently with page_numtopp_nolock(). To support this
+ * functionality, assignments to memseg_hash array members must
+ * be done atomically.
+ *
+ * NOTE: bzero() does not currently guarantee this for kernel
+ * threads, and cannot be used here.
+ */
+ for (i = 0; i < N_MEM_SLOTS; i++)
+ memseg_hash[i] = NULL;
+
+ hat_kpm_mseghash_clear(N_MEM_SLOTS);
+
+ /*
+ * Physmax is the last valid pfn.
+ */
+ mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
+ for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
+ index = MEMSEG_PFN_HASH(pseg->pages_base);
+ cur = pseg->pages_base;
+ do {
+ if (index >= N_MEM_SLOTS)
+ index = MEMSEG_PFN_HASH(cur);
+
+ if (memseg_hash[index] == NULL ||
+ memseg_hash[index]->pages_base > pseg->pages_base) {
+ memseg_hash[index] = pseg;
+ hat_kpm_mseghash_update(index, pseg);
+ }
+ cur += mhash_per_slot;
+ index++;
+ } while (cur < pseg->pages_end);
+ }
+}
+
+/*
+ * Return the pagenum for the pp
+ */
+pfn_t
+page_pptonum(page_t *pp)
+{
+ return (pp->p_pagenum);
+}
+
+/*
+ * interface to the referenced and modified etc bits
+ * in the PSM part of the page struct
+ * when no locking is desired.
+ */
+void
+page_set_props(page_t *pp, uint_t flags)
+{
+ ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
+ pp->p_nrm |= (uchar_t)flags;
+}
+
+void
+page_clr_all_props(page_t *pp)
+{
+ pp->p_nrm = 0;
+}
+
+/*
+ * The following functions is called from free_vp_pages()
+ * for an inexact estimate of a newly free'd page...
+ */
+ulong_t
+page_share_cnt(page_t *pp)
+{
+ return (hat_page_getshare(pp));
+}
+
+/*
+ * The following functions are used in handling memory
+ * errors.
+ */
+
+int
+page_istoxic(page_t *pp)
+{
+ return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC);
+}
+
+int
+page_isfailing(page_t *pp)
+{
+ return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING);
+}
+
+int
+page_isretired(page_t *pp)
+{
+ return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED);
+}
+
+int
+page_deteriorating(page_t *pp)
+{
+ return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0);
+}
+
+void
+page_settoxic(page_t *pp, uchar_t flag)
+{
+ uchar_t new_flag = 0;
+ while ((new_flag & flag) != flag) {
+ uchar_t old_flag = pp->p_toxic;
+ new_flag = old_flag | flag;
+ (void) cas8(&pp->p_toxic, old_flag, new_flag);
+ new_flag = ((volatile page_t *)pp)->p_toxic;
+ }
+}
+
+void
+page_clrtoxic(page_t *pp)
+{
+ /*
+ * We don't need to worry about atomicity on the
+ * p_toxic flag here as this is only called from
+ * page_free() while holding an exclusive lock on
+ * the page
+ */
+ pp->p_toxic = PAGE_IS_OK;
+}
+
+void
+page_clrtoxic_flag(page_t *pp, uchar_t flag)
+{
+ uchar_t new_flag = ((volatile page_t *)pp)->p_toxic;
+ while ((new_flag & flag) == flag) {
+ uchar_t old_flag = new_flag;
+ new_flag = old_flag & ~flag;
+ (void) cas8(&pp->p_toxic, old_flag, new_flag);
+ new_flag = ((volatile page_t *)pp)->p_toxic;
+ }
+}
+
+int
+page_isfaulty(page_t *pp)
+{
+ return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY);
+}
+
+/*
+ * The following four functions are called from /proc code
+ * for the /proc/<pid>/xmap interface.
+ */
+int
+page_isshared(page_t *pp)
+{
+ return (hat_page_getshare(pp) > 1);
+}
+
+int
+page_isfree(page_t *pp)
+{
+ return (PP_ISFREE(pp));
+}
+
+int
+page_isref(page_t *pp)
+{
+ return (hat_page_getattr(pp, P_REF));
+}
+
+int
+page_ismod(page_t *pp)
+{
+ return (hat_page_getattr(pp, P_MOD));
+}
diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c
new file mode 100644
index 0000000000..3d1d773321
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_pagelist.c
@@ -0,0 +1,3726 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file contains common functions to access and manage the page lists.
+ * Many of these routines originated from platform dependent modules
+ * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
+ * a platform independent manner.
+ *
+ * vm/vm_dep.h provides for platform specific support.
+ */
+
+#include <sys/types.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/systm.h>
+#include <sys/atomic.h>
+#include <sys/sysmacros.h>
+#include <vm/as.h>
+#include <vm/page.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_vn.h>
+#include <sys/memnode.h>
+#include <vm/vm_dep.h>
+#include <sys/lgrp.h>
+#include <sys/mem_config.h>
+#include <sys/callb.h>
+#include <sys/mem_cage.h>
+#include <sys/sdt.h>
+
+extern uint_t vac_colors;
+
+/*
+ * number of page colors equivalent to reqested color in page_get routines.
+ * If set, keeps large pages intact longer and keeps MPO allocation
+ * from the local mnode in favor of acquiring the 'correct' page color from
+ * a demoted large page or from a remote mnode.
+ */
+int colorequiv;
+
+/*
+ * if set, specifies the percentage of large pages that are free from within
+ * a large page region before attempting to lock those pages for
+ * page_get_contig_pages processing.
+ *
+ * Should be turned on when kpr is available when page_trylock_contig_pages
+ * can be more selective.
+ */
+
+int ptcpthreshold;
+
+/*
+ * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
+ * use slot 0 (base page size unused) to enable or disable limiting search.
+ * Enabled by default.
+ */
+int pgcpfailcnt[MMU_PAGE_SIZES];
+int pgcplimitsearch = 1;
+
+#ifdef VM_STATS
+struct vmm_vmstats_str vmm_vmstats;
+
+#endif /* VM_STATS */
+
+#if defined(__sparc)
+#define LPGCREATE 0
+#else
+/* enable page_get_contig_pages */
+#define LPGCREATE 1
+#endif
+
+int pg_contig_disable;
+int pg_lpgcreate_nocage = LPGCREATE;
+
+/*
+ * page_freelist_fill pfn flag to signify no hi pfn requirement.
+ */
+#define PFNNULL 0
+
+/* Flags involved in promotion and demotion routines */
+#define PC_FREE 0x1 /* put page on freelist */
+#define PC_ALLOC 0x2 /* return page for allocation */
+
+/*
+ * Flag for page_demote to be used with PC_FREE to denote that we don't care
+ * what the color is as the color parameter to the function is ignored.
+ */
+#define PC_NO_COLOR (-1)
+
+/*
+ * page counters candidates info
+ * See page_ctrs_cands comment below for more details.
+ * fields are as follows:
+ * pcc_pages_free: # pages which freelist coalesce can create
+ * pcc_color_free_len: number of elements in pcc_color_free array
+ * pcc_color_free: pointer to page free counts per color
+ */
+typedef struct pcc_info {
+ pgcnt_t pcc_pages_free;
+ int pcc_color_free_len;
+ pgcnt_t *pcc_color_free;
+} pcc_info_t;
+
+/*
+ * On big machines it can take a long time to check page_counters
+ * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
+ * updated sum of all elements of the corresponding page_counters arrays.
+ * page_freelist_coalesce() searches page_counters only if an appropriate
+ * element of page_ctrs_cands array is greater than 0.
+ *
+ * An extra dimension is used for page_ctrs_cands to spread the elements
+ * over a few e$ cache lines to avoid serialization during the array
+ * updates.
+ */
+#pragma align 64(page_ctrs_cands)
+
+static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
+
+/*
+ * Return in val the total number of free pages which can be created
+ * for the given mnode (m) and region size (r)
+ */
+#define PGCTRS_CANDS_GETVALUE(m, r, val) { \
+ int i; \
+ val = 0; \
+ for (i = 0; i < NPC_MUTEX; i++) { \
+ val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \
+ } \
+}
+
+/*
+ * Return in val the total number of free pages which can be created
+ * for the given mnode (m), region size (r), and color (c)
+ */
+#define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \
+ int i; \
+ val = 0; \
+ ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \
+ for (i = 0; i < NPC_MUTEX; i++) { \
+ val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \
+ } \
+}
+
+/*
+ * We can only allow a single thread to update a counter within the physical
+ * range of the largest supported page size. That is the finest granularity
+ * possible since the counter values are dependent on each other
+ * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
+ * ctr_mutex lock index for a particular physical range.
+ */
+static kmutex_t *ctr_mutex[NPC_MUTEX];
+
+#define PP_CTR_LOCK_INDX(pp) \
+ (((pp)->p_pagenum >> \
+ (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
+
+/*
+ * Local functions prototypes.
+ */
+
+void page_ctr_add(page_t *, int);
+void page_ctr_add_internal(int, page_t *, int);
+void page_ctr_sub(page_t *, int);
+uint_t page_convert_color(uchar_t, uchar_t, uint_t);
+void page_freelist_lock(int);
+void page_freelist_unlock(int);
+page_t *page_promote(int, pfn_t, uchar_t, int);
+page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
+page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
+page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
+static int page_trylock_cons(page_t *pp, se_t se);
+
+#define PNUM_SIZE(szc) \
+ (hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift)
+#define PNUM_SHIFT(szc) \
+ (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
+
+/*
+ * The page_counters array below is used to keep track of free contiguous
+ * physical memory. A hw_page_map_t will be allocated per mnode per szc.
+ * This contains an array of counters, the size of the array, a shift value
+ * used to convert a pagenum into a counter array index or vice versa, as
+ * well as a cache of the last successful index to be promoted to a larger
+ * page size. As an optimization, we keep track of the last successful index
+ * to be promoted per page color for the given size region, and this is
+ * allocated dynamically based upon the number of colors for a given
+ * region size.
+ *
+ * Conceptually, the page counters are represented as:
+ *
+ * page_counters[region_size][mnode]
+ *
+ * region_size: size code of a candidate larger page made up
+ * of contiguous free smaller pages.
+ *
+ * page_counters[region_size][mnode].hpm_counters[index]:
+ * represents how many (region_size - 1) pages either
+ * exist or can be created within the given index range.
+ *
+ * Let's look at a sparc example:
+ * If we want to create a free 512k page, we look at region_size 2
+ * for the mnode we want. We calculate the index and look at a specific
+ * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at
+ * this location, it means that 8 64k pages either exist or can be created
+ * from 8K pages in order to make a single free 512k page at the given
+ * index. Note that when a region is full, it will contribute to the
+ * counts in the region above it. Thus we will not know what page
+ * size the free pages will be which can be promoted to this new free
+ * page unless we look at all regions below the current region.
+ */
+
+/*
+ * Note: hpmctr_t is defined in platform vm_dep.h
+ * hw_page_map_t contains all the information needed for the page_counters
+ * logic. The fields are as follows:
+ *
+ * hpm_counters: dynamically allocated array to hold counter data
+ * hpm_entries: entries in hpm_counters
+ * hpm_shift: shift for pnum/array index conv
+ * hpm_base: PFN mapped to counter index 0
+ * hpm_color_current_len: # of elements in hpm_color_current "array" below
+ * hpm_color_current: last index in counter array for this color at
+ * which we successfully created a large page
+ */
+typedef struct hw_page_map {
+ hpmctr_t *hpm_counters;
+ size_t hpm_entries;
+ int hpm_shift;
+ pfn_t hpm_base;
+ size_t hpm_color_current_len;
+ size_t *hpm_color_current;
+} hw_page_map_t;
+
+/*
+ * Element zero is not used, but is allocated for convenience.
+ */
+static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
+
+/*
+ * The following macros are convenient ways to get access to the individual
+ * elements of the page_counters arrays. They can be used on both
+ * the left side and right side of equations.
+ */
+#define PAGE_COUNTERS(mnode, rg_szc, idx) \
+ (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
+
+#define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \
+ (page_counters[(rg_szc)][(mnode)].hpm_counters)
+
+#define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \
+ (page_counters[(rg_szc)][(mnode)].hpm_shift)
+
+#define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \
+ (page_counters[(rg_szc)][(mnode)].hpm_entries)
+
+#define PAGE_COUNTERS_BASE(mnode, rg_szc) \
+ (page_counters[(rg_szc)][(mnode)].hpm_base)
+
+#define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \
+ (page_counters[(rg_szc)][(mnode)].hpm_color_current_len)
+
+#define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \
+ (page_counters[(rg_szc)][(mnode)].hpm_color_current)
+
+#define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \
+ (page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)])
+
+#define PNUM_TO_IDX(mnode, rg_szc, pnum) \
+ (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \
+ PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
+
+#define IDX_TO_PNUM(mnode, rg_szc, index) \
+ (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \
+ ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
+
+/*
+ * Protects the hpm_counters and hpm_color_current memory from changing while
+ * looking at page counters information.
+ * Grab the write lock to modify what these fields point at.
+ * Grab the read lock to prevent any pointers from changing.
+ * The write lock can not be held during memory allocation due to a possible
+ * recursion deadlock with trying to grab the read lock while the
+ * write lock is already held.
+ */
+krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
+
+/*
+ * page size to page size code
+ */
+int
+page_szc(size_t pagesize)
+{
+ int i = 0;
+
+ while (hw_page_array[i].hp_size) {
+ if (pagesize == hw_page_array[i].hp_size)
+ return (i);
+ i++;
+ }
+ return (-1);
+}
+
+/*
+ * page size to page size code for user supported page sizes
+ */
+int
+page_user_szc(size_t pagesize)
+{
+ int szc = page_szc(pagesize);
+ if (szc != -1)
+ return (SZC_2_USERSZC(szc));
+ return (-1);
+}
+
+/*
+ * Return how many page sizes are available for the user to use. This is
+ * what the hardware supports and not based upon how the OS implements the
+ * support of different page sizes.
+ */
+uint_t
+page_num_user_pagesizes(void)
+{
+ return (mmu_exported_page_sizes);
+}
+
+uint_t
+page_num_pagesizes(void)
+{
+ return (mmu_page_sizes);
+}
+
+/*
+ * returns the count of the number of base pagesize pages associated with szc
+ */
+pgcnt_t
+page_get_pagecnt(uint_t szc)
+{
+ if (szc >= mmu_page_sizes)
+ panic("page_get_pagecnt: out of range %d", szc);
+ return (hw_page_array[szc].hp_pgcnt);
+}
+
+size_t
+page_get_pagesize(uint_t szc)
+{
+ if (szc >= mmu_page_sizes)
+ panic("page_get_pagesize: out of range %d", szc);
+ return (hw_page_array[szc].hp_size);
+}
+
+/*
+ * Return the size of a page based upon the index passed in. An index of
+ * zero refers to the smallest page size in the system, and as index increases
+ * it refers to the next larger supported page size in the system.
+ * Note that szc and userszc may not be the same due to unsupported szc's on
+ * some systems.
+ */
+size_t
+page_get_user_pagesize(uint_t userszc)
+{
+ uint_t szc = USERSZC_2_SZC(userszc);
+
+ if (szc >= mmu_page_sizes)
+ panic("page_get_user_pagesize: out of range %d", szc);
+ return (hw_page_array[szc].hp_size);
+}
+
+uint_t
+page_get_shift(uint_t szc)
+{
+ if (szc >= mmu_page_sizes)
+ panic("page_get_shift: out of range %d", szc);
+ return (hw_page_array[szc].hp_shift);
+}
+
+uint_t
+page_get_pagecolors(uint_t szc)
+{
+ ASSERT(page_colors != 0);
+ return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1));
+}
+
+/*
+ * Called by startup().
+ * Size up the per page size free list counters based on physmax
+ * of each node and max_mem_nodes.
+ */
+size_t
+page_ctrs_sz(void)
+{
+ int r; /* region size */
+ int mnode;
+ uint_t ctrs_sz = 0;
+ int i;
+ pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
+
+ /*
+ * We need to determine how many page colors there are for each
+ * page size in order to allocate memory for any color specific
+ * arrays.
+ */
+ colors_per_szc[0] = page_colors;
+ for (i = 1; i < mmu_page_sizes; i++) {
+ colors_per_szc[i] =
+ page_convert_color(0, i, page_colors - 1) + 1;
+ }
+
+ for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+
+ pgcnt_t r_pgcnt;
+ pfn_t r_base;
+ pgcnt_t r_align;
+
+ if (mem_node_config[mnode].exists == 0)
+ continue;
+
+ /*
+ * determine size needed for page counter arrays with
+ * base aligned to large page size.
+ */
+ for (r = 1; r < mmu_page_sizes; r++) {
+ /* add in space for hpm_counters */
+ r_align = page_get_pagecnt(r);
+ r_base = mem_node_config[mnode].physbase;
+ r_base &= ~(r_align - 1);
+ r_pgcnt = howmany(mem_node_config[mnode].physmax -
+ r_base, r_align);
+ /*
+ * Round up to always allocate on pointer sized
+ * boundaries.
+ */
+ ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
+ sizeof (hpmctr_t *));
+
+ /* add in space for hpm_color_current */
+ ctrs_sz += (colors_per_szc[r] *
+ sizeof (size_t));
+ }
+ }
+
+ for (r = 1; r < mmu_page_sizes; r++) {
+ ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
+
+ /* add in space for page_ctrs_cands */
+ ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t));
+ ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] *
+ sizeof (pgcnt_t);
+ }
+
+ /* ctr_mutex */
+ ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
+
+ /* size for page list counts */
+ PLCNT_SZ(ctrs_sz);
+
+ /*
+ * add some slop for roundups. page_ctrs_alloc will roundup the start
+ * address of the counters to ecache_alignsize boundary for every
+ * memory node.
+ */
+ return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
+}
+
+caddr_t
+page_ctrs_alloc(caddr_t alloc_base)
+{
+ int mnode;
+ int r; /* region size */
+ int i;
+ pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
+
+ /*
+ * We need to determine how many page colors there are for each
+ * page size in order to allocate memory for any color specific
+ * arrays.
+ */
+ colors_per_szc[0] = page_colors;
+ for (i = 1; i < mmu_page_sizes; i++) {
+ colors_per_szc[i] =
+ page_convert_color(0, i, page_colors - 1) + 1;
+ }
+
+ for (r = 1; r < mmu_page_sizes; r++) {
+ page_counters[r] = (hw_page_map_t *)alloc_base;
+ alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
+ }
+
+ /* page_ctrs_cands */
+ for (r = 1; r < mmu_page_sizes; r++) {
+ for (i = 0; i < NPC_MUTEX; i++) {
+ page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base;
+ alloc_base += max_mem_nodes * (sizeof (pcc_info_t));
+
+ }
+ }
+
+ /* page_ctrs_cands pcc_color_free array */
+ for (r = 1; r < mmu_page_sizes; r++) {
+ for (i = 0; i < NPC_MUTEX; i++) {
+ for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+ page_ctrs_cands[i][r][mnode].pcc_color_free_len
+ = colors_per_szc[r];
+ page_ctrs_cands[i][r][mnode].pcc_color_free =
+ (pgcnt_t *)alloc_base;
+ alloc_base += colors_per_szc[r] *
+ sizeof (pgcnt_t);
+ }
+ }
+ }
+
+ /* ctr_mutex */
+ for (i = 0; i < NPC_MUTEX; i++) {
+ ctr_mutex[i] = (kmutex_t *)alloc_base;
+ alloc_base += (max_mem_nodes * sizeof (kmutex_t));
+ }
+
+ /* initialize page list counts */
+ PLCNT_INIT(alloc_base);
+
+ for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+
+ pgcnt_t r_pgcnt;
+ pfn_t r_base;
+ pgcnt_t r_align;
+ int r_shift;
+
+ if (mem_node_config[mnode].exists == 0)
+ continue;
+
+ for (r = 1; r < mmu_page_sizes; r++) {
+ /*
+ * the page_counters base has to be aligned to the
+ * page count of page size code r otherwise the counts
+ * will cross large page boundaries.
+ */
+ r_align = page_get_pagecnt(r);
+ r_base = mem_node_config[mnode].physbase;
+ /* base needs to be aligned - lower to aligned value */
+ r_base &= ~(r_align - 1);
+ r_pgcnt = howmany(mem_node_config[mnode].physmax -
+ r_base, r_align);
+ r_shift = PAGE_BSZS_SHIFT(r);
+
+ PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
+ PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
+ PAGE_COUNTERS_BASE(mnode, r) = r_base;
+ PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) =
+ colors_per_szc[r];
+ PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) =
+ (size_t *)alloc_base;
+ alloc_base += (sizeof (size_t) * colors_per_szc[r]);
+ for (i = 0; i < colors_per_szc[r]; i++) {
+ PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
+ }
+ PAGE_COUNTERS_COUNTERS(mnode, r) =
+ (hpmctr_t *)alloc_base;
+ /*
+ * Round up to make alloc_base always be aligned on
+ * a pointer boundary.
+ */
+ alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
+ sizeof (hpmctr_t *));
+
+ /*
+ * Verify that PNUM_TO_IDX and IDX_TO_PNUM
+ * satisfy the identity requirement.
+ * We should be able to go from one to the other
+ * and get consistent values.
+ */
+ ASSERT(PNUM_TO_IDX(mnode, r,
+ (IDX_TO_PNUM(mnode, r, 0))) == 0);
+ ASSERT(IDX_TO_PNUM(mnode, r,
+ (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
+ }
+ /*
+ * Roundup the start address of the page_counters to
+ * cache aligned boundary for every memory node.
+ * page_ctrs_sz() has added some slop for these roundups.
+ */
+ alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
+ L2CACHE_ALIGN);
+ }
+
+ /* Initialize other page counter specific data structures. */
+ for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
+ rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
+ }
+
+ return (alloc_base);
+}
+
+/*
+ * Functions to adjust region counters for each size free list.
+ * Caller is responsible to acquire the ctr_mutex lock if necessary and
+ * thus can be called during startup without locks.
+ */
+/* ARGSUSED */
+void
+page_ctr_add_internal(int mnode, page_t *pp, int flags)
+{
+ ssize_t r; /* region size */
+ ssize_t idx;
+ pfn_t pfnum;
+ int lckidx;
+
+ ASSERT(pp->p_szc < mmu_page_sizes);
+
+ PLCNT_INCR(pp, mnode, pp->p_szc, flags);
+
+ /* no counter update needed for largest page size */
+ if (pp->p_szc >= mmu_page_sizes - 1) {
+ return;
+ }
+
+ r = pp->p_szc + 1;
+ pfnum = pp->p_pagenum;
+ lckidx = PP_CTR_LOCK_INDX(pp);
+
+ /*
+ * Increment the count of free pages for the current
+ * region. Continue looping up in region size incrementing
+ * count if the preceeding region is full.
+ */
+ while (r < mmu_page_sizes) {
+ idx = PNUM_TO_IDX(mnode, r, pfnum);
+
+ ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
+ ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
+
+ if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r))
+ break;
+
+ page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++;
+ page_ctrs_cands[lckidx][r][mnode].
+ pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
+ r++;
+ }
+}
+
+void
+page_ctr_add(page_t *pp, int flags)
+{
+ int lckidx = PP_CTR_LOCK_INDX(pp);
+ int mnode = PP_2_MEM_NODE(pp);
+ kmutex_t *lock = &ctr_mutex[lckidx][mnode];
+
+ mutex_enter(lock);
+ page_ctr_add_internal(mnode, pp, flags);
+ mutex_exit(lock);
+}
+
+void
+page_ctr_sub(page_t *pp, int flags)
+{
+ int lckidx;
+ int mnode = PP_2_MEM_NODE(pp);
+ kmutex_t *lock;
+ ssize_t r; /* region size */
+ ssize_t idx;
+ pfn_t pfnum;
+
+ ASSERT(pp->p_szc < mmu_page_sizes);
+
+ PLCNT_DECR(pp, mnode, pp->p_szc, flags);
+
+ /* no counter update needed for largest page size */
+ if (pp->p_szc >= mmu_page_sizes - 1) {
+ return;
+ }
+
+ r = pp->p_szc + 1;
+ pfnum = pp->p_pagenum;
+ lckidx = PP_CTR_LOCK_INDX(pp);
+ lock = &ctr_mutex[lckidx][mnode];
+
+ /*
+ * Decrement the count of free pages for the current
+ * region. Continue looping up in region size decrementing
+ * count if the preceeding region was full.
+ */
+ mutex_enter(lock);
+ while (r < mmu_page_sizes) {
+ idx = PNUM_TO_IDX(mnode, r, pfnum);
+
+ ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
+ ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
+
+ if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
+ break;
+ }
+ ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0);
+ ASSERT(page_ctrs_cands[lckidx][r][mnode].
+ pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
+
+ page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--;
+ page_ctrs_cands[lckidx][r][mnode].
+ pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
+ r++;
+ }
+ mutex_exit(lock);
+}
+
+/*
+ * Adjust page counters following a memory attach, since typically the
+ * size of the array needs to change, and the PFN to counter index
+ * mapping needs to change.
+ */
+uint_t
+page_ctrs_adjust(int mnode)
+{
+ pgcnt_t npgs;
+ int r; /* region size */
+ int i;
+ size_t pcsz, old_csz;
+ hpmctr_t *new_ctr, *old_ctr;
+ pfn_t oldbase, newbase;
+ size_t old_npgs;
+ hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
+ size_t size_cache[MMU_PAGE_SIZES];
+ size_t *color_cache[MMU_PAGE_SIZES];
+ size_t *old_color_array;
+ pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
+
+ newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
+ npgs = roundup(mem_node_config[mnode].physmax,
+ PC_BASE_ALIGN) - newbase;
+
+ /*
+ * We need to determine how many page colors there are for each
+ * page size in order to allocate memory for any color specific
+ * arrays.
+ */
+ colors_per_szc[0] = page_colors;
+ for (r = 1; r < mmu_page_sizes; r++) {
+ colors_per_szc[r] =
+ page_convert_color(0, r, page_colors - 1) + 1;
+ }
+
+ /*
+ * Preallocate all of the new hpm_counters arrays as we can't
+ * hold the page_ctrs_rwlock as a writer and allocate memory.
+ * If we can't allocate all of the arrays, undo our work so far
+ * and return failure.
+ */
+ for (r = 1; r < mmu_page_sizes; r++) {
+ pcsz = npgs >> PAGE_BSZS_SHIFT(r);
+
+ ctr_cache[r] = kmem_zalloc(pcsz *
+ sizeof (hpmctr_t), KM_NOSLEEP);
+ if (ctr_cache[r] == NULL) {
+ while (--r >= 1) {
+ kmem_free(ctr_cache[r],
+ size_cache[r] * sizeof (hpmctr_t));
+ }
+ return (ENOMEM);
+ }
+ size_cache[r] = pcsz;
+ }
+ /*
+ * Preallocate all of the new color current arrays as we can't
+ * hold the page_ctrs_rwlock as a writer and allocate memory.
+ * If we can't allocate all of the arrays, undo our work so far
+ * and return failure.
+ */
+ for (r = 1; r < mmu_page_sizes; r++) {
+ color_cache[r] = kmem_zalloc(sizeof (size_t) *
+ colors_per_szc[r], KM_NOSLEEP);
+ if (color_cache[r] == NULL) {
+ while (--r >= 1) {
+ kmem_free(color_cache[r],
+ colors_per_szc[r] * sizeof (size_t));
+ }
+ for (r = 1; r < mmu_page_sizes; r++) {
+ kmem_free(ctr_cache[r],
+ size_cache[r] * sizeof (hpmctr_t));
+ }
+ return (ENOMEM);
+ }
+ }
+
+ /*
+ * Grab the write lock to prevent others from walking these arrays
+ * while we are modifying them.
+ */
+ rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
+ page_freelist_lock(mnode);
+ for (r = 1; r < mmu_page_sizes; r++) {
+ PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
+ old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
+ old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
+ oldbase = PAGE_COUNTERS_BASE(mnode, r);
+ old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
+ old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r);
+
+ pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
+ new_ctr = ctr_cache[r];
+ ctr_cache[r] = NULL;
+ if (old_ctr != NULL &&
+ (oldbase + old_npgs > newbase) &&
+ (newbase + npgs > oldbase)) {
+ /*
+ * Map the intersection of the old and new
+ * counters into the new array.
+ */
+ size_t offset;
+ if (newbase > oldbase) {
+ offset = (newbase - oldbase) >>
+ PAGE_COUNTERS_SHIFT(mnode, r);
+ bcopy(old_ctr + offset, new_ctr,
+ MIN(pcsz, (old_csz - offset)) *
+ sizeof (hpmctr_t));
+ } else {
+ offset = (oldbase - newbase) >>
+ PAGE_COUNTERS_SHIFT(mnode, r);
+ bcopy(old_ctr, new_ctr + offset,
+ MIN(pcsz - offset, old_csz) *
+ sizeof (hpmctr_t));
+ }
+ }
+
+ PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
+ PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
+ PAGE_COUNTERS_BASE(mnode, r) = newbase;
+ PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r];
+ PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r];
+ color_cache[r] = NULL;
+ /*
+ * for now, just reset on these events as it's probably
+ * not worthwhile to try and optimize this.
+ */
+ for (i = 0; i < colors_per_szc[r]; i++) {
+ PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
+ }
+
+ /* cache info for freeing out of the critical path */
+ if ((caddr_t)old_ctr >= kernelheap &&
+ (caddr_t)old_ctr < ekernelheap) {
+ ctr_cache[r] = old_ctr;
+ size_cache[r] = old_csz;
+ }
+ if ((caddr_t)old_color_array >= kernelheap &&
+ (caddr_t)old_color_array < ekernelheap) {
+ color_cache[r] = old_color_array;
+ }
+ /*
+ * Verify that PNUM_TO_IDX and IDX_TO_PNUM
+ * satisfy the identity requirement.
+ * We should be able to go from one to the other
+ * and get consistent values.
+ */
+ ASSERT(PNUM_TO_IDX(mnode, r,
+ (IDX_TO_PNUM(mnode, r, 0))) == 0);
+ ASSERT(IDX_TO_PNUM(mnode, r,
+ (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
+ }
+ page_freelist_unlock(mnode);
+ rw_exit(&page_ctrs_rwlock[mnode]);
+
+ /*
+ * Now that we have dropped the write lock, it is safe to free all
+ * of the memory we have cached above.
+ */
+ for (r = 1; r < mmu_page_sizes; r++) {
+ if (ctr_cache[r] != NULL) {
+ kmem_free(ctr_cache[r],
+ size_cache[r] * sizeof (hpmctr_t));
+ }
+ if (color_cache[r] != NULL) {
+ kmem_free(color_cache[r],
+ colors_per_szc[r] * sizeof (size_t));
+ }
+ }
+ return (0);
+}
+
+/*
+ * color contains a valid color index or bin for cur_szc
+ */
+uint_t
+page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color)
+{
+ uint_t shift;
+
+ if (cur_szc > new_szc) {
+ shift = page_get_shift(cur_szc) - page_get_shift(new_szc);
+ return (color << shift);
+ } else if (cur_szc < new_szc) {
+ shift = page_get_shift(new_szc) - page_get_shift(cur_szc);
+ return (color >> shift);
+ }
+ return (color);
+}
+
+#ifdef DEBUG
+
+/*
+ * confirm pp is a large page corresponding to szc
+ */
+void
+chk_lpg(page_t *pp, uchar_t szc)
+{
+ spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
+ uint_t noreloc;
+
+ if (npgs == 1) {
+ ASSERT(pp->p_szc == 0);
+ ASSERT(pp->p_next == pp);
+ ASSERT(pp->p_prev == pp);
+ return;
+ }
+
+ ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
+ ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
+
+ ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
+ ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
+ ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
+ ASSERT(pp->p_prev == (pp + (npgs - 1)));
+
+ /*
+ * Check list of pages.
+ */
+ noreloc = PP_ISNORELOC(pp);
+ while (npgs--) {
+ if (npgs != 0) {
+ ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
+ ASSERT(pp->p_next == (pp + 1));
+ }
+ ASSERT(pp->p_szc == szc);
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+ ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
+ ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
+ ASSERT(pp->p_vnode == NULL);
+ ASSERT(PP_ISNORELOC(pp) == noreloc);
+
+ pp = pp->p_next;
+ }
+}
+#endif /* DEBUG */
+
+void
+page_freelist_lock(int mnode)
+{
+ int i;
+ for (i = 0; i < NPC_MUTEX; i++) {
+ mutex_enter(FPC_MUTEX(mnode, i));
+ mutex_enter(CPC_MUTEX(mnode, i));
+ }
+}
+
+void
+page_freelist_unlock(int mnode)
+{
+ int i;
+ for (i = 0; i < NPC_MUTEX; i++) {
+ mutex_exit(FPC_MUTEX(mnode, i));
+ mutex_exit(CPC_MUTEX(mnode, i));
+ }
+}
+
+/*
+ * add pp to the specified page list. Defaults to head of the page list
+ * unless PG_LIST_TAIL is specified.
+ */
+void
+page_list_add(page_t *pp, int flags)
+{
+ page_t **ppp;
+ kmutex_t *pcm;
+ uint_t bin, mtype;
+ int mnode;
+
+ ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(!hat_page_is_mapped(pp));
+ ASSERT(hat_page_getshare(pp) == 0);
+
+ /*
+ * Large pages should be freed via page_list_add_pages().
+ */
+ ASSERT(pp->p_szc == 0);
+
+ /*
+ * Don't need to lock the freelist first here
+ * because the page isn't on the freelist yet.
+ * This means p_szc can't change on us.
+ */
+
+ bin = PP_2_BIN(pp);
+ mnode = PP_2_MEM_NODE(pp);
+ mtype = PP_2_MTYPE(pp);
+
+ if (flags & PG_LIST_ISINIT) {
+ /*
+ * PG_LIST_ISINIT is set during system startup (ie. single
+ * threaded), add a page to the free list and add to the
+ * the free region counters w/o any locking
+ */
+ ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
+
+ /* inline version of page_add() */
+ if (*ppp != NULL) {
+ pp->p_next = *ppp;
+ pp->p_prev = (*ppp)->p_prev;
+ (*ppp)->p_prev = pp;
+ pp->p_prev->p_next = pp;
+ } else
+ *ppp = pp;
+
+ page_ctr_add_internal(mnode, pp, flags);
+ } else {
+ pcm = PC_BIN_MUTEX(mnode, bin, flags);
+
+ if (flags & PG_FREE_LIST) {
+ ASSERT(PP_ISAGED(pp));
+ ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
+
+ } else {
+ ASSERT(pp->p_vnode);
+ ASSERT((pp->p_offset & PAGEOFFSET) == 0);
+ ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
+ }
+ mutex_enter(pcm);
+ page_add(ppp, pp);
+
+ if (flags & PG_LIST_TAIL)
+ *ppp = (*ppp)->p_next;
+ /*
+ * Add counters before releasing pcm mutex to avoid a race with
+ * page_freelist_coalesce and page_freelist_fill.
+ */
+ page_ctr_add(pp, flags);
+ mutex_exit(pcm);
+ }
+
+
+#if defined(__sparc)
+ if (PP_ISNORELOC(pp)) {
+ kcage_freemem_add(1);
+ }
+#endif
+ /*
+ * It is up to the caller to unlock the page!
+ */
+ ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
+}
+
+
+#ifdef __sparc
+/*
+ * This routine is only used by kcage_init during system startup.
+ * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
+ * without the overhead of taking locks and updating counters.
+ */
+void
+page_list_noreloc_startup(page_t *pp)
+{
+ page_t **ppp;
+ uint_t bin;
+ int mnode;
+ int mtype;
+ int flags = PG_LIST_ISCAGE;
+
+ /*
+ * If this is a large page on the freelist then
+ * break it up into smaller pages.
+ */
+ if (pp->p_szc != 0)
+ page_boot_demote(pp);
+
+ /*
+ * Get list page is currently on.
+ */
+ bin = PP_2_BIN(pp);
+ mnode = PP_2_MEM_NODE(pp);
+ mtype = PP_2_MTYPE(pp);
+ ASSERT(mtype == MTYPE_RELOC);
+ ASSERT(pp->p_szc == 0);
+
+ if (PP_ISAGED(pp)) {
+ ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
+ flags |= PG_FREE_LIST;
+ } else {
+ ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
+ flags |= PG_CACHE_LIST;
+ }
+
+ ASSERT(*ppp != NULL);
+
+ /*
+ * Delete page from current list.
+ */
+ if (*ppp == pp)
+ *ppp = pp->p_next; /* go to next page */
+ if (*ppp == pp) {
+ *ppp = NULL; /* page list is gone */
+ } else {
+ pp->p_prev->p_next = pp->p_next;
+ pp->p_next->p_prev = pp->p_prev;
+ }
+
+ /* LINTED */
+ PLCNT_DECR(pp, mnode, 0, flags);
+
+ /*
+ * Set no reloc for cage initted pages.
+ */
+ PP_SETNORELOC(pp);
+
+ mtype = PP_2_MTYPE(pp);
+ ASSERT(mtype == MTYPE_NORELOC);
+
+ /*
+ * Get new list for page.
+ */
+ if (PP_ISAGED(pp)) {
+ ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
+ } else {
+ ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
+ }
+
+ /*
+ * Insert page on new list.
+ */
+ if (*ppp == NULL) {
+ *ppp = pp;
+ pp->p_next = pp->p_prev = pp;
+ } else {
+ pp->p_next = *ppp;
+ pp->p_prev = (*ppp)->p_prev;
+ (*ppp)->p_prev = pp;
+ pp->p_prev->p_next = pp;
+ }
+
+ /* LINTED */
+ PLCNT_INCR(pp, mnode, 0, flags);
+
+ /*
+ * Update cage freemem counter
+ */
+ atomic_add_long(&kcage_freemem, 1);
+}
+#else /* __sparc */
+
+/* ARGSUSED */
+void
+page_list_noreloc_startup(page_t *pp)
+{
+ panic("page_list_noreloc_startup: should be here only for sparc");
+}
+#endif
+
+void
+page_list_add_pages(page_t *pp, int flags)
+{
+ kmutex_t *pcm;
+ pgcnt_t pgcnt;
+ uint_t bin, mtype, i;
+ int mnode;
+
+ /* default to freelist/head */
+ ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
+
+ CHK_LPG(pp, pp->p_szc);
+ VM_STAT_ADD(vmm_vmstats.pc_list_add_pages[pp->p_szc]);
+
+ bin = PP_2_BIN(pp);
+ mnode = PP_2_MEM_NODE(pp);
+ mtype = PP_2_MTYPE(pp);
+
+ if (flags & PG_LIST_ISINIT) {
+ ASSERT(pp->p_szc == mmu_page_sizes - 1);
+ page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
+ ASSERT(!PP_ISNORELOC(pp));
+ PLCNT_INCR(pp, mnode, pp->p_szc, flags);
+ } else {
+
+ ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
+
+ pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
+
+ mutex_enter(pcm);
+ page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
+ page_ctr_add(pp, PG_FREE_LIST);
+ mutex_exit(pcm);
+
+ pgcnt = page_get_pagecnt(pp->p_szc);
+#if defined(__sparc)
+ if (PP_ISNORELOC(pp))
+ kcage_freemem_add(pgcnt);
+#endif
+ for (i = 0; i < pgcnt; i++, pp++)
+ page_unlock(pp);
+ }
+}
+
+/*
+ * During boot, need to demote a large page to base
+ * pagesize pages for seg_kmem for use in boot_alloc()
+ */
+void
+page_boot_demote(page_t *pp)
+{
+ ASSERT(pp->p_szc != 0);
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+
+ (void) page_demote(PP_2_MEM_NODE(pp),
+ PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
+ PC_FREE);
+
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+ ASSERT(pp->p_szc == 0);
+}
+
+/*
+ * Take a particular page off of whatever freelist the page
+ * is claimed to be on.
+ *
+ * NOTE: Only used for PAGESIZE pages.
+ */
+void
+page_list_sub(page_t *pp, int flags)
+{
+ int bin;
+ uint_t mtype;
+ int mnode;
+ kmutex_t *pcm;
+ page_t **ppp;
+
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(PP_ISFREE(pp));
+
+ /*
+ * The p_szc field can only be changed by page_promote()
+ * and page_demote(). Only free pages can be promoted and
+ * demoted and the free list MUST be locked during these
+ * operations. So to prevent a race in page_list_sub()
+ * between computing which bin of the freelist lock to
+ * grab and actually grabing the lock we check again that
+ * the bin we locked is still the correct one. Notice that
+ * the p_szc field could have actually changed on us but
+ * if the bin happens to still be the same we are safe.
+ */
+try_again:
+ bin = PP_2_BIN(pp);
+ mnode = PP_2_MEM_NODE(pp);
+ pcm = PC_BIN_MUTEX(mnode, bin, flags);
+ mutex_enter(pcm);
+ if (PP_2_BIN(pp) != bin) {
+ mutex_exit(pcm);
+ goto try_again;
+ }
+ mtype = PP_2_MTYPE(pp);
+
+ if (flags & PG_FREE_LIST) {
+ ASSERT(PP_ISAGED(pp));
+ ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
+ } else {
+ ASSERT(!PP_ISAGED(pp));
+ ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
+ }
+
+ /*
+ * Common PAGESIZE case.
+ *
+ * Note that we locked the freelist. This prevents
+ * any page promotion/demotion operations. Therefore
+ * the p_szc will not change until we drop pcm mutex.
+ */
+ if (pp->p_szc == 0) {
+ page_sub(ppp, pp);
+ /*
+ * Subtract counters before releasing pcm mutex
+ * to avoid race with page_freelist_coalesce.
+ */
+ page_ctr_sub(pp, flags);
+ mutex_exit(pcm);
+
+#if defined(__sparc)
+ if (PP_ISNORELOC(pp)) {
+ kcage_freemem_sub(1);
+ }
+#endif
+ return;
+ }
+
+ /*
+ * Large pages on the cache list are not supported.
+ */
+ if (flags & PG_CACHE_LIST)
+ panic("page_list_sub: large page on cachelist");
+
+ /*
+ * Slow but rare.
+ *
+ * Somebody wants this particular page which is part
+ * of a large page. In this case we just demote the page
+ * if it's on the freelist.
+ *
+ * We have to drop pcm before locking the entire freelist.
+ * Once we have re-locked the freelist check to make sure
+ * the page hasn't already been demoted or completely
+ * freed.
+ */
+ mutex_exit(pcm);
+ page_freelist_lock(mnode);
+ if (pp->p_szc != 0) {
+ /*
+ * Large page is on freelist.
+ */
+ (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
+ pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
+ }
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+ ASSERT(pp->p_szc == 0);
+
+ /*
+ * Subtract counters before releasing pcm mutex
+ * to avoid race with page_freelist_coalesce.
+ */
+ bin = PP_2_BIN(pp);
+ mtype = PP_2_MTYPE(pp);
+ ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
+
+ page_sub(ppp, pp);
+ page_ctr_sub(pp, flags);
+ page_freelist_unlock(mnode);
+
+#if defined(__sparc)
+ if (PP_ISNORELOC(pp)) {
+ kcage_freemem_sub(1);
+ }
+#endif
+}
+
+void
+page_list_sub_pages(page_t *pp, uint_t szc)
+{
+ kmutex_t *pcm;
+ uint_t bin, mtype;
+ int mnode;
+
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+
+ /*
+ * See comment in page_list_sub().
+ */
+try_again:
+ bin = PP_2_BIN(pp);
+ mnode = PP_2_MEM_NODE(pp);
+ pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
+ mutex_enter(pcm);
+ if (PP_2_BIN(pp) != bin) {
+ mutex_exit(pcm);
+ goto try_again;
+ }
+
+ VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages1[pp->p_szc]);
+
+ /*
+ * If we're called with a page larger than szc or it got
+ * promoted above szc before we locked the freelist then
+ * drop pcm and re-lock entire freelist. If page still larger
+ * than szc then demote it.
+ */
+ if (pp->p_szc > szc) {
+ VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages2[pp->p_szc]);
+ mutex_exit(pcm);
+ pcm = NULL;
+ page_freelist_lock(mnode);
+ if (pp->p_szc > szc) {
+ VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages3[pp->p_szc]);
+ (void) page_demote(mnode,
+ PFN_BASE(pp->p_pagenum, pp->p_szc),
+ pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
+ }
+ bin = PP_2_BIN(pp);
+ }
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+ ASSERT(pp->p_szc <= szc);
+ ASSERT(pp == PP_PAGEROOT(pp));
+
+ mtype = PP_2_MTYPE(pp);
+ if (pp->p_szc != 0) {
+ page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
+ CHK_LPG(pp, pp->p_szc);
+ } else {
+ page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
+ }
+ page_ctr_sub(pp, PG_FREE_LIST);
+
+ if (pcm != NULL) {
+ mutex_exit(pcm);
+ } else {
+ page_freelist_unlock(mnode);
+ }
+
+#if defined(__sparc)
+ if (PP_ISNORELOC(pp)) {
+ pgcnt_t pgcnt;
+
+ pgcnt = page_get_pagecnt(pp->p_szc);
+ kcage_freemem_sub(pgcnt);
+ }
+#endif
+}
+
+/*
+ * Add the page to the front of a linked list of pages
+ * using the p_next & p_prev pointers for the list.
+ * The caller is responsible for protecting the list pointers.
+ */
+void
+mach_page_add(page_t **ppp, page_t *pp)
+{
+ if (*ppp == NULL) {
+ pp->p_next = pp->p_prev = pp;
+ } else {
+ pp->p_next = *ppp;
+ pp->p_prev = (*ppp)->p_prev;
+ (*ppp)->p_prev = pp;
+ pp->p_prev->p_next = pp;
+ }
+ *ppp = pp;
+}
+
+/*
+ * Remove this page from a linked list of pages
+ * using the p_next & p_prev pointers for the list.
+ *
+ * The caller is responsible for protecting the list pointers.
+ */
+void
+mach_page_sub(page_t **ppp, page_t *pp)
+{
+ ASSERT(PP_ISFREE(pp));
+
+ if (*ppp == NULL || pp == NULL)
+ panic("mach_page_sub");
+
+ if (*ppp == pp)
+ *ppp = pp->p_next; /* go to next page */
+
+ if (*ppp == pp)
+ *ppp = NULL; /* page list is gone */
+ else {
+ pp->p_prev->p_next = pp->p_next;
+ pp->p_next->p_prev = pp->p_prev;
+ }
+ pp->p_prev = pp->p_next = pp; /* make pp a list of one */
+}
+
+/*
+ * Routine fsflush uses to gradually coalesce the free list into larger pages.
+ */
+void
+page_promote_size(page_t *pp, uint_t cur_szc)
+{
+ pfn_t pfn;
+ int mnode;
+ int idx;
+ int new_szc = cur_szc + 1;
+ int full = FULL_REGION_CNT(new_szc);
+
+ pfn = page_pptonum(pp);
+ mnode = PFN_2_MEM_NODE(pfn);
+
+ page_freelist_lock(mnode);
+
+ idx = PNUM_TO_IDX(mnode, new_szc, pfn);
+ if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
+ (void) page_promote(mnode, pfn, new_szc, PC_FREE);
+
+ page_freelist_unlock(mnode);
+}
+
+static uint_t page_promote_err;
+static uint_t page_promote_noreloc_err;
+
+/*
+ * Create a single larger page (of szc new_szc) from smaller contiguous pages
+ * for the given mnode starting at pfnum. Pages involved are on the freelist
+ * before the call and may be returned to the caller if requested, otherwise
+ * they will be placed back on the freelist.
+ * If flags is PC_ALLOC, then the large page will be returned to the user in
+ * a state which is consistent with a page being taken off the freelist. If
+ * we failed to lock the new large page, then we will return NULL to the
+ * caller and put the large page on the freelist instead.
+ * If flags is PC_FREE, then the large page will be placed on the freelist,
+ * and NULL will be returned.
+ * The caller is responsible for locking the freelist as well as any other
+ * accounting which needs to be done for a returned page.
+ *
+ * RFE: For performance pass in pp instead of pfnum so
+ * we can avoid excessive calls to page_numtopp_nolock().
+ * This would depend on an assumption that all contiguous
+ * pages are in the same memseg so we can just add/dec
+ * our pp.
+ *
+ * Lock ordering:
+ *
+ * There is a potential but rare deadlock situation
+ * for page promotion and demotion operations. The problem
+ * is there are two paths into the freelist manager and
+ * they have different lock orders:
+ *
+ * page_create()
+ * lock freelist
+ * page_lock(EXCL)
+ * unlock freelist
+ * return
+ * caller drops page_lock
+ *
+ * page_free() and page_reclaim()
+ * caller grabs page_lock(EXCL)
+ *
+ * lock freelist
+ * unlock freelist
+ * drop page_lock
+ *
+ * What prevents a thread in page_create() from deadlocking
+ * with a thread freeing or reclaiming the same page is the
+ * page_trylock() in page_get_freelist(). If the trylock fails
+ * it skips the page.
+ *
+ * The lock ordering for promotion and demotion is the same as
+ * for page_create(). Since the same deadlock could occur during
+ * page promotion and freeing or reclaiming of a page on the
+ * cache list we might have to fail the operation and undo what
+ * have done so far. Again this is rare.
+ */
+page_t *
+page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
+{
+ page_t *pp, *pplist, *tpp, *start_pp;
+ pgcnt_t new_npgs, npgs;
+ uint_t bin;
+ pgcnt_t tmpnpgs, pages_left;
+ uint_t mtype;
+ uint_t noreloc;
+ uint_t i;
+ int which_list;
+ ulong_t index;
+ kmutex_t *phm;
+
+ /*
+ * General algorithm:
+ * Find the starting page
+ * Walk each page struct removing it from the freelist,
+ * and linking it to all the other pages removed.
+ * Once all pages are off the freelist,
+ * walk the list, modifying p_szc to new_szc and what
+ * ever other info needs to be done to create a large free page.
+ * According to the flags, either return the page or put it
+ * on the freelist.
+ */
+
+ start_pp = page_numtopp_nolock(pfnum);
+ ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
+ new_npgs = page_get_pagecnt(new_szc);
+ ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
+
+ /*
+ * Loop through smaller pages to confirm that all pages
+ * give the same result for PP_ISNORELOC().
+ * We can check this reliably here as the protocol for setting
+ * P_NORELOC requires pages to be taken off the free list first.
+ */
+ for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) {
+ if (pp == start_pp) {
+ /* First page, set requirement. */
+ noreloc = PP_ISNORELOC(pp);
+ } else if (noreloc != PP_ISNORELOC(pp)) {
+ page_promote_noreloc_err++;
+ page_promote_err++;
+ return (NULL);
+ }
+ }
+
+ pages_left = new_npgs;
+ pplist = NULL;
+ pp = start_pp;
+
+ /* Loop around coalescing the smaller pages into a big page. */
+ while (pages_left) {
+ /*
+ * Remove from the freelist.
+ */
+ ASSERT(PP_ISFREE(pp));
+ bin = PP_2_BIN(pp);
+ ASSERT(mnode == PP_2_MEM_NODE(pp));
+ mtype = PP_2_MTYPE(pp);
+ if (PP_ISAGED(pp)) {
+
+ /*
+ * PG_FREE_LIST
+ */
+ if (pp->p_szc) {
+ page_vpsub(&PAGE_FREELISTS(mnode,
+ pp->p_szc, bin, mtype), pp);
+ } else {
+ mach_page_sub(&PAGE_FREELISTS(mnode, 0,
+ bin, mtype), pp);
+ }
+ which_list = PG_FREE_LIST;
+ } else {
+ ASSERT(pp->p_szc == 0);
+
+ /*
+ * PG_CACHE_LIST
+ *
+ * Since this page comes from the
+ * cachelist, we must destroy the
+ * vnode association.
+ */
+ if (!page_trylock(pp, SE_EXCL)) {
+ goto fail_promote;
+ }
+
+ /*
+ * We need to be careful not to deadlock
+ * with another thread in page_lookup().
+ * The page_lookup() thread could be holding
+ * the same phm that we need if the two
+ * pages happen to hash to the same phm lock.
+ * At this point we have locked the entire
+ * freelist and page_lookup() could be trying
+ * to grab a freelist lock.
+ */
+ index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
+ phm = PAGE_HASH_MUTEX(index);
+ if (!mutex_tryenter(phm)) {
+ page_unlock(pp);
+ goto fail_promote;
+ }
+
+ mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
+ page_hashout(pp, phm);
+ mutex_exit(phm);
+ PP_SETAGED(pp);
+ page_unlock(pp);
+ which_list = PG_CACHE_LIST;
+ }
+ page_ctr_sub(pp, which_list);
+
+ /*
+ * Concatenate the smaller page(s) onto
+ * the large page list.
+ */
+ tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
+ pages_left -= npgs;
+ tpp = pp;
+ while (npgs--) {
+ tpp->p_szc = new_szc;
+ tpp = tpp->p_next;
+ }
+ page_list_concat(&pplist, &pp);
+ pp += tmpnpgs;
+ }
+ CHK_LPG(pplist, new_szc);
+
+ /*
+ * return the page to the user if requested
+ * in the properly locked state.
+ */
+ if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
+ return (pplist);
+ }
+
+ /*
+ * Otherwise place the new large page on the freelist
+ */
+ bin = PP_2_BIN(pplist);
+ mnode = PP_2_MEM_NODE(pplist);
+ mtype = PP_2_MTYPE(pplist);
+ page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
+
+ page_ctr_add(pplist, PG_FREE_LIST);
+ return (NULL);
+
+fail_promote:
+ /*
+ * A thread must have still been freeing or
+ * reclaiming the page on the cachelist.
+ * To prevent a deadlock undo what we have
+ * done sofar and return failure. This
+ * situation can only happen while promoting
+ * PAGESIZE pages.
+ */
+ page_promote_err++;
+ while (pplist) {
+ pp = pplist;
+ mach_page_sub(&pplist, pp);
+ pp->p_szc = 0;
+ bin = PP_2_BIN(pp);
+ mtype = PP_2_MTYPE(pp);
+ mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
+ page_ctr_add(pp, PG_FREE_LIST);
+ }
+ return (NULL);
+
+}
+
+/*
+ * Break up a large page into smaller size pages.
+ * Pages involved are on the freelist before the call and may
+ * be returned to the caller if requested, otherwise they will
+ * be placed back on the freelist.
+ * The caller is responsible for locking the freelist as well as any other
+ * accounting which needs to be done for a returned page.
+ * If flags is not PC_ALLOC, the color argument is ignored, and thus
+ * technically, any value may be passed in but PC_NO_COLOR is the standard
+ * which should be followed for clarity's sake.
+ */
+page_t *
+page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
+ int color, int flags)
+{
+ page_t *pp, *pplist, *npplist;
+ pgcnt_t npgs, n;
+ uint_t bin;
+ uint_t mtype;
+ page_t *ret_pp = NULL;
+
+ ASSERT(cur_szc != 0);
+ ASSERT(new_szc < cur_szc);
+
+ pplist = page_numtopp_nolock(pfnum);
+ ASSERT(pplist != NULL);
+
+ ASSERT(pplist->p_szc == cur_szc);
+
+ bin = PP_2_BIN(pplist);
+ ASSERT(mnode == PP_2_MEM_NODE(pplist));
+ mtype = PP_2_MTYPE(pplist);
+ page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
+
+ CHK_LPG(pplist, cur_szc);
+ page_ctr_sub(pplist, PG_FREE_LIST);
+
+ /*
+ * Number of PAGESIZE pages for smaller new_szc
+ * page.
+ */
+ npgs = page_get_pagecnt(new_szc);
+
+ while (pplist) {
+ pp = pplist;
+
+ ASSERT(pp->p_szc == cur_szc);
+
+ /*
+ * We either break it up into PAGESIZE pages or larger.
+ */
+ if (npgs == 1) { /* PAGESIZE case */
+ mach_page_sub(&pplist, pp);
+ ASSERT(pp->p_szc == cur_szc);
+ ASSERT(new_szc == 0);
+ ASSERT(mnode == PP_2_MEM_NODE(pp));
+ pp->p_szc = new_szc;
+ bin = PP_2_BIN(pp);
+ if ((bin == color) && (flags == PC_ALLOC) &&
+ (ret_pp == NULL) &&
+ page_trylock_cons(pp, SE_EXCL)) {
+ ret_pp = pp;
+ } else {
+ mtype = PP_2_MTYPE(pp);
+ mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
+ mtype), pp);
+ page_ctr_add(pp, PG_FREE_LIST);
+ }
+ } else {
+
+ /*
+ * Break down into smaller lists of pages.
+ */
+ page_list_break(&pplist, &npplist, npgs);
+
+ pp = pplist;
+ n = npgs;
+ while (n--) {
+ ASSERT(pp->p_szc == cur_szc);
+ pp->p_szc = new_szc;
+ pp = pp->p_next;
+ }
+
+ CHK_LPG(pplist, new_szc);
+
+ bin = PP_2_BIN(pplist);
+ ASSERT(mnode == PP_2_MEM_NODE(pp));
+ if ((bin == color) && (flags == PC_ALLOC) &&
+ (ret_pp == NULL) &&
+ page_trylock_cons(pp, SE_EXCL)) {
+ ret_pp = pp;
+ } else {
+ mtype = PP_2_MTYPE(pp);
+ page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
+ bin, mtype), pplist);
+
+ page_ctr_add(pplist, PG_FREE_LIST);
+ }
+ pplist = npplist;
+ }
+ }
+ return (ret_pp);
+}
+
+int mpss_coalesce_disable = 0;
+
+/*
+ * Coalesce free pages into a page of the given szc and color if possible.
+ * Return the pointer to the page created, otherwise, return NULL.
+ */
+static page_t *
+page_freelist_coalesce(int mnode, uchar_t szc, int color)
+{
+ int r; /* region size */
+ int idx, full, i;
+ pfn_t pfnum;
+ size_t len;
+ size_t buckets_to_check;
+ pgcnt_t cands;
+ page_t *ret_pp;
+ int color_stride;
+
+ VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce);
+
+ if (mpss_coalesce_disable) {
+ return (NULL);
+ }
+
+ r = szc;
+ PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands);
+ if (cands == 0) {
+ VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip);
+ return (NULL);
+ }
+ full = FULL_REGION_CNT(r);
+ color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
+ page_colors;
+
+ /* Prevent page_counters dynamic memory from being freed */
+ rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
+ len = PAGE_COUNTERS_ENTRIES(mnode, r);
+ buckets_to_check = len / color_stride;
+ idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color);
+ ASSERT((idx % color_stride) == color);
+ idx += color_stride;
+ if (idx >= len)
+ idx = color;
+ for (i = 0; i < buckets_to_check; i++) {
+ if (PAGE_COUNTERS(mnode, r, idx) == full) {
+ pfnum = IDX_TO_PNUM(mnode, r, idx);
+ ASSERT(pfnum >= mem_node_config[mnode].physbase &&
+ pfnum < mem_node_config[mnode].physmax);
+ /*
+ * RFE: For performance maybe we can do something less
+ * brutal than locking the entire freelist. So far
+ * this doesn't seem to be a performance problem?
+ */
+ page_freelist_lock(mnode);
+ if (PAGE_COUNTERS(mnode, r, idx) != full) {
+ VM_STAT_ADD(vmm_vmstats.page_ctrs_changed);
+ goto skip_this_one;
+ }
+ ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC);
+ if (ret_pp != NULL) {
+ PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
+ idx;
+ page_freelist_unlock(mnode);
+ rw_exit(&page_ctrs_rwlock[mnode]);
+#if defined(__sparc)
+ if (PP_ISNORELOC(ret_pp)) {
+ pgcnt_t npgs;
+
+ npgs = page_get_pagecnt(ret_pp->p_szc);
+ kcage_freemem_sub(npgs);
+ }
+#endif
+ return (ret_pp);
+ }
+skip_this_one:
+ page_freelist_unlock(mnode);
+ /*
+ * No point looking for another page if we've
+ * already tried all of the ones that
+ * page_ctr_cands indicated. Stash off where we left
+ * off.
+ * Note: this is not exact since we don't hold the
+ * page_freelist_locks before we initially get the
+ * value of cands for performance reasons, but should
+ * be a decent approximation.
+ */
+ if (--cands == 0) {
+ PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
+ idx;
+ break;
+ }
+ }
+ idx += color_stride;
+ if (idx >= len)
+ idx = color;
+ }
+ rw_exit(&page_ctrs_rwlock[mnode]);
+ VM_STAT_ADD(vmm_vmstats.page_ctrs_failed);
+ return (NULL);
+}
+
+/*
+ * For the given mnode, promote as many small pages to large pages as possible.
+ */
+void
+page_freelist_coalesce_all(int mnode)
+{
+ int r; /* region size */
+ int idx, full;
+ pfn_t pfnum;
+ size_t len;
+
+ VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
+
+ if (mpss_coalesce_disable) {
+ return;
+ }
+
+ /*
+ * Lock the entire freelist and coalesce what we can.
+ *
+ * Always promote to the largest page possible
+ * first to reduce the number of page promotions.
+ */
+ rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
+ page_freelist_lock(mnode);
+ for (r = mmu_page_sizes - 1; r > 0; r--) {
+ pgcnt_t cands;
+
+ PGCTRS_CANDS_GETVALUE(mnode, r, cands);
+ if (cands == 0) {
+ VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
+ continue;
+ }
+
+ full = FULL_REGION_CNT(r);
+ len = PAGE_COUNTERS_ENTRIES(mnode, r);
+
+ for (idx = 0; idx < len; idx++) {
+ if (PAGE_COUNTERS(mnode, r, idx) == full) {
+ pfnum = IDX_TO_PNUM(mnode, r, idx);
+ ASSERT(pfnum >=
+ mem_node_config[mnode].physbase &&
+ pfnum <
+ mem_node_config[mnode].physmax);
+ (void) page_promote(mnode, pfnum, r, PC_FREE);
+ }
+ }
+ }
+ page_freelist_unlock(mnode);
+ rw_exit(&page_ctrs_rwlock[mnode]);
+}
+
+/*
+ * This is where all polices for moving pages around
+ * to different page size free lists is implemented.
+ * Returns 1 on success, 0 on failure.
+ *
+ * So far these are the priorities for this algorithm in descending
+ * order:
+ *
+ * 1) When servicing a request try to do so with a free page
+ * from next size up. Helps defer fragmentation as long
+ * as possible.
+ *
+ * 2) Page coalesce on demand. Only when a freelist
+ * larger than PAGESIZE is empty and step 1
+ * will not work since all larger size lists are
+ * also empty.
+ *
+ * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
+ */
+page_t *
+page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi)
+{
+ uchar_t nszc = szc + 1;
+ int bin;
+ page_t *pp, *firstpp;
+ page_t *ret_pp = NULL;
+
+ ASSERT(szc < mmu_page_sizes);
+
+ /*
+ * First try to break up a larger page to fill
+ * current size freelist.
+ */
+ while (nszc < mmu_page_sizes) {
+ /*
+ * If page found then demote it.
+ */
+ bin = page_convert_color(szc, nszc, color);
+ if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
+ page_freelist_lock(mnode);
+ firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
+
+ /*
+ * If pfnhi is not PFNNULL, look for large page below
+ * pfnhi. PFNNULL signifies no pfn requirement.
+ */
+ if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
+ do {
+ pp = pp->p_vpnext;
+ if (pp == firstpp) {
+ pp = NULL;
+ break;
+ }
+ } while (pp->p_pagenum >= pfnhi);
+ }
+ if (pp) {
+ ASSERT(pp->p_szc == nszc);
+ ret_pp = page_demote(mnode, pp->p_pagenum,
+ pp->p_szc, szc, color, PC_ALLOC);
+ if (ret_pp) {
+ page_freelist_unlock(mnode);
+#if defined(__sparc)
+ if (PP_ISNORELOC(ret_pp)) {
+ pgcnt_t npgs;
+
+ npgs = page_get_pagecnt(
+ ret_pp->p_szc);
+ kcage_freemem_sub(npgs);
+ }
+#endif
+ return (ret_pp);
+ }
+ }
+ page_freelist_unlock(mnode);
+ }
+ nszc++;
+ }
+
+ /*
+ * Ok that didn't work. Time to coalesce.
+ */
+ if (szc != 0) {
+ ret_pp = page_freelist_coalesce(mnode, szc, color);
+ }
+
+ return (ret_pp);
+}
+
+/*
+ * Helper routine used only by the freelist code to lock
+ * a page. If the page is a large page then it succeeds in
+ * locking all the constituent pages or none at all.
+ * Returns 1 on sucess, 0 on failure.
+ */
+static int
+page_trylock_cons(page_t *pp, se_t se)
+{
+ page_t *tpp, *first_pp = pp;
+
+ /*
+ * Fail if can't lock first or only page.
+ */
+ if (!page_trylock(pp, se)) {
+ return (0);
+ }
+
+ /*
+ * PAGESIZE: common case.
+ */
+ if (pp->p_szc == 0) {
+ return (1);
+ }
+
+ /*
+ * Large page case.
+ */
+ tpp = pp->p_next;
+ while (tpp != pp) {
+ if (!page_trylock(tpp, se)) {
+ /*
+ * On failure unlock what we
+ * have locked so far.
+ */
+ while (first_pp != tpp) {
+ page_unlock(first_pp);
+ first_pp = first_pp->p_next;
+ }
+ return (0);
+ }
+ tpp = tpp->p_next;
+ }
+ return (1);
+}
+
+page_t *
+page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
+ uint_t flags)
+{
+ kmutex_t *pcm;
+ int i, fill_tried, fill_marker;
+ page_t *pp, *first_pp;
+ uint_t bin_marker;
+ int colors, cpucolors;
+ uchar_t nszc;
+ uint_t nszc_color_shift;
+ int nwaybins = 0, nwaycnt;
+
+ ASSERT(szc < mmu_page_sizes);
+
+ VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
+
+ /* LINTED */
+ MTYPE_START(mnode, mtype, flags);
+ if (mtype < 0) { /* mnode foes not have memory in mtype range */
+ VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
+ return (NULL);
+ }
+
+ /*
+ * Set how many physical colors for this page size.
+ */
+ colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
+ page_colors;
+
+ nszc = MIN(szc + 1, mmu_page_sizes - 1);
+ nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc);
+
+ /* cpu_page_colors is non-zero if a page color may be in > 1 bin */
+ cpucolors = cpu_page_colors;
+
+ /*
+ * adjust cpucolors to possibly check additional 'equivalent' bins
+ * to try to minimize fragmentation of large pages by delaying calls
+ * to page_freelist_fill.
+ */
+ if (colorequiv > 1) {
+ int equivcolors = colors / colorequiv;
+
+ if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
+ cpucolors = equivcolors;
+ }
+
+ ASSERT(colors <= page_colors);
+ ASSERT(colors);
+ ASSERT((colors & (colors - 1)) == 0);
+
+ ASSERT(bin < colors);
+
+ /*
+ * Only hold one freelist lock at a time, that way we
+ * can start anywhere and not have to worry about lock
+ * ordering.
+ */
+big_try_again:
+ fill_tried = 0;
+ nwaycnt = 0;
+ for (i = 0; i <= colors; i++) {
+try_again:
+ ASSERT(bin < colors);
+ if (PAGE_FREELISTS(mnode, szc, bin, mtype)) {
+ pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
+ mutex_enter(pcm);
+ pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
+ if (pp != NULL) {
+ /*
+ * These were set before the page
+ * was put on the free list,
+ * they must still be set.
+ */
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+ ASSERT(pp->p_vnode == NULL);
+ ASSERT(pp->p_hash == NULL);
+ ASSERT(pp->p_offset == (u_offset_t)-1);
+ ASSERT(pp->p_szc == szc);
+ ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
+
+ /*
+ * Walk down the hash chain.
+ * 8k pages are linked on p_next
+ * and p_prev fields. Large pages
+ * are a contiguous group of
+ * constituent pages linked together
+ * on their p_next and p_prev fields.
+ * The large pages are linked together
+ * on the hash chain using p_vpnext
+ * p_vpprev of the base constituent
+ * page of each large page.
+ */
+ first_pp = pp;
+ while (!page_trylock_cons(pp, SE_EXCL)) {
+ if (szc == 0) {
+ pp = pp->p_next;
+ } else {
+ pp = pp->p_vpnext;
+ }
+
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+ ASSERT(pp->p_vnode == NULL);
+ ASSERT(pp->p_hash == NULL);
+ ASSERT(pp->p_offset == (u_offset_t)-1);
+ ASSERT(pp->p_szc == szc);
+ ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
+ mnode);
+
+ if (pp == first_pp) {
+ pp = NULL;
+ break;
+ }
+ }
+
+ if (pp) {
+ ASSERT(mtype == PP_2_MTYPE(pp));
+ ASSERT(pp->p_szc == szc);
+ if (szc == 0) {
+ page_sub(&PAGE_FREELISTS(mnode,
+ szc, bin, mtype), pp);
+ } else {
+ page_vpsub(&PAGE_FREELISTS(
+ mnode, szc, bin, mtype),
+ pp);
+ CHK_LPG(pp, szc);
+ }
+ page_ctr_sub(pp, PG_FREE_LIST);
+
+ if ((PP_ISFREE(pp) == 0) ||
+ (PP_ISAGED(pp) == 0))
+ panic("free page is not. pp %p",
+ (void *)pp);
+ mutex_exit(pcm);
+
+#if defined(__sparc)
+ ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
+ (flags & PG_NORELOC) == 0);
+
+ if (PP_ISNORELOC(pp)) {
+ pgcnt_t npgs;
+
+ npgs = page_get_pagecnt(szc);
+ kcage_freemem_sub(npgs);
+ }
+#endif
+ VM_STAT_ADD(vmm_vmstats.
+ pgmf_allocok[szc]);
+ return (pp);
+ }
+ }
+ mutex_exit(pcm);
+ }
+
+ /*
+ * Wow! The initial bin is empty.
+ * If specific color is needed, check if page color may be
+ * in other bins. cpucolors is:
+ * 0 if the colors for this cpu is equal to page_colors.
+ * This means that pages with a particular color are in a
+ * single bin.
+ * -1 if colors of cpus (cheetah+) are heterogenous. Need to
+ * first determine the colors for the current cpu.
+ * >0 colors of all cpus are homogenous and < page_colors
+ */
+
+ if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
+ if (!nwaybins) {
+ /*
+ * cpucolors is negative if ecache setsizes
+ * are heterogenous. determine colors for this
+ * particular cpu.
+ */
+ if (cpucolors < 0) {
+ cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
+ ASSERT(cpucolors > 0);
+ nwaybins = colors / cpucolors;
+ } else {
+ nwaybins = colors / cpucolors;
+ ASSERT(szc > 0 || nwaybins > 1);
+ }
+ if (nwaybins < 2)
+ cpucolors = 0;
+ }
+
+ if (cpucolors && (nwaycnt + 1 <= nwaybins)) {
+ nwaycnt++;
+ bin = (bin + (colors / nwaybins)) &
+ (colors - 1);
+ if (nwaycnt < nwaybins) {
+ goto try_again;
+ }
+ }
+ /* back to initial color if fall-thru */
+ }
+
+ /*
+ * color bins are all empty if color match. Try and satisfy
+ * the request by breaking up or coalescing pages from
+ * a different size freelist of the correct color that
+ * satisfies the ORIGINAL color requested. If that
+ * fails then try pages of the same size but different
+ * colors assuming we are not called with
+ * PG_MATCH_COLOR.
+ */
+ if (!fill_tried) {
+ fill_tried = 1;
+ fill_marker = bin >> nszc_color_shift;
+ pp = page_freelist_fill(szc, bin, mnode, mtype,
+ PFNNULL);
+ if (pp != NULL) {
+ return (pp);
+ }
+ }
+
+ if (flags & PG_MATCH_COLOR)
+ break;
+
+ /*
+ * Select next color bin to try.
+ */
+ if (szc == 0) {
+ /*
+ * PAGESIZE page case.
+ */
+ if (i == 0) {
+ bin = (bin + BIN_STEP) & page_colors_mask;
+ bin_marker = bin;
+ } else {
+ bin = (bin + vac_colors) & page_colors_mask;
+ if (bin == bin_marker) {
+ bin = (bin + 1) & page_colors_mask;
+ bin_marker = bin;
+ }
+ }
+ } else {
+ /*
+ * Large page case.
+ */
+ bin = (bin + 1) & (colors - 1);
+ }
+ /*
+ * If bin advanced to the next color bin of the
+ * next larger pagesize, there is a chance the fill
+ * could succeed.
+ */
+ if (fill_marker != (bin >> nszc_color_shift))
+ fill_tried = 0;
+ }
+
+#if defined(__sparc)
+ if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&
+ (kcage_freemem >= kcage_lotsfree)) {
+ /*
+ * The Cage is ON and with plenty of free mem, and
+ * we're willing to check for a NORELOC page if we
+ * couldn't find a RELOC page, so spin again.
+ */
+ flags |= PG_NORELOC;
+ mtype = MTYPE_NORELOC;
+ goto big_try_again;
+ }
+#else
+ if (flags & PGI_MT_RANGE) {
+ /* cycle through range of mtypes */
+ MTYPE_NEXT(mnode, mtype, flags);
+ if (mtype >= 0)
+ goto big_try_again;
+ }
+#endif
+ VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
+
+ return (NULL);
+}
+
+
+/*
+ * Returns the count of free pages for 'pp' with size code 'szc'.
+ * Note: This function does not return an exact value as the page freelist
+ * locks are not held and thus the values in the page_counters may be
+ * changing as we walk through the data.
+ */
+static int
+page_freecnt(int mnode, page_t *pp, uchar_t szc)
+{
+ pgcnt_t pgfree;
+ pgcnt_t cnt;
+ ssize_t r = szc; /* region size */
+ ssize_t idx;
+ int i;
+ int full, range;
+
+ /* Make sure pagenum passed in is aligned properly */
+ ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
+ ASSERT(szc > 0);
+
+ /* Prevent page_counters dynamic memory from being freed */
+ rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
+ idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
+ cnt = PAGE_COUNTERS(mnode, r, idx);
+ pgfree = cnt << PNUM_SHIFT(r - 1);
+ range = FULL_REGION_CNT(szc);
+
+ /* Check for completely full region */
+ if (cnt == range) {
+ rw_exit(&page_ctrs_rwlock[mnode]);
+ return (pgfree);
+ }
+
+ while (--r > 0) {
+ idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
+ full = FULL_REGION_CNT(r);
+ for (i = 0; i < range; i++, idx++) {
+ cnt = PAGE_COUNTERS(mnode, r, idx);
+ /*
+ * If cnt here is full, that means we have already
+ * accounted for these pages earlier.
+ */
+ if (cnt != full) {
+ pgfree += (cnt << PNUM_SHIFT(r - 1));
+ }
+ }
+ range *= full;
+ }
+ rw_exit(&page_ctrs_rwlock[mnode]);
+ return (pgfree);
+}
+
+/*
+ * Called from page_geti_contig_pages to exclusively lock constituent pages
+ * starting from 'spp' for page size code 'szc'.
+ *
+ * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
+ * region needs to be greater than or equal to the threshold.
+ */
+static int
+page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
+{
+ pgcnt_t pgcnt = PNUM_SIZE(szc);
+ pgcnt_t pgfree, i;
+ page_t *pp;
+
+ VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
+
+
+ if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
+ goto skipptcpcheck;
+ /*
+ * check if there are sufficient free pages available before attempting
+ * to trylock. Count is approximate as page counters can change.
+ */
+ pgfree = page_freecnt(mnode, spp, szc);
+
+ /* attempt to trylock if there are sufficient already free pages */
+ if (pgfree < pgcnt/ptcpthreshold) {
+ VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
+ return (0);
+ }
+
+skipptcpcheck:
+
+ for (i = 0; i < pgcnt; i++) {
+ pp = &spp[i];
+ if (!page_trylock(pp, SE_EXCL)) {
+ VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
+ while (--i != (pgcnt_t)-1) {
+ pp = &spp[i];
+ ASSERT(PAGE_EXCL(pp));
+ page_unlock(pp);
+ }
+ return (0);
+ }
+ ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
+ if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
+ !PP_ISFREE(pp)) {
+ VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
+ ASSERT(i == 0);
+ page_unlock(pp);
+ return (0);
+ }
+ if (PP_ISNORELOC(pp)) {
+ VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
+ while (i != (pgcnt_t)-1) {
+ pp = &spp[i];
+ ASSERT(PAGE_EXCL(pp));
+ page_unlock(pp);
+ i--;
+ }
+ return (0);
+ }
+ }
+ VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
+ return (1);
+}
+
+/*
+ * Claim large page pointed to by 'pp'. 'pp' is the starting set
+ * of 'szc' constituent pages that had been locked exclusively previously.
+ * Will attempt to relocate constituent pages in use.
+ */
+static page_t *
+page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
+{
+ spgcnt_t pgcnt, npgs, i;
+ page_t *targpp, *rpp, *hpp;
+ page_t *replpp = NULL;
+ page_t *pplist = NULL;
+
+ ASSERT(pp != NULL);
+
+ pgcnt = page_get_pagecnt(szc);
+ while (pgcnt) {
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(!PP_ISNORELOC(pp));
+ if (PP_ISFREE(pp)) {
+ /*
+ * If this is a PG_FREE_LIST page then its
+ * size code can change underneath us due to
+ * page promotion or demotion. As an optimzation
+ * use page_list_sub_pages() instead of
+ * page_list_sub().
+ */
+ if (PP_ISAGED(pp)) {
+ page_list_sub_pages(pp, szc);
+ if (pp->p_szc == szc) {
+ return (pp);
+ }
+ ASSERT(pp->p_szc < szc);
+ npgs = page_get_pagecnt(pp->p_szc);
+ hpp = pp;
+ for (i = 0; i < npgs; i++, pp++) {
+ pp->p_szc = szc;
+ }
+ page_list_concat(&pplist, &hpp);
+ pgcnt -= npgs;
+ continue;
+ }
+ ASSERT(!PP_ISAGED(pp));
+ ASSERT(pp->p_szc == 0);
+ page_list_sub(pp, PG_CACHE_LIST);
+ page_hashout(pp, NULL);
+ PP_SETAGED(pp);
+ pp->p_szc = szc;
+ page_list_concat(&pplist, &pp);
+ pp++;
+ pgcnt--;
+ continue;
+ }
+ npgs = page_get_pagecnt(pp->p_szc);
+
+ /*
+ * page_create_wait freemem accounting done by caller of
+ * page_get_freelist and not necessary to call it prior to
+ * calling page_get_replacement_page.
+ *
+ * page_get_replacement_page can call page_get_contig_pages
+ * to acquire a large page (szc > 0); the replacement must be
+ * smaller than the contig page size to avoid looping or
+ * szc == 0 and PGI_PGCPSZC0 is set.
+ */
+ if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
+ replpp = page_get_replacement_page(pp, NULL, 0);
+ if (replpp) {
+ npgs = page_get_pagecnt(pp->p_szc);
+ ASSERT(npgs <= pgcnt);
+ targpp = pp;
+ }
+ }
+
+ /*
+ * If replacement is NULL or do_page_relocate fails, fail
+ * coalescing of pages.
+ */
+ if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
+ &npgs, NULL) != 0)) {
+ /*
+ * Unlock un-processed target list
+ */
+ while (pgcnt--) {
+ ASSERT(PAGE_EXCL(pp));
+ page_unlock(pp);
+ pp++;
+ }
+ /*
+ * Free the processed target list.
+ */
+ while (pplist) {
+ pp = pplist;
+ page_sub(&pplist, pp);
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_szc == szc);
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp));
+ pp->p_szc = 0;
+ page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
+ page_unlock(pp);
+ }
+
+ if (replpp != NULL)
+ page_free_replacement_page(replpp);
+
+ return (NULL);
+ }
+ ASSERT(pp == targpp);
+
+ /* LINTED */
+ ASSERT(hpp = pp); /* That's right, it's an assignment */
+
+ pp += npgs;
+ pgcnt -= npgs;
+
+ while (npgs--) {
+ ASSERT(PAGE_EXCL(targpp));
+ ASSERT(!PP_ISFREE(targpp));
+ ASSERT(!PP_ISNORELOC(targpp));
+ PP_SETFREE(targpp);
+ ASSERT(PP_ISAGED(targpp));
+ ASSERT(targpp->p_szc < szc || (szc == 0 &&
+ (flags & PGI_PGCPSZC0)));
+ targpp->p_szc = szc;
+ targpp = targpp->p_next;
+
+ rpp = replpp;
+ ASSERT(rpp != NULL);
+ page_sub(&replpp, rpp);
+ ASSERT(PAGE_EXCL(rpp));
+ ASSERT(!PP_ISFREE(rpp));
+ page_unlock(rpp);
+ }
+ ASSERT(targpp == hpp);
+ ASSERT(replpp == NULL);
+ page_list_concat(&pplist, &targpp);
+ }
+ CHK_LPG(pplist, szc);
+ return (pplist);
+}
+
+/*
+ * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
+ * of 0 means nothing left after trim.
+ */
+
+int
+trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
+{
+ pfn_t kcagepfn;
+ int decr;
+ int rc = 0;
+
+ if (PP_ISNORELOC(mseg->pages)) {
+ if (PP_ISNORELOC(mseg->epages - 1) == 0) {
+
+ /* lower part of this mseg inside kernel cage */
+ decr = kcage_current_pfn(&kcagepfn);
+
+ /* kernel cage may have transitioned past mseg */
+ if (kcagepfn >= mseg->pages_base &&
+ kcagepfn < mseg->pages_end) {
+ ASSERT(decr == 0);
+ *lo = kcagepfn;
+ *hi = MIN(pfnhi,
+ (mseg->pages_end - 1));
+ rc = 1;
+ }
+ }
+ /* else entire mseg in the cage */
+ } else {
+ if (PP_ISNORELOC(mseg->epages - 1)) {
+
+ /* upper part of this mseg inside kernel cage */
+ decr = kcage_current_pfn(&kcagepfn);
+
+ /* kernel cage may have transitioned past mseg */
+ if (kcagepfn >= mseg->pages_base &&
+ kcagepfn < mseg->pages_end) {
+ ASSERT(decr);
+ *hi = kcagepfn;
+ *lo = MAX(pfnlo, mseg->pages_base);
+ rc = 1;
+ }
+ } else {
+ /* entire mseg outside of kernel cage */
+ *lo = MAX(pfnlo, mseg->pages_base);
+ *hi = MIN(pfnhi, (mseg->pages_end - 1));
+ rc = 1;
+ }
+ }
+ return (rc);
+}
+
+/*
+ * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a
+ * page with size code 'szc'. Claiming such a page requires acquiring
+ * exclusive locks on all constituent pages (page_trylock_contig_pages),
+ * relocating pages in use and concatenating these constituent pages into a
+ * large page.
+ *
+ * The page lists do not have such a large page and page_freelist_fill has
+ * already failed to demote larger pages and/or coalesce smaller free pages.
+ *
+ * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
+ * pages with the same color as 'bin'.
+ *
+ * 'pfnflag' specifies the subset of the pfn range to search.
+ */
+
+
+static page_t *
+page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
+ pfn_t pfnlo, pfn_t pfnhi, int pfnflag)
+{
+ struct memseg *mseg;
+ pgcnt_t szcpgcnt = page_get_pagecnt(szc);
+ pgcnt_t szcpgmask = szcpgcnt - 1;
+ pfn_t randpfn;
+ page_t *pp, *randpp, *endpp;
+ uint_t colors;
+ pfn_t hi, lo;
+ uint_t skip;
+
+ ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
+
+ if ((pfnhi - pfnlo) + 1 < szcpgcnt)
+ return (NULL);
+
+ ASSERT(szc < mmu_page_sizes);
+
+ colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
+ page_colors;
+
+ ASSERT(bin < colors);
+
+ /*
+ * trim the pfn range to search based on pfnflag. pfnflag is set
+ * when there have been previous page_get_contig_page failures to
+ * limit the search.
+ *
+ * The high bit in pfnflag specifies the number of 'slots' in the
+ * pfn range and the remainder of pfnflag specifies which slot.
+ * For example, a value of 1010b would mean the second slot of
+ * the pfn range that has been divided into 8 slots.
+ */
+ if (pfnflag > 1) {
+ int slots = 1 << (highbit(pfnflag) - 1);
+ int slotid = pfnflag & (slots - 1);
+ pgcnt_t szcpages;
+ int slotlen;
+
+ pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
+ pfnhi = pfnhi & ~(szcpgcnt - 1);
+
+ szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
+ slotlen = howmany(szcpages, slots);
+ pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
+ ASSERT(pfnlo < pfnhi);
+ if (pfnhi > pfnlo + (slotlen * szcpgcnt))
+ pfnhi = pfnlo + (slotlen * szcpgcnt);
+ }
+
+ memsegs_lock(0);
+
+ /*
+ * loop through memsegs to look for contig page candidates
+ */
+
+ for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
+ if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
+ /* no overlap */
+ continue;
+ }
+
+ if (mseg->pages_end - mseg->pages_base < szcpgcnt)
+ /* mseg too small */
+ continue;
+
+ /* trim off kernel cage pages from pfn range */
+ if (kcage_on) {
+ if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
+ continue;
+ } else {
+ lo = MAX(pfnlo, mseg->pages_base);
+ hi = MIN(pfnhi, (mseg->pages_end - 1));
+ }
+
+ /* round to szcpgcnt boundaries */
+ lo = P2ROUNDUP(lo, szcpgcnt);
+ hi = hi & ~(szcpgcnt - 1);
+
+ if (hi <= lo)
+ continue;
+
+ /*
+ * set lo to point to the pfn for the desired bin. Large
+ * page sizes may only have a single page color
+ */
+ if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
+ uint_t lobin;
+
+ /*
+ * factor in colorequiv to check additional
+ * 'equivalent' bins.
+ */
+ if (colorequiv > 1 && colors > colorequiv)
+ colors = colors / colorequiv;
+
+ /* determine bin that lo currently points to */
+ lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt;
+
+ /*
+ * set lo to point at appropriate color and set skip
+ * to arrive at the next szc page of the same color.
+ */
+ lo += ((bin - lobin) & (colors - 1)) * szcpgcnt;
+
+ skip = colors * szcpgcnt;
+ } else {
+ /* check all pages starting from lo */
+ skip = szcpgcnt;
+ }
+ if (hi <= lo)
+ /* mseg cannot satisfy color request */
+ continue;
+
+ /* randomly choose a point between lo and hi to begin search */
+
+ randpfn = (pfn_t)GETTICK();
+ randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
+ randpp = mseg->pages + (randpfn - mseg->pages_base);
+
+ ASSERT(randpp->p_pagenum == randpfn);
+
+ pp = randpp;
+ endpp = mseg->pages + (hi - mseg->pages_base);
+
+ ASSERT(randpp + szcpgcnt <= endpp);
+
+ do {
+ ASSERT(!(pp->p_pagenum & szcpgmask));
+ ASSERT((flags & PG_MATCH_COLOR) == 0 ||
+ colorequiv > 1 ||
+ PP_2_BIN(pp) == bin);
+ if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
+ /* pages unlocked by page_claim on failure */
+ if (page_claim_contig_pages(pp, szc, flags)) {
+ memsegs_unlock(0);
+ return (pp);
+ }
+ }
+
+ pp += skip;
+ if (pp >= endpp) {
+ /* start from the beginning */
+ pp = mseg->pages + (lo - mseg->pages_base);
+ ASSERT(pp->p_pagenum == lo);
+ ASSERT(pp + szcpgcnt <= endpp);
+ }
+ } while (pp != randpp);
+ }
+ memsegs_unlock(0);
+ return (NULL);
+}
+
+
+/*
+ * controlling routine that searches through physical memory in an attempt to
+ * claim a large page based on the input parameters.
+ * on the page free lists.
+ *
+ * calls page_geti_contig_pages with an initial pfn range from the mnode
+ * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
+ * that overlaps with the kernel cage or does not match the requested page
+ * color if PG_MATCH_COLOR is set. Since this search is very expensive,
+ * page_geti_contig_pages may further limit the search range based on
+ * previous failure counts (pgcpfailcnt[]).
+ *
+ * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
+ * pagesize page that satisfies mtype.
+ */
+page_t *
+page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
+ uint_t flags)
+{
+ pfn_t pfnlo, pfnhi; /* contig pages pfn range */
+ page_t *pp;
+ int pfnflag = 0; /* no limit on search if 0 */
+
+ VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
+
+ /* LINTED */
+ MTYPE_START(mnode, mtype, flags);
+ if (mtype < 0) { /* mnode does not have memory in mtype range */
+ VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
+ return (NULL);
+ }
+
+ ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
+
+ /* do not limit search and ignore color if hi pri */
+
+ if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
+ pfnflag = pgcpfailcnt[szc];
+
+ /* remove color match to improve chances */
+
+ if (flags & PGI_PGCPHIPRI || pfnflag)
+ flags &= ~PG_MATCH_COLOR;
+
+ do {
+ /* get pfn range based on mnode and mtype */
+ MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
+
+ ASSERT(pfnhi >= pfnlo);
+
+ pp = page_geti_contig_pages(mnode, bin, szc, flags,
+ pfnlo, pfnhi, pfnflag);
+
+ if (pp != NULL) {
+ pfnflag = pgcpfailcnt[szc];
+ if (pfnflag) {
+ /* double the search size */
+ pgcpfailcnt[szc] = pfnflag >> 1;
+ }
+ VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
+ return (pp);
+ }
+ /* LINTED */
+ } while ((flags & PGI_MT_RANGE) &&
+ (MTYPE_NEXT(mnode, mtype, flags) >= 0));
+
+ VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
+ return (NULL);
+}
+
+
+/*
+ * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
+ *
+ * Does its own locking and accounting.
+ * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
+ * pages of the proper color even if there are pages of a different color.
+ *
+ * Finds a page, removes it, THEN locks it.
+ */
+
+/*ARGSUSED*/
+page_t *
+page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
+ caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
+{
+ struct as *as = seg->s_as;
+ page_t *pp = NULL;
+ ulong_t bin;
+ uchar_t szc;
+ int mnode;
+ int mtype;
+ page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
+ lgrp_mnode_cookie_t lgrp_cookie;
+
+ page_get_func = page_get_mnode_freelist;
+
+ /*
+ * If we aren't passed a specific lgroup, or passed a freed lgrp
+ * assume we wish to allocate near to the current thread's home.
+ */
+ if (!LGRP_EXISTS(lgrp))
+ lgrp = lgrp_home_lgrp();
+
+ if (kcage_on) {
+ if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
+ kcage_freemem < kcage_throttlefree + btop(size) &&
+ curthread != kcage_cageout_thread) {
+ /*
+ * Set a "reserve" of kcage_throttlefree pages for
+ * PG_PANIC and cageout thread allocations.
+ *
+ * Everybody else has to serialize in
+ * page_create_get_something() to get a cage page, so
+ * that we don't deadlock cageout!
+ */
+ return (NULL);
+ }
+ } else {
+ flags &= ~PG_NORELOC;
+ flags |= PGI_NOCAGE;
+ }
+
+ /* LINTED */
+ MTYPE_INIT(mtype, vp, vaddr, flags);
+
+ /*
+ * Convert size to page size code.
+ */
+ if ((szc = page_szc(size)) == (uchar_t)-1)
+ panic("page_get_freelist: illegal page size request");
+ ASSERT(szc < mmu_page_sizes);
+
+ VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
+
+ /* LINTED */
+ AS_2_BIN(as, seg, vp, vaddr, bin);
+
+ /* bin is for base pagesize color - convert if larger pagesize. */
+ if (szc)
+ bin = page_convert_color(0, szc, bin);
+
+ /*
+ * Try to get a local page first, but try remote if we can't
+ * get a page of the right color.
+ */
+pgretry:
+ LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
+ while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
+ pp = page_get_func(mnode, bin, mtype, szc, flags);
+ if (pp != NULL) {
+ VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
+ DTRACE_PROBE4(page__get,
+ lgrp_t *, lgrp,
+ int, mnode,
+ ulong_t, bin,
+ uint_t, flags);
+ return (pp);
+ }
+ }
+ ASSERT(pp == NULL);
+
+ /*
+ * for non-SZC0 PAGESIZE requests, check cachelist before checking
+ * remote free lists. Caller expected to call page_get_cachelist which
+ * will check local cache lists and remote free lists.
+ */
+ if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
+ VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
+ return (NULL);
+ }
+
+ ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
+
+ lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
+
+ /*
+ * Try to get a non-local freelist page.
+ */
+ LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
+ while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
+ pp = page_get_func(mnode, bin, mtype, szc, flags);
+ if (pp != NULL) {
+ DTRACE_PROBE4(page__get,
+ lgrp_t *, lgrp,
+ int, mnode,
+ ulong_t, bin,
+ uint_t, flags);
+ VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
+ return (pp);
+ }
+ }
+
+ ASSERT(pp == NULL);
+
+ /*
+ * when the cage is off chances are page_get_contig_pages() will fail
+ * to lock a large page chunk therefore when the cage is off it's not
+ * called by default. this can be changed via /etc/system.
+ *
+ * page_get_contig_pages() also called to acquire a base pagesize page
+ * for page_create_get_something().
+ */
+ if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
+ (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
+ (page_get_func != page_get_contig_pages)) {
+
+ VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
+ page_get_func = page_get_contig_pages;
+ goto pgretry;
+ }
+
+ if (pgcplimitsearch && page_get_func == page_get_contig_pages)
+ pgcpfailcnt[szc]++;
+
+ VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
+ return (NULL);
+}
+
+/*
+ * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
+ *
+ * Does its own locking.
+ * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
+ * pages of the proper color even if there are pages of a different color.
+ * Otherwise, scan the bins for ones with pages. For each bin with pages,
+ * try to lock one of them. If no page can be locked, try the
+ * next bin. Return NULL if a page can not be found and locked.
+ *
+ * Finds a pages, trys to lock it, then removes it.
+ */
+
+/*ARGSUSED*/
+page_t *
+page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
+ caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
+{
+ page_t *pp;
+ struct as *as = seg->s_as;
+ ulong_t bin;
+ /*LINTED*/
+ int mnode;
+ int mtype;
+ lgrp_mnode_cookie_t lgrp_cookie;
+
+ /*
+ * If we aren't passed a specific lgroup, or pasased a freed lgrp
+ * assume we wish to allocate near to the current thread's home.
+ */
+ if (!LGRP_EXISTS(lgrp))
+ lgrp = lgrp_home_lgrp();
+
+ if (!kcage_on) {
+ flags &= ~PG_NORELOC;
+ flags |= PGI_NOCAGE;
+ }
+
+ if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
+ kcage_freemem <= kcage_throttlefree) {
+ /*
+ * Reserve kcage_throttlefree pages for critical kernel
+ * threads.
+ *
+ * Everybody else has to go to page_create_get_something()
+ * to get a cage page, so we don't deadlock cageout.
+ */
+ return (NULL);
+ }
+
+ /* LINTED */
+ AS_2_BIN(as, seg, vp, vaddr, bin);
+
+ ASSERT(bin <= page_colors_mask);
+
+ /* LINTED */
+ MTYPE_INIT(mtype, vp, vaddr, flags);
+
+ VM_STAT_ADD(vmm_vmstats.pgc_alloc);
+
+ /*
+ * Try local cachelists first
+ */
+ LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
+ while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
+ pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
+ if (pp != NULL) {
+ VM_STAT_ADD(vmm_vmstats.pgc_allocok);
+ DTRACE_PROBE4(page__get,
+ lgrp_t *, lgrp,
+ int, mnode,
+ ulong_t, bin,
+ uint_t, flags);
+ return (pp);
+ }
+ }
+
+ lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
+
+ /*
+ * Try freelists/cachelists that are farther away
+ * This is our only chance to allocate remote pages for PAGESIZE
+ * requests.
+ */
+ LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
+ while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
+ pp = page_get_mnode_freelist(mnode, bin, mtype,
+ 0, flags);
+ if (pp != NULL) {
+ VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
+ DTRACE_PROBE4(page__get,
+ lgrp_t *, lgrp,
+ int, mnode,
+ ulong_t, bin,
+ uint_t, flags);
+ return (pp);
+ }
+ pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
+ if (pp != NULL) {
+ VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
+ DTRACE_PROBE4(page__get,
+ lgrp_t *, lgrp,
+ int, mnode,
+ ulong_t, bin,
+ uint_t, flags);
+ return (pp);
+ }
+ }
+
+ VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
+ return (NULL);
+}
+
+page_t *
+page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
+{
+ kmutex_t *pcm;
+ int i;
+ page_t *pp;
+ page_t *first_pp;
+ uint_t bin_marker;
+ int nwaybins, nwaycnt;
+ int cpucolors;
+
+ VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
+
+ /* LINTED */
+ MTYPE_START(mnode, mtype, flags);
+ if (mtype < 0) { /* mnode does not have memory in mtype range */
+ VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
+ return (NULL);
+ }
+
+ nwaybins = 0;
+ cpucolors = cpu_page_colors;
+ /*
+ * adjust cpucolors to possibly check additional 'equivalent' bins
+ * to try to minimize fragmentation of large pages by delaying calls
+ * to page_freelist_fill.
+ */
+ if (colorequiv > 1) {
+ int equivcolors = page_colors / colorequiv;
+
+ if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
+ cpucolors = equivcolors;
+ }
+
+ /*
+ * Only hold one cachelist lock at a time, that way we
+ * can start anywhere and not have to worry about lock
+ * ordering.
+ */
+
+big_try_again:
+ nwaycnt = 0;
+ for (i = 0; i <= page_colors; i++) {
+ if (PAGE_CACHELISTS(mnode, bin, mtype)) {
+ pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
+ mutex_enter(pcm);
+ pp = PAGE_CACHELISTS(mnode, bin, mtype);
+ if (pp != NULL) {
+ first_pp = pp;
+ ASSERT(pp->p_vnode);
+ ASSERT(PP_ISAGED(pp) == 0);
+ ASSERT(pp->p_szc == 0);
+ ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
+ while (!page_trylock(pp, SE_EXCL)) {
+ pp = pp->p_next;
+ ASSERT(pp->p_szc == 0);
+ if (pp == first_pp) {
+ /*
+ * We have searched the
+ * complete list!
+ * And all of them (might
+ * only be one) are locked.
+ * This can happen since
+ * these pages can also be
+ * found via the hash list.
+ * When found via the hash
+ * list, they are locked
+ * first, then removed.
+ * We give up to let the
+ * other thread run.
+ */
+ pp = NULL;
+ break;
+ }
+ ASSERT(pp->p_vnode);
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PP_ISAGED(pp) == 0);
+ ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
+ mnode);
+ }
+
+ if (pp) {
+ page_t **ppp;
+ /*
+ * Found and locked a page.
+ * Pull it off the list.
+ */
+ ASSERT(mtype == PP_2_MTYPE(pp));
+ ppp = &PAGE_CACHELISTS(mnode, bin,
+ mtype);
+ page_sub(ppp, pp);
+ /*
+ * Subtract counters before releasing
+ * pcm mutex to avoid a race with
+ * page_freelist_coalesce and
+ * page_freelist_fill.
+ */
+ page_ctr_sub(pp, PG_CACHE_LIST);
+ mutex_exit(pcm);
+ ASSERT(pp->p_vnode);
+ ASSERT(PP_ISAGED(pp) == 0);
+#if defined(__sparc)
+ ASSERT(!kcage_on ||
+ (flags & PG_NORELOC) == 0 ||
+ PP_ISNORELOC(pp));
+ if (PP_ISNORELOC(pp)) {
+ kcage_freemem_sub(1);
+ }
+#endif
+ VM_STAT_ADD(vmm_vmstats.
+ pgmc_allocok);
+ return (pp);
+ }
+ }
+ mutex_exit(pcm);
+ }
+
+ /*
+ * Wow! The initial bin is empty or no page in the bin could
+ * be locked.
+ *
+ * If specific color is needed, check if page color may be in
+ * other bins.
+ */
+ if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
+ if (!nwaybins) {
+ if (cpucolors < 0) {
+ cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
+ ASSERT(cpucolors > 0);
+ nwaybins = page_colors / cpucolors;
+ if (nwaybins < 2)
+ cpucolors = 0;
+ } else {
+ nwaybins = page_colors / cpucolors;
+ ASSERT(nwaybins > 1);
+ }
+ }
+
+ if (++nwaycnt >= nwaybins) {
+ break;
+ }
+ bin = (bin + (page_colors / nwaybins)) &
+ page_colors_mask;
+ continue;
+ }
+
+ if (i == 0) {
+ bin = (bin + BIN_STEP) & page_colors_mask;
+ bin_marker = bin;
+ } else {
+ bin = (bin + vac_colors) & page_colors_mask;
+ if (bin == bin_marker) {
+ bin = (bin + 1) & page_colors_mask;
+ bin_marker = bin;
+ }
+ }
+ }
+
+#if defined(__sparc)
+ if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&
+ (kcage_freemem >= kcage_lotsfree)) {
+ /*
+ * The Cage is ON and with plenty of free mem, and
+ * we're willing to check for a NORELOC page if we
+ * couldn't find a RELOC page, so spin again.
+ */
+ flags |= PG_NORELOC;
+ mtype = MTYPE_NORELOC;
+ goto big_try_again;
+ }
+#else
+ if (flags & PGI_MT_RANGE) {
+ MTYPE_NEXT(mnode, mtype, flags);
+ if (mtype >= 0)
+ goto big_try_again;
+ }
+#endif
+ VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
+ return (NULL);
+}
+
+#ifdef DEBUG
+#define REPL_PAGE_STATS
+#endif /* DEBUG */
+
+#ifdef REPL_PAGE_STATS
+struct repl_page_stats {
+ uint_t ngets;
+ uint_t ngets_noreloc;
+ uint_t npgr_noreloc;
+ uint_t nnopage_first;
+ uint_t nnopage;
+ uint_t nhashout;
+ uint_t nnofree;
+ uint_t nnext_pp;
+} repl_page_stats;
+#define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1)
+#else /* REPL_PAGE_STATS */
+#define REPL_STAT_INCR(v)
+#endif /* REPL_PAGE_STATS */
+
+int pgrppgcp;
+
+/*
+ * The freemem accounting must be done by the caller.
+ * First we try to get a replacement page of the same size as like_pp,
+ * if that is not possible, then we just get a set of discontiguous
+ * PAGESIZE pages.
+ */
+page_t *
+page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp,
+ uint_t pgrflags)
+{
+ page_t *like_pp;
+ page_t *pp, *pplist;
+ page_t *pl = NULL;
+ ulong_t bin;
+ int mnode, page_mnode;
+ int szc;
+ spgcnt_t npgs, pg_cnt;
+ pfn_t pfnum;
+ int mtype;
+ int flags = 0;
+ lgrp_mnode_cookie_t lgrp_cookie;
+
+
+ REPL_STAT_INCR(ngets);
+ like_pp = orig_like_pp;
+ ASSERT(PAGE_EXCL(like_pp));
+
+ szc = like_pp->p_szc;
+ npgs = page_get_pagecnt(szc);
+ /*
+ * Now we reset like_pp to the base page_t.
+ * That way, we won't walk past the end of this 'szc' page.
+ */
+ pfnum = PFN_BASE(like_pp->p_pagenum, szc);
+ like_pp = page_numtopp_nolock(pfnum);
+ ASSERT(like_pp->p_szc == szc);
+
+ if (PP_ISNORELOC(like_pp)) {
+ ASSERT(kcage_on);
+ REPL_STAT_INCR(ngets_noreloc);
+ flags = PGI_RELOCONLY;
+ } else if (pgrflags & PGR_NORELOC) {
+ ASSERT(kcage_on);
+ REPL_STAT_INCR(npgr_noreloc);
+ flags = PG_NORELOC;
+ }
+
+ /*
+ * Kernel pages must always be replaced with the same size
+ * pages, since we cannot properly handle demotion of kernel
+ * pages.
+ */
+ if (like_pp->p_vnode == &kvp)
+ pgrflags |= PGR_SAMESZC;
+
+ /* LINTED */
+ MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode);
+
+ while (npgs) {
+ pplist = NULL;
+ for (;;) {
+ pg_cnt = page_get_pagecnt(szc);
+ bin = PP_2_BIN(like_pp);
+ ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
+ ASSERT(pg_cnt <= npgs);
+
+ /*
+ * If an lgroup was specified, try to get the
+ * page from that lgroup.
+ */
+ if (LGRP_EXISTS(lgrp)) {
+ /* Try the lgroup's freelists first */
+ LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+ LGRP_SRCH_LOCAL);
+ while ((pplist == NULL) &&
+ (mnode = lgrp_memnode_choose(&lgrp_cookie))
+ != -1) {
+ pplist = page_get_mnode_freelist(
+ mnode, bin, mtype, szc,
+ flags);
+ }
+
+ /*
+ * Now try it's cachelists if this is a
+ * small page. Don't need to do it for
+ * larger ones since page_freelist_coalesce()
+ * already failed.
+ */
+ if (pplist != NULL || szc != 0)
+ break;
+
+ /* Now try it's cachelists */
+ LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+ LGRP_SRCH_LOCAL);
+
+ while ((pplist == NULL) &&
+ (mnode = lgrp_memnode_choose(&lgrp_cookie))
+ != -1) {
+ pplist = page_get_mnode_cachelist(
+ bin, flags, mnode, mtype);
+ }
+ if (pplist != NULL) {
+ page_hashout(pplist, NULL);
+ PP_SETAGED(pplist);
+ REPL_STAT_INCR(nhashout);
+ break;
+ }
+ /* Done looking in this lgroup. Bail out. */
+ break;
+ }
+
+ ASSERT(!LGRP_EXISTS(lgrp));
+ /*
+ * No lgroup was specified, so just try to get the
+ * page as close to like_pp's mnode as possible.
+ * First try the local freelist...
+ */
+ mnode = PP_2_MEM_NODE(like_pp);
+ pplist = page_get_mnode_freelist(mnode, bin,
+ mtype, szc, flags);
+ if (pplist != NULL)
+ break;
+
+ REPL_STAT_INCR(nnofree);
+
+ /*
+ * ...then the local cachelist. Don't need to do it for
+ * larger pages cause page_freelist_coalesce() already
+ * failed there anyway.
+ */
+ if (szc == 0) {
+ pplist = page_get_mnode_cachelist(bin, flags,
+ mnode, mtype);
+ if (pplist != NULL) {
+ page_hashout(pplist, NULL);
+ PP_SETAGED(pplist);
+ REPL_STAT_INCR(nhashout);
+ break;
+ }
+ }
+
+ /* Now try remote freelists */
+ page_mnode = mnode;
+ lgrp =
+ lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
+ LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+ LGRP_SRCH_HIER);
+ while (pplist == NULL &&
+ (mnode = lgrp_memnode_choose(&lgrp_cookie))
+ != -1) {
+ /*
+ * Skip local mnode.
+ */
+ if ((mnode == page_mnode) ||
+ (mem_node_config[mnode].exists == 0))
+ continue;
+
+ pplist = page_get_mnode_freelist(mnode,
+ bin, mtype, szc, flags);
+ }
+
+ if (pplist != NULL)
+ break;
+
+
+ /* Now try remote cachelists */
+ LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+ LGRP_SRCH_HIER);
+ while (pplist == NULL && szc == 0) {
+ mnode = lgrp_memnode_choose(&lgrp_cookie);
+ if (mnode == -1)
+ break;
+ /*
+ * Skip local mnode.
+ */
+ if ((mnode == page_mnode) ||
+ (mem_node_config[mnode].exists == 0))
+ continue;
+
+ pplist = page_get_mnode_cachelist(bin,
+ flags, mnode, mtype);
+
+ if (pplist != NULL) {
+ page_hashout(pplist, NULL);
+ PP_SETAGED(pplist);
+ REPL_STAT_INCR(nhashout);
+ break;
+ }
+ }
+
+ /*
+ * Break out of while loop under the following cases:
+ * - If we successfully got a page.
+ * - If pgrflags specified only returning a specific
+ * page size and we could not find that page size.
+ * - If we could not satisfy the request with PAGESIZE
+ * or larger pages.
+ */
+ if (pplist != NULL || szc == 0)
+ break;
+
+ if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
+ /* try to find contig page */
+
+ LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
+ LGRP_SRCH_HIER);
+
+ while ((pplist == NULL) &&
+ (mnode =
+ lgrp_memnode_choose(&lgrp_cookie))
+ != -1) {
+ pplist = page_get_contig_pages(
+ mnode, bin, mtype, szc,
+ flags | PGI_PGCPHIPRI);
+ }
+ break;
+ }
+
+ /*
+ * The correct thing to do here is try the next
+ * page size down using szc--. Due to a bug
+ * with the processing of HAT_RELOAD_SHARE
+ * where the sfmmu_ttecnt arrays of all
+ * hats sharing an ISM segment don't get updated,
+ * using intermediate size pages for relocation
+ * can lead to continuous page faults.
+ */
+ szc = 0;
+ }
+
+ if (pplist != NULL) {
+ DTRACE_PROBE4(page__get,
+ lgrp_t *, lgrp,
+ int, mnode,
+ ulong_t, bin,
+ uint_t, flags);
+
+ while (pplist != NULL && pg_cnt--) {
+ ASSERT(pplist != NULL);
+ pp = pplist;
+ page_sub(&pplist, pp);
+ PP_CLRFREE(pp);
+ PP_CLRAGED(pp);
+ page_list_concat(&pl, &pp);
+ npgs--;
+ like_pp = like_pp + 1;
+ REPL_STAT_INCR(nnext_pp);
+ }
+ ASSERT(pg_cnt == 0);
+ } else {
+ break;
+ }
+ }
+
+ if (npgs) {
+ /*
+ * We were unable to allocate the necessary number
+ * of pages.
+ * We need to free up any pl.
+ */
+ REPL_STAT_INCR(nnopage);
+ page_free_replacement_page(pl);
+ return (NULL);
+ } else {
+ return (pl);
+ }
+}
+
+/*
+ * demote a free large page to it's constituent pages
+ */
+void
+page_demote_free_pages(page_t *pp)
+{
+
+ int mnode;
+
+ ASSERT(pp != NULL);
+ ASSERT(PAGE_LOCKED(pp));
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
+
+ mnode = PP_2_MEM_NODE(pp);
+ page_freelist_lock(mnode);
+ if (pp->p_szc != 0) {
+ (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
+ pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
+ }
+ page_freelist_unlock(mnode);
+ ASSERT(pp->p_szc == 0);
+}
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
new file mode 100644
index 0000000000..fcafb5f803
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -0,0 +1,1147 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - paged vnode.
+ *
+ * This file supplies vm support for the vnode operations that deal with pages.
+ */
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/vmmeter.h>
+#include <sys/vmsystm.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/user.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/cpuvar.h>
+#include <sys/vtrace.h>
+#include <sys/tnf_probe.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/rm.h>
+#include <vm/pvn.h>
+#include <vm/page.h>
+#include <vm/seg_map.h>
+#include <vm/seg_kmem.h>
+#include <sys/fs/swapnode.h>
+
+int pvn_nofodklust = 0;
+int pvn_write_noklust = 0;
+
+uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */
+uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */
+ /* support for vmodsort for testing */
+
+static struct kmem_cache *marker_cache = NULL;
+
+/*
+ * Find the largest contiguous block which contains `addr' for file offset
+ * `offset' in it while living within the file system block sizes (`vp_off'
+ * and `vp_len') and the address space limits for which no pages currently
+ * exist and which map to consecutive file offsets.
+ */
+page_t *
+pvn_read_kluster(
+ struct vnode *vp,
+ u_offset_t off,
+ struct seg *seg,
+ caddr_t addr,
+ u_offset_t *offp, /* return values */
+ size_t *lenp, /* return values */
+ u_offset_t vp_off,
+ size_t vp_len,
+ int isra)
+{
+ ssize_t deltaf, deltab;
+ page_t *pp;
+ page_t *plist = NULL;
+ spgcnt_t pagesavail;
+ u_offset_t vp_end;
+
+ ASSERT(off >= vp_off && off < vp_off + vp_len);
+
+ /*
+ * We only want to do klustering/read ahead if there
+ * is more than minfree pages currently available.
+ */
+ pagesavail = freemem - minfree;
+
+ if (pagesavail <= 0)
+ if (isra)
+ return ((page_t *)NULL); /* ra case - give up */
+ else
+ pagesavail = 1; /* must return a page */
+
+ /* We calculate in pages instead of bytes due to 32-bit overflows */
+ if (pagesavail < (spgcnt_t)btopr(vp_len)) {
+ /*
+ * Don't have enough free memory for the
+ * max request, try sizing down vp request.
+ */
+ deltab = (ssize_t)(off - vp_off);
+ vp_len -= deltab;
+ vp_off += deltab;
+ if (pagesavail < btopr(vp_len)) {
+ /*
+ * Still not enough memory, just settle for
+ * pagesavail which is at least 1.
+ */
+ vp_len = ptob(pagesavail);
+ }
+ }
+
+ vp_end = vp_off + vp_len;
+ ASSERT(off >= vp_off && off < vp_end);
+
+ if (isra && SEGOP_KLUSTER(seg, addr, 0))
+ return ((page_t *)NULL); /* segment driver says no */
+
+ if ((plist = page_create_va(vp, off,
+ PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
+ return ((page_t *)NULL);
+
+ if (vp_len <= PAGESIZE || pvn_nofodklust) {
+ *offp = off;
+ *lenp = MIN(vp_len, PAGESIZE);
+ } else {
+ /*
+ * Scan back from front by incrementing "deltab" and
+ * comparing "off" with "vp_off + deltab" to avoid
+ * "signed" versus "unsigned" conversion problems.
+ */
+ for (deltab = PAGESIZE; off >= vp_off + deltab;
+ deltab += PAGESIZE) {
+ /*
+ * Call back to the segment driver to verify that
+ * the klustering/read ahead operation makes sense.
+ */
+ if (SEGOP_KLUSTER(seg, addr, -deltab))
+ break; /* page not eligible */
+ if ((pp = page_create_va(vp, off - deltab,
+ PAGESIZE, PG_EXCL, seg, addr - deltab))
+ == NULL)
+ break; /* already have the page */
+ /*
+ * Add page to front of page list.
+ */
+ page_add(&plist, pp);
+ }
+ deltab -= PAGESIZE;
+
+ /* scan forward from front */
+ for (deltaf = PAGESIZE; off + deltaf < vp_end;
+ deltaf += PAGESIZE) {
+ /*
+ * Call back to the segment driver to verify that
+ * the klustering/read ahead operation makes sense.
+ */
+ if (SEGOP_KLUSTER(seg, addr, deltaf))
+ break; /* page not file extension */
+ if ((pp = page_create_va(vp, off + deltaf,
+ PAGESIZE, PG_EXCL, seg, addr + deltaf))
+ == NULL)
+ break; /* already have page */
+
+ /*
+ * Add page to end of page list.
+ */
+ page_add(&plist, pp);
+ plist = plist->p_next;
+ }
+ *offp = off = off - deltab;
+ *lenp = deltab + deltaf;
+ ASSERT(off >= vp_off);
+
+ /*
+ * If we ended up getting more than was actually
+ * requested, retract the returned length to only
+ * reflect what was requested. This might happen
+ * if we were allowed to kluster pages across a
+ * span of (say) 5 frags, and frag size is less
+ * than PAGESIZE. We need a whole number of
+ * pages to contain those frags, but the returned
+ * size should only allow the returned range to
+ * extend as far as the end of the frags.
+ */
+ if ((vp_off + vp_len) < (off + *lenp)) {
+ ASSERT(vp_end > off);
+ *lenp = vp_end - off;
+ }
+ }
+ TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
+ "pvn_read_kluster:seg %p addr %x isra %x",
+ seg, addr, isra);
+ return (plist);
+}
+
+/*
+ * Handle pages for this vnode on either side of the page "pp"
+ * which has been locked by the caller. This routine will also
+ * do klustering in the range [vp_off, vp_off + vp_len] up
+ * until a page which is not found. The offset and length
+ * of pages included is returned in "*offp" and "*lenp".
+ *
+ * Returns a list of dirty locked pages all ready to be
+ * written back.
+ */
+page_t *
+pvn_write_kluster(
+ struct vnode *vp,
+ page_t *pp,
+ u_offset_t *offp, /* return values */
+ size_t *lenp, /* return values */
+ u_offset_t vp_off,
+ size_t vp_len,
+ int flags)
+{
+ u_offset_t off;
+ page_t *dirty;
+ size_t deltab, deltaf;
+ se_t se;
+ u_offset_t vp_end;
+
+ off = pp->p_offset;
+
+ /*
+ * Kustering should not be done if we are invalidating
+ * pages since we could destroy pages that belong to
+ * some other process if this is a swap vnode.
+ */
+ if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
+ *offp = off;
+ *lenp = PAGESIZE;
+ return (pp);
+ }
+
+ if (flags & (B_FREE | B_INVAL))
+ se = SE_EXCL;
+ else
+ se = SE_SHARED;
+
+ dirty = pp;
+ /*
+ * Scan backwards looking for pages to kluster by incrementing
+ * "deltab" and comparing "off" with "vp_off + deltab" to
+ * avoid "signed" versus "unsigned" conversion problems.
+ */
+ for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
+ pp = page_lookup_nowait(vp, off - deltab, se);
+ if (pp == NULL)
+ break; /* page not found */
+ if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
+ break;
+ page_add(&dirty, pp);
+ }
+ deltab -= PAGESIZE;
+
+ vp_end = vp_off + vp_len;
+ /* now scan forwards looking for pages to kluster */
+ for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
+ pp = page_lookup_nowait(vp, off + deltaf, se);
+ if (pp == NULL)
+ break; /* page not found */
+ if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
+ break;
+ page_add(&dirty, pp);
+ dirty = dirty->p_next;
+ }
+
+ *offp = off - deltab;
+ *lenp = deltab + deltaf;
+ return (dirty);
+}
+
+/*
+ * Generic entry point used to release the "shared/exclusive" lock
+ * and the "p_iolock" on pages after i/o is complete.
+ */
+void
+pvn_io_done(page_t *plist)
+{
+ page_t *pp;
+
+ while (plist != NULL) {
+ pp = plist;
+ page_sub(&plist, pp);
+ page_io_unlock(pp);
+ page_unlock(pp);
+ }
+}
+
+/*
+ * Entry point to be used by file system getpage subr's and
+ * other such routines which either want to unlock pages (B_ASYNC
+ * request) or destroy a list of pages if an error occurred.
+ */
+void
+pvn_read_done(page_t *plist, int flags)
+{
+ page_t *pp;
+
+ while (plist != NULL) {
+ pp = plist;
+ page_sub(&plist, pp);
+ page_io_unlock(pp);
+ if (flags & B_ERROR) {
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, B_INVAL, 0, kcred);
+ } else {
+ (void) page_release(pp, 0);
+ }
+ }
+}
+
+/*
+ * Automagic pageout.
+ * When memory gets tight, start freeing pages popping out of the
+ * write queue.
+ */
+int write_free = 1;
+pgcnt_t pages_before_pager = 200; /* LMXXX */
+
+/*
+ * Routine to be called when page-out's complete.
+ * The caller, typically VOP_PUTPAGE, has to explicity call this routine
+ * after waiting for i/o to complete (biowait) to free the list of
+ * pages associated with the buffer. These pages must be locked
+ * before i/o is initiated.
+ *
+ * If a write error occurs, the pages are marked as modified
+ * so the write will be re-tried later.
+ */
+
+void
+pvn_write_done(page_t *plist, int flags)
+{
+ int dfree = 0;
+ int pgrec = 0;
+ int pgout = 0;
+ int pgpgout = 0;
+ int anonpgout = 0;
+ int anonfree = 0;
+ int fspgout = 0;
+ int fsfree = 0;
+ int execpgout = 0;
+ int execfree = 0;
+ page_t *pp;
+ struct cpu *cpup;
+ struct vnode *vp = NULL; /* for probe */
+ uint_t ppattr;
+
+ ASSERT((flags & B_READ) == 0);
+
+ /*
+ * If we are about to start paging anyway, start freeing pages.
+ */
+ if (write_free && freemem < lotsfree + pages_before_pager &&
+ (flags & B_ERROR) == 0) {
+ flags |= B_FREE;
+ }
+
+ /*
+ * Handle each page involved in the i/o operation.
+ */
+ while (plist != NULL) {
+ pp = plist;
+ ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
+ page_sub(&plist, pp);
+
+ /* Kernel probe support */
+ if (vp == NULL)
+ vp = pp->p_vnode;
+
+ if (flags & B_ERROR) {
+ /*
+ * Write operation failed. We don't want
+ * to destroy (or free) the page unless B_FORCE
+ * is set. We set the mod bit again and release
+ * all locks on the page so that it will get written
+ * back again later when things are hopefully
+ * better again.
+ * If B_INVAL and B_FORCE is set we really have
+ * to destroy the page.
+ */
+ if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
+ page_io_unlock(pp);
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, B_INVAL, 0, kcred);
+ } else {
+ hat_setmod(pp);
+ page_io_unlock(pp);
+ page_unlock(pp);
+ }
+ } else if (flags & B_INVAL) {
+ /*
+ * XXX - Failed writes with B_INVAL set are
+ * not handled appropriately.
+ */
+ page_io_unlock(pp);
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, B_INVAL, 0, kcred);
+ } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
+ /*
+ * Update statistics for pages being paged out
+ */
+ if (pp->p_vnode) {
+ if (IS_SWAPFSVP(pp->p_vnode)) {
+ anonpgout++;
+ } else {
+ if (pp->p_vnode->v_flag & VVMEXEC) {
+ execpgout++;
+ } else {
+ fspgout++;
+ }
+ }
+ }
+ page_io_unlock(pp);
+ pgout = 1;
+ pgpgout++;
+ TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
+ "page_ws_out:pp %p", pp);
+
+ /*
+ * The page_struct_lock need not be acquired to
+ * examine "p_lckcnt" and "p_cowcnt" since we'll
+ * have an "exclusive" lock if the upgrade succeeds.
+ */
+ if (page_tryupgrade(pp) &&
+ pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
+ /*
+ * Check if someone has reclaimed the
+ * page. If ref and mod are not set, no
+ * one is using it so we can free it.
+ * The rest of the system is careful
+ * to use the NOSYNC flag to unload
+ * translations set up for i/o w/o
+ * affecting ref and mod bits.
+ *
+ * Obtain a copy of the real hardware
+ * mod bit using hat_pagesync(pp, HAT_DONTZERO)
+ * to avoid having to flush the cache.
+ */
+ ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
+ HAT_SYNC_STOPON_MOD);
+ ck_refmod:
+ if (!(ppattr & (P_REF | P_MOD))) {
+ if (hat_page_is_mapped(pp)) {
+ /*
+ * Doesn't look like the page
+ * was modified so now we
+ * really have to unload the
+ * translations. Meanwhile
+ * another CPU could've
+ * modified it so we have to
+ * check again. We don't loop
+ * forever here because now
+ * the translations are gone
+ * and no one can get a new one
+ * since we have the "exclusive"
+ * lock on the page.
+ */
+ (void) hat_pageunload(pp,
+ HAT_FORCE_PGUNLOAD);
+ ppattr = hat_page_getattr(pp,
+ P_REF | P_MOD);
+ goto ck_refmod;
+ }
+ /*
+ * Update statistics for pages being
+ * freed
+ */
+ if (pp->p_vnode) {
+ if (IS_SWAPFSVP(pp->p_vnode)) {
+ anonfree++;
+ } else {
+ if (pp->p_vnode->v_flag
+ & VVMEXEC) {
+ execfree++;
+ } else {
+ fsfree++;
+ }
+ }
+ }
+ /*LINTED: constant in conditional ctx*/
+ VN_DISPOSE(pp, B_FREE,
+ (flags & B_DONTNEED), kcred);
+ dfree++;
+ } else {
+ page_unlock(pp);
+ pgrec++;
+ TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
+ "page_ws_free:pp %p", pp);
+ }
+ } else {
+ /*
+ * Page is either `locked' in memory
+ * or was reclaimed and now has a
+ * "shared" lock, so release it.
+ */
+ page_unlock(pp);
+ }
+ } else {
+ /*
+ * Neither B_FREE nor B_INVAL nor B_ERROR.
+ * Just release locks.
+ */
+ page_io_unlock(pp);
+ page_unlock(pp);
+ }
+ }
+
+ CPU_STATS_ENTER_K();
+ cpup = CPU; /* get cpup now that CPU cannot change */
+ CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
+ CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
+ CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
+ CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
+ CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
+ CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
+ CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
+ CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
+ CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
+ CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
+ CPU_STATS_EXIT_K();
+
+ /* Kernel probe */
+ TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
+ tnf_opaque, vnode, vp,
+ tnf_ulong, pages_pageout, pgpgout,
+ tnf_ulong, pages_freed, dfree,
+ tnf_ulong, pages_reclaimed, pgrec);
+}
+
+/*
+ * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
+ * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
+ * operation and is only to be considered if it doesn't involve any
+ * waiting here. B_TRUNC indicates that the file is being truncated
+ * and so no i/o needs to be done. B_FORCE indicates that the page
+ * must be destroyed so don't try wrting it out.
+ *
+ * The caller must ensure that the page is locked. Returns 1, if
+ * the page should be written back (the "iolock" is held in this
+ * case), or 0 if the page has been dealt with or has been
+ * unlocked.
+ */
+int
+pvn_getdirty(page_t *pp, int flags)
+{
+ ASSERT((flags & (B_INVAL | B_FREE)) ?
+ PAGE_EXCL(pp) : PAGE_SHARED(pp));
+ ASSERT(PP_ISFREE(pp) == 0);
+
+ /*
+ * If trying to invalidate or free a logically `locked' page,
+ * forget it. Don't need page_struct_lock to check p_lckcnt and
+ * p_cowcnt as the page is exclusively locked.
+ */
+ if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
+ (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
+ page_unlock(pp);
+ return (0);
+ }
+
+ /*
+ * Now acquire the i/o lock so we can add it to the dirty
+ * list (if necessary). We avoid blocking on the i/o lock
+ * in the following cases:
+ *
+ * If B_DELWRI is set, which implies that this request is
+ * due to a klustering operartion.
+ *
+ * If this is an async (B_ASYNC) operation and we are not doing
+ * invalidation (B_INVAL) [The current i/o or fsflush will ensure
+ * that the the page is written out].
+ */
+ if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
+ if (!page_io_trylock(pp)) {
+ page_unlock(pp);
+ return (0);
+ }
+ } else {
+ page_io_lock(pp);
+ }
+
+ /*
+ * If we want to free or invalidate the page then
+ * we need to unload it so that anyone who wants
+ * it will have to take a minor fault to get it.
+ * Otherwise, we're just writing the page back so we
+ * need to sync up the hardwre and software mod bit to
+ * detect any future modifications. We clear the
+ * software mod bit when we put the page on the dirty
+ * list.
+ */
+ if (flags & (B_INVAL | B_FREE)) {
+ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+ } else {
+ (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
+ }
+
+ if (!hat_ismod(pp) || (flags & B_TRUNC)) {
+ /*
+ * Don't need to add it to the
+ * list after all.
+ */
+ page_io_unlock(pp);
+ if (flags & B_INVAL) {
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, B_INVAL, 0, kcred);
+ } else if (flags & B_FREE) {
+ /*LINTED: constant in conditional context*/
+ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
+ } else {
+ /*
+ * This is advisory path for the callers
+ * of VOP_PUTPAGE() who prefer freeing the
+ * page _only_ if no one else is accessing it.
+ * E.g. segmap_release()
+ *
+ * The above hat_ismod() check is useless because:
+ * (1) we may not be holding SE_EXCL lock;
+ * (2) we've not unloaded _all_ translations
+ *
+ * Let page_release() do the heavy-lifting.
+ */
+ (void) page_release(pp, 1);
+ }
+ return (0);
+ }
+
+ /*
+ * Page is dirty, get it ready for the write back
+ * and add page to the dirty list.
+ */
+ hat_clrrefmod(pp);
+
+ /*
+ * If we're going to free the page when we're done
+ * then we can let others try to use it starting now.
+ * We'll detect the fact that they used it when the
+ * i/o is done and avoid freeing the page.
+ */
+ if (flags & B_FREE)
+ page_downgrade(pp);
+
+
+ TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
+
+ return (1);
+}
+
+
+/*ARGSUSED*/
+static int
+marker_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ page_t *mark = buf;
+ bzero(mark, sizeof (page_t));
+ return (0);
+}
+
+void
+pvn_init()
+{
+ if (pvn_vmodsort_disable == 0)
+ pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
+ marker_cache = kmem_cache_create("marker_cache",
+ sizeof (page_t), 0, marker_constructor,
+ NULL, NULL, NULL, NULL, 0);
+}
+
+
+/*
+ * Process a vnode's page list for all pages whose offset is >= off.
+ * Pages are to either be free'd, invalidated, or written back to disk.
+ *
+ * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
+ * is specified, otherwise they are "shared" locked.
+ *
+ * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
+ *
+ * Special marker page_t's are inserted in the list in order
+ * to keep track of where we are in the list when locks are dropped.
+ *
+ * Note the list is circular and insertions can happen only at the
+ * head and tail of the list. The algorithm ensures visiting all pages
+ * on the list in the following way:
+ *
+ * Drop two marker pages at the end of the list.
+ *
+ * Move one marker page backwards towards the start of the list until
+ * it is at the list head, processing the pages passed along the way.
+ *
+ * Due to race conditions when the vphm mutex is dropped, additional pages
+ * can be added to either end of the list, so we'll continue to move
+ * the marker and process pages until it is up against the end marker.
+ *
+ * There is one special exit condition. If we are processing a VMODSORT
+ * vnode and only writing back modified pages, we can stop as soon as
+ * we run into an unmodified page. This makes fsync(3) operations fast.
+ */
+int
+pvn_vplist_dirty(
+ vnode_t *vp,
+ u_offset_t off,
+ int (*putapage)(vnode_t *, page_t *, u_offset_t *,
+ size_t *, int, cred_t *),
+ int flags,
+ cred_t *cred)
+{
+ page_t *pp;
+ page_t *mark; /* marker page that moves toward head */
+ page_t *end; /* marker page at end of list */
+ int err = 0;
+ int error;
+ kmutex_t *vphm;
+ se_t se;
+ page_t **where_to_move;
+
+ ASSERT(vp->v_type != VCHR);
+
+ if (vp->v_pages == NULL)
+ return (0);
+
+
+ /*
+ * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
+ *
+ * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
+ * from getting blocked while flushing pages to a dead NFS server.
+ */
+ mutex_enter(&vp->v_lock);
+ if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
+ mutex_exit(&vp->v_lock);
+ return (EAGAIN);
+ }
+
+ while (vp->v_flag & VVMLOCK)
+ cv_wait(&vp->v_cv, &vp->v_lock);
+
+ if (vp->v_pages == NULL) {
+ mutex_exit(&vp->v_lock);
+ return (0);
+ }
+
+ vp->v_flag |= VVMLOCK;
+ mutex_exit(&vp->v_lock);
+
+
+ /*
+ * Set up the marker pages used to walk the list
+ */
+ end = kmem_cache_alloc(marker_cache, KM_SLEEP);
+ end->p_vnode = vp;
+ end->p_offset = (u_offset_t)-2;
+ mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
+ mark->p_vnode = vp;
+ mark->p_offset = (u_offset_t)-1;
+
+ /*
+ * Grab the lock protecting the vnode's page list
+ * note that this lock is dropped at times in the loop.
+ */
+ vphm = page_vnode_mutex(vp);
+ mutex_enter(vphm);
+ if (vp->v_pages == NULL)
+ goto leave;
+
+ /*
+ * insert the markers and loop through the list of pages
+ */
+ page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
+ page_vpadd(&mark->p_vpnext, end);
+ for (;;) {
+
+ /*
+ * If only doing an async write back, then we can
+ * stop as soon as we get to start of the list.
+ */
+ if (flags == B_ASYNC && vp->v_pages == mark)
+ break;
+
+ /*
+ * otherwise stop when we've gone through all the pages
+ */
+ if (mark->p_vpprev == end)
+ break;
+
+ pp = mark->p_vpprev;
+ if (vp->v_pages == pp)
+ where_to_move = &vp->v_pages;
+ else
+ where_to_move = &pp->p_vpprev->p_vpnext;
+
+ ASSERT(pp->p_vnode == vp);
+
+ /*
+ * Skip this page if the offset is out of the desired range.
+ * Just move the marker and continue.
+ */
+ if (pp->p_offset < off) {
+ page_vpsub(&vp->v_pages, mark);
+ page_vpadd(where_to_move, mark);
+ continue;
+ }
+
+ /*
+ * If just flushing dirty pages to disk and this vnode
+ * is using a sorted list of pages, we can stop processing
+ * as soon as we find an unmodified page. Since all the
+ * modified pages are visited first.
+ */
+ if (IS_VMODSORT(vp) &&
+ !(flags & (B_INVAL | B_FREE | B_TRUNC)) &&
+ !hat_ismod(pp)) {
+#ifdef DEBUG
+ /*
+ * For debug kernels examine what should be all the
+ * remaining clean pages, asserting that they are
+ * not modified.
+ */
+ page_t *chk = pp;
+ int attr;
+
+ page_vpsub(&vp->v_pages, mark);
+ page_vpadd(where_to_move, mark);
+ do {
+ chk = chk->p_vpprev;
+ ASSERT(chk != end);
+ if (chk == mark)
+ continue;
+ attr = hat_page_getattr(chk, P_MOD | P_REF);
+ if ((attr & P_MOD) == 0)
+ continue;
+ panic("v_pages list not all clean: "
+ "page_t*=%p vnode=%p off=%lx "
+ "attr=0x%x last clean page_t*=%p\n",
+ (void *)chk, (void *)chk->p_vnode,
+ (long)chk->p_offset, attr, (void *)pp);
+ } while (chk != vp->v_pages);
+#endif
+ break;
+ }
+
+ /*
+ * If we are supposed to invalidate or free this
+ * page, then we need an exclusive lock.
+ */
+ se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
+
+ /*
+ * We must acquire the page lock for all synchronous
+ * operations (invalidate, free and write).
+ */
+ if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
+ /*
+ * If the page_lock() drops the mutex
+ * we must retry the loop.
+ */
+ if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
+ continue;
+
+ /*
+ * It's ok to move the marker page now.
+ */
+ page_vpsub(&vp->v_pages, mark);
+ page_vpadd(where_to_move, mark);
+ } else {
+
+ /*
+ * update the marker page for all remaining cases
+ */
+ page_vpsub(&vp->v_pages, mark);
+ page_vpadd(where_to_move, mark);
+
+ /*
+ * For write backs, If we can't lock the page, it's
+ * invalid or in the process of being destroyed. Skip
+ * it, assuming someone else is writing it.
+ */
+ if (!page_trylock(pp, se))
+ continue;
+ }
+
+ ASSERT(pp->p_vnode == vp);
+
+ /*
+ * Successfully locked the page, now figure out what to
+ * do with it. Free pages are easily dealt with, invalidate
+ * if desired or just go on to the next page.
+ */
+ if (PP_ISFREE(pp)) {
+ if ((flags & B_INVAL) == 0) {
+ page_unlock(pp);
+ continue;
+ }
+
+ /*
+ * Invalidate (destroy) the page.
+ */
+ mutex_exit(vphm);
+ page_destroy_free(pp);
+ mutex_enter(vphm);
+ continue;
+ }
+
+ /*
+ * pvn_getdirty() figures out what do do with a dirty page.
+ * If the page is dirty, the putapage() routine will write it
+ * and will kluster any other adjacent dirty pages it can.
+ *
+ * pvn_getdirty() and `(*putapage)' unlock the page.
+ */
+ mutex_exit(vphm);
+ if (pvn_getdirty(pp, flags)) {
+ error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
+ if (!err)
+ err = error;
+ }
+ mutex_enter(vphm);
+ }
+ page_vpsub(&vp->v_pages, mark);
+ page_vpsub(&vp->v_pages, end);
+
+leave:
+ /*
+ * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
+ */
+ mutex_exit(vphm);
+ kmem_cache_free(marker_cache, mark);
+ kmem_cache_free(marker_cache, end);
+ mutex_enter(&vp->v_lock);
+ vp->v_flag &= ~VVMLOCK;
+ cv_broadcast(&vp->v_cv);
+ mutex_exit(&vp->v_lock);
+ return (err);
+}
+
+/*
+ * Zero out zbytes worth of data. Caller should be aware that this
+ * routine may enter back into the fs layer (xxx_getpage). Locks
+ * that the xxx_getpage routine may need should not be held while
+ * calling this.
+ */
+void
+pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
+{
+ caddr_t addr;
+
+ ASSERT(vp->v_type != VCHR);
+
+ if (vp->v_pages == NULL)
+ return;
+
+ /*
+ * zbytes may be zero but there still may be some portion of
+ * a page which needs clearing (since zbytes is a function
+ * of filesystem block size, not pagesize.)
+ */
+ if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
+ return;
+
+ /*
+ * We get the last page and handle the partial
+ * zeroing via kernel mappings. This will make the page
+ * dirty so that we know that when this page is written
+ * back, the zeroed information will go out with it. If
+ * the page is not currently in memory, then the kzero
+ * operation will cause it to be brought it. We use kzero
+ * instead of bzero so that if the page cannot be read in
+ * for any reason, the system will not panic. We need
+ * to zero out a minimum of the fs given zbytes, but we
+ * might also have to do more to get the entire last page.
+ */
+
+ if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
+ panic("pvn_vptrunc zbytes");
+ addr = segmap_getmapflt(segkmap, vp, vplen,
+ MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
+ (void) kzero(addr + (vplen & MAXBOFFSET),
+ MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
+ (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
+}
+
+/*
+ * Handles common work of the VOP_GETPAGE routines when more than
+ * one page must be returned by calling a file system specific operation
+ * to do most of the work. Must be called with the vp already locked
+ * by the VOP_GETPAGE routine.
+ */
+int
+pvn_getpages(
+ int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
+ size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
+ struct vnode *vp,
+ u_offset_t off,
+ size_t len,
+ uint_t *protp,
+ page_t *pl[],
+ size_t plsz,
+ struct seg *seg,
+ caddr_t addr,
+ enum seg_rw rw,
+ struct cred *cred)
+{
+ page_t **ppp;
+ u_offset_t o, eoff;
+ size_t sz, xlen;
+ int err;
+
+ ASSERT(plsz >= len); /* insure that we have enough space */
+
+ /*
+ * Loop one page at a time and let getapage function fill
+ * in the next page in array. We only allow one page to be
+ * returned at a time (except for the last page) so that we
+ * don't have any problems with duplicates and other such
+ * painful problems. This is a very simple minded algorithm,
+ * but it does the job correctly. We hope that the cost of a
+ * getapage call for a resident page that we might have been
+ * able to get from an earlier call doesn't cost too much.
+ */
+ ppp = pl;
+ sz = PAGESIZE;
+ eoff = off + len;
+ xlen = len;
+ for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
+ xlen -= PAGESIZE) {
+ if (o + PAGESIZE >= eoff) {
+ /*
+ * Last time through - allow the all of
+ * what's left of the pl[] array to be used.
+ */
+ sz = plsz - (o - off);
+ }
+ err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
+ rw, cred);
+ if (err) {
+ /*
+ * Release any pages we already got.
+ */
+ if (o > off && pl != NULL) {
+ for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
+ (void) page_release(*ppp, 1);
+ }
+ break;
+ }
+ if (pl != NULL)
+ ppp++;
+ }
+ return (err);
+}
+
+/*
+ * Initialize the page list array.
+ */
+void
+pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
+ u_offset_t off, size_t io_len, enum seg_rw rw)
+{
+ ssize_t sz;
+ page_t *ppcur, **ppp;
+
+ if (plsz >= io_len) {
+ /*
+ * Everything fits, set up to load
+ * all the pages.
+ */
+ sz = io_len;
+ } else {
+ /*
+ * Set up to load plsz worth
+ * starting at the needed page.
+ */
+ while (pp->p_offset != off) {
+ /* XXX - Do we need this assert? */
+ ASSERT(pp->p_next->p_offset !=
+ pp->p_offset);
+ /*
+ * Remove page from the i/o list,
+ * release the i/o and the page lock.
+ */
+ ppcur = pp;
+ page_sub(&pp, ppcur);
+ page_io_unlock(ppcur);
+ (void) page_release(ppcur, 1);
+ }
+ sz = plsz;
+ }
+
+ /*
+ * Initialize the page list array.
+ */
+ ppp = pl;
+ do {
+ ppcur = pp;
+ *ppp++ = ppcur;
+ page_sub(&pp, ppcur);
+ page_io_unlock(ppcur);
+ if (rw != S_CREATE)
+ page_downgrade(ppcur);
+ sz -= PAGESIZE;
+ } while (sz > 0 && pp != NULL);
+ *ppp = NULL; /* terminate list */
+
+ /*
+ * Now free the remaining pages that weren't
+ * loaded in the page list.
+ */
+ while (pp != NULL) {
+ ppcur = pp;
+ page_sub(&pp, ppcur);
+ page_io_unlock(ppcur);
+ (void) page_release(ppcur, 1);
+ }
+}
diff --git a/usr/src/uts/common/vm/vm_rm.c b/usr/src/uts/common/vm/vm_rm.c
new file mode 100644
index 0000000000..36cd5f0375
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_rm.c
@@ -0,0 +1,189 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/sysmacros.h>
+#include <sys/errno.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/proc.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg_vn.h>
+#include <vm/rm.h>
+#include <vm/seg.h>
+#include <vm/page.h>
+
+/*
+ * Yield the size of an address space.
+ *
+ * The size can only be used as a hint since we cannot guarantee it
+ * will stay the same size unless the as->a_lock is held by the caller.
+ */
+size_t
+rm_assize(struct as *as)
+{
+ size_t size = 0;
+ struct seg *seg;
+ struct segvn_data *svd;
+ extern struct seg_ops segdev_ops; /* needs a header file */
+
+ ASSERT(as != NULL && AS_READ_HELD(as, &as->a_lock));
+
+ if (as == &kas)
+ return (0);
+
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+ if (seg->s_ops == &segdev_ops &&
+ ((SEGOP_GETTYPE(seg, seg->s_base) &
+ (MAP_SHARED | MAP_PRIVATE)) == 0)) {
+ /*
+ * Don't include mappings of /dev/null. These just
+ * reserve address space ranges and have no memory.
+ * We cheat by knowing that these segments come
+ * from segdev and have no mapping type.
+ */
+ /* EMPTY */;
+ } else if (seg->s_ops == &segvn_ops &&
+ (svd = (struct segvn_data *)seg->s_data) != NULL &&
+ (svd->vp == NULL || svd->vp->v_type != VREG) &&
+ (svd->flags & MAP_NORESERVE)) {
+ /*
+ * Don't include MAP_NORESERVE pages in the
+ * address range unless their mappings have
+ * actually materialized. We cheat by knowing
+ * that segvn is the only segment driver that
+ * supports MAP_NORESERVE and that the actual
+ * number of bytes reserved is in the segment's
+ * private data structure.
+ */
+ size += svd->swresv;
+ } else {
+ caddr_t addr = seg->s_base;
+ size_t segsize = seg->s_size;
+ vnode_t *vp;
+ vattr_t vattr;
+
+ /*
+ * If the segment is mapped beyond the end of the
+ * underlying mapped file, if any, then limit the
+ * segment's size contribution to the file size.
+ */
+ vattr.va_mask = AT_SIZE;
+ if (seg->s_ops == &segvn_ops &&
+ SEGOP_GETVP(seg, addr, &vp) == 0 &&
+ vp != NULL && vp->v_type == VREG &&
+ VOP_GETATTR(vp, &vattr, ATTR_HINT, CRED()) == 0) {
+ u_offset_t filesize = vattr.va_size;
+ u_offset_t offset = SEGOP_GETOFFSET(seg, addr);
+
+ if (filesize < offset)
+ filesize = 0;
+ else
+ filesize -= offset;
+ filesize = P2ROUNDUP_TYPED(filesize, PAGESIZE,
+ u_offset_t);
+ if ((u_offset_t)segsize > filesize)
+ segsize = filesize;
+ }
+ size += segsize;
+ }
+ }
+
+ return (size);
+}
+
+/*
+ * Yield the memory claim requirement for an address space.
+ *
+ * This is currently implemented as the number of active hardware
+ * translations that have page structures. Therefore, it can
+ * underestimate the traditional resident set size, eg, if the
+ * physical page is present and the hardware translation is missing;
+ * and it can overestimate the rss, eg, if there are active
+ * translations to a frame buffer with page structs.
+ * Also, it does not take sharing and XHATs into account.
+ */
+size_t
+rm_asrss(as)
+ register struct as *as;
+{
+ if (as != (struct as *)NULL && as != &kas)
+ return ((size_t)btop(hat_get_mapped_size(as->a_hat)));
+ else
+ return (0);
+}
+
+/*
+ * Return a 16-bit binary fraction representing the percent of total memory
+ * used by this address space. Binary point is to right of high-order bit.
+ * Defined as the ratio of a_rss for the process to total physical memory.
+ * This assumes 2s-complement arithmetic and that shorts and longs are
+ * 16 bits and 32 bits, respectively.
+ */
+ushort_t
+rm_pctmemory(struct as *as)
+{
+ /* This can't overflow */
+ ulong_t num = (ulong_t)rm_asrss(as) << (PAGESHIFT-1);
+ int shift = 16 - PAGESHIFT;
+ ulong_t total = total_pages;
+
+ if (shift < 0) {
+ num >>= (-shift);
+ shift = 0;
+ }
+ while (shift > 0 && (num & 0x80000000) == 0) {
+ shift--;
+ num <<= 1;
+ }
+ if (shift > 0)
+ total >>= shift;
+
+ return (num / total);
+}
diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c
new file mode 100644
index 0000000000..50cc21cdf7
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_seg.c
@@ -0,0 +1,952 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - segment management.
+ */
+
+#include <sys/types.h>
+#include <sys/inttypes.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kmem.h>
+#include <sys/vmsystm.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/callb.h>
+#include <sys/mem_config.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kmem.h>
+
+/*
+ * kstats for segment advise
+ */
+segadvstat_t segadvstat = {
+ { "MADV_FREE_hit", KSTAT_DATA_ULONG },
+ { "MADV_FREE_miss", KSTAT_DATA_ULONG },
+};
+
+kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
+uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
+
+/* #define PDEBUG */
+#if defined(PDEBUG) || defined(lint) || defined(__lint)
+int pdebug = 0;
+#else
+#define pdebug 0
+#endif /* PDEBUG */
+
+#define PPRINTF if (pdebug) printf
+#define PPRINT(x) PPRINTF(x)
+#define PPRINT1(x, a) PPRINTF(x, a)
+#define PPRINT2(x, a, b) PPRINTF(x, a, b)
+#define PPRINT3(x, a, b, c) PPRINTF(x, a, b, c)
+#define PPRINT4(x, a, b, c, d) PPRINTF(x, a, b, c, d)
+#define PPRINT5(x, a, b, c, d, e) PPRINTF(x, a, b, c, d, e)
+
+#define P_HASHMASK (p_hashsize - 1)
+#define P_BASESHIFT 6
+
+/*
+ * entry in the segment page cache
+ */
+struct seg_pcache {
+ struct seg_pcache *p_hnext; /* list for hashed blocks */
+ struct seg_pcache *p_hprev;
+ int p_active; /* active count */
+ int p_ref; /* ref bit */
+ size_t p_len; /* segment length */
+ caddr_t p_addr; /* base address */
+ struct seg *p_seg; /* segment */
+ struct page **p_pp; /* pp shadow list */
+ enum seg_rw p_rw; /* rw */
+ uint_t p_flags; /* bit flags */
+ int (*p_callback)(struct seg *, caddr_t, size_t,
+ struct page **, enum seg_rw);
+};
+
+struct seg_phash {
+ struct seg_pcache *p_hnext; /* list for hashed blocks */
+ struct seg_pcache *p_hprev;
+ int p_qlen; /* Q length */
+ kmutex_t p_hmutex; /* protects hash bucket */
+};
+
+static int seg_preap_time = 20; /* reclaim every 20 secs */
+static int seg_pmaxqlen = 5; /* max Q length in hash list */
+static int seg_ppcount = 5; /* max # of purges per reclaim interval */
+static int seg_plazy = 1; /* if 1, pages are cached after pageunlock */
+static pgcnt_t seg_pwindow; /* max # of pages that can be cached */
+static pgcnt_t seg_plocked; /* # of pages which are cached by pagelock */
+static pgcnt_t seg_plocked_window; /* # pages from window */
+int seg_preapahead;
+
+static uint_t seg_pdisable = 0; /* if not 0, caching temporarily disabled */
+
+static int seg_pupdate_active = 1; /* background reclaim thread */
+static clock_t seg_preap_interval; /* reap interval in ticks */
+
+static kmutex_t seg_pcache; /* protects the whole pagelock cache */
+static kmutex_t seg_pmem; /* protects window counter */
+static ksema_t seg_psaync_sem; /* sema for reclaim thread */
+static struct seg_phash *p_hashtab;
+static int p_hashsize = 0;
+
+#define p_hash(seg) \
+ (P_HASHMASK & \
+ ((uintptr_t)(seg) >> P_BASESHIFT))
+
+#define p_match(pcp, seg, addr, len, rw) \
+ (((pcp)->p_seg == (seg) && \
+ (pcp)->p_addr == (addr) && \
+ (pcp)->p_rw == (rw) && \
+ (pcp)->p_len == (len)) ? 1 : 0)
+
+#define p_match_pp(pcp, seg, addr, len, pp, rw) \
+ (((pcp)->p_seg == (seg) && \
+ (pcp)->p_addr == (addr) && \
+ (pcp)->p_pp == (pp) && \
+ (pcp)->p_rw == (rw) && \
+ (pcp)->p_len == (len)) ? 1 : 0)
+
+
+/*
+ * lookup an address range in pagelock cache. Return shadow list
+ * and bump up active count.
+ */
+struct page **
+seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
+{
+ struct seg_pcache *pcp;
+ struct seg_phash *hp;
+
+ /*
+ * Skip pagelock cache, while DR is in progress or
+ * seg_pcache is off.
+ */
+ if (seg_pdisable || seg_plazy == 0) {
+ return (NULL);
+ }
+
+ hp = &p_hashtab[p_hash(seg)];
+ mutex_enter(&hp->p_hmutex);
+ for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
+ pcp = pcp->p_hnext) {
+ if (p_match(pcp, seg, addr, len, rw)) {
+ pcp->p_active++;
+ mutex_exit(&hp->p_hmutex);
+
+ PPRINT5("seg_plookup hit: seg %p, addr %p, "
+ "len %lx, count %d, pplist %p \n",
+ (void *)seg, (void *)addr, len, pcp->p_active,
+ (void *)pcp->p_pp);
+
+ return (pcp->p_pp);
+ }
+ }
+ mutex_exit(&hp->p_hmutex);
+
+ PPRINT("seg_plookup miss:\n");
+
+ return (NULL);
+}
+
+/*
+ * mark address range inactive. If the cache is off or the address
+ * range is not in the cache we call the segment driver to reclaim
+ * the pages. Otherwise just decrement active count and set ref bit.
+ */
+void
+seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
+ enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t,
+ struct page **, enum seg_rw))
+{
+ struct seg_pcache *pcp;
+ struct seg_phash *hp;
+
+ if (seg_plazy == 0) {
+ (void) (*callback)(seg, addr, len, pp, rw);
+ return;
+ }
+ hp = &p_hashtab[p_hash(seg)];
+ mutex_enter(&hp->p_hmutex);
+ for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
+ pcp = pcp->p_hnext) {
+ if (p_match_pp(pcp, seg, addr, len, pp, rw)) {
+ pcp->p_active--;
+ ASSERT(pcp->p_active >= 0);
+ if (pcp->p_active == 0 && seg_pdisable) {
+ int npages;
+
+ ASSERT(callback == pcp->p_callback);
+ /* free the entry */
+ hp->p_qlen--;
+ pcp->p_hprev->p_hnext = pcp->p_hnext;
+ pcp->p_hnext->p_hprev = pcp->p_hprev;
+ mutex_exit(&hp->p_hmutex);
+ npages = pcp->p_len >> PAGESHIFT;
+ mutex_enter(&seg_pmem);
+ seg_plocked -= npages;
+ if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
+ seg_plocked_window -= npages;
+ }
+ mutex_exit(&seg_pmem);
+ kmem_free(pcp, sizeof (struct seg_pcache));
+ goto out;
+ }
+ pcp->p_ref = 1;
+ mutex_exit(&hp->p_hmutex);
+ return;
+ }
+ }
+ mutex_exit(&hp->p_hmutex);
+out:
+ (void) (*callback)(seg, addr, len, pp, rw);
+}
+
+/*
+ * The seg_pinsert_check() is used by segment drivers to predict whether
+ * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
+ */
+
+int
+seg_pinsert_check(struct seg *seg, size_t len, uint_t flags)
+{
+ struct seg_phash *hp;
+
+ if (seg_plazy == 0) {
+ return (SEGP_FAIL);
+ }
+ if (seg_pdisable != 0) {
+ return (SEGP_FAIL);
+ }
+ ASSERT((len & PAGEOFFSET) == 0);
+ hp = &p_hashtab[p_hash(seg)];
+ if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
+ return (SEGP_FAIL);
+ }
+ /*
+ * If the SEGP_FORCE_WIRED flag is set,
+ * we skip the check for seg_pwindow.
+ */
+ if ((flags & SEGP_FORCE_WIRED) == 0) {
+ pgcnt_t npages;
+
+ npages = len >> PAGESHIFT;
+ if ((seg_plocked_window + npages) > seg_pwindow) {
+ return (SEGP_FAIL);
+ }
+ }
+ return (SEGP_SUCCESS);
+}
+
+
+/*
+ * insert address range with shadow list into pagelock cache. If
+ * the cache is off or caching is temporarily disabled or the allowed
+ * 'window' is exceeded - return SEGP_FAIL. Otherwise return
+ * SEGP_SUCCESS.
+ */
+int
+seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
+ enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t,
+ size_t, struct page **, enum seg_rw))
+{
+ struct seg_pcache *pcp;
+ struct seg_phash *hp;
+ pgcnt_t npages;
+
+ if (seg_plazy == 0) {
+ return (SEGP_FAIL);
+ }
+ if (seg_pdisable != 0) {
+ return (SEGP_FAIL);
+ }
+ ASSERT((len & PAGEOFFSET) == 0);
+ hp = &p_hashtab[p_hash(seg)];
+ if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
+ return (SEGP_FAIL);
+ }
+ npages = len >> PAGESHIFT;
+ mutex_enter(&seg_pmem);
+ /*
+ * If the SEGP_FORCE_WIRED flag is set,
+ * we skip the check for seg_pwindow.
+ */
+ if ((flags & SEGP_FORCE_WIRED) == 0) {
+ seg_plocked_window += npages;
+ if (seg_plocked_window > seg_pwindow) {
+ seg_plocked_window -= npages;
+ mutex_exit(&seg_pmem);
+ return (SEGP_FAIL);
+ }
+ }
+ seg_plocked += npages;
+ mutex_exit(&seg_pmem);
+
+ pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP);
+ pcp->p_seg = seg;
+ pcp->p_addr = addr;
+ pcp->p_len = len;
+ pcp->p_pp = pp;
+ pcp->p_rw = rw;
+ pcp->p_callback = callback;
+ pcp->p_active = 1;
+ pcp->p_flags = flags;
+
+ PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n",
+ (void *)seg, (void *)addr, len, (void *)pp);
+
+ hp = &p_hashtab[p_hash(seg)];
+ mutex_enter(&hp->p_hmutex);
+ hp->p_qlen++;
+ pcp->p_hnext = hp->p_hnext;
+ pcp->p_hprev = (struct seg_pcache *)hp;
+ hp->p_hnext->p_hprev = pcp;
+ hp->p_hnext = pcp;
+ mutex_exit(&hp->p_hmutex);
+ return (SEGP_SUCCESS);
+}
+
+/*
+ * purge all entries from the pagelock cache if not active
+ * and not recently used. Drop all locks and call through
+ * the address space into the segment driver to reclaim
+ * the pages. This makes sure we get the address space
+ * and segment driver locking right.
+ */
+static void
+seg_ppurge_all(int force)
+{
+ struct seg_pcache *delcallb_list = NULL;
+ struct seg_pcache *pcp;
+ struct seg_phash *hp;
+ int purge_count = 0;
+ pgcnt_t npages = 0;
+ pgcnt_t npages_window = 0;
+
+ /*
+ * if the cache if off or empty, return
+ */
+ if (seg_plazy == 0 || seg_plocked == 0) {
+ return;
+ }
+ for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
+ mutex_enter(&hp->p_hmutex);
+ pcp = hp->p_hnext;
+
+ /*
+ * While 'force' is set, seg_pasync_thread is not
+ * throttled. This is to speedup flushing of seg_pcache
+ * in preparation for DR.
+ *
+ * In normal case, when 'force' is not set, we throttle
+ * seg_pasync_thread so that we don't spend all the time
+ * time in purging the cache.
+ */
+ while ((pcp != (struct seg_pcache *)hp) &&
+ (force || (purge_count <= seg_ppcount))) {
+
+ /*
+ * purge entries which are not active and
+ * have not been used recently and
+ * have the SEGP_ASYNC_FLUSH flag.
+ *
+ * In the 'force' case, we ignore the
+ * SEGP_ASYNC_FLUSH flag.
+ */
+ if (!(pcp->p_flags & SEGP_ASYNC_FLUSH))
+ pcp->p_ref = 1;
+ if (force)
+ pcp->p_ref = 0;
+ if (!pcp->p_ref && !pcp->p_active) {
+ struct as *as = pcp->p_seg->s_as;
+
+ /*
+ * try to get the readers lock on the address
+ * space before taking out the cache element.
+ * This ensures as_pagereclaim() can actually
+ * call through the address space and free
+ * the pages. If we don't get the lock, just
+ * skip this entry. The pages will be reclaimed
+ * by the segment driver at unmap time.
+ */
+ if (AS_LOCK_TRYENTER(as, &as->a_lock,
+ RW_READER)) {
+ hp->p_qlen--;
+ pcp->p_hprev->p_hnext = pcp->p_hnext;
+ pcp->p_hnext->p_hprev = pcp->p_hprev;
+ pcp->p_hprev = delcallb_list;
+ delcallb_list = pcp;
+ purge_count++;
+ }
+ } else {
+ pcp->p_ref = 0;
+ }
+ pcp = pcp->p_hnext;
+ }
+ mutex_exit(&hp->p_hmutex);
+ if (!force && purge_count > seg_ppcount)
+ break;
+ }
+
+ /*
+ * run the delayed callback list. We don't want to hold the
+ * cache lock during a call through the address space.
+ */
+ while (delcallb_list != NULL) {
+ struct as *as;
+
+ pcp = delcallb_list;
+ delcallb_list = pcp->p_hprev;
+ as = pcp->p_seg->s_as;
+
+ PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, "
+ "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr,
+ pcp->p_len, (void *)pcp->p_pp);
+
+ as_pagereclaim(as, pcp->p_pp, pcp->p_addr,
+ pcp->p_len, pcp->p_rw);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ npages += pcp->p_len >> PAGESHIFT;
+ if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
+ npages_window += pcp->p_len >> PAGESHIFT;
+ }
+ kmem_free(pcp, sizeof (struct seg_pcache));
+ }
+ mutex_enter(&seg_pmem);
+ seg_plocked -= npages;
+ seg_plocked_window -= npages_window;
+ mutex_exit(&seg_pmem);
+}
+
+/*
+ * Remove cached pages for segment(s) entries from hashtable.
+ * The segments are identified by a given clients callback
+ * function.
+ * This is useful for multiple seg's cached on behalf of
+ * dummy segment (ISM/DISM) with common callback function.
+ * The clients callback function may return status indicating
+ * that the last seg's entry has been purged. In such a case
+ * the seg_ppurge_seg() stops searching hashtable and exits.
+ * Otherwise all hashtable entries are scanned.
+ */
+void
+seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t,
+ struct page **, enum seg_rw))
+{
+ struct seg_pcache *pcp, *npcp;
+ struct seg_phash *hp;
+ pgcnt_t npages = 0;
+ pgcnt_t npages_window = 0;
+ int done = 0;
+
+ /*
+ * if the cache if off or empty, return
+ */
+ if (seg_plazy == 0 || seg_plocked == 0) {
+ return;
+ }
+ mutex_enter(&seg_pcache);
+ seg_pdisable++;
+ mutex_exit(&seg_pcache);
+
+ for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
+
+ mutex_enter(&hp->p_hmutex);
+ pcp = hp->p_hnext;
+ while (pcp != (struct seg_pcache *)hp) {
+
+ /*
+ * purge entries which are not active
+ */
+ npcp = pcp->p_hnext;
+ if (!pcp->p_active && pcp->p_callback == callback) {
+ hp->p_qlen--;
+ pcp->p_hprev->p_hnext = pcp->p_hnext;
+ pcp->p_hnext->p_hprev = pcp->p_hprev;
+
+ if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr,
+ pcp->p_len, pcp->p_pp, pcp->p_rw)) {
+ done = 1;
+ }
+
+ npages += pcp->p_len >> PAGESHIFT;
+ if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
+ npages_window +=
+ pcp->p_len >> PAGESHIFT;
+ }
+ kmem_free(pcp, sizeof (struct seg_pcache));
+ }
+ pcp = npcp;
+ if (done)
+ break;
+ }
+ mutex_exit(&hp->p_hmutex);
+ if (done)
+ break;
+ }
+
+ mutex_enter(&seg_pcache);
+ seg_pdisable--;
+ mutex_exit(&seg_pcache);
+
+ mutex_enter(&seg_pmem);
+ seg_plocked -= npages;
+ seg_plocked_window -= npages_window;
+ mutex_exit(&seg_pmem);
+}
+
+/*
+ * purge all entries for a given segment. Since we
+ * callback into the segment driver directly for page
+ * reclaim the caller needs to hold the right locks.
+ */
+void
+seg_ppurge(struct seg *seg)
+{
+ struct seg_pcache *delcallb_list = NULL;
+ struct seg_pcache *pcp;
+ struct seg_phash *hp;
+ pgcnt_t npages = 0;
+ pgcnt_t npages_window = 0;
+
+ if (seg_plazy == 0) {
+ return;
+ }
+ hp = &p_hashtab[p_hash(seg)];
+ mutex_enter(&hp->p_hmutex);
+ pcp = hp->p_hnext;
+ while (pcp != (struct seg_pcache *)hp) {
+ if (pcp->p_seg == seg) {
+ if (pcp->p_active) {
+ break;
+ }
+ hp->p_qlen--;
+ pcp->p_hprev->p_hnext = pcp->p_hnext;
+ pcp->p_hnext->p_hprev = pcp->p_hprev;
+ pcp->p_hprev = delcallb_list;
+ delcallb_list = pcp;
+ }
+ pcp = pcp->p_hnext;
+ }
+ mutex_exit(&hp->p_hmutex);
+ while (delcallb_list != NULL) {
+ pcp = delcallb_list;
+ delcallb_list = pcp->p_hprev;
+
+ PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, "
+ "pplist %p\n", (void *)seg, (void *)pcp->p_addr,
+ pcp->p_len, (void *)pcp->p_pp);
+
+ ASSERT(seg == pcp->p_seg);
+ (void) (*pcp->p_callback)(seg, pcp->p_addr,
+ pcp->p_len, pcp->p_pp, pcp->p_rw);
+ npages += pcp->p_len >> PAGESHIFT;
+ if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
+ npages_window += pcp->p_len >> PAGESHIFT;
+ }
+ kmem_free(pcp, sizeof (struct seg_pcache));
+ }
+ mutex_enter(&seg_pmem);
+ seg_plocked -= npages;
+ seg_plocked_window -= npages_window;
+ mutex_exit(&seg_pmem);
+}
+
+static void seg_pinit_mem_config(void);
+
+/*
+ * setup the pagelock cache
+ */
+static void
+seg_pinit(void)
+{
+ struct seg_phash *hp;
+ int i;
+ uint_t physmegs;
+
+ sema_init(&seg_psaync_sem, 0, NULL, SEMA_DEFAULT, NULL);
+
+ mutex_enter(&seg_pcache);
+ if (p_hashtab == NULL) {
+ physmegs = physmem >> (20 - PAGESHIFT);
+
+ /* If p_hashsize was not set in /etc/system ... */
+ if (p_hashsize == 0) {
+ /*
+ * Choose p_hashsize based on physmem.
+ */
+ if (physmegs < 64) {
+ p_hashsize = 64;
+ } else if (physmegs < 1024) {
+ p_hashsize = 1024;
+ } else if (physmegs < 10 * 1024) {
+ p_hashsize = 8192;
+ } else if (physmegs < 20 * 1024) {
+ p_hashsize = 2 * 8192;
+ seg_pmaxqlen = 16;
+ } else {
+ p_hashsize = 128 * 1024;
+ seg_pmaxqlen = 128;
+ }
+ }
+
+ p_hashtab = kmem_zalloc(
+ p_hashsize * sizeof (struct seg_phash), KM_SLEEP);
+ for (i = 0; i < p_hashsize; i++) {
+ hp = (struct seg_phash *)&p_hashtab[i];
+ hp->p_hnext = (struct seg_pcache *)hp;
+ hp->p_hprev = (struct seg_pcache *)hp;
+ mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
+ }
+ if (seg_pwindow == 0) {
+ if (physmegs < 24) {
+ /* don't use cache */
+ seg_plazy = 0;
+ } else if (physmegs < 64) {
+ seg_pwindow = physmem >> 5; /* 3% of memory */
+ } else if (physmegs < 10 * 1024) {
+ seg_pwindow = physmem >> 3; /* 12% of memory */
+ } else {
+ seg_pwindow = physmem >> 1;
+ }
+ }
+ }
+ mutex_exit(&seg_pcache);
+
+ seg_pinit_mem_config();
+}
+
+/*
+ * called by pageout if memory is low
+ */
+void
+seg_preap(void)
+{
+ /*
+ * if the cache if off or empty, return
+ */
+ if (seg_plocked == 0 || seg_plazy == 0) {
+ return;
+ }
+ sema_v(&seg_psaync_sem);
+}
+
+static void seg_pupdate(void *);
+
+/*
+ * run as a backgroud thread and reclaim pagelock
+ * pages which have not been used recently
+ */
+void
+seg_pasync_thread(void)
+{
+ callb_cpr_t cpr_info;
+ kmutex_t pasync_lock; /* just for CPR stuff */
+
+ mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ CALLB_CPR_INIT(&cpr_info, &pasync_lock,
+ callb_generic_cpr, "seg_pasync");
+
+ if (seg_preap_interval == 0) {
+ seg_preap_interval = seg_preap_time * hz;
+ } else {
+ seg_preap_interval *= hz;
+ }
+ if (seg_plazy && seg_pupdate_active) {
+ (void) timeout(seg_pupdate, NULL, seg_preap_interval);
+ }
+
+ for (;;) {
+ mutex_enter(&pasync_lock);
+ CALLB_CPR_SAFE_BEGIN(&cpr_info);
+ mutex_exit(&pasync_lock);
+ sema_p(&seg_psaync_sem);
+ mutex_enter(&pasync_lock);
+ CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock);
+ mutex_exit(&pasync_lock);
+
+ seg_ppurge_all(0);
+ }
+}
+
+static void
+seg_pupdate(void *dummy)
+{
+ sema_v(&seg_psaync_sem);
+
+ if (seg_plazy && seg_pupdate_active) {
+ (void) timeout(seg_pupdate, dummy, seg_preap_interval);
+ }
+}
+
+static struct kmem_cache *seg_cache;
+
+/*
+ * Initialize segment management data structures.
+ */
+void
+seg_init(void)
+{
+ kstat_t *ksp;
+
+ seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
+ segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
+ if (ksp) {
+ ksp->ks_data = (void *)segadvstat_ptr;
+ kstat_install(ksp);
+ }
+
+ seg_pinit();
+}
+
+/*
+ * Allocate a segment to cover [base, base+size]
+ * and attach it to the specified address space.
+ */
+struct seg *
+seg_alloc(struct as *as, caddr_t base, size_t size)
+{
+ struct seg *new;
+ caddr_t segbase;
+ size_t segsize;
+
+ segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
+ segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
+ (uintptr_t)segbase;
+
+ if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
+ return ((struct seg *)NULL); /* bad virtual addr range */
+
+ if (as != &kas &&
+ valid_usr_range(segbase, segsize, 0, as,
+ as->a_userlimit) != RANGE_OKAY)
+ return ((struct seg *)NULL); /* bad virtual addr range */
+
+ new = kmem_cache_alloc(seg_cache, KM_SLEEP);
+ new->s_ops = NULL;
+ new->s_data = NULL;
+ new->s_szc = 0;
+ new->s_flags = 0;
+ if (seg_attach(as, segbase, segsize, new) < 0) {
+ kmem_cache_free(seg_cache, new);
+ return ((struct seg *)NULL);
+ }
+ /* caller must fill in ops, data */
+ return (new);
+}
+
+/*
+ * Attach a segment to the address space. Used by seg_alloc()
+ * and for kernel startup to attach to static segments.
+ */
+int
+seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
+{
+ seg->s_as = as;
+ seg->s_base = base;
+ seg->s_size = size;
+
+ /*
+ * as_addseg() will add the segment at the appropraite point
+ * in the list. It will return -1 if there is overlap with
+ * an already existing segment.
+ */
+ return (as_addseg(as, seg));
+}
+
+/*
+ * Unmap a segment and free it from its associated address space.
+ * This should be called by anybody who's finished with a whole segment's
+ * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the
+ * responsibility of the segment driver to unlink the the segment
+ * from the address space, and to free public and private data structures
+ * associated with the segment. (This is typically done by a call to
+ * seg_free()).
+ */
+void
+seg_unmap(struct seg *seg)
+{
+#ifdef DEBUG
+ int ret;
+#endif /* DEBUG */
+
+ ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ /* Shouldn't have called seg_unmap if mapping isn't yet established */
+ ASSERT(seg->s_data != NULL);
+
+ /* Unmap the whole mapping */
+#ifdef DEBUG
+ ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
+ ASSERT(ret == 0);
+#else
+ SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
+#endif /* DEBUG */
+}
+
+/*
+ * Free the segment from its associated as. This should only be called
+ * if a mapping to the segment has not yet been established (e.g., if
+ * an error occurs in the middle of doing an as_map when the segment
+ * has already been partially set up) or if it has already been deleted
+ * (e.g., from a segment driver unmap routine if the unmap applies to the
+ * entire segment). If the mapping is currently set up then seg_unmap() should
+ * be called instead.
+ */
+void
+seg_free(struct seg *seg)
+{
+ register struct as *as = seg->s_as;
+ struct seg *tseg = as_removeseg(as, seg);
+
+ ASSERT(tseg == seg);
+
+ /*
+ * If the segment private data field is NULL,
+ * then segment driver is not attached yet.
+ */
+ if (seg->s_data != NULL)
+ SEGOP_FREE(seg);
+
+ kmem_cache_free(seg_cache, seg);
+}
+
+/*ARGSUSED*/
+static void
+seg_p_mem_config_post_add(
+ void *arg,
+ pgcnt_t delta_pages)
+{
+ /* Nothing to do. */
+}
+
+/*
+ * Attempt to purge seg_pcache. May need to return before this has
+ * completed to allow other pre_del callbacks to unlock pages. This is
+ * ok because:
+ * 1) The seg_pdisable flag has been set so at least we won't
+ * cache anymore locks and the locks we couldn't purge
+ * will not be held if they do get released by a subsequent
+ * pre-delete callback.
+ *
+ * 2) The rest of the memory delete thread processing does not
+ * depend on the changes made in this pre-delete callback. No
+ * panics will result, the worst that will happen is that the
+ * DR code will timeout and cancel the delete.
+ */
+/*ARGSUSED*/
+static int
+seg_p_mem_config_pre_del(
+ void *arg,
+ pgcnt_t delta_pages)
+{
+ pgcnt_t old_plocked;
+ int stall_count = 0;
+
+ mutex_enter(&seg_pcache);
+ seg_pdisable++;
+ ASSERT(seg_pdisable != 0);
+ mutex_exit(&seg_pcache);
+
+ /*
+ * Attempt to empty the cache. Terminate if seg_plocked does not
+ * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
+ */
+ while (seg_plocked != 0) {
+ old_plocked = seg_plocked;
+ seg_ppurge_all(1);
+ if (seg_plocked == old_plocked) {
+ if (stall_count++ > SEGP_STALL_THRESHOLD) {
+ cmn_err(CE_NOTE, "!Pre-delete couldn't purge"
+ " pagelock cache - continuing");
+ break;
+ }
+ } else
+ stall_count = 0;
+ if (seg_plocked != 0)
+ delay(hz/SEGP_PREDEL_DELAY_FACTOR);
+ }
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+seg_p_mem_config_post_del(
+ void *arg,
+ pgcnt_t delta_pages,
+ int cancelled)
+{
+ mutex_enter(&seg_pcache);
+ ASSERT(seg_pdisable != 0);
+ seg_pdisable--;
+ mutex_exit(&seg_pcache);
+}
+
+static kphysm_setup_vector_t seg_p_mem_config_vec = {
+ KPHYSM_SETUP_VECTOR_VERSION,
+ seg_p_mem_config_post_add,
+ seg_p_mem_config_pre_del,
+ seg_p_mem_config_post_del,
+};
+
+static void
+seg_pinit_mem_config(void)
+{
+ int ret;
+
+ ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
+ /*
+ * Want to catch this in the debug kernel. At run time, if the
+ * callbacks don't get run all will be OK as the disable just makes
+ * it more likely that the pages can be collected.
+ */
+ ASSERT(ret == 0);
+}
diff --git a/usr/src/uts/common/vm/vm_swap.c b/usr/src/uts/common/vm/vm_swap.c
new file mode 100644
index 0000000000..d7028b6f29
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_swap.c
@@ -0,0 +1,1590 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Each physical swap area has an associated bitmap representing
+ * its physical storage. The bitmap records which swap slots are
+ * currently allocated or freed. Allocation is done by searching
+ * through the bitmap for the first free slot. Thus, there's
+ * no linear relation between offset within the swap device and the
+ * address (within its segment(s)) of the page that the slot backs;
+ * instead, it's an arbitrary one-to-one mapping.
+ *
+ * Associated with each swap area is a swapinfo structure. These
+ * structures are linked into a linear list that determines the
+ * ordering of swap areas in the logical swap device. Each contains a
+ * pointer to the corresponding bitmap, the area's size, and its
+ * associated vnode.
+ */
+
+#include <sys/types.h>
+#include <sys/inttypes.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/vtrace.h>
+#include <sys/swap.h>
+#include <sys/dumphdr.h>
+#include <sys/debug.h>
+#include <sys/fs/snode.h>
+#include <sys/fs/swapnode.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/page.h>
+#include <vm/seg_vn.h>
+#include <vm/hat.h>
+#include <vm/anon.h>
+#include <vm/seg_map.h>
+
+/*
+ * To balance the load among multiple swap areas, we don't allow
+ * more than swap_maxcontig allocations to be satisfied from a
+ * single swap area before moving on to the next swap area. This
+ * effectively "interleaves" allocations among the many swap areas.
+ */
+int swap_maxcontig; /* set by anon_init() to 1 Mb */
+
+#define MINIROOTSIZE 12000 /* ~6 Meg XXX */
+
+/*
+ * XXX - this lock is a kludge. It serializes some aspects of swapadd() and
+ * swapdel() (namely VOP_OPEN, VOP_CLOSE, VN_RELE). It protects against
+ * somebody swapadd'ing and getting swap slots from a vnode, while someone
+ * else is in the process of closing or rele'ing it.
+ */
+static kmutex_t swap_lock;
+
+kmutex_t swapinfo_lock;
+
+/*
+ * protected by the swapinfo_lock
+ */
+struct swapinfo *swapinfo;
+
+static struct swapinfo *silast;
+static int nswapfiles;
+
+static u_offset_t swap_getoff(struct swapinfo *);
+static int swapadd(struct vnode *, ulong_t, ulong_t, char *);
+static int swapdel(struct vnode *, ulong_t);
+static int swapslot_free(struct vnode *, u_offset_t, struct swapinfo *);
+
+/*
+ * swap device bitmap allocation macros
+ */
+#define MAPSHIFT 5
+#define NBBW (NBPW * NBBY) /* number of bits per word */
+#define TESTBIT(map, i) (((map)[(i) >> MAPSHIFT] & (1 << (i) % NBBW)))
+#define SETBIT(map, i) (((map)[(i) >> MAPSHIFT] |= (1 << (i) % NBBW)))
+#define CLEARBIT(map, i) (((map)[(i) >> MAPSHIFT] &= ~(1 << (i) % NBBW)))
+
+int swap_debug = 0; /* set for debug printf's */
+int swap_verify = 0; /* set to verify slots when freeing and allocating */
+
+uint_t swapalloc_maxcontig;
+
+/*
+ * Allocate a range of up to *lenp contiguous slots (page) from a physical
+ * swap device. Flags are one of:
+ * SA_NOT Must have a slot from a physical swap device other than the
+ * the one containing input (*vpp, *offp).
+ * Less slots than requested may be returned. *lenp allocated slots are
+ * returned starting at *offp on *vpp.
+ * Returns 1 for a successful allocation, 0 for couldn't allocate any slots.
+ */
+int
+swap_phys_alloc(
+ struct vnode **vpp,
+ u_offset_t *offp,
+ size_t *lenp,
+ uint_t flags)
+{
+ struct swapinfo *sip;
+ offset_t soff, noff;
+ size_t len;
+
+ mutex_enter(&swapinfo_lock);
+ sip = silast;
+
+ /* Find a desirable physical device and allocate from it. */
+ do {
+ if (sip == NULL)
+ break;
+ if (!(sip->si_flags & ST_INDEL) &&
+ (spgcnt_t)sip->si_nfpgs > 0) {
+ /* Caller wants other than specified swap device */
+ if (flags & SA_NOT) {
+ if (*vpp != sip->si_vp ||
+ *offp < sip->si_soff ||
+ *offp >= sip->si_eoff)
+ goto found;
+ /* Caller is loose, will take anything */
+ } else
+ goto found;
+ } else if (sip->si_nfpgs == 0)
+ sip->si_allocs = 0;
+ if ((sip = sip->si_next) == NULL)
+ sip = swapinfo;
+ } while (sip != silast);
+ mutex_exit(&swapinfo_lock);
+ return (0);
+found:
+ soff = swap_getoff(sip);
+ sip->si_nfpgs--;
+ if (soff == -1)
+ panic("swap_alloc: swap_getoff failed!");
+
+ for (len = PAGESIZE; len < *lenp; len += PAGESIZE) {
+ if (sip->si_nfpgs == 0)
+ break;
+ if (swapalloc_maxcontig && len >= swapalloc_maxcontig)
+ break;
+ noff = swap_getoff(sip);
+ if (noff == -1) {
+ break;
+ } else if (noff != soff + len) {
+ CLEARBIT(sip->si_swapslots, btop(noff - sip->si_soff));
+ break;
+ }
+ sip->si_nfpgs--;
+ }
+ *vpp = sip->si_vp;
+ *offp = soff;
+ *lenp = len;
+ ASSERT((spgcnt_t)sip->si_nfpgs >= 0);
+ sip->si_allocs += btop(len);
+ if (sip->si_allocs >= swap_maxcontig) {
+ sip->si_allocs = 0;
+ if ((silast = sip->si_next) == NULL)
+ silast = swapinfo;
+ }
+ TRACE_2(TR_FAC_VM, TR_SWAP_ALLOC,
+ "swap_alloc:sip %p offset %lx", sip, soff);
+ mutex_exit(&swapinfo_lock);
+ return (1);
+}
+
+int swap_backsearch = 0;
+
+/*
+ * Get a free offset on swap device sip.
+ * Return >=0 offset if succeeded, -1 for failure.
+ */
+static u_offset_t
+swap_getoff(struct swapinfo *sip)
+{
+ uint_t *sp, *ep;
+ size_t aoff, boff, poff, slotnumber;
+
+ ASSERT(MUTEX_HELD(&swapinfo_lock));
+
+ sip->si_alloccnt++;
+ for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
+ ep = &sip->si_swapslots[sip->si_mapsize / NBPW]; sp < ep; sp++) {
+ if (*sp != (uint_t)0xffffffff)
+ goto foundentry;
+ else
+ sip->si_checkcnt++;
+ }
+ SWAP_PRINT(SW_ALLOC,
+ "swap_getoff: couldn't find slot from hint %ld to end\n",
+ sip->si_hint, 0, 0, 0, 0);
+ /*
+ * Go backwards? Check for faster method XXX
+ */
+ if (swap_backsearch) {
+ for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
+ ep = sip->si_swapslots; sp > ep; sp--) {
+ if (*sp != (uint_t)0xffffffff)
+ goto foundentry;
+ else
+ sip->si_checkcnt++;
+ }
+ } else {
+ for (sp = sip->si_swapslots,
+ ep = &sip->si_swapslots[sip->si_hint >> MAPSHIFT];
+ sp < ep; sp++) {
+ if (*sp != (uint_t)0xffffffff)
+ goto foundentry;
+ else
+ sip->si_checkcnt++;
+ }
+ }
+ if (*sp == 0xffffffff) {
+ cmn_err(CE_WARN, "No free swap slots!");
+ return ((u_offset_t)-1);
+ }
+
+foundentry:
+ /*
+ * aoff is the page number offset (in bytes) of the si_swapslots
+ * array element containing a free page
+ *
+ * boff is the page number offset of the free page
+ * (i.e. cleared bit) in si_swapslots[aoff].
+ */
+ aoff = ((char *)sp - (char *)sip->si_swapslots) * NBBY;
+
+ for (boff = (sip->si_hint % NBBW); boff < NBBW; boff++) {
+ if (!TESTBIT(sip->si_swapslots, aoff + boff))
+ goto foundslot;
+ else
+ sip->si_checkcnt++;
+ }
+ for (boff = 0; boff < (sip->si_hint % NBBW); boff++) {
+ if (!TESTBIT(sip->si_swapslots, aoff + boff))
+ goto foundslot;
+ else
+ sip->si_checkcnt++;
+ }
+ panic("swap_getoff: didn't find slot in word hint %ld", sip->si_hint);
+
+foundslot:
+ /*
+ * Return the offset of the free page in swap device.
+ * Convert page number of byte offset and add starting
+ * offset of swap device.
+ */
+ slotnumber = aoff + boff;
+ SWAP_PRINT(SW_ALLOC, "swap_getoff: allocating slot %ld\n",
+ slotnumber, 0, 0, 0, 0);
+ poff = ptob(slotnumber);
+ if (poff + sip->si_soff >= sip->si_eoff)
+ printf("ptob(aoff(%ld) + boff(%ld))(%ld) >= eoff(%ld)\n",
+ aoff, boff, ptob(slotnumber), (long)sip->si_eoff);
+ ASSERT(poff < sip->si_eoff);
+ /*
+ * We could verify here that the slot isn't already allocated
+ * by looking through all the anon slots.
+ */
+ SETBIT(sip->si_swapslots, slotnumber);
+ sip->si_hint = slotnumber + 1; /* hint = next slot */
+ return (poff + sip->si_soff);
+}
+
+/*
+ * Free a swap page.
+ */
+void
+swap_phys_free(struct vnode *vp, u_offset_t off, size_t len)
+{
+ struct swapinfo *sip;
+ ssize_t pagenumber, npage;
+
+ mutex_enter(&swapinfo_lock);
+ sip = swapinfo;
+
+ do {
+ if (sip->si_vp == vp &&
+ sip->si_soff <= off && off < sip->si_eoff) {
+ for (pagenumber = btop(off - sip->si_soff),
+ npage = btop(len) + pagenumber;
+ pagenumber < npage; pagenumber++) {
+ SWAP_PRINT(SW_ALLOC,
+ "swap_phys_free: freeing slot %ld on "
+ "sip %p\n",
+ pagenumber, sip, 0, 0, 0);
+ if (!TESTBIT(sip->si_swapslots, pagenumber)) {
+ panic(
+ "swap_phys_free: freeing free slot "
+ "%p,%lx\n", (void *)vp,
+ ptob(pagenumber) + sip->si_soff);
+ }
+ CLEARBIT(sip->si_swapslots, pagenumber);
+ sip->si_nfpgs++;
+ }
+ ASSERT(sip->si_nfpgs <= sip->si_npgs);
+ mutex_exit(&swapinfo_lock);
+ return;
+ }
+ } while ((sip = sip->si_next) != NULL);
+ panic("swap_phys_free");
+ /*NOTREACHED*/
+}
+
+/*
+ * Return the anon struct corresponding for the given
+ * <vnode, off> if it is part of the virtual swap device.
+ * Return the anon struct if found, otherwise NULL.
+ */
+struct anon *
+swap_anon(struct vnode *vp, u_offset_t off)
+{
+ struct anon *ap;
+
+ ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(vp, off)]));
+
+ for (ap = anon_hash[ANON_HASH(vp, off)]; ap != NULL; ap = ap->an_hash) {
+ if (ap->an_vp == vp && ap->an_off == off)
+ return (ap);
+ }
+ return (NULL);
+}
+
+
+/*
+ * Determine if the vp offset range overlap a swap device.
+ */
+int
+swap_in_range(struct vnode *vp, u_offset_t offset, size_t len)
+{
+ struct swapinfo *sip;
+ u_offset_t eoff;
+
+ eoff = offset + len;
+ ASSERT(eoff > offset);
+
+ mutex_enter(&swapinfo_lock);
+ sip = swapinfo;
+ if (vp && sip) {
+ do {
+ if (vp != sip->si_vp || eoff <= sip->si_soff ||
+ offset >= sip->si_eoff)
+ continue;
+ mutex_exit(&swapinfo_lock);
+ return (1);
+ } while ((sip = sip->si_next) != NULL);
+ }
+ mutex_exit(&swapinfo_lock);
+ return (0);
+}
+
+/*
+ * See if name is one of our swap files
+ * even though lookupname failed.
+ * This can be used by swapdel to delete
+ * swap resources on remote machines
+ * where the link has gone down.
+ */
+static struct vnode *
+swapdel_byname(
+ char *name, /* pathname to delete */
+ ulong_t lowblk) /* Low block number of area to delete */
+{
+ struct swapinfo **sipp, *osip;
+ u_offset_t soff;
+
+ /*
+ * Find the swap file entry for the file to
+ * be deleted. Skip any entries that are in
+ * transition.
+ */
+
+ soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
+
+ mutex_enter(&swapinfo_lock);
+ for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
+ if ((strcmp(osip->si_pname, name) == 0) &&
+ (osip->si_soff == soff) && (osip->si_flags == 0)) {
+ struct vnode *vp = osip->si_vp;
+
+ VN_HOLD(vp);
+ mutex_exit(&swapinfo_lock);
+ return (vp);
+ }
+ }
+ mutex_exit(&swapinfo_lock);
+ return (NULL);
+}
+
+
+/*
+ * New system call to manipulate swap files.
+ */
+int
+swapctl(int sc_cmd, void *sc_arg, int *rv)
+{
+ struct swapinfo *sip, *csip, *tsip;
+ int error = 0;
+ struct swapent st, *ust;
+ struct swapres sr;
+ struct vnode *vp;
+ int cnt = 0;
+ int tmp_nswapfiles;
+ int nswap;
+ int length, nlen;
+ int gplen = 0, plen;
+ char *swapname;
+ char *pname;
+ char *tpname;
+ struct anoninfo ai;
+ spgcnt_t avail;
+ int global = INGLOBALZONE(curproc);
+
+ /*
+ * When running in a zone we want to hide the details of the swap
+ * devices: we report there only being one swap device named "swap"
+ * having a size equal to the sum of the sizes of all real swap devices
+ * on the system.
+ */
+ switch (sc_cmd) {
+ case SC_GETNSWP:
+ if (global)
+ *rv = nswapfiles;
+ else
+ *rv = 1;
+ return (0);
+
+ case SC_AINFO:
+ /*
+ * Return anoninfo information with these changes:
+ * ani_max = maximum amount of swap space
+ * (including potentially available physical memory)
+ * ani_free = amount of unallocated anonymous memory
+ * (some of which might be reserved and including
+ * potentially available physical memory)
+ * ani_resv = amount of claimed (reserved) anonymous memory
+ */
+ avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
+ ai.ani_max = (k_anoninfo.ani_max +
+ k_anoninfo.ani_mem_resv) +avail;
+
+ ai.ani_free = k_anoninfo.ani_free + avail;
+
+ ai.ani_resv = k_anoninfo.ani_phys_resv +
+ k_anoninfo.ani_mem_resv;
+
+ if (copyout(&ai, sc_arg, sizeof (struct anoninfo)) != 0)
+ return (EFAULT);
+ return (0);
+
+ case SC_LIST:
+ if (copyin(sc_arg, &length, sizeof (int)) != 0)
+ return (EFAULT);
+ if (!global) {
+ struct swapent st;
+ char *swappath = "swap";
+
+ if (length < 1)
+ return (ENOMEM);
+ ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
+ if (copyin(ust, &st, sizeof (swapent_t)) != 0)
+ return (EFAULT);
+ st.ste_start = PAGESIZE >> SCTRSHFT;
+ st.ste_length = (off_t)0;
+ st.ste_pages = 0;
+ st.ste_free = 0;
+ st.ste_flags = 0;
+ mutex_enter(&swapinfo_lock);
+ for (sip = swapinfo, nswap = 0;
+ sip != NULL && nswap < nswapfiles;
+ sip = sip->si_next, nswap++) {
+ st.ste_length +=
+ (sip->si_eoff - sip->si_soff) >> SCTRSHFT;
+ st.ste_pages += sip->si_npgs;
+ st.ste_free += sip->si_nfpgs;
+ }
+ mutex_exit(&swapinfo_lock);
+ if (copyout(&st, ust, sizeof (swapent_t)) != 0 ||
+ copyout(swappath, st.ste_path,
+ strlen(swappath) + 1) != 0) {
+ return (EFAULT);
+ }
+ *rv = 1;
+ return (0);
+ }
+beginning:
+ tmp_nswapfiles = nswapfiles;
+ /* Return an error if not enough space for the whole table. */
+ if (length < tmp_nswapfiles)
+ return (ENOMEM);
+ /*
+ * Get memory to hold the swap entries and their names. We'll
+ * copy the real entries into these and then copy these out.
+ * Allocating the pathname memory is only a guess so we may
+ * find that we need more and have to do it again.
+ * All this is because we have to hold the anon lock while
+ * traversing the swapinfo list, and we can't be doing copyouts
+ * and/or kmem_alloc()s during this.
+ */
+ csip = kmem_zalloc(tmp_nswapfiles * sizeof (struct swapinfo),
+ KM_SLEEP);
+retry:
+ nlen = tmp_nswapfiles * (gplen += 100);
+ pname = kmem_zalloc(nlen, KM_SLEEP);
+
+ mutex_enter(&swapinfo_lock);
+
+ if (tmp_nswapfiles != nswapfiles) {
+ mutex_exit(&swapinfo_lock);
+ kmem_free(pname, nlen);
+ kmem_free(csip,
+ tmp_nswapfiles * sizeof (struct swapinfo));
+ gplen = 0;
+ goto beginning;
+ }
+ for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
+ sip && nswap < tmp_nswapfiles;
+ sip = sip->si_next, tsip++, tpname += plen, nswap++) {
+ plen = sip->si_pnamelen;
+ if (tpname + plen - pname > nlen) {
+ mutex_exit(&swapinfo_lock);
+ kmem_free(pname, nlen);
+ goto retry;
+ }
+ *tsip = *sip;
+ tsip->si_pname = tpname;
+ (void) strcpy(tsip->si_pname, sip->si_pname);
+ }
+ mutex_exit(&swapinfo_lock);
+
+ if (sip) {
+ error = ENOMEM;
+ goto lout;
+ }
+ ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
+ for (tsip = csip, cnt = 0; cnt < nswap; tsip++, ust++, cnt++) {
+ if (copyin(ust, &st, sizeof (swapent_t)) != 0) {
+ error = EFAULT;
+ goto lout;
+ }
+ st.ste_flags = tsip->si_flags;
+ st.ste_length =
+ (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
+ st.ste_start = tsip->si_soff >> SCTRSHFT;
+ st.ste_pages = tsip->si_npgs;
+ st.ste_free = tsip->si_nfpgs;
+ if (copyout(&st, ust, sizeof (swapent_t)) != 0) {
+ error = EFAULT;
+ goto lout;
+ }
+ if (!tsip->si_pnamelen)
+ continue;
+ if (copyout(tsip->si_pname, st.ste_path,
+ tsip->si_pnamelen) != 0) {
+ error = EFAULT;
+ goto lout;
+ }
+ }
+ *rv = nswap;
+lout:
+ kmem_free(csip, tmp_nswapfiles * sizeof (struct swapinfo));
+ kmem_free(pname, nlen);
+ return (error);
+
+ case SC_ADD:
+ case SC_REMOVE:
+ break;
+ default:
+ return (EINVAL);
+ }
+ if ((error = secpolicy_swapctl(CRED())) != 0)
+ return (error);
+
+ if (copyin(sc_arg, &sr, sizeof (swapres_t)))
+ return (EFAULT);
+
+ /* Allocate the space to read in pathname */
+ if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
+ return (ENOMEM);
+
+ error = copyinstr(sr.sr_name, swapname, MAXPATHLEN, 0);
+ if (error)
+ goto out;
+
+ error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+ if (error) {
+ if (sc_cmd == SC_ADD)
+ goto out;
+ /* see if we match by name */
+ vp = swapdel_byname(swapname, (size_t)sr.sr_start);
+ if (vp == NULL)
+ goto out;
+ }
+
+ if (vp->v_flag & (VNOMAP | VNOSWAP)) {
+ VN_RELE(vp);
+ error = ENOSYS;
+ goto out;
+ }
+ switch (vp->v_type) {
+ case VBLK:
+ break;
+
+ case VREG:
+ if (vp->v_vfsp && vn_is_readonly(vp))
+ error = EROFS;
+ else
+ error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED());
+ break;
+
+ case VDIR:
+ error = EISDIR;
+ break;
+ default:
+ error = ENOSYS;
+ break;
+ }
+ if (error == 0) {
+ if (sc_cmd == SC_REMOVE)
+ error = swapdel(vp, sr.sr_start);
+ else
+ error = swapadd(vp, sr.sr_start,
+ sr.sr_length, swapname);
+ }
+ VN_RELE(vp);
+out:
+ kmem_free(swapname, MAXPATHLEN);
+ return (error);
+}
+
+#if defined(_LP64) && defined(_SYSCALL32)
+
+int
+swapctl32(int sc_cmd, void *sc_arg, int *rv)
+{
+ struct swapinfo *sip, *csip, *tsip;
+ int error = 0;
+ struct swapent32 st, *ust;
+ struct swapres32 sr;
+ struct vnode *vp;
+ int cnt = 0;
+ int tmp_nswapfiles;
+ int nswap;
+ int length, nlen;
+ int gplen = 0, plen;
+ char *swapname;
+ char *pname;
+ char *tpname;
+ struct anoninfo32 ai;
+ size_t s;
+ spgcnt_t avail;
+
+ switch (sc_cmd) {
+ case SC_GETNSWP:
+ *rv = nswapfiles;
+ return (0);
+
+ case SC_AINFO:
+ /*
+ * Return anoninfo information with these changes:
+ * ani_max = maximum amount of swap space
+ * (including potentially available physical memory)
+ * ani_free = amount of unallocated anonymous memory
+ * (some of which might be reserved and including
+ * potentially available physical memory)
+ * ani_resv = amount of claimed (reserved) anonymous memory
+ */
+ avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
+ s = (k_anoninfo.ani_max + k_anoninfo.ani_mem_resv) + avail;
+ if (s > UINT32_MAX)
+ return (EOVERFLOW);
+ ai.ani_max = s;
+
+ s = k_anoninfo.ani_free + avail;
+ if (s > UINT32_MAX)
+ return (EOVERFLOW);
+ ai.ani_free = s;
+
+ s = k_anoninfo.ani_phys_resv + k_anoninfo.ani_mem_resv;
+ if (s > UINT32_MAX)
+ return (EOVERFLOW);
+ ai.ani_resv = s;
+
+ if (copyout(&ai, sc_arg, sizeof (ai)) != 0)
+ return (EFAULT);
+ return (0);
+
+ case SC_LIST:
+ if (copyin(sc_arg, &length, sizeof (int32_t)) != 0)
+ return (EFAULT);
+beginning:
+ tmp_nswapfiles = nswapfiles;
+ /* Return an error if not enough space for the whole table. */
+ if (length < tmp_nswapfiles)
+ return (ENOMEM);
+ /*
+ * Get memory to hold the swap entries and their names. We'll
+ * copy the real entries into these and then copy these out.
+ * Allocating the pathname memory is only a guess so we may
+ * find that we need more and have to do it again.
+ * All this is because we have to hold the anon lock while
+ * traversing the swapinfo list, and we can't be doing copyouts
+ * and/or kmem_alloc()s during this.
+ */
+ csip = kmem_zalloc(tmp_nswapfiles * sizeof (*csip), KM_SLEEP);
+retry:
+ nlen = tmp_nswapfiles * (gplen += 100);
+ pname = kmem_zalloc(nlen, KM_SLEEP);
+
+ mutex_enter(&swapinfo_lock);
+
+ if (tmp_nswapfiles != nswapfiles) {
+ mutex_exit(&swapinfo_lock);
+ kmem_free(pname, nlen);
+ kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
+ gplen = 0;
+ goto beginning;
+ }
+ for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
+ (sip != NULL) && (nswap < tmp_nswapfiles);
+ sip = sip->si_next, tsip++, tpname += plen, nswap++) {
+ plen = sip->si_pnamelen;
+ if (tpname + plen - pname > nlen) {
+ mutex_exit(&swapinfo_lock);
+ kmem_free(pname, nlen);
+ goto retry;
+ }
+ *tsip = *sip;
+ tsip->si_pname = tpname;
+ (void) strcpy(tsip->si_pname, sip->si_pname);
+ }
+ mutex_exit(&swapinfo_lock);
+
+ if (sip != NULL) {
+ error = ENOMEM;
+ goto lout;
+ }
+ ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent;
+ for (tsip = csip, cnt = 0; cnt < nswap; tsip++, ust++, cnt++) {
+ if (copyin(ust, &st, sizeof (*ust)) != 0) {
+ error = EFAULT;
+ goto lout;
+ }
+ st.ste_flags = tsip->si_flags;
+ st.ste_length =
+ (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
+ st.ste_start = tsip->si_soff >> SCTRSHFT;
+ st.ste_pages = tsip->si_npgs;
+ st.ste_free = tsip->si_nfpgs;
+ if (copyout(&st, ust, sizeof (st)) != 0) {
+ error = EFAULT;
+ goto lout;
+ }
+ if (!tsip->si_pnamelen)
+ continue;
+ if (copyout(tsip->si_pname,
+ (caddr_t)(uintptr_t)st.ste_path,
+ tsip->si_pnamelen) != 0) {
+ error = EFAULT;
+ goto lout;
+ }
+ }
+ *rv = nswap;
+lout:
+ kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
+ kmem_free(pname, nlen);
+ return (error);
+
+ case SC_ADD:
+ case SC_REMOVE:
+ break;
+ default:
+ return (EINVAL);
+ }
+ if ((error = secpolicy_swapctl(CRED())) != 0)
+ return (error);
+
+ if (copyin(sc_arg, &sr, sizeof (sr)))
+ return (EFAULT);
+
+ /* Allocate the space to read in pathname */
+ if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
+ return (ENOMEM);
+
+ error = copyinstr((caddr_t)(uintptr_t)sr.sr_name,
+ swapname, MAXPATHLEN, NULL);
+ if (error)
+ goto out;
+
+ error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+ if (error) {
+ if (sc_cmd == SC_ADD)
+ goto out;
+ /* see if we match by name */
+ vp = swapdel_byname(swapname, (uint_t)sr.sr_start);
+ if (vp == NULL)
+ goto out;
+ }
+
+ if (vp->v_flag & (VNOMAP | VNOSWAP)) {
+ VN_RELE(vp);
+ error = ENOSYS;
+ goto out;
+ }
+ switch (vp->v_type) {
+ case VBLK:
+ break;
+
+ case VREG:
+ if (vp->v_vfsp && vn_is_readonly(vp))
+ error = EROFS;
+ else
+ error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED());
+ break;
+
+ case VDIR:
+ error = EISDIR;
+ break;
+ default:
+ error = ENOSYS;
+ break;
+ }
+ if (error == 0) {
+ if (sc_cmd == SC_REMOVE)
+ error = swapdel(vp, sr.sr_start);
+ else
+ error = swapadd(vp, sr.sr_start, sr.sr_length,
+ swapname);
+ }
+ VN_RELE(vp);
+out:
+ kmem_free(swapname, MAXPATHLEN);
+ return (error);
+}
+
+#endif /* _LP64 && _SYSCALL32 */
+
+/*
+ * Add a new swap file.
+ */
+int
+swapadd(struct vnode *vp, ulong_t lowblk, ulong_t nblks, char *swapname)
+{
+ struct swapinfo **sipp, *nsip = NULL, *esip = NULL;
+ struct vnode *cvp;
+ struct vattr vattr;
+ pgcnt_t pages;
+ u_offset_t soff, eoff;
+ int error;
+ ssize_t i, start, end;
+ ushort_t wasswap;
+ ulong_t startblk;
+ size_t returned_mem;
+
+ SWAP_PRINT(SW_CTL, "swapadd: vp %p lowblk %ld nblks %ld swapname %s\n",
+ vp, lowblk, nblks, swapname, 0);
+ /*
+ * Get the real vnode. (If vp is not a specnode it just returns vp, so
+ * it does the right thing, but having this code know about specnodes
+ * violates the spirit of having it be indepedent of vnode type.)
+ */
+ cvp = common_specvp(vp);
+
+ /*
+ * Or in VISSWAP so file system has chance to deny swap-ons during open.
+ */
+ mutex_enter(&cvp->v_lock);
+ wasswap = cvp->v_flag & VISSWAP;
+ cvp->v_flag |= VISSWAP;
+ mutex_exit(&cvp->v_lock);
+
+ mutex_enter(&swap_lock);
+ if (error = VOP_OPEN(&cvp, FREAD|FWRITE, CRED())) {
+ mutex_exit(&swap_lock);
+ /* restore state of v_flag */
+ if (!wasswap) {
+ mutex_enter(&cvp->v_lock);
+ cvp->v_flag &= ~VISSWAP;
+ mutex_exit(&cvp->v_lock);
+ }
+ return (error);
+ }
+ mutex_exit(&swap_lock);
+
+ /*
+ * Get partition size. Return error if empty partition,
+ * or if request does not fit within the partition.
+ * If this is the first swap device, we can reduce
+ * the size of the swap area to match what is
+ * available. This can happen if the system was built
+ * on a machine with a different size swap partition.
+ */
+ vattr.va_mask = AT_SIZE;
+ if (error = VOP_GETATTR(cvp, &vattr, ATTR_COMM, CRED()))
+ goto out;
+
+ /*
+ * Specfs returns a va_size of MAXOFFSET_T (UNKNOWN_SIZE) when the
+ * size of the device can't be determined.
+ */
+ if ((vattr.va_size == 0) || (vattr.va_size == MAXOFFSET_T)) {
+ error = EINVAL;
+ goto out;
+ }
+
+#ifdef _ILP32
+ /*
+ * No support for large swap in 32-bit OS, if the size of the swap is
+ * bigger than MAXOFF32_T then the size used by swapfs must be limited.
+ * This limitation is imposed by the swap subsystem itself, a D_64BIT
+ * driver as the target of swap operation should be able to field
+ * the IO.
+ */
+ if (vattr.va_size > MAXOFF32_T) {
+ cmn_err(CE_NOTE,
+ "!swap device %s truncated from 0x%llx to 0x%x bytes",
+ swapname, vattr.va_size, MAXOFF32_T);
+ vattr.va_size = MAXOFF32_T;
+ }
+#endif /* _ILP32 */
+
+ /* Fail if file not writeable (try to set size to current size) */
+ vattr.va_mask = AT_SIZE;
+ if (error = VOP_SETATTR(cvp, &vattr, 0, CRED(), NULL))
+ goto out;
+
+ /* Fail if fs does not support VOP_PAGEIO */
+ error = VOP_PAGEIO(cvp, (page_t *)NULL, (u_offset_t)0, 0, 0, CRED());
+
+ if (error == ENOSYS)
+ goto out;
+ else
+ error = 0;
+ /*
+ * If swapping on the root filesystem don't put swap blocks that
+ * correspond to the miniroot filesystem on the swap free list.
+ */
+ if (cvp == rootdir)
+ startblk = roundup(MINIROOTSIZE<<SCTRSHFT, klustsize)>>SCTRSHFT;
+ else /* Skip 1st page (disk label) */
+ startblk = (ulong_t)(lowblk ? lowblk : 1);
+
+ soff = startblk << SCTRSHFT;
+ if (soff >= vattr.va_size) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * If user specified 0 blks, use the size of the device
+ */
+ eoff = nblks ? soff + (nblks - (startblk - lowblk) << SCTRSHFT) :
+ vattr.va_size;
+
+ SWAP_PRINT(SW_CTL, "swapadd: va_size %ld soff %ld eoff %ld\n",
+ vattr.va_size, soff, eoff, 0, 0);
+
+ if (eoff > vattr.va_size) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * The starting and ending offsets must be page aligned.
+ * Round soff up to next page boundary, round eoff
+ * down to previous page boundary.
+ */
+ soff = ptob(btopr(soff));
+ eoff = ptob(btop(eoff));
+ if (soff >= eoff) {
+ SWAP_PRINT(SW_CTL, "swapadd: soff %ld >= eoff %ld\n",
+ soff, eoff, 0, 0, 0);
+ error = EINVAL;
+ goto out;
+ }
+
+ pages = btop(eoff - soff);
+
+ /* Allocate and partially set up the new swapinfo */
+ nsip = kmem_zalloc(sizeof (struct swapinfo), KM_SLEEP);
+ nsip->si_vp = cvp;
+
+ nsip->si_soff = soff;
+ nsip->si_eoff = eoff;
+ nsip->si_hint = 0;
+ nsip->si_checkcnt = nsip->si_alloccnt = 0;
+
+ nsip->si_pnamelen = (int)strlen(swapname) + 1;
+ nsip->si_pname = (char *)kmem_zalloc(nsip->si_pnamelen, KM_SLEEP);
+ bcopy(swapname, nsip->si_pname, nsip->si_pnamelen - 1);
+ SWAP_PRINT(SW_CTL, "swapadd: allocating swapinfo for %s, %ld pages\n",
+ swapname, pages, 0, 0, 0);
+ /*
+ * Size of swapslots map in bytes
+ */
+ nsip->si_mapsize = P2ROUNDUP(pages, NBBW) / NBBY;
+ nsip->si_swapslots = kmem_zalloc(nsip->si_mapsize, KM_SLEEP);
+
+ /*
+ * Permanently set the bits that can't ever be allocated,
+ * i.e. those from the ending offset to the round up slot for the
+ * swapslots bit map.
+ */
+ start = pages;
+ end = P2ROUNDUP(pages, NBBW);
+ for (i = start; i < end; i++) {
+ SWAP_PRINT(SW_CTL, "swapadd: set bit for page %ld\n", i,
+ 0, 0, 0, 0);
+ SETBIT(nsip->si_swapslots, i);
+ }
+ nsip->si_npgs = nsip->si_nfpgs = pages;
+ /*
+ * Now check to see if we can add it. We wait til now to check because
+ * we need the swapinfo_lock and we don't want sleep with it (e.g.,
+ * during kmem_alloc()) while we're setting up the swapinfo.
+ */
+ mutex_enter(&swapinfo_lock);
+ for (sipp = &swapinfo; (esip = *sipp) != NULL; sipp = &esip->si_next) {
+ if (esip->si_vp == cvp) {
+ if (esip->si_soff == soff && esip->si_npgs == pages &&
+ (esip->si_flags & ST_DOINGDEL)) {
+ /*
+ * We are adding a device that we are in the
+ * middle of deleting. Just clear the
+ * ST_DOINGDEL flag to signal this and
+ * the deletion routine will eventually notice
+ * it and add it back.
+ */
+ esip->si_flags &= ~ST_DOINGDEL;
+ mutex_exit(&swapinfo_lock);
+ goto out;
+ }
+ /* disallow overlapping swap files */
+ if ((soff < esip->si_eoff) && (eoff > esip->si_soff)) {
+ error = EEXIST;
+ mutex_exit(&swapinfo_lock);
+ goto out;
+ }
+ }
+ }
+
+ nswapfiles++;
+
+ /*
+ * add new swap device to list and shift allocations to it
+ * before updating the anoninfo counters
+ */
+ *sipp = nsip;
+ silast = nsip;
+
+ /*
+ * Update the total amount of reservable swap space
+ * accounting properly for swap space from physical memory
+ */
+ /* New swap device soaks up currently reserved memory swap */
+ mutex_enter(&anoninfo_lock);
+
+ ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+ k_anoninfo.ani_max += pages;
+ ANI_ADD(pages);
+ if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
+ returned_mem = MIN(k_anoninfo.ani_mem_resv -
+ k_anoninfo.ani_locked_swap,
+ k_anoninfo.ani_max - k_anoninfo.ani_phys_resv);
+
+ ANI_ADD(-returned_mem);
+ k_anoninfo.ani_free -= returned_mem;
+ k_anoninfo.ani_mem_resv -= returned_mem;
+ k_anoninfo.ani_phys_resv += returned_mem;
+
+ mutex_enter(&freemem_lock);
+ availrmem += returned_mem;
+ mutex_exit(&freemem_lock);
+ }
+ /*
+ * At boot time, to permit booting small memory machines using
+ * only physical memory as swap space, we allowed a dangerously
+ * large amount of memory to be used as swap space; now that
+ * more physical backing store is available bump down the amount
+ * we can get from memory to a safer size.
+ */
+ if (swapfs_minfree < swapfs_desfree) {
+ mutex_enter(&freemem_lock);
+ if (availrmem > swapfs_desfree || !k_anoninfo.ani_mem_resv)
+ swapfs_minfree = swapfs_desfree;
+ mutex_exit(&freemem_lock);
+ }
+
+ SWAP_PRINT(SW_CTL, "swapadd: ani_max %ld ani_free %ld\n",
+ k_anoninfo.ani_free, k_anoninfo.ani_free, 0, 0, 0);
+
+ mutex_exit(&anoninfo_lock);
+
+ mutex_exit(&swapinfo_lock);
+
+ /* Initialize the dump device */
+ mutex_enter(&dump_lock);
+ if (dumpvp == NULL)
+ (void) dumpinit(vp, swapname, 0);
+ mutex_exit(&dump_lock);
+
+ VN_HOLD(cvp);
+out:
+ if (error || esip) {
+ SWAP_PRINT(SW_CTL, "swapadd: error (%d)\n", error, 0, 0, 0, 0);
+
+ if (!wasswap) {
+ mutex_enter(&cvp->v_lock);
+ cvp->v_flag &= ~VISSWAP;
+ mutex_exit(&cvp->v_lock);
+ }
+ if (nsip) {
+ kmem_free(nsip->si_swapslots, (size_t)nsip->si_mapsize);
+ kmem_free(nsip->si_pname, nsip->si_pnamelen);
+ kmem_free(nsip, sizeof (*nsip));
+ }
+ mutex_enter(&swap_lock);
+ (void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED());
+ mutex_exit(&swap_lock);
+ }
+ return (error);
+}
+
+/*
+ * Delete a swap file.
+ */
+static int
+swapdel(
+ struct vnode *vp,
+ ulong_t lowblk) /* Low block number of area to delete. */
+{
+ struct swapinfo **sipp, *osip = NULL;
+ struct vnode *cvp;
+ u_offset_t soff;
+ int error = 0;
+ u_offset_t toff = 0;
+ struct vnode *tvp = NULL;
+ spgcnt_t pages;
+ struct anon **app, *ap;
+ kmutex_t *ahm;
+ pgcnt_t adjust_swap = 0;
+
+ /* Find the swap file entry for the file to be deleted */
+ cvp = common_specvp(vp);
+
+
+ lowblk = lowblk ? lowblk : 1; /* Skip first page (disk label) */
+ soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
+
+ mutex_enter(&swapinfo_lock);
+ for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
+ if ((osip->si_vp == cvp) &&
+ (osip->si_soff == soff) && (osip->si_flags == 0))
+ break;
+ }
+
+ /* If the file was not found, error. */
+ if (osip == NULL) {
+ error = EINVAL;
+ mutex_exit(&swapinfo_lock);
+ goto out;
+ }
+
+ pages = osip->si_npgs;
+
+ /*
+ * Do not delete if we will be low on swap pages.
+ */
+ mutex_enter(&anoninfo_lock);
+
+ ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+
+ mutex_enter(&freemem_lock);
+ if (((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +
+ MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) < pages) {
+ mutex_exit(&freemem_lock);
+ mutex_exit(&anoninfo_lock);
+ error = ENOMEM;
+ cmn_err(CE_WARN, "swapdel - too few free pages");
+ mutex_exit(&swapinfo_lock);
+ goto out;
+ }
+ mutex_exit(&freemem_lock);
+
+ k_anoninfo.ani_max -= pages;
+
+ /* If needed, reserve memory swap to replace old device */
+ if (k_anoninfo.ani_phys_resv > k_anoninfo.ani_max) {
+ adjust_swap = k_anoninfo.ani_phys_resv - k_anoninfo.ani_max;
+ k_anoninfo.ani_phys_resv -= adjust_swap;
+ k_anoninfo.ani_mem_resv += adjust_swap;
+ mutex_enter(&freemem_lock);
+ availrmem -= adjust_swap;
+ mutex_exit(&freemem_lock);
+ ANI_ADD(adjust_swap);
+ }
+ ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+ mutex_exit(&anoninfo_lock);
+
+ ANI_ADD(-pages);
+
+ /*
+ * Set the delete flag. This prevents anyone from allocating more
+ * pages from this file. Also set ST_DOINGDEL. Someone who wants to
+ * add the file back while we're deleting it will signify by clearing
+ * this flag.
+ */
+ osip->si_flags |= ST_INDEL|ST_DOINGDEL;
+ mutex_exit(&swapinfo_lock);
+
+ /*
+ * Free all the allocated physical slots for this file. We do this
+ * by walking through the entire anon hash array, because we need
+ * to update all the anon slots that have physical swap slots on
+ * this file, and this is the only way to find them all. We go back
+ * to the beginning of a bucket after each slot is freed because the
+ * anonhash_lock is not held during the free and thus the hash table
+ * may change under us.
+ */
+ for (app = anon_hash; app < &anon_hash[ANON_HASH_SIZE]; app++) {
+ ahm = &anonhash_lock[(app-anon_hash) & (AH_LOCK_SIZE - 1)];
+ mutex_enter(ahm);
+top:
+ for (ap = *app; ap != NULL; ap = ap->an_hash) {
+ if (ap->an_pvp == cvp &&
+ ap->an_poff >= osip->si_soff &&
+ ap->an_poff < osip->si_eoff) {
+ ASSERT(TESTBIT(osip->si_swapslots,
+ btop((size_t)(ap->an_poff -
+ osip->si_soff))));
+ tvp = ap->an_vp;
+ toff = ap->an_off;
+ VN_HOLD(tvp);
+ mutex_exit(ahm);
+
+ error = swapslot_free(tvp, toff, osip);
+
+ VN_RELE(tvp);
+ mutex_enter(ahm);
+ if (!error && (osip->si_flags & ST_DOINGDEL)) {
+ goto top;
+ } else {
+ if (error) {
+ cmn_err(CE_WARN,
+ "swapslot_free failed %d",
+ error);
+ }
+
+ /*
+ * Add device back before making it
+ * visible.
+ */
+ mutex_enter(&swapinfo_lock);
+ osip->si_flags &=
+ ~(ST_INDEL | ST_DOINGDEL);
+ mutex_exit(&swapinfo_lock);
+
+ /*
+ * Update the anon space available
+ */
+ mutex_enter(&anoninfo_lock);
+
+ k_anoninfo.ani_phys_resv += adjust_swap;
+ k_anoninfo.ani_mem_resv -= adjust_swap;
+ k_anoninfo.ani_max += pages;
+
+ mutex_enter(&freemem_lock);
+ availrmem += adjust_swap;
+ mutex_exit(&freemem_lock);
+
+ mutex_exit(&anoninfo_lock);
+
+ ANI_ADD(pages);
+
+ mutex_exit(ahm);
+ goto out;
+ }
+ }
+ }
+ mutex_exit(ahm);
+ }
+
+ /* All done, they'd better all be free! */
+ mutex_enter(&swapinfo_lock);
+ ASSERT(osip->si_nfpgs == osip->si_npgs);
+
+ /* Now remove it from the swapinfo list */
+ for (sipp = &swapinfo; *sipp != NULL; sipp = &(*sipp)->si_next) {
+ if (*sipp == osip)
+ break;
+ }
+ ASSERT(*sipp);
+ *sipp = osip->si_next;
+ if (silast == osip)
+ if ((silast = osip->si_next) == NULL)
+ silast = swapinfo;
+ nswapfiles--;
+ mutex_exit(&swapinfo_lock);
+
+ kmem_free(osip->si_swapslots, osip->si_mapsize);
+ kmem_free(osip->si_pname, osip->si_pnamelen);
+ kmem_free(osip, sizeof (*osip));
+
+ mutex_enter(&dump_lock);
+ if (cvp == dumpvp)
+ dumpfini();
+ mutex_exit(&dump_lock);
+
+ /* Release the vnode */
+
+ mutex_enter(&swap_lock);
+ (void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED());
+ mutex_enter(&cvp->v_lock);
+ cvp->v_flag &= ~VISSWAP;
+ mutex_exit(&cvp->v_lock);
+ VN_RELE(cvp);
+ mutex_exit(&swap_lock);
+out:
+ return (error);
+}
+
+/*
+ * Free up a physical swap slot on swapinfo sip, currently in use by the
+ * anonymous page whose name is (vp, off).
+ */
+static int
+swapslot_free(
+ struct vnode *vp,
+ u_offset_t off,
+ struct swapinfo *sip)
+{
+ struct page *pl[2], *pp;
+ struct anon *ap = NULL;
+ int error = 0;
+ kmutex_t *ahm;
+
+ /*
+ * Get the page for the old swap slot and i/o lock it.
+ * Users of the physical slot will synchronize on the i/o lock.
+ */
+ if (error = VOP_GETPAGE(vp, (offset_t)off, ptob(1), NULL,
+ pl, ptob(1), segkmap, NULL, S_READ, CRED())) {
+ /*
+ * Anon slot went away (EIDRM) or vp was truncated (EFAULT)
+ * while we got the page. Thus the physical slot must be
+ * free, so we have succeeded.
+ */
+ if (error == EIDRM || error == EFAULT)
+ error = 0;
+ return (error);
+ }
+ pp = pl[0];
+ page_io_lock(pp);
+
+ ahm = &anonhash_lock[AH_LOCK(vp, off)];
+ mutex_enter(ahm);
+ /*
+ * Get the anon slot; anon struct cannot vanish while we hold
+ * SE_SHARED lock on the physical page since anon_decref() blocks
+ * in page_lookup() before it can proceed further to remove
+ * anon struct from anon_hash table.
+ */
+ if ((ap = swap_anon(vp, off)) == NULL) {
+ panic("swapslot_free(%p, %llx, %p), page: %p, null anon",
+ vp, off, sip, pp);
+ }
+ /*
+ * Free the physical slot. It may have been freed up and replaced with
+ * another one while we were getting the page so we have to re-verify
+ * that this is really one we want. If we do free the slot we have
+ * to mark the page modified, as its backing store is now gone.
+ */
+ if (ap->an_pvp == sip->si_vp && ap->an_poff >= sip->si_soff &&
+ ap->an_poff < sip->si_eoff) {
+ swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
+ ap->an_pvp = NULL;
+ ap->an_poff = NULL;
+ mutex_exit(ahm);
+ hat_setmod(pp);
+ } else {
+ mutex_exit(ahm);
+ }
+out:
+ /* Release the page locks */
+ page_unlock(pp);
+ page_io_unlock(pp);
+ return (error);
+}
+
+/*
+ * Get contig physical backing store for vp, in the range
+ * [*offp, *offp + *lenp), May back a subrange of this, but must
+ * always include the requested offset or fail. Returns the offsets
+ * backed as [*offp, *offp + *lenp) and the physical offsets used to
+ * back them from *pvpp in the range [*pstartp, *pstartp + *lenp).
+ * Returns 0 for success
+ * SE_NOANON -- no anon slot for requested paged
+ * SE_NOSWAP -- no physical swap space available
+ */
+int
+swap_newphysname(
+ struct vnode *vp,
+ u_offset_t offset,
+ u_offset_t *offp,
+ size_t *lenp,
+ struct vnode **pvpp,
+ u_offset_t *poffp)
+{
+ struct anon *ap = NULL; /* anon slot for vp, off */
+ int error = 0;
+ struct vnode *pvp;
+ u_offset_t poff, pstart, prem;
+ size_t plen;
+ u_offset_t off, start;
+ kmutex_t *ahm;
+
+ ASSERT(*offp <= offset && offset < *offp + *lenp);
+
+ /* Get new physical swap slots. */
+ plen = *lenp;
+ if (!swap_phys_alloc(&pvp, &pstart, &plen, 0)) {
+ /*
+ * No swap available so return error unless requested
+ * offset is already backed in which case return that.
+ */
+ ahm = &anonhash_lock[AH_LOCK(vp, offset)];
+ mutex_enter(ahm);
+ if ((ap = swap_anon(vp, offset)) == NULL) {
+ error = SE_NOANON;
+ mutex_exit(ahm);
+ return (error);
+ }
+ error = (ap->an_pvp ? 0 : SE_NOSWAP);
+ *offp = offset;
+ *lenp = PAGESIZE;
+ *pvpp = ap->an_pvp;
+ *poffp = ap->an_poff;
+ mutex_exit(ahm);
+ return (error);
+ }
+
+ /*
+ * We got plen (<= *lenp) contig slots. Use these to back a
+ * subrange of [*offp, *offp + *lenp) which includes offset.
+ * For now we just put offset at the end of the kluster.
+ * Clearly there are other possible choices - which is best?
+ */
+ start = MAX(*offp,
+ (offset + PAGESIZE > plen) ? (offset + PAGESIZE - plen) : 0);
+ ASSERT(start + plen <= *offp + *lenp);
+
+ for (off = start, poff = pstart; poff < pstart + plen;
+ off += PAGESIZE, poff += PAGESIZE) {
+ ahm = &anonhash_lock[AH_LOCK(vp, off)];
+ mutex_enter(ahm);
+ if ((ap = swap_anon(vp, off)) != NULL) {
+ /* Free old slot if any, and assign new one */
+ if (ap->an_pvp)
+ swap_phys_free(ap->an_pvp, ap->an_poff,
+ PAGESIZE);
+ ap->an_pvp = pvp;
+ ap->an_poff = poff;
+ } else { /* No anon slot for a klustered page, quit. */
+ prem = (pstart + plen) - poff;
+ /* Already did requested page, do partial kluster */
+ if (off > offset) {
+ plen = poff - pstart;
+ error = 0;
+ /* Fail on requested page, error */
+ } else if (off == offset) {
+ error = SE_NOANON;
+ /* Fail on prior page, fail on requested page, error */
+ } else if ((ap = swap_anon(vp, offset)) == NULL) {
+ error = SE_NOANON;
+ /* Fail on prior page, got requested page, do only it */
+ } else {
+ /* Free old slot if any, and assign new one */
+ if (ap->an_pvp)
+ swap_phys_free(ap->an_pvp, ap->an_poff,
+ PAGESIZE);
+ ap->an_pvp = pvp;
+ ap->an_poff = poff;
+ /* One page kluster */
+ start = offset;
+ plen = PAGESIZE;
+ pstart = poff;
+ poff += PAGESIZE;
+ prem -= PAGESIZE;
+ }
+ /* Free unassigned slots */
+ swap_phys_free(pvp, poff, prem);
+ mutex_exit(ahm);
+ break;
+ }
+ mutex_exit(ahm);
+ }
+ ASSERT(*offp <= start && start + plen <= *offp + *lenp);
+ ASSERT(start <= offset && offset < start + plen);
+ *offp = start;
+ *lenp = plen;
+ *pvpp = pvp;
+ *poffp = pstart;
+ return (error);
+}
+
+
+/*
+ * Get the physical swap backing store location for a given anonymous page
+ * named (vp, off). The backing store name is returned in (*pvpp, *poffp).
+ * Returns 0 success
+ * EIDRM -- no anon slot (page is not allocated)
+ */
+int
+swap_getphysname(
+ struct vnode *vp,
+ u_offset_t off,
+ struct vnode **pvpp,
+ u_offset_t *poffp)
+{
+ struct anon *ap;
+ int error = 0;
+ kmutex_t *ahm;
+
+ ahm = &anonhash_lock[AH_LOCK(vp, off)];
+ mutex_enter(ahm);
+
+ /* Get anon slot for vp, off */
+ ap = swap_anon(vp, off);
+ if (ap == NULL) {
+ error = EIDRM;
+ goto out;
+ }
+ *pvpp = ap->an_pvp;
+ *poffp = ap->an_poff;
+out:
+ mutex_exit(ahm);
+ return (error);
+}
diff --git a/usr/src/uts/common/vm/vpage.h b/usr/src/uts/common/vm/vpage.h
new file mode 100644
index 0000000000..68dfb1adb0
--- /dev/null
+++ b/usr/src/uts/common/vm/vpage.h
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1998 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _VM_VPAGE_H
+#define _VM_VPAGE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * VM - Information per virtual page.
+ */
+struct vpage {
+ uchar_t nvp_prot; /* see <sys/mman.h> prot flags */
+ uchar_t nvp_advice; /* pplock & <sys/mman.h> madvise flags */
+};
+
+/*
+ * This was changed from a bitfield to flags/macros in order
+ * to conserve space (uchar_t bitfields are not ANSI). This could
+ * have been condensed to a uchar_t, but at the expense of complexity.
+ * We've stolen a bit from the top of nvp_advice to store pplock in.
+ *
+ * WARNING: VPP_SETADVICE(vpp, x) evaluates vpp twice, and VPP_PLOCK(vpp)
+ * returns a positive integer when the lock is held, not necessarily (1).
+ */
+#define VP_ADVICE_MASK (0x07)
+#define VP_PPLOCK_MASK (0x80) /* physical page locked by me */
+#define VP_PPLOCK_SHIFT (0x07) /* offset of lock hiding inside nvp_advice */
+
+#define VPP_PROT(vpp) ((vpp)->nvp_prot)
+#define VPP_ADVICE(vpp) ((vpp)->nvp_advice & VP_ADVICE_MASK)
+#define VPP_ISPPLOCK(vpp) \
+ ((uchar_t)((vpp)->nvp_advice & VP_PPLOCK_MASK))
+
+#define VPP_SETPROT(vpp, x) ((vpp)->nvp_prot = (x))
+#define VPP_SETADVICE(vpp, x) \
+ ((vpp)->nvp_advice = ((vpp)->nvp_advice & ~VP_ADVICE_MASK) | \
+ ((x) & VP_ADVICE_MASK))
+#define VPP_SETPPLOCK(vpp) ((vpp)->nvp_advice |= VP_PPLOCK_MASK)
+#define VPP_CLRPPLOCK(vpp) ((vpp)->nvp_advice &= ~VP_PPLOCK_MASK)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_VPAGE_H */
diff --git a/usr/src/uts/common/vm/xhat.c b/usr/src/uts/common/vm/xhat.c
new file mode 100644
index 0000000000..255ca1eb67
--- /dev/null
+++ b/usr/src/uts/common/vm/xhat.c
@@ -0,0 +1,555 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/mman.h>
+#include <sys/systm.h>
+#include <vm/xhat.h>
+#include <vm/page.h>
+#include <vm/as.h>
+
+int xhat_debug = 0;
+
+krwlock_t xhat_provider_rwlock;
+xhat_provider_t *xhat_provider = NULL;
+
+void
+xhat_init()
+{
+ rw_init(&xhat_provider_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+
+
+int
+xhat_provider_register(xhat_provider_t *provider)
+{
+ /* strlen("_cache") = 7 */
+ char cache_name[XHAT_CACHE_NAMELEN + 7];
+
+
+ if (provider->xhat_provider_version != XHAT_PROVIDER_VERSION) {
+ cmn_err(CE_WARN, "XHAT provider version mismatch");
+ return (-1);
+ }
+
+ if ((XHAT_POPS(provider)->xhat_alloc == NULL) ||
+ (XHAT_POPS(provider)->xhat_free == NULL)) {
+ cmn_err(CE_WARN, "Malformed XHAT provider");
+ return (-1);
+ }
+
+ /* Allocate kmem_cache which will manage xhat blocks */
+ provider->xblkcache->free_blks = NULL;
+ (void) strncpy(cache_name, provider->xhat_provider_name,
+ XHAT_CACHE_NAMELEN);
+ (void) strcat(cache_name, "_cache");
+ provider->xblkcache->cache = kmem_cache_create(cache_name,
+ provider->xhat_provider_blk_size, 0, NULL, NULL,
+ provider->xblkcache->reclaim,
+ (void *)provider, NULL, 0);
+ if (provider->xblkcache->cache == NULL) {
+ cmn_err(CE_WARN, "Failed to allocate cache for %s",
+ provider->xhat_provider_name);
+ return (-1);
+ }
+
+ mutex_init(&provider->xblkcache->lock, NULL, MUTEX_DEFAULT, NULL);
+
+
+ /* Insert provider in the global list */
+ rw_enter(&xhat_provider_rwlock, RW_WRITER);
+ provider->next = xhat_provider;
+ provider->prev = NULL;
+ if (xhat_provider)
+ xhat_provider->prev = provider;
+ xhat_provider = provider;
+ xhat_provider->xhat_provider_refcnt = 0;
+ rw_exit(&xhat_provider_rwlock);
+ return (0);
+}
+
+
+
+int
+xhat_provider_unregister(xhat_provider_t *provider)
+{
+ if (provider->xhat_provider_version != XHAT_PROVIDER_VERSION)
+ return (-1);
+
+ rw_enter(&xhat_provider_rwlock, RW_WRITER);
+
+ if (provider->xhat_provider_refcnt) {
+ rw_exit(&xhat_provider_rwlock);
+ return (-1);
+ }
+
+ if (provider->next)
+ provider->next->prev = provider->prev;
+ if (provider->prev)
+ provider->prev->next = provider->next;
+ else
+ xhat_provider = provider->next;
+ provider->prev = NULL;
+ provider->next = NULL;
+ rw_exit(&xhat_provider_rwlock);
+
+ /* Free all xblks that are sitting on free_blks list */
+ provider->xblkcache->reclaim(provider);
+
+ kmem_cache_destroy(provider->xblkcache->cache);
+
+ return (0);
+}
+
+
+
+/* Attaches an XHAT to the address space */
+int
+xhat_attach_xhat(xhat_provider_t *provider, struct as *as,
+ struct xhat **xhatp, void *arg)
+{
+ struct xhat *xh;
+
+
+
+ xh = XHAT_POPS(provider)->xhat_alloc(arg);
+ if (xh == NULL) {
+ *xhatp = NULL;
+ return (XH_PRVDR);
+ }
+
+ mutex_init(&xh->xhat_lock, NULL, MUTEX_DEFAULT, NULL);
+ xh->xhat_provider = provider;
+
+ rw_enter(&xhat_provider_rwlock, RW_WRITER);
+ provider->xhat_provider_refcnt++;
+ rw_exit(&xhat_provider_rwlock);
+
+ mutex_enter(&as->a_contents);
+
+ /* Is address space busy (being freed, dup'd or swapped)? */
+ if (AS_ISBUSY(as)) {
+ mutex_exit(&as->a_contents);
+ XHAT_POPS(provider)->xhat_free(xh);
+
+ rw_enter(&xhat_provider_rwlock, RW_WRITER);
+ provider->xhat_provider_refcnt--;
+ rw_exit(&xhat_provider_rwlock);
+
+ *xhatp = NULL;
+ return (XH_ASBUSY);
+ }
+
+ xh->xhat_as = as;
+ xh->xhat_refcnt = 0;
+ xh->holder = NULL;
+ xh->arg = arg;
+ xh->next = (struct xhat *)as->a_xhat;
+ if (xh->next)
+ xh->next->prev = xh;
+ as->a_xhat = xh;
+ mutex_exit(&as->a_contents);
+ *xhatp = xh;
+ return (0);
+}
+
+
+int
+xhat_detach_xhat(xhat_provider_t *provider, struct as *as)
+{
+ struct xhat *xh;
+
+
+ mutex_enter(&as->a_contents);
+
+ for (xh = (struct xhat *)as->a_xhat; xh != NULL; xh = xh->next)
+ if (xh->xhat_provider == provider) {
+
+
+ if (xh->holder != NULL) {
+ /*
+ * The address space is being freed,
+ * dup'd or swapped out.
+ * If we are the thread which doing one
+ * of those operations, we can go ahead
+ * and free up the XHAT.
+ * Otherwise, return.
+ */
+ if (xh->holder != curthread) {
+ mutex_exit(&as->a_contents);
+ return (XH_ASBUSY);
+ } else
+ xhat_hat_rele(xh);
+ }
+
+ if (xh->xhat_refcnt > 0) {
+ /*
+ * There are still "users" of the XHAT.
+ * This may be either because the caller
+ * forgot to free something up (which is a bug)
+ * or because xhat_op_all() is in progress.
+ * Since we are not allowing any of
+ * xhat_op_all's ops to call xhat_detach_xhat(),
+ * This can only be some other thread. It
+ * may want to wait a bit and retry.
+ */
+
+
+ /* Restore the hold on the XHAT */
+ if (xh->holder == curthread)
+ xhat_hat_hold(xh);
+
+ mutex_exit(&as->a_contents);
+ return (XH_XHHELD);
+ }
+
+ rw_enter(&xhat_provider_rwlock, RW_WRITER);
+ provider->xhat_provider_refcnt--;
+ rw_exit(&xhat_provider_rwlock);
+
+ if (xh->next)
+ xh->next->prev = xh->prev;
+ if (xh->prev)
+ xh->prev->next = xh->next;
+ else
+ as->a_xhat = (void *) xh->next;
+ mutex_exit(&as->a_contents);
+
+ XHAT_POPS(provider)->xhat_free(xh);
+
+ return (0);
+ }
+ mutex_exit(&as->a_contents);
+ return (XH_NOTATTCHD);
+}
+
+void
+xhat_hat_hold(struct xhat *xhat)
+{
+ mutex_enter(&xhat->xhat_lock);
+ xhat->xhat_refcnt++;
+ mutex_exit(&xhat->xhat_lock);
+}
+
+void
+xhat_hat_rele(struct xhat *xhat)
+{
+ mutex_enter(&xhat->xhat_lock);
+ xhat->xhat_refcnt--;
+ ASSERT(xhat->xhat_refcnt >= 0);
+ mutex_exit(&xhat->xhat_lock);
+}
+
+
+int
+xhat_hat_holders(struct xhat *xhat)
+{
+ return (xhat->xhat_refcnt);
+}
+
+
+/*
+ * Assumes that address space is already locked
+ * and that AS_FREE is set for as->a_flags.
+ */
+void
+xhat_free_start_all(struct as *as)
+{
+ struct xhat *xh, *xh_nxt;
+
+
+ ASSERT(AS_ISBUSY(as));
+
+ mutex_enter(&as->a_contents);
+ xh = (struct xhat *)as->a_xhat;
+
+ /*
+ * Simply calling xhat_hat_hold() won't work because we will
+ * not be able to succeed in xhat_detach_xhat(), which may
+ * get called from here. We need to know _who_ the holder is.
+ */
+ if (xh != NULL) {
+ xhat_hat_hold(xh);
+ ASSERT(xh->holder == NULL);
+ xh->holder = curthread;
+ }
+
+ while (xh != NULL) {
+
+ xh_nxt = xh->next;
+ if (xh_nxt != NULL) {
+ ASSERT(xh_nxt->holder == NULL);
+ xhat_hat_hold(xh_nxt);
+ xh_nxt->holder = curthread;
+ }
+
+ mutex_exit(&as->a_contents);
+
+ XHAT_FREE_START(xh);
+
+ mutex_enter(&as->a_contents);
+
+ xh = xh_nxt;
+ }
+
+ mutex_exit(&as->a_contents);
+}
+
+
+
+/*
+ * Assumes that address space is already locked.
+ * Since xhat_free_start_all() must have been called
+ * earlier, for all XHATs holder is set to curthread.
+ * Also, since AS_BUSY is set for as->a_flags, no new
+ * XHATs could have been added.
+ */
+void
+xhat_free_end_all(struct as *as)
+{
+
+ struct xhat *xh, *xh_nxt;
+
+ ASSERT(AS_ISBUSY(as));
+
+ mutex_enter(&as->a_contents);
+ xh = (struct xhat *)as->a_xhat;
+
+
+ while (xh != NULL) {
+
+ ASSERT(xh->holder == curthread);
+
+ xh_nxt = xh->next;
+
+ mutex_exit(&as->a_contents);
+
+ XHAT_FREE_END(xh);
+
+ mutex_enter(&as->a_contents);
+
+ xh = xh_nxt;
+ }
+
+ mutex_exit(&as->a_contents);
+}
+
+
+/* Assumes that address space is already locked */
+
+/* ARGSUSED */
+int
+xhat_dup_all(struct as *as, struct as *newas, caddr_t addr, size_t len,
+ uint_t flag)
+{
+ /* This is not supported. Should we return some sort of error? */
+
+ ASSERT(AS_ISBUSY(as));
+
+ return (0);
+}
+
+
+/* Assumes that address space is already locked */
+void
+xhat_swapout_all(struct as *as)
+{
+ struct xhat *xh, *xh_nxt;
+
+
+ ASSERT(AS_ISBUSY(as));
+
+ mutex_enter(&as->a_contents);
+ xh = (struct xhat *)as->a_xhat;
+
+ if (xh != NULL) {
+ xhat_hat_hold(xh);
+ ASSERT(xh->holder == NULL);
+ xh->holder = curthread;
+ }
+
+
+ while (xh != NULL) {
+
+ xh_nxt = xh->next;
+ if (xh_nxt != NULL) {
+ ASSERT(xh_nxt->holder == NULL);
+ xhat_hat_hold(xh_nxt);
+ xh_nxt->holder = curthread;
+ }
+
+ mutex_exit(&as->a_contents);
+
+ XHAT_SWAPOUT(xh);
+
+ mutex_enter(&as->a_contents);
+
+ /*
+ * If the xh is still there (i.e. swapout did not
+ * destroy it), clear the holder field.
+ * xh_nxt->prev couldn't have been changed in xhat_attach_xhat()
+ * because AS_BUSY is set. xhat_detach_xhat() also couldn't
+ * have modified it because (holder != NULL).
+ * If there is only one XHAT, just see if a_xhat still
+ * points to us.
+ */
+ if (((xh_nxt != NULL) && (xh_nxt->prev == xh)) ||
+ ((as->a_xhat != NULL) && (as->a_xhat == xh))) {
+ xhat_hat_rele(xh);
+ xh->holder = NULL;
+ }
+
+ xh = xh_nxt;
+ }
+
+ mutex_exit(&as->a_contents);
+}
+
+
+
+
+/*
+ * In the following routines, the appropriate xhat_op
+ * should never attempt to call xhat_detach_xhat(): it will
+ * never succeed since the XHAT is held.
+ */
+
+
+#define XHAT_UNLOAD_CALLBACK_OP (0)
+#define XHAT_SETATTR_OP (1)
+#define XHAT_CLRATTR_OP (2)
+#define XHAT_CHGATTR_OP (3)
+#define XHAT_CHGPROT_OP (4)
+#define XHAT_UNSHARE_OP (5)
+
+
+static void
+xhat_op_all(int op, struct as *as, caddr_t addr,
+ size_t len, uint_t flags, void *ptr)
+{
+ struct xhat *xh, *xh_nxt;
+
+ mutex_enter(&as->a_contents);
+ xh = (struct xhat *)as->a_xhat;
+
+ while (xh != NULL) {
+
+ xhat_hat_hold(xh);
+
+ xh_nxt = xh->next;
+ if (xh_nxt != NULL)
+ xhat_hat_hold(xh_nxt);
+
+ mutex_exit(&as->a_contents);
+
+ switch (op) {
+ case XHAT_UNLOAD_CALLBACK_OP:
+ XHAT_UNLOAD_CALLBACK(xh, addr,
+ len, flags, (hat_callback_t *)ptr);
+ break;
+ case XHAT_SETATTR_OP:
+ XHAT_SETATTR(xh, addr, len, flags);
+ break;
+ case XHAT_CLRATTR_OP:
+ XHAT_CLRATTR(xh, addr, len, flags);
+ break;
+ case XHAT_CHGATTR_OP:
+ XHAT_CHGATTR(xh, addr, len, flags);
+ break;
+ case XHAT_CHGPROT_OP:
+ XHAT_CHGPROT(xh, addr, len, flags);
+ break;
+ case XHAT_UNSHARE_OP:
+ XHAT_UNSHARE(xh, addr, len);
+ break;
+ default:
+ panic("Unknown op %d in xhat_op_all", op);
+ }
+
+ mutex_enter(&as->a_contents);
+
+ /*
+ * Both pointers are still valid because both
+ * XHATs are held.
+ */
+ xhat_hat_rele(xh);
+ if (xh_nxt != NULL)
+ xhat_hat_rele(xh_nxt);
+ xh = xh_nxt;
+ }
+
+ mutex_exit(&as->a_contents);
+}
+
+
+
+void
+xhat_unload_callback_all(struct as *as, caddr_t addr, size_t len, uint_t flags,
+ hat_callback_t *callback)
+{
+ xhat_op_all(XHAT_UNLOAD_CALLBACK_OP, as, addr, len, flags, callback);
+}
+
+
+void
+xhat_setattr_all(struct as *as, caddr_t addr, size_t len, uint_t attr)
+{
+ xhat_op_all(XHAT_SETATTR_OP, as, addr, len, attr, NULL);
+}
+
+
+
+void
+xhat_clrattr_all(struct as *as, caddr_t addr, size_t len, uint_t attr)
+{
+ xhat_op_all(XHAT_CLRATTR_OP, as, addr, len, attr, NULL);
+}
+
+
+void
+xhat_chgattr_all(struct as *as, caddr_t addr, size_t len, uint_t attr)
+{
+ xhat_op_all(XHAT_CHGATTR_OP, as, addr, len, attr, NULL);
+}
+
+
+void
+xhat_chgprot_all(struct as *as, caddr_t addr, size_t len, uint_t prot)
+{
+ xhat_op_all(XHAT_CHGPROT_OP, as, addr, len, prot, NULL);
+}
+
+
+void
+xhat_unshare_all(struct as *as, caddr_t addr, size_t len)
+{
+ xhat_op_all(XHAT_UNSHARE_OP, as, addr, len, 0, NULL);
+}
diff --git a/usr/src/uts/common/vm/xhat.h b/usr/src/uts/common/vm/xhat.h
new file mode 100644
index 0000000000..808262f2c9
--- /dev/null
+++ b/usr/src/uts/common/vm/xhat.h
@@ -0,0 +1,208 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VM_XHAT_H
+#define _VM_XHAT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _ASM
+
+#include <sys/types.h>
+#include <vm/page.h>
+#include <sys/kmem.h>
+
+struct xhat;
+struct xhat_hme_blk;
+
+struct xhat_ops {
+ struct xhat *(*xhat_alloc)(void *);
+ void (*xhat_free)(struct xhat *);
+ void (*xhat_free_start)(struct xhat *);
+ void (*xhat_free_end)(struct xhat *);
+ int (*xhat_dup)(struct xhat *, struct xhat *, caddr_t,
+ size_t, uint_t);
+ void (*xhat_swapin)(struct xhat *);
+ void (*xhat_swapout)(struct xhat *);
+ void (*xhat_memload)(struct xhat *, caddr_t, struct page *,
+ uint_t, uint_t);
+ void (*xhat_memload_array)(struct xhat *, caddr_t, size_t,
+ struct page **, uint_t, uint_t);
+ void (*xhat_devload)(struct xhat *, caddr_t, size_t, pfn_t,
+ uint_t, int);
+ void (*xhat_unload)(struct xhat *, caddr_t, size_t, uint_t);
+ void (*xhat_unload_callback)(struct xhat *, caddr_t, size_t,
+ uint_t, hat_callback_t *);
+ void (*xhat_setattr)(struct xhat *, caddr_t, size_t, uint_t);
+ void (*xhat_clrattr)(struct xhat *, caddr_t, size_t, uint_t);
+ void (*xhat_chgattr)(struct xhat *, caddr_t, size_t, uint_t);
+ void (*xhat_unshare)(struct xhat *, caddr_t, size_t);
+ void (*xhat_chgprot)(struct xhat *, caddr_t, size_t, uint_t);
+ int (*xhat_pageunload)(struct xhat *, struct page *, uint_t,
+ void *);
+};
+
+
+#define XHAT_POPS(_p) (_p)->xhat_provider_ops
+#define XHAT_PROPS(_h) XHAT_POPS(((struct xhat *)(_h))->xhat_provider)
+#define XHAT_HOPS(hat, func, args) \
+ { \
+ if (XHAT_PROPS(hat)-> /* */ func) \
+ XHAT_PROPS(hat)-> /* */ func /* */ args; \
+ }
+
+#define XHAT_FREE_START(a) \
+ XHAT_HOPS(a, xhat_free_start, ((struct xhat *)(a)))
+#define XHAT_FREE_END(a) \
+ XHAT_HOPS(a, xhat_free_end, ((struct xhat *)(a)))
+#define XHAT_DUP(a, b, c, d, e) \
+ ((XHAT_PROPS(a)->xhat_dup == NULL) ? (0) : \
+ XHAT_PROPS(a)->xhat_dup((struct xhat *)(a), \
+ (struct xhat *)(b), c, d, e))
+#define XHAT_SWAPIN(a) \
+ XHAT_HOPS(a, xhat_swapin, ((struct xhat *)(a)))
+#define XHAT_SWAPOUT(a) \
+ XHAT_HOPS(a, xhat_swapout, ((struct xhat *)(a)))
+#define XHAT_MEMLOAD(a, b, c, d, e) \
+ XHAT_HOPS(a, xhat_memload, ((struct xhat *)(a), b, c, d, e))
+#define XHAT_MEMLOAD_ARRAY(a, b, c, d, e, f) \
+ XHAT_HOPS(a, xhat_memload_array, ((struct xhat *)(a), b, c, d, e, f))
+#define XHAT_DEVLOAD(a, b, c, d, e, f) \
+ XHAT_HOPS(a, xhat_devload, ((struct xhat *)(a), b, c, d, e, f))
+#define XHAT_UNLOAD(a, b, c, d) \
+ XHAT_HOPS(a, xhat_unload, ((struct xhat *)(a), b, c, d))
+#define XHAT_UNLOAD_CALLBACK(a, b, c, d, e) \
+ XHAT_HOPS(a, xhat_unload_callback, ((struct xhat *)(a), b, c, d, e))
+#define XHAT_SETATTR(a, b, c, d) \
+ XHAT_HOPS(a, xhat_setattr, ((struct xhat *)(a), b, c, d))
+#define XHAT_CLRATTR(a, b, c, d) \
+ XHAT_HOPS(a, xhat_clrattr, ((struct xhat *)(a), b, c, d))
+#define XHAT_CHGATTR(a, b, c, d) \
+ XHAT_HOPS(a, xhat_chgattr, ((struct xhat *)(a), b, c, d))
+#define XHAT_UNSHARE(a, b, c) \
+ XHAT_HOPS(a, xhat_unshare, ((struct xhat *)(a), b, c))
+#define XHAT_CHGPROT(a, b, c, d) \
+ XHAT_HOPS(a, xhat_chgprot, ((struct xhat *)(a), b, c, d))
+#define XHAT_PAGEUNLOAD(a, b, c, d) \
+ ((XHAT_PROPS(a)->xhat_pageunload == NULL) ? (0) : \
+ XHAT_PROPS(a)->xhat_pageunload((struct xhat *)(a), b, c, d))
+
+
+
+#define XHAT_PROVIDER_VERSION 1
+
+/*
+ * Provider name will be appended with "_cache"
+ * when initializing kmem cache.
+ * The resulting sring must be less than
+ * KMEM_CACHE_NAMELEN
+ */
+#define XHAT_CACHE_NAMELEN 24
+
+typedef struct xblk_cache {
+ kmutex_t lock;
+ kmem_cache_t *cache;
+ void *free_blks;
+ void (*reclaim)(void *);
+} xblk_cache_t;
+
+typedef struct xhat_provider {
+ int xhat_provider_version;
+ int xhat_provider_refcnt;
+ struct xhat_provider *next;
+ struct xhat_provider *prev;
+ char xhat_provider_name[XHAT_CACHE_NAMELEN];
+ xblk_cache_t *xblkcache;
+ struct xhat_ops *xhat_provider_ops;
+ int xhat_provider_blk_size;
+} xhat_provider_t;
+
+/*
+ * The xhat structure is protected by xhat_lock.
+ * A particular xhat implementation is a extension of the
+ * xhat structure and may contain its own lock(s) to
+ * protect those additional fields.
+ * The xhat structure is never allocated directly.
+ * Instead its allocation is provided by the hat implementation.
+ * The xhat provider ops xhat_alloc/xhat_free are used to
+ * alloc/free a implementation dependant xhat structure.
+ */
+struct xhat {
+ xhat_provider_t *xhat_provider;
+ struct as *xhat_as;
+ void *arg;
+ struct xhat *prev;
+ struct xhat *next;
+ kmutex_t xhat_lock;
+ int xhat_refcnt;
+ kthread_t *holder;
+};
+
+
+/* Error codes */
+#define XH_PRVDR (1) /* Provider-specific error */
+#define XH_ASBUSY (2) /* Address space is busy */
+#define XH_XHHELD (3) /* XHAT is being held */
+#define XH_NOTATTCHD (4) /* Provider is not attached to as */
+
+
+int xhat_provider_register(xhat_provider_t *);
+int xhat_provider_unregister(xhat_provider_t *);
+void xhat_init(void);
+int xhat_attach_xhat(xhat_provider_t *, struct as *, struct xhat **,
+ void *);
+int xhat_detach_xhat(xhat_provider_t *, struct as *);
+pfn_t xhat_insert_xhatblk(page_t *, struct xhat *, void **);
+int xhat_delete_xhatblk(void *, int);
+void xhat_hat_hold(struct xhat *);
+void xhat_hat_rele(struct xhat *);
+int xhat_hat_holders(struct xhat *);
+
+void xhat_free_start_all(struct as *);
+void xhat_free_end_all(struct as *);
+int xhat_dup_all(struct as *, struct as *, caddr_t, size_t, uint_t);
+void xhat_swapout_all(struct as *);
+void xhat_unload_callback_all(struct as *, caddr_t, size_t, uint_t,
+ hat_callback_t *);
+void xhat_setattr_all(struct as *, caddr_t, size_t, uint_t);
+void xhat_clrattr_all(struct as *, caddr_t, size_t, uint_t);
+void xhat_chgattr_all(struct as *, caddr_t, size_t, uint_t);
+void xhat_chgprot_all(struct as *, caddr_t, size_t, uint_t);
+void xhat_unshare_all(struct as *, caddr_t, size_t);
+
+
+#endif /* _ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VM_XHAT_H */