summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/vm/page_lock.c
diff options
context:
space:
mode:
authorstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
committerstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
commit7c478bd95313f5f23a4c958a745db2134aa03244 (patch)
treec871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/uts/common/vm/page_lock.c
downloadillumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz
OpenSolaris Launch
Diffstat (limited to 'usr/src/uts/common/vm/page_lock.c')
-rw-r--r--usr/src/uts/common/vm/page_lock.c861
1 files changed, 861 insertions, 0 deletions
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
new file mode 100644
index 0000000000..9a2d12dd8e
--- /dev/null
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -0,0 +1,861 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - page locking primitives
+ */
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/vtrace.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/bitmap.h>
+#include <sys/lockstat.h>
+#include <sys/condvar_impl.h>
+#include <vm/page.h>
+#include <vm/seg_enum.h>
+#include <vm/vm_dep.h>
+
+/*
+ * This global mutex is for logical page locking.
+ * The following fields in the page structure are protected
+ * by this lock:
+ *
+ * p_lckcnt
+ * p_cowcnt
+ */
+kmutex_t page_llock;
+
+/*
+ * This is a global lock for the logical page free list. The
+ * logical free list, in this implementation, is maintained as two
+ * separate physical lists - the cache list and the free list.
+ */
+kmutex_t page_freelock;
+
+/*
+ * The hash table, page_hash[], the p_selock fields, and the
+ * list of pages associated with vnodes are protected by arrays of mutexes.
+ *
+ * Unless the hashes are changed radically, the table sizes must be
+ * a power of two. Also, we typically need more mutexes for the
+ * vnodes since these locks are occasionally held for long periods.
+ * And since there seem to be two special vnodes (kvp and swapvp),
+ * we make room for private mutexes for them.
+ *
+ * The pse_mutex[] array holds the mutexes to protect the p_selock
+ * fields of all page_t structures.
+ *
+ * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
+ * when given a pointer to a page_t.
+ *
+ * PSE_TABLE_SIZE must be a power of two. One could argue that we
+ * should go to the trouble of setting it up at run time and base it
+ * on memory size rather than the number of compile time CPUs.
+ *
+ * XX64 We should be using physmem size to calculate PSE_TABLE_SIZE,
+ * PSE_SHIFT, PIO_SHIFT.
+ *
+ * These might break in 64 bit world.
+ */
+#define PSE_SHIFT 7 /* log2(PSE_TABLE_SIZE) */
+
+#define PSE_TABLE_SIZE 128 /* number of mutexes to have */
+
+#define PIO_SHIFT PSE_SHIFT /* next power of 2 bigger than page_t */
+#define PIO_TABLE_SIZE PSE_TABLE_SIZE /* number of io mutexes to have */
+
+pad_mutex_t ph_mutex[PH_TABLE_SIZE];
+pad_mutex_t pse_mutex[PSE_TABLE_SIZE];
+kmutex_t pio_mutex[PIO_TABLE_SIZE];
+
+#define PAGE_SE_MUTEX(pp) \
+ &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
+ ((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
+ (PSE_TABLE_SIZE - 1))].pad_mutex
+
+#define PAGE_IO_MUTEX(pp) \
+ &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
+
+#define PSZC_MTX_TABLE_SIZE 128
+#define PSZC_MTX_TABLE_SHIFT 7
+
+static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE];
+
+#define PAGE_SZC_MUTEX(_pp) \
+ &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
+ ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
+ ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
+ (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
+
+/*
+ * The vph_mutex[] array holds the mutexes to protect the vnode chains,
+ * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
+ * and p_vpnext).
+ *
+ * The page_vnode_mutex(vp) function returns the address of the appropriate
+ * mutex from this array given a pointer to a vnode. It is complicated
+ * by the fact that the kernel's vnode and the swapfs vnode are referenced
+ * frequently enough to warrent their own mutexes.
+ *
+ * The VP_HASH_FUNC returns the index into the vph_mutex array given
+ * an address of a vnode.
+ */
+
+/*
+ * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
+ * Need to review again.
+ */
+#define VPH_TABLE_SIZE (2 << VP_SHIFT)
+
+#define VP_HASH_FUNC(vp) \
+ ((((uintptr_t)(vp) >> 6) + \
+ ((uintptr_t)(vp) >> 8) + \
+ ((uintptr_t)(vp) >> 10) + \
+ ((uintptr_t)(vp) >> 12)) \
+ & (VPH_TABLE_SIZE - 1))
+
+extern struct vnode kvp;
+
+kmutex_t vph_mutex[VPH_TABLE_SIZE + 2];
+
+/*
+ * Initialize the locks used by the Virtual Memory Management system.
+ */
+void
+page_lock_init()
+{
+}
+
+/*
+ * At present we only use page ownership to aid debugging, so it's
+ * OK if the owner field isn't exact. In the 32-bit world two thread ids
+ * can map to the same owner because we just 'or' in 0x80000000 and
+ * then clear the second highest bit, so that (for example) 0x2faced00
+ * and 0xafaced00 both map to 0xafaced00.
+ * In the 64-bit world, p_selock may not be large enough to hold a full
+ * thread pointer. If we ever need precise ownership (e.g. if we implement
+ * priority inheritance for page locks) then p_selock should become a
+ * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
+ */
+#define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
+#define SE_READER 1
+
+/*
+ * A page that is deleted must be marked as such using the
+ * page_lock_delete() function. The page must be exclusively locked.
+ * The SE_DELETED marker is put in p_selock when this function is called.
+ * SE_DELETED must be distinct from any SE_WRITER value.
+ */
+#define SE_DELETED (1 | INT_MIN)
+
+#ifdef VM_STATS
+uint_t vph_kvp_count;
+uint_t vph_swapfsvp_count;
+uint_t vph_other;
+#endif /* VM_STATS */
+
+#ifdef VM_STATS
+uint_t page_lock_count;
+uint_t page_lock_miss;
+uint_t page_lock_miss_lock;
+uint_t page_lock_reclaim;
+uint_t page_lock_bad_reclaim;
+uint_t page_lock_same_page;
+uint_t page_lock_upgrade;
+uint_t page_lock_upgrade_failed;
+uint_t page_lock_deleted;
+
+uint_t page_trylock_locked;
+uint_t page_trylock_missed;
+
+uint_t page_try_reclaim_upgrade;
+#endif /* VM_STATS */
+
+
+/*
+ * Acquire the "shared/exclusive" lock on a page.
+ *
+ * Returns 1 on success and locks the page appropriately.
+ * 0 on failure and does not lock the page.
+ *
+ * If `lock' is non-NULL, it will be dropped and reacquired in the
+ * failure case. This routine can block, and if it does
+ * it will always return a failure since the page identity [vp, off]
+ * or state may have changed.
+ */
+
+int
+page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
+{
+ return (page_lock_es(pp, se, lock, reclaim, 0));
+}
+
+/*
+ * With the addition of reader-writer lock semantics to page_lock_es,
+ * callers wanting an exclusive (writer) lock may prevent shared-lock
+ * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
+ * In this case, when an exclusive lock cannot be acquired, p_selock's
+ * SE_EWANTED bit is set.
+ * This bit, along with the se and es parameters, are used to decide
+ * if the requested lock should be granted:
+ *
+ * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED Action
+ * ---------- -------------- ------------------- ---------
+ * SE_EXCL no dont-care/1 deny lock
+ * SE_EXCL any(see note) unlocked/any grant lock, clear SE_EWANTED
+ * SE_EXCL yes any lock/any deny, set SE_EWANTED
+ * SE_EXCL no any lock/any deny
+ * SE_SHARED not applicable shared/0 grant
+ * SE_SHARED not applicable unlocked/0 grant
+ * SE_SHARED not applicable shared/1 deny
+ * SE_SHARED not applicable unlocked/1 deny
+ * SE_SHARED not applicable excl/any deny
+ *
+ * Note: the code grants an exclusive lock to the caller and clears
+ * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
+ * bit's value. This was deemed acceptable as we are not concerned about
+ * exclusive-lock starvation. If this ever becomes an issue, a priority or
+ * fifo mechanism should also be implemented.
+ */
+int
+page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
+{
+ int retval;
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+ int upgraded;
+ int reclaim_it;
+
+ ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
+
+ VM_STAT_ADD(page_lock_count);
+
+ upgraded = 0;
+ reclaim_it = 0;
+
+ mutex_enter(pse);
+
+ /*
+ * Current uses of 'es':
+ * es == 1 page_lookup_create will attempt page relocation
+ * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete
+ * memory thread); this prevents reader-starvation of waiting
+ * writer thread(s).
+ */
+
+
+ ASSERT(((es & SE_EXCL_WANTED) == 0) ||
+ ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+ if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
+ se = SE_EXCL;
+ }
+
+ if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
+
+ reclaim_it = 1;
+ if (se == SE_SHARED) {
+ /*
+ * This is an interesting situation.
+ *
+ * Remember that p_free can only change if
+ * p_selock < 0.
+ * p_free does not depend on our holding `pse'.
+ * And, since we hold `pse', p_selock can not change.
+ * So, if p_free changes on us, the page is already
+ * exclusively held, and we would fail to get p_selock
+ * regardless.
+ *
+ * We want to avoid getting the share
+ * lock on a free page that needs to be reclaimed.
+ * It is possible that some other thread has the share
+ * lock and has left the free page on the cache list.
+ * pvn_vplist_dirty() does this for brief periods.
+ * If the se_share is currently SE_EXCL, we will fail
+ * to acquire p_selock anyway. Blocking is the
+ * right thing to do.
+ * If we need to reclaim this page, we must get
+ * exclusive access to it, force the upgrade now.
+ * Again, we will fail to acquire p_selock if the
+ * page is not free and block.
+ */
+ upgraded = 1;
+ se = SE_EXCL;
+ VM_STAT_ADD(page_lock_upgrade);
+ }
+ }
+
+ if (se == SE_EXCL) {
+ if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
+ /*
+ * if the caller wants a writer lock (but did not
+ * specify exclusive access), and there is a pending
+ * writer that wants exclusive access, return failure
+ */
+ retval = 0;
+ } else if ((pp->p_selock & ~SE_EWANTED) == 0) {
+ /* no reader/writer lock held */
+ THREAD_KPRI_REQUEST();
+ /* this clears our setting of the SE_EWANTED bit */
+ pp->p_selock = SE_WRITER;
+ retval = 1;
+ } else {
+ /* page is locked */
+ if (es == SE_EXCL_WANTED) {
+ /* set the SE_EWANTED bit */
+ pp->p_selock |= SE_EWANTED;
+ }
+ retval = 0;
+ }
+ } else {
+ retval = 0;
+ if (pp->p_selock >= 0) {
+ /* readers are not allowed when excl wanted */
+ if (!(pp->p_selock & SE_EWANTED)) {
+ pp->p_selock += SE_READER;
+ retval = 1;
+ }
+ }
+ }
+
+ if (retval == 0) {
+ if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
+ VM_STAT_ADD(page_lock_deleted);
+ mutex_exit(pse);
+ return (retval);
+ }
+
+#ifdef VM_STATS
+ VM_STAT_ADD(page_lock_miss);
+ if (upgraded) {
+ VM_STAT_ADD(page_lock_upgrade_failed);
+ }
+#endif
+ if (lock) {
+ VM_STAT_ADD(page_lock_miss_lock);
+ mutex_exit(lock);
+ }
+
+ /*
+ * Now, wait for the page to be unlocked and
+ * release the lock protecting p_cv and p_selock.
+ */
+ cv_wait(&pp->p_cv, pse);
+ mutex_exit(pse);
+
+ /*
+ * The page identity may have changed while we were
+ * blocked. If we are willing to depend on "pp"
+ * still pointing to a valid page structure (i.e.,
+ * assuming page structures are not dynamically allocated
+ * or freed), we could try to lock the page if its
+ * identity hasn't changed.
+ *
+ * This needs to be measured, since we come back from
+ * cv_wait holding pse (the expensive part of this
+ * operation) we might as well try the cheap part.
+ * Though we would also have to confirm that dropping
+ * `lock' did not cause any grief to the callers.
+ */
+ if (lock) {
+ mutex_enter(lock);
+ }
+ } else {
+ /*
+ * We have the page lock.
+ * If we needed to reclaim the page, and the page
+ * needed reclaiming (ie, it was free), then we
+ * have the page exclusively locked. We may need
+ * to downgrade the page.
+ */
+ ASSERT((upgraded) ?
+ ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
+ mutex_exit(pse);
+
+ /*
+ * We now hold this page's lock, either shared or
+ * exclusive. This will prevent its identity from changing.
+ * The page, however, may or may not be free. If the caller
+ * requested, and it is free, go reclaim it from the
+ * free list. If the page can't be reclaimed, return failure
+ * so that the caller can start all over again.
+ *
+ * NOTE:page_reclaim() releases the page lock (p_selock)
+ * if it can't be reclaimed.
+ */
+ if (reclaim_it) {
+ if (!page_reclaim(pp, lock)) {
+ VM_STAT_ADD(page_lock_bad_reclaim);
+ retval = 0;
+ } else {
+ VM_STAT_ADD(page_lock_reclaim);
+ if (upgraded) {
+ page_downgrade(pp);
+ }
+ }
+ }
+ }
+ return (retval);
+}
+
+/*
+ * Clear the SE_EWANTED bit from p_selock. This function allows
+ * callers of page_lock_es and page_try_reclaim_lock to clear
+ * their setting of this bit if they decide they no longer wish
+ * to gain exclusive access to the page. Currently only
+ * delete_memory_thread uses this when the delete memory
+ * operation is cancelled.
+ */
+void
+page_lock_clr_exclwanted(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+ mutex_enter(pse);
+ pp->p_selock &= ~SE_EWANTED;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ mutex_exit(pse);
+}
+
+/*
+ * Read the comments inside of page_lock_es() carefully.
+ *
+ * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
+ * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
+ * This is used by threads subject to reader-starvation (eg. memory delete).
+ *
+ * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
+ * it is expected that it will retry at a later time. Threads that will
+ * not retry the lock *must* call page_lock_clr_exclwanted to clear the
+ * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock,
+ * the bit is cleared.)
+ */
+int
+page_try_reclaim_lock(page_t *pp, se_t se, int es)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+ selock_t old;
+
+ mutex_enter(pse);
+
+ old = pp->p_selock;
+
+ ASSERT(((es & SE_EXCL_WANTED) == 0) ||
+ ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+ if (se == SE_SHARED && es == 1 && old == 0) {
+ se = SE_EXCL;
+ }
+
+ if (se == SE_SHARED) {
+ if (!PP_ISFREE(pp)) {
+ if (old >= 0) {
+ /* readers are not allowed when excl wanted */
+ if (!(old & SE_EWANTED)) {
+ pp->p_selock = old + SE_READER;
+ mutex_exit(pse);
+ return (1);
+ }
+ }
+ mutex_exit(pse);
+ return (0);
+ }
+ /*
+ * The page is free, so we really want SE_EXCL (below)
+ */
+ VM_STAT_ADD(page_try_reclaim_upgrade);
+ }
+
+ /*
+ * The caller wants a writer lock. We try for it only if
+ * SE_EWANTED is not set, or if the caller specified
+ * SE_EXCL_WANTED.
+ */
+ if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) {
+ if ((old & ~SE_EWANTED) == 0) {
+ /* no reader/writer lock held */
+ THREAD_KPRI_REQUEST();
+ /* this clears out our setting of the SE_EWANTED bit */
+ pp->p_selock = SE_WRITER;
+ mutex_exit(pse);
+ return (1);
+ }
+ }
+ if (es == SE_EXCL_WANTED) {
+ /* page is locked, set the SE_EWANTED bit */
+ pp->p_selock |= SE_EWANTED;
+ }
+ mutex_exit(pse);
+ return (0);
+}
+
+/*
+ * Acquire a page's "shared/exclusive" lock, but never block.
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_trylock(page_t *pp, se_t se)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+ mutex_enter(pse);
+ if (pp->p_selock & SE_EWANTED) {
+ /* fail if a thread wants exclusive access */
+ mutex_exit(pse);
+ return (0);
+ }
+
+ if (se == SE_EXCL) {
+ if (pp->p_selock == 0) {
+ THREAD_KPRI_REQUEST();
+ pp->p_selock = SE_WRITER;
+ mutex_exit(pse);
+ return (1);
+ }
+ } else {
+ if (pp->p_selock >= 0) {
+ pp->p_selock += SE_READER;
+ mutex_exit(pse);
+ return (1);
+ }
+ }
+ mutex_exit(pse);
+ return (0);
+}
+
+/*
+ * Release the page's "shared/exclusive" lock and wake up anyone
+ * who might be waiting for it.
+ */
+void
+page_unlock(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+ selock_t old;
+
+ mutex_enter(pse);
+ old = pp->p_selock;
+ if ((old & ~SE_EWANTED) == SE_READER) {
+ pp->p_selock = old & ~SE_READER;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ } else if ((old & ~SE_EWANTED) == SE_DELETED) {
+ panic("page_unlock: page %p is deleted", pp);
+ } else if (old < 0) {
+ THREAD_KPRI_RELEASE();
+ pp->p_selock &= SE_EWANTED;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ } else if ((old & ~SE_EWANTED) > SE_READER) {
+ pp->p_selock = old - SE_READER;
+ } else {
+ panic("page_unlock: page %p is not locked", pp);
+ }
+ mutex_exit(pse);
+}
+
+/*
+ * Try to upgrade the lock on the page from a "shared" to an
+ * "exclusive" lock. Since this upgrade operation is done while
+ * holding the mutex protecting this page, no one else can acquire this page's
+ * lock and change the page. Thus, it is safe to drop the "shared"
+ * lock and attempt to acquire the "exclusive" lock.
+ *
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_tryupgrade(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+ mutex_enter(pse);
+ if (!(pp->p_selock & SE_EWANTED)) {
+ /* no threads want exclusive access, try upgrade */
+ if (pp->p_selock == SE_READER) {
+ THREAD_KPRI_REQUEST();
+ /* convert to exclusive lock */
+ pp->p_selock = SE_WRITER;
+ mutex_exit(pse);
+ return (1);
+ }
+ }
+ mutex_exit(pse);
+ return (0);
+}
+
+/*
+ * Downgrade the "exclusive" lock on the page to a "shared" lock
+ * while holding the mutex protecting this page's p_selock field.
+ */
+void
+page_downgrade(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+ int excl_waiting;
+
+ ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
+ ASSERT(PAGE_EXCL(pp));
+
+ mutex_enter(pse);
+ excl_waiting = pp->p_selock & SE_EWANTED;
+ THREAD_KPRI_RELEASE();
+ pp->p_selock = SE_READER | excl_waiting;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ mutex_exit(pse);
+}
+
+void
+page_lock_delete(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_vnode == NULL);
+ ASSERT(pp->p_offset == (u_offset_t)-1);
+ ASSERT(!PP_ISFREE(pp));
+
+ mutex_enter(pse);
+ THREAD_KPRI_RELEASE();
+ pp->p_selock = SE_DELETED;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ mutex_exit(pse);
+}
+
+/*
+ * Implement the io lock for pages
+ */
+void
+page_iolock_init(page_t *pp)
+{
+ pp->p_iolock_state = 0;
+ cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
+}
+
+/*
+ * Acquire the i/o lock on a page.
+ */
+void
+page_io_lock(page_t *pp)
+{
+ kmutex_t *pio;
+
+ pio = PAGE_IO_MUTEX(pp);
+ mutex_enter(pio);
+ while (pp->p_iolock_state & PAGE_IO_INUSE) {
+ cv_wait(&(pp->p_io_cv), pio);
+ }
+ pp->p_iolock_state |= PAGE_IO_INUSE;
+ mutex_exit(pio);
+}
+
+/*
+ * Release the i/o lock on a page.
+ */
+void
+page_io_unlock(page_t *pp)
+{
+ kmutex_t *pio;
+
+ pio = PAGE_IO_MUTEX(pp);
+ mutex_enter(pio);
+ cv_signal(&pp->p_io_cv);
+ pp->p_iolock_state &= ~PAGE_IO_INUSE;
+ mutex_exit(pio);
+}
+
+/*
+ * Try to acquire the i/o lock on a page without blocking.
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_io_trylock(page_t *pp)
+{
+ kmutex_t *pio;
+
+ if (pp->p_iolock_state & PAGE_IO_INUSE)
+ return (0);
+
+ pio = PAGE_IO_MUTEX(pp);
+ mutex_enter(pio);
+
+ if (pp->p_iolock_state & PAGE_IO_INUSE) {
+ mutex_exit(pio);
+ return (0);
+ }
+ pp->p_iolock_state |= PAGE_IO_INUSE;
+ mutex_exit(pio);
+
+ return (1);
+}
+
+/*
+ * Assert that the i/o lock on a page is held.
+ * Returns 1 on success, 0 on failure.
+ */
+int
+page_iolock_assert(page_t *pp)
+{
+ return (pp->p_iolock_state & PAGE_IO_INUSE);
+}
+
+/*
+ * Wrapper exported to kernel routines that are built
+ * platform-independent (the macro is platform-dependent;
+ * the size of vph_mutex[] is based on NCPU).
+ *
+ * Note that you can do stress testing on this by setting the
+ * variable page_vnode_mutex_stress to something other than
+ * zero in a DEBUG kernel in a debugger after loading the kernel.
+ * Setting it after the kernel is running may not work correctly.
+ */
+#ifdef DEBUG
+static int page_vnode_mutex_stress = 0;
+#endif
+
+kmutex_t *
+page_vnode_mutex(vnode_t *vp)
+{
+ if (vp == &kvp)
+ return (&vph_mutex[VPH_TABLE_SIZE + 0]);
+#ifdef DEBUG
+ if (page_vnode_mutex_stress != 0)
+ return (&vph_mutex[0]);
+#endif
+
+ return (&vph_mutex[VP_HASH_FUNC(vp)]);
+}
+
+kmutex_t *
+page_se_mutex(page_t *pp)
+{
+ return (PAGE_SE_MUTEX(pp));
+}
+
+#ifdef VM_STATS
+uint_t pszclck_stat[4];
+#endif
+/*
+ * Find, take and return a mutex held by hat_page_demote().
+ * Called by page_demote_vp_pages() before hat_page_demote() call and by
+ * routines that want to block hat_page_demote() but can't do it
+ * via locking all constituent pages.
+ *
+ * Return NULL if p_szc is 0.
+ *
+ * It should only be used for pages that can be demoted by hat_page_demote()
+ * i.e. non swapfs file system pages. The logic here is lifted from
+ * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
+ * since the page is locked and not free.
+ *
+ * Hash of the root page is used to find the lock.
+ * To find the root in the presense of hat_page_demote() chageing the location
+ * of the root this routine relies on the fact that hat_page_demote() changes
+ * root last.
+ *
+ * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
+ * returned pp's p_szc may be any value.
+ */
+kmutex_t *
+page_szc_lock(page_t *pp)
+{
+ kmutex_t *mtx;
+ page_t *rootpp;
+ uint_t szc;
+ uint_t rszc;
+ uint_t pszc = pp->p_szc;
+
+ ASSERT(pp != NULL);
+ ASSERT(PAGE_LOCKED(pp));
+ ASSERT(!PP_ISFREE(pp));
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT(!IS_SWAPFSVP(pp->p_vnode));
+ ASSERT(pp->p_vnode != &kvp);
+
+again:
+ if (pszc == 0) {
+ VM_STAT_ADD(pszclck_stat[0]);
+ return (NULL);
+ }
+
+ /* The lock lives in the root page */
+
+ rootpp = PP_GROUPLEADER(pp, pszc);
+ mtx = PAGE_SZC_MUTEX(rootpp);
+ mutex_enter(mtx);
+
+ /*
+ * since p_szc can only decrease if pp == rootpp
+ * rootpp will be always the same i.e we have the right root
+ * regardless of rootpp->p_szc.
+ * If location of pp's root didn't change after we took
+ * the lock we have the right root. return mutex hashed off it.
+ */
+ if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
+ VM_STAT_ADD(pszclck_stat[1]);
+ return (mtx);
+ }
+
+ /*
+ * root location changed because page got demoted.
+ * locate the new root.
+ */
+ if (rszc < pszc) {
+ szc = pp->p_szc;
+ ASSERT(szc < pszc);
+ mutex_exit(mtx);
+ pszc = szc;
+ VM_STAT_ADD(pszclck_stat[2]);
+ goto again;
+ }
+
+ VM_STAT_ADD(pszclck_stat[3]);
+ /*
+ * current hat_page_demote not done yet.
+ * wait for it to finish.
+ */
+ mutex_exit(mtx);
+ rootpp = PP_GROUPLEADER(rootpp, rszc);
+ mtx = PAGE_SZC_MUTEX(rootpp);
+ mutex_enter(mtx);
+ mutex_exit(mtx);
+ ASSERT(rootpp->p_szc < rszc);
+ goto again;
+}
+
+int
+page_szc_lock_assert(page_t *pp)
+{
+ page_t *rootpp = PP_PAGEROOT(pp);
+ kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
+
+ return (MUTEX_HELD(mtx));
+}