summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/vm/vpm.c
diff options
context:
space:
mode:
authorpraks <none@none>2006-04-19 18:07:13 -0700
committerpraks <none@none>2006-04-19 18:07:13 -0700
commita5652762e5f7bf683d19f18542e5e39df63bad79 (patch)
tree113821225c18b190514811f3e27a638333bc2dcd /usr/src/uts/common/vm/vpm.c
parent4ab777b1b0f310e59b52a57c79efa0571506942a (diff)
downloadillumos-joyent-a5652762e5f7bf683d19f18542e5e39df63bad79.tar.gz
6256083 Need a lightweight file page mapping mechanism to substitute segmap
6387639 segkpm segment set to incorrect size for amd64
Diffstat (limited to 'usr/src/uts/common/vm/vpm.c')
-rw-r--r--usr/src/uts/common/vm/vpm.c1141
1 files changed, 1141 insertions, 0 deletions
diff --git a/usr/src/uts/common/vm/vpm.c b/usr/src/uts/common/vm/vpm.c
new file mode 100644
index 0000000000..1f4f2fdf58
--- /dev/null
+++ b/usr/src/uts/common/vm/vpm.c
@@ -0,0 +1,1141 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * VM - generic vnode page mapping interfaces.
+ *
+ * Mechanism to provide temporary mappings to vnode pages.
+ * The typical use would be to copy/access file data.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/buf.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/mman.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/vtrace.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/thread.h>
+#include <sys/dumphdr.h>
+#include <sys/bitmap.h>
+#include <sys/lgrp.h>
+
+#include <vm/seg_kmem.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kpm.h>
+#include <vm/seg_map.h>
+#include <vm/page.h>
+#include <vm/pvn.h>
+#include <vm/rm.h>
+#include <vm/vpm.h>
+
+/*
+ * Needs to be enabled by each platform.
+ */
+int vpm_enable = 0;
+
+#ifdef SEGKPM_SUPPORT
+
+
+int vpm_cache_enable = 1;
+long vpm_cache_percent = 12;
+long vpm_cache_size;
+int vpm_nfreelist = 0;
+int vpmd_freemsk = 0;
+
+#define VPM_S_PAD 64
+union vpm_cpu {
+ struct {
+ int vcpu_free_ndx;
+ ulong_t vcpu_hits;
+ ulong_t vcpu_misses;
+ } vcpu;
+ char vpm_pad[VPM_S_PAD];
+};
+static union vpm_cpu *vpmd_cpu;
+
+#define vfree_ndx vcpu.vcpu_free_ndx
+
+int vpm_cachemode = VPMCACHE_LRU;
+
+#define PPMTX(pp) (&(pp)->p_ilock)
+
+static struct vpmap *vpmd_vpmap; /* list of vpmap structs preallocated */
+static struct vpmfree *vpmd_free;
+#define VPMAPMTX(vpm) (&vpm->vpm_mtx)
+#define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
+#define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
+#define VPMP(id) (&vpmd_vpmap[id - 1])
+#define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1)
+
+
+#ifdef DEBUG
+
+struct vpm_debug {
+ int vpmd_steals;
+ int vpmd_contend;
+ int vpmd_prevpagelocked;
+ int vpmd_getpagefailed;
+ int vpmd_zerostart;
+ int vpmd_emptyfreelist;
+ int vpmd_nofreevpms;
+} vpm_debug;
+
+#define VPM_DEBUG(x) ((vpm_debug.x)++)
+
+int steals;
+int steals_mtbf = 7;
+int contend;
+int contend_mtbf = 127;
+
+#define VPM_MTBF(v, f) (((++(v)) & (f)) != (f))
+
+#else /* DEBUG */
+
+#define VPM_MTBF(v, f) (1)
+#define VPM_DEBUG(x) /* nothing */
+
+#endif
+
+/*
+ * The vpm cache.
+ *
+ * The main purpose of having a cache here is to speed up page_lookup()
+ * operations and also provide an LRU(default) behaviour of file pages. The
+ * page_lookup() operation tends to be expensive if a page has to be
+ * reclaimed from the system page cache("cachelist"). Once we speed up the
+ * page_lookup()->page_reclaim() path then there there should be no need for
+ * this cache. The system page cache(cachelist) should effectively serve the
+ * purpose of caching file pages.
+ *
+ * This cache is very similar to segmap's smap cache. Each page in the
+ * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
+ * hash table. The page_t has a reference to the vpmap_t when cached. For a
+ * given vnode, offset the page is found by means of a page_lookup() operation.
+ * Any page which has a mapping(i.e when cached) will not be in the
+ * system 'cachelist'. Hence the page_lookup() will not have to do a
+ * page_reclaim(). That is how the cache serves to speed up page_lookup()
+ * operations.
+ *
+ * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
+ */
+
+void
+vpm_init()
+{
+ long npages;
+ struct vpmap *vpm;
+ struct vpmfree *vpmflp;
+ int i, ndx;
+ extern void prefetch_smap_w(void *);
+
+ if (!vpm_cache_enable) {
+ return;
+ }
+
+ /*
+ * Set the size of the cache.
+ */
+ vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
+ if (vpm_cache_size < VPMAP_MINCACHE) {
+ vpm_cache_size = VPMAP_MINCACHE;
+ }
+
+ /*
+ * Number of freelists.
+ */
+ if (vpm_nfreelist == 0) {
+ vpm_nfreelist = max_ncpus;
+ } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
+ cmn_err(CE_WARN, "vpmap create : number of freelist "
+ "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
+ vpm_nfreelist = 2 * max_ncpus;
+ }
+
+ /*
+ * Round it up to the next power of 2
+ */
+ if (vpm_nfreelist & (vpm_nfreelist - 1)) {
+ vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
+ }
+ vpmd_freemsk = vpm_nfreelist - 1;
+
+ /*
+ * Use a per cpu rotor index to spread the allocations evenly
+ * across the available vpm freelists.
+ */
+ vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
+ ndx = 0;
+ for (i = 0; i < max_ncpus; i++) {
+
+ vpmd_cpu[i].vfree_ndx = ndx;
+ ndx = (ndx + 1) & vpmd_freemsk;
+ }
+
+ /*
+ * Allocate and initialize the freelist.
+ */
+ vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
+ KM_SLEEP);
+ for (i = 0; i < vpm_nfreelist; i++) {
+
+ vpmflp = &vpmd_free[i];
+ /*
+ * Set up initial queue pointers. They will get flipped
+ * back and forth.
+ */
+ vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
+ vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
+ }
+
+ npages = mmu_btop(vpm_cache_size);
+
+
+ /*
+ * Allocate and initialize the vpmap structs.
+ */
+ vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
+ for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
+ struct vpmfree *vpmflp;
+ union vpm_freeq *releq;
+ struct vpmap *vpmapf;
+
+ /*
+ * Use prefetch as we have to walk thru a large number of
+ * these data structures. We just use the smap's prefetch
+ * routine as it does the same. This should work fine
+ * for x64(this needs to be modifed when enabled on sparc).
+ */
+ prefetch_smap_w((void *)vpm);
+
+ vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
+
+ vpmflp = VPMAP2VMF(vpm);
+ releq = vpmflp->vpm_releq;
+
+ vpmapf = releq->vpmq_free;
+ if (vpmapf == NULL) {
+ releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
+ } else {
+ vpm->vpm_next = vpmapf;
+ vpm->vpm_prev = vpmapf->vpm_prev;
+ vpmapf->vpm_prev = vpm;
+ vpm->vpm_prev->vpm_next = vpm;
+ releq->vpmq_free = vpm->vpm_next;
+ }
+
+ /*
+ * Indicate that the vpmap is on the releq at start
+ */
+ vpm->vpm_ndxflg = VPMRELEQ;
+ }
+}
+
+
+/*
+ * unhooks vpm from the freelist if it is still on the freelist.
+ */
+#define VPMAP_RMFREELIST(vpm) \
+ { \
+ if (vpm->vpm_next != NULL) { \
+ union vpm_freeq *freeq; \
+ struct vpmfree *vpmflp; \
+ vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
+ freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
+ mutex_enter(&freeq->vpmq_mtx); \
+ if (freeq->vpmq_free != vpm) { \
+ vpm->vpm_prev->vpm_next = vpm->vpm_next; \
+ vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
+ } else if (vpm == vpm->vpm_next) { \
+ freeq->vpmq_free = NULL; \
+ } else { \
+ freeq->vpmq_free = vpm->vpm_next; \
+ vpm->vpm_prev->vpm_next = vpm->vpm_next; \
+ vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
+ } \
+ mutex_exit(&freeq->vpmq_mtx); \
+ vpm->vpm_next = vpm->vpm_prev = NULL; \
+ } \
+ }
+
+static int
+get_freelndx(int mode)
+{
+ int ndx;
+
+ ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
+ switch (mode) {
+
+ case VPMCACHE_LRU:
+ default:
+ vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
+ break;
+ }
+ return (ndx);
+}
+
+
+/*
+ * Find one vpmap structure from the free lists and use it for the newpage.
+ * The previous page it cached is dissociated and released. The page_t's
+ * p_vpmref is cleared only when the vpm it is pointing to is locked(or
+ * for AMD64 when the page is exclusively locked in page_unload. That is
+ * because the p_vpmref is treated as mapping).
+ *
+ * The page's p_vpmref is set when the page is
+ * locked(at least SHARED locked).
+ */
+static struct vpmap *
+get_free_vpmap(page_t *newpage)
+{
+ struct vpmfree *vpmflp;
+ kmutex_t *vmtx;
+ struct vpmap *vpm, *first;
+ union vpm_freeq *allocq, *releq;
+ page_t *pp = NULL;
+ int end_ndx, page_locked = 0;
+ int free_ndx;
+
+ /*
+ * get the freelist bin index.
+ */
+ free_ndx = get_freelndx(vpm_cachemode);
+
+ end_ndx = free_ndx;
+ vpmflp = &vpmd_free[free_ndx];
+
+retry_queue:
+ allocq = vpmflp->vpm_allocq;
+ mutex_enter(&allocq->vpmq_mtx);
+
+ if ((vpm = allocq->vpmq_free) == NULL) {
+
+skip_queue:
+ /*
+ * The alloc list is empty or this queue is being skipped;
+ * first see if the allocq toggled.
+ */
+ if (vpmflp->vpm_allocq != allocq) {
+ /* queue changed */
+ mutex_exit(&allocq->vpmq_mtx);
+ goto retry_queue;
+ }
+ releq = vpmflp->vpm_releq;
+ if (!mutex_tryenter(&releq->vpmq_mtx)) {
+ /* cannot get releq; a free vpmap may be there now */
+ mutex_exit(&allocq->vpmq_mtx);
+
+ /*
+ * This loop could spin forever if this thread has
+ * higher priority than the thread that is holding
+ * releq->vpmq_mtx. In order to force the other thread
+ * to run, we'll lock/unlock the mutex which is safe
+ * since we just unlocked the allocq mutex.
+ */
+ mutex_enter(&releq->vpmq_mtx);
+ mutex_exit(&releq->vpmq_mtx);
+ goto retry_queue;
+ }
+ if (releq->vpmq_free == NULL) {
+ VPM_DEBUG(vpmd_emptyfreelist);
+ /*
+ * This freelist is empty.
+ * This should not happen unless clients
+ * are failing to release the vpmap after
+ * accessing the data. Before resorting
+ * to sleeping, try the next list of the same color.
+ */
+ free_ndx = (free_ndx + 1) & vpmd_freemsk;
+ if (free_ndx != end_ndx) {
+ mutex_exit(&releq->vpmq_mtx);
+ mutex_exit(&allocq->vpmq_mtx);
+ vpmflp = &vpmd_free[free_ndx];
+ goto retry_queue;
+ }
+ /*
+ * Tried all freelists.
+ * wait on this list and hope something gets freed.
+ */
+ vpmflp->vpm_want++;
+ mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
+ cv_wait(&vpmflp->vpm_free_cv,
+ &vpmflp->vpm_freeq[0].vpmq_mtx);
+ vpmflp->vpm_want--;
+ mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
+ vpmflp = &vpmd_free[free_ndx];
+ VPM_DEBUG(vpmd_nofreevpms);
+ goto retry_queue;
+ } else {
+ /*
+ * Something on the rele queue; flip the alloc
+ * and rele queues and retry.
+ */
+ vpmflp->vpm_allocq = releq;
+ vpmflp->vpm_releq = allocq;
+ mutex_exit(&allocq->vpmq_mtx);
+ mutex_exit(&releq->vpmq_mtx);
+ if (page_locked) {
+ delay(hz >> 2);
+ page_locked = 0;
+ }
+ goto retry_queue;
+ }
+ } else {
+ int gotnewvpm;
+ kmutex_t *pmtx;
+ uint_t vpmref;
+
+ /*
+ * Fastpath the case we get the vpmap mutex
+ * on the first try.
+ */
+ first = vpm;
+next_vpmap:
+ vmtx = VPMAPMTX(vpm);
+ if (!mutex_tryenter(vmtx)) {
+ /*
+ * Another thread is trying to reclaim this slot.
+ * Skip to the next queue or vpmap.
+ */
+ if ((vpm = vpm->vpm_next) == first) {
+ goto skip_queue;
+ } else {
+ goto next_vpmap;
+ }
+ }
+
+ /*
+ * Assign this vpm to the newpage.
+ */
+ pmtx = PPMTX(newpage);
+ gotnewvpm = 0;
+ mutex_enter(pmtx);
+
+ /*
+ * Check if some other thread already assigned a vpm to
+ * this page.
+ */
+ if ((vpmref = newpage->p_vpmref) == 0) {
+ newpage->p_vpmref = VPMID(vpm);
+ gotnewvpm = 1;
+ } else {
+ VPM_DEBUG(vpmd_contend);
+ mutex_exit(vmtx);
+ }
+ mutex_exit(pmtx);
+
+ if (gotnewvpm) {
+
+ /*
+ * At this point, we've selected the vpm. Remove vpm
+ * from its freelist. If vpm is the first one in
+ * the freelist, update the head of the freelist.
+ */
+ if (first == vpm) {
+ ASSERT(first == allocq->vpmq_free);
+ allocq->vpmq_free = vpm->vpm_next;
+ }
+
+ /*
+ * If the head of the freelist still points to vpm,
+ * then there are no more free vpmaps in that list.
+ */
+ if (allocq->vpmq_free == vpm)
+ /*
+ * Took the last one
+ */
+ allocq->vpmq_free = NULL;
+ else {
+ vpm->vpm_prev->vpm_next = vpm->vpm_next;
+ vpm->vpm_next->vpm_prev = vpm->vpm_prev;
+ }
+ mutex_exit(&allocq->vpmq_mtx);
+ vpm->vpm_prev = vpm->vpm_next = NULL;
+
+ /*
+ * Disassociate the previous page. On x64 systems
+ * p_vpmref is used as a mapping reference to the page.
+ */
+ if ((pp = vpm->vpm_pp) != NULL &&
+ vpm->vpm_vp == pp->p_vnode &&
+ vpm->vpm_off == pp->p_offset) {
+
+ pmtx = PPMTX(pp);
+ if (page_trylock(pp, SE_SHARED)) {
+ /*
+ * Now verify that it is the correct
+ * page. If not someone else stole it,
+ * so just unlock it and leave.
+ */
+ mutex_enter(pmtx);
+ if (PP_ISFREE(pp) ||
+ vpm->vpm_vp != pp->p_vnode ||
+ vpm->vpm_off != pp->p_offset ||
+ pp->p_vpmref != VPMID(vpm)) {
+ mutex_exit(pmtx);
+
+ page_unlock(pp);
+ } else {
+ /*
+ * Release the page.
+ */
+ pp->p_vpmref = 0;
+ mutex_exit(pmtx);
+ hat_kpm_mapout(pp, 0,
+ hat_kpm_page2va(pp, 1));
+ (void) page_release(pp, 1);
+ }
+ } else {
+ /*
+ * If the page cannot be locked, just
+ * clear the p_vpmref and go.
+ */
+ mutex_enter(pmtx);
+ if (pp->p_vpmref == VPMID(vpm)) {
+ pp->p_vpmref = 0;
+ }
+ mutex_exit(pmtx);
+ VPM_DEBUG(vpmd_prevpagelocked);
+ }
+ }
+
+ /*
+ * Setup vpm to point to the new page.
+ */
+ vpm->vpm_pp = newpage;
+ vpm->vpm_vp = newpage->p_vnode;
+ vpm->vpm_off = newpage->p_offset;
+
+ } else {
+ int steal = !VPM_MTBF(steals, steals_mtbf);
+ /*
+ * Page already has a vpm assigned just use that.
+ * Grab the vpm mutex and verify that it is still
+ * the correct one. The pp->p_vpmref should not change
+ * once we have the vpm mutex and the page lock.
+ */
+ mutex_exit(&allocq->vpmq_mtx);
+ vpm = VPMP(vpmref);
+ vmtx = VPMAPMTX(vpm);
+ mutex_enter(vmtx);
+ if ((steal && vpm->vpm_refcnt == 0) ||
+ vpm->vpm_pp != newpage) {
+ /*
+ * The vpm got stolen, retry.
+ * clear the p_vpmref.
+ */
+ pmtx = PPMTX(newpage);
+ mutex_enter(pmtx);
+ if (newpage->p_vpmref == vpmref) {
+ newpage->p_vpmref = 0;
+ }
+ mutex_exit(pmtx);
+
+ mutex_exit(vmtx);
+ VPM_DEBUG(vpmd_steals);
+ goto retry_queue;
+ } else if (vpm->vpm_refcnt == 0) {
+ /*
+ * Remove it from the free list if it
+ * exists there.
+ */
+ VPMAP_RMFREELIST(vpm);
+ }
+ }
+ return (vpm);
+ }
+}
+
+static void
+free_vpmap(struct vpmap *vpm)
+{
+ struct vpmfree *vpmflp;
+ struct vpmap *vpmfreelist;
+ union vpm_freeq *releq;
+
+ ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
+
+ if (vpm->vpm_refcnt != 0) {
+ panic("free_vpmap");
+ /*NOTREACHED*/
+ }
+
+ vpmflp = &vpmd_free[vpm->vpm_free_ndx];
+ /*
+ * Add to the tail of the release queue
+ * Note that vpm_releq and vpm_allocq could toggle
+ * before we get the lock. This does not affect
+ * correctness as the 2 queues are only maintained
+ * to reduce lock pressure.
+ */
+ releq = vpmflp->vpm_releq;
+ if (releq == &vpmflp->vpm_freeq[0]) {
+ vpm->vpm_ndxflg = 0;
+ } else {
+ vpm->vpm_ndxflg = 1;
+ }
+ mutex_enter(&releq->vpmq_mtx);
+ vpmfreelist = releq->vpmq_free;
+ if (vpmfreelist == 0) {
+ int want;
+
+ releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
+ /*
+ * Both queue mutexes are held to set vpm_want;
+ * snapshot the value before dropping releq mutex.
+ * If vpm_want appears after the releq mutex is dropped,
+ * then the vpmap just freed is already gone.
+ */
+ want = vpmflp->vpm_want;
+ mutex_exit(&releq->vpmq_mtx);
+ /*
+ * See if there was a waiter before dropping the releq mutex
+ * then recheck after obtaining vpm_freeq[0] mutex as
+ * the another thread may have already signaled.
+ */
+ if (want) {
+ mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
+ if (vpmflp->vpm_want)
+ cv_signal(&vpmflp->vpm_free_cv);
+ mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
+ }
+ } else {
+ vpm->vpm_next = vpmfreelist;
+ vpm->vpm_prev = vpmfreelist->vpm_prev;
+ vpmfreelist->vpm_prev = vpm;
+ vpm->vpm_prev->vpm_next = vpm;
+ mutex_exit(&releq->vpmq_mtx);
+ }
+}
+
+/*
+ * Get the vpmap for the page.
+ * The refcnt of this vpm is incremented.
+ */
+static struct vpmap *
+get_vpmap(page_t *pp)
+{
+ struct vpmap *vpm = NULL;
+ kmutex_t *vmtx;
+ kmutex_t *pmtx;
+ unsigned int refid;
+
+ ASSERT((pp != NULL) && PAGE_LOCKED(pp));
+
+ if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
+ vpm = VPMP(refid);
+ vmtx = VPMAPMTX(vpm);
+ mutex_enter(vmtx);
+ /*
+ * Since we have the page lock and the vpm mutex, the
+ * pp->p_vpmref cannot change.
+ */
+ if (vpm->vpm_pp != pp) {
+ pmtx = PPMTX(pp);
+
+ /*
+ * Clear the p_vpmref as it is incorrect.
+ * This can happen if the page was stolen.
+ * On x64 this should not happen as p_vpmref
+ * is treated as a mapping on the page. So
+ * if the page is stolen, the mapping would have
+ * been cleared in page_unload().
+ */
+ mutex_enter(pmtx);
+ if (pp->p_vpmref == refid)
+ pp->p_vpmref = 0;
+ mutex_exit(pmtx);
+
+ mutex_exit(vmtx);
+ vpm = NULL;
+ } else if (vpm->vpm_refcnt == 0) {
+ /*
+ * Got the vpm, remove it from the free
+ * list if it exists there.
+ */
+ VPMAP_RMFREELIST(vpm);
+ }
+ }
+ if (vpm == NULL) {
+ /*
+ * get_free_vpmap() returns with the vpmap mutex held.
+ */
+ vpm = get_free_vpmap(pp);
+ vmtx = VPMAPMTX(vpm);
+ vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
+ } else {
+ vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
+ }
+
+ vpm->vpm_refcnt++;
+ mutex_exit(vmtx);
+
+ return (vpm);
+}
+
+/* END --- vpm cache ---- */
+
+/*
+ * The vnode page mapping(vpm) interface routines.
+ */
+
+/*
+ * Find or create the pages starting form baseoff for specified
+ * length 'len'.
+ */
+static int
+vpm_pagecreate(
+ struct vnode *vp,
+ u_offset_t baseoff,
+ size_t len,
+ vmap_t vml[],
+ int nseg,
+ int *newpage)
+{
+
+ page_t *pp = NULL;
+ caddr_t base;
+ u_offset_t off = baseoff;
+ int i;
+ ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
+
+ for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++) {
+ struct vpmap *vpm;
+
+
+ if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
+
+ base = segkpm_create_va(off);
+
+ /*
+ * the seg pointer passed in is just advisor. Just
+ * pass segkmap for now like segmap does with
+ * segmap_kpm enabled.
+ */
+ if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
+ segkmap, base)) == NULL) {
+ panic("segmap_pagecreate_vpm: "
+ "page_create failed");
+ /*NOTREACHED*/
+ }
+ if (newpage != NULL)
+ *newpage = 1;
+
+ page_io_unlock(pp);
+ }
+
+ /*
+ * Get the vpm for this page_t.
+ */
+ if (vpm_cache_enable) {
+ vpm = get_vpmap(pp);
+ vml[i].vs_data = (void *)&vpm->vpm_pp;
+ } else {
+ vml[i].vs_data = (void *)pp;
+ pp->p_vpmref = 0;
+ }
+
+ vml[i].vs_addr = hat_kpm_mapin(pp, 0);
+ vml[i].vs_len = PAGESIZE;
+
+ off += PAGESIZE;
+ }
+ vml[i].vs_data = NULL;
+ vml[i].vs_addr = (caddr_t)NULL;
+ return (0);
+}
+
+
+/*
+ * Returns vpm mappings of pages in the range [off, off+len], where
+ * len is rounded up to the PAGESIZE boundary. The list of pages and
+ * the page addresses are returned in the SGL vml (vmap_t) array passed in.
+ * The nseg is the number of vmap_t entries in the array.
+ *
+ * Currently max len allowed is MAXBSIZE therefore, it will either
+ * fetch/create one or two pages depending on what is the PAGESIZE.
+ *
+ * The segmap's SM_LOCKPROTO usage is not supported by these interfaces.
+ * For such cases, use the seg_map interfaces.
+ */
+int
+vpm_map_pages(
+ struct vnode *vp,
+ u_offset_t off,
+ size_t len,
+ int fetchpage,
+ vmap_t *vml,
+ int nseg,
+ int *newpage,
+ enum seg_rw rw)
+{
+ extern struct vnode *common_specvp();
+ u_offset_t baseoff;
+ uint_t prot;
+ caddr_t base;
+ page_t *pp, *pplist[MAXVMAPS];
+ struct vpmap *vpm;
+ int i, error = 0;
+
+ ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
+ baseoff = off & (offset_t)PAGEMASK;
+ vml[0].vs_data = NULL;
+ vml[0].vs_addr = (caddr_t)NULL;
+ /*
+ * For now, lets restrict it to MAXBSIZE. XXX - We can allow
+ * len longer then MAXBSIZE, but there should be a limit
+ * which should be determined by how many pages the VOP_GETPAGE()
+ * can fetch.
+ */
+ if (off + len > baseoff + MAXBSIZE) {
+ panic("vpm_map_pages bad len");
+ /*NOTREACHED*/
+ }
+
+ /*
+ * If this is a block device we have to be sure to use the
+ * "common" block device vnode for the mapping.
+ */
+ if (vp->v_type == VBLK)
+ vp = common_specvp(vp);
+
+
+ if (!fetchpage)
+ return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
+
+ for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++,
+ pplist[i] = NULL) {
+
+ pp = page_lookup(vp, baseoff, SE_SHARED);
+
+ /*
+ * If we did not find the page or if this page was not
+ * in our cache, then let VOP_GETPAGE get all the pages.
+ * We need to call VOP_GETPAGE so that filesytems can do some
+ * (un)necessary tracking for sequential access.
+ */
+
+ if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
+ (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
+ != (P_MOD | P_REF))) {
+ if (pp != NULL) {
+ page_unlock(pp);
+ }
+
+ /*
+ * Pass a dummy address as it will be required
+ * by page_create_va(). We pass segkmap as the seg
+ * as some file systems(UFS) check it.
+ */
+ base = segkpm_create_va(baseoff);
+
+ error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i],
+ roundup(len, PAGESIZE), segkmap, base, rw, CRED());
+ if (error) {
+ VPM_DEBUG(vpmd_getpagefailed);
+ pplist[i] = NULL;
+ }
+ break;
+ } else {
+ pplist[i] = pp;
+ baseoff += PAGESIZE;
+ }
+ }
+
+ if (error) {
+ for (i = 0; pplist[i] != NULL; i++) {
+ page_unlock(pplist[i]);
+ pplist[i] = NULL;
+ }
+ vml[0].vs_addr = NULL;
+ vml[0].vs_data = NULL;
+ return (FC_MAKE_ERR(error));
+ }
+
+ /*
+ * Get the vpm's for pages.
+ */
+ for (i = 0; pplist[i] != NULL; i++) {
+ if (vpm_cache_enable) {
+ vpm = get_vpmap(pplist[i]);
+ vml[i].vs_data = (void *)&(vpm->vpm_pp);
+ } else {
+ vml[i].vs_data = (void *)pplist[i];
+ pplist[i]->p_vpmref = 0;
+ }
+
+ vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
+ vml[i].vs_len = PAGESIZE;
+ }
+
+ vml[i].vs_data = NULL;
+ vml[i].vs_addr = (caddr_t)NULL;
+
+ return (0);
+}
+
+/*
+ * Release the vpm mappings on the pages and unlock them.
+ */
+void
+vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
+{
+ int i;
+ struct vpmap *vpm;
+ kmutex_t *mtx;
+ page_t *pp;
+
+ for (i = 0; vml[i].vs_data != NULL; i++) {
+ ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
+
+ if (vpm_cache_enable) {
+ pp = *(((page_t **)vml[i].vs_data));
+ } else {
+ pp = (page_t *)vml[i].vs_data;
+ }
+
+ /*
+ * Mark page as being modified or referenced, bacause vpm pages
+ * would not cause faults where it would be set normally.
+ */
+ if (rw == S_WRITE) {
+ hat_setrefmod(pp);
+ } else {
+ ASSERT(rw == S_READ);
+ hat_setref(pp);
+ }
+
+ if (vpm_cache_enable) {
+ page_unlock(pp);
+ vpm = (struct vpmap *)((char *)vml[i].vs_data
+ - offsetof(struct vpmap, vpm_pp));
+ mtx = VPMAPMTX(vpm);
+ mutex_enter(mtx);
+
+ if (--vpm->vpm_refcnt == 0) {
+ free_vpmap(vpm);
+ }
+ mutex_exit(mtx);
+ } else {
+ hat_kpm_mapout(pp, 0, vml[i].vs_addr);
+ (void) page_release(pp, 1);
+ }
+ vml[i].vs_data = NULL;
+ vml[i].vs_addr = NULL;
+ }
+}
+
+/*
+ * Given the vp, off and the uio structure, this routine will do the
+ * the copy (uiomove). If the last page created is partially written,
+ * the rest of the page is zeroed out. It also zeros the beginning of
+ * the first page till the start offset if requested(zerostart).
+ * If pages are to be fetched, it will call the filesystem's getpage
+ * function (VOP_GETPAGE) to get them, otherwise they will be created if
+ * not already present in the page cache.
+ */
+int
+vpm_data_copy(struct vnode *vp,
+ u_offset_t off,
+ size_t len,
+ struct uio *uio,
+ int fetchpage,
+ int *newpage,
+ int zerostart,
+ enum seg_rw rw)
+{
+ int error;
+ struct vmap vml[MINVMAPS];
+ enum uio_rw uiorw;
+ int npages = 0;
+
+ uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
+ /*
+ * 'off' will be the offset where the I/O starts.
+ * We get the pages starting at the (off & PAGEMASK)
+ * page boundary.
+ */
+ error = vpm_map_pages(vp, off, (uint_t)len,
+ fetchpage, vml, MINVMAPS, &npages, rw);
+
+ if (newpage != NULL)
+ *newpage = npages;
+ if (!error) {
+ int i, pn, slen = len;
+ int pon = off & PAGEOFFSET;
+
+ /*
+ * Clear from the beginning of the page to start offset
+ * if requested.
+ */
+ if (!fetchpage && zerostart) {
+ (void) kzero(vml[0].vs_addr, (uint_t)pon);
+ VPM_DEBUG(vpmd_zerostart);
+ }
+
+ for (i = 0; !error && slen > 0 &&
+ vml[i].vs_addr != NULL; i++) {
+ pn = (int)MIN(slen, (PAGESIZE - pon));
+ error = uiomove(vml[i].vs_addr + pon,
+ (long)pn, uiorw, uio);
+ slen -= pn;
+ pon = 0;
+ }
+
+ /*
+ * When new pages are created, zero out part of the
+ * page we did not copy to.
+ */
+ if (!fetchpage && npages &&
+ uio->uio_loffset < roundup(off + len, PAGESIZE)) {
+ int nzero;
+
+ pon = (uio->uio_loffset & PAGEOFFSET);
+ nzero = PAGESIZE - pon;
+ i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
+ (void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
+ }
+ vpm_unmap_pages(vml, rw);
+ }
+ return (error);
+}
+
+/*
+ * called to flush pages for the given vnode covering
+ * [off, off+len] range.
+ */
+int
+vpm_sync_pages(struct vnode *vp,
+ u_offset_t off,
+ size_t len,
+ uint_t flags)
+{
+ extern struct vnode *common_specvp();
+ int bflags = 0;
+ int error = 0;
+ size_t psize = roundup(len, PAGESIZE);
+
+ /*
+ * If this is a block device we have to be sure to use the
+ * "common" block device vnode for the mapping.
+ */
+ if (vp->v_type == VBLK)
+ vp = common_specvp(vp);
+
+ if ((flags & ~SM_DONTNEED) != 0) {
+ if (flags & SM_ASYNC)
+ bflags |= B_ASYNC;
+ if (flags & SM_INVAL)
+ bflags |= B_INVAL;
+ if (flags & SM_DESTROY)
+ bflags |= (B_INVAL|B_TRUNC);
+ if (flags & SM_FREE)
+ bflags |= B_FREE;
+ if (flags & SM_DONTNEED)
+ bflags |= B_DONTNEED;
+
+ error = VOP_PUTPAGE(vp, off, psize, bflags, CRED());
+ }
+
+ return (error);
+}
+
+
+#else /* SEGKPM_SUPPORT */
+
+/* vpm stubs */
+void
+vpm_init()
+{
+}
+
+/*ARGSUSED*/
+int
+vpm_pagecreate(
+ struct vnode *vp,
+ u_offset_t baseoff,
+ size_t len,
+ vmap_t vml[],
+ int nseg,
+ int *newpage)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+vpm_map_pages(
+ struct vnode *vp,
+ u_offset_t off,
+ size_t len,
+ int fetchpage,
+ vmap_t vml[],
+ int nseg,
+ int *newpage,
+ enum seg_rw rw)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+vpm_data_copy(struct vnode *vp,
+ u_offset_t off,
+ size_t len,
+ struct uio *uio,
+ int fetchpage,
+ int *newpage,
+ int zerostart,
+ enum seg_rw rw)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
+{
+}
+/*ARGSUSED*/
+int
+vpm_sync_pages(struct vnode *vp,
+ u_offset_t off,
+ size_t len,
+ uint_t flags)
+{
+ return (0);
+}
+#endif /* SEGKPM_SUPPORT */