diff options
author | praks <none@none> | 2006-04-19 18:07:13 -0700 |
---|---|---|
committer | praks <none@none> | 2006-04-19 18:07:13 -0700 |
commit | a5652762e5f7bf683d19f18542e5e39df63bad79 (patch) | |
tree | 113821225c18b190514811f3e27a638333bc2dcd /usr/src/uts/common/vm/vpm.c | |
parent | 4ab777b1b0f310e59b52a57c79efa0571506942a (diff) | |
download | illumos-joyent-a5652762e5f7bf683d19f18542e5e39df63bad79.tar.gz |
6256083 Need a lightweight file page mapping mechanism to substitute segmap
6387639 segkpm segment set to incorrect size for amd64
Diffstat (limited to 'usr/src/uts/common/vm/vpm.c')
-rw-r--r-- | usr/src/uts/common/vm/vpm.c | 1141 |
1 files changed, 1141 insertions, 0 deletions
diff --git a/usr/src/uts/common/vm/vpm.c b/usr/src/uts/common/vm/vpm.c new file mode 100644 index 0000000000..1f4f2fdf58 --- /dev/null +++ b/usr/src/uts/common/vm/vpm.c @@ -0,0 +1,1141 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VM - generic vnode page mapping interfaces. + * + * Mechanism to provide temporary mappings to vnode pages. + * The typical use would be to copy/access file data. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/buf.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/mman.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/vtrace.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/thread.h> +#include <sys/dumphdr.h> +#include <sys/bitmap.h> +#include <sys/lgrp.h> + +#include <vm/seg_kmem.h> +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kpm.h> +#include <vm/seg_map.h> +#include <vm/page.h> +#include <vm/pvn.h> +#include <vm/rm.h> +#include <vm/vpm.h> + +/* + * Needs to be enabled by each platform. + */ +int vpm_enable = 0; + +#ifdef SEGKPM_SUPPORT + + +int vpm_cache_enable = 1; +long vpm_cache_percent = 12; +long vpm_cache_size; +int vpm_nfreelist = 0; +int vpmd_freemsk = 0; + +#define VPM_S_PAD 64 +union vpm_cpu { + struct { + int vcpu_free_ndx; + ulong_t vcpu_hits; + ulong_t vcpu_misses; + } vcpu; + char vpm_pad[VPM_S_PAD]; +}; +static union vpm_cpu *vpmd_cpu; + +#define vfree_ndx vcpu.vcpu_free_ndx + +int vpm_cachemode = VPMCACHE_LRU; + +#define PPMTX(pp) (&(pp)->p_ilock) + +static struct vpmap *vpmd_vpmap; /* list of vpmap structs preallocated */ +static struct vpmfree *vpmd_free; +#define VPMAPMTX(vpm) (&vpm->vpm_mtx) +#define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk]) +#define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk) +#define VPMP(id) (&vpmd_vpmap[id - 1]) +#define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1) + + +#ifdef DEBUG + +struct vpm_debug { + int vpmd_steals; + int vpmd_contend; + int vpmd_prevpagelocked; + int vpmd_getpagefailed; + int vpmd_zerostart; + int vpmd_emptyfreelist; + int vpmd_nofreevpms; +} vpm_debug; + +#define VPM_DEBUG(x) ((vpm_debug.x)++) + +int steals; +int steals_mtbf = 7; +int contend; +int contend_mtbf = 127; + +#define VPM_MTBF(v, f) (((++(v)) & (f)) != (f)) + +#else /* DEBUG */ + +#define VPM_MTBF(v, f) (1) +#define VPM_DEBUG(x) /* nothing */ + +#endif + +/* + * The vpm cache. + * + * The main purpose of having a cache here is to speed up page_lookup() + * operations and also provide an LRU(default) behaviour of file pages. The + * page_lookup() operation tends to be expensive if a page has to be + * reclaimed from the system page cache("cachelist"). Once we speed up the + * page_lookup()->page_reclaim() path then there there should be no need for + * this cache. The system page cache(cachelist) should effectively serve the + * purpose of caching file pages. + * + * This cache is very similar to segmap's smap cache. Each page in the + * cache is tracked by the structure vpmap_t. But unlike segmap, there is no + * hash table. The page_t has a reference to the vpmap_t when cached. For a + * given vnode, offset the page is found by means of a page_lookup() operation. + * Any page which has a mapping(i.e when cached) will not be in the + * system 'cachelist'. Hence the page_lookup() will not have to do a + * page_reclaim(). That is how the cache serves to speed up page_lookup() + * operations. + * + * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system. + */ + +void +vpm_init() +{ + long npages; + struct vpmap *vpm; + struct vpmfree *vpmflp; + int i, ndx; + extern void prefetch_smap_w(void *); + + if (!vpm_cache_enable) { + return; + } + + /* + * Set the size of the cache. + */ + vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100); + if (vpm_cache_size < VPMAP_MINCACHE) { + vpm_cache_size = VPMAP_MINCACHE; + } + + /* + * Number of freelists. + */ + if (vpm_nfreelist == 0) { + vpm_nfreelist = max_ncpus; + } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) { + cmn_err(CE_WARN, "vpmap create : number of freelist " + "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus); + vpm_nfreelist = 2 * max_ncpus; + } + + /* + * Round it up to the next power of 2 + */ + if (vpm_nfreelist & (vpm_nfreelist - 1)) { + vpm_nfreelist = 1 << (highbit(vpm_nfreelist)); + } + vpmd_freemsk = vpm_nfreelist - 1; + + /* + * Use a per cpu rotor index to spread the allocations evenly + * across the available vpm freelists. + */ + vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP); + ndx = 0; + for (i = 0; i < max_ncpus; i++) { + + vpmd_cpu[i].vfree_ndx = ndx; + ndx = (ndx + 1) & vpmd_freemsk; + } + + /* + * Allocate and initialize the freelist. + */ + vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree), + KM_SLEEP); + for (i = 0; i < vpm_nfreelist; i++) { + + vpmflp = &vpmd_free[i]; + /* + * Set up initial queue pointers. They will get flipped + * back and forth. + */ + vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ]; + vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ]; + } + + npages = mmu_btop(vpm_cache_size); + + + /* + * Allocate and initialize the vpmap structs. + */ + vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP); + for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) { + struct vpmfree *vpmflp; + union vpm_freeq *releq; + struct vpmap *vpmapf; + + /* + * Use prefetch as we have to walk thru a large number of + * these data structures. We just use the smap's prefetch + * routine as it does the same. This should work fine + * for x64(this needs to be modifed when enabled on sparc). + */ + prefetch_smap_w((void *)vpm); + + vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm); + + vpmflp = VPMAP2VMF(vpm); + releq = vpmflp->vpm_releq; + + vpmapf = releq->vpmq_free; + if (vpmapf == NULL) { + releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; + } else { + vpm->vpm_next = vpmapf; + vpm->vpm_prev = vpmapf->vpm_prev; + vpmapf->vpm_prev = vpm; + vpm->vpm_prev->vpm_next = vpm; + releq->vpmq_free = vpm->vpm_next; + } + + /* + * Indicate that the vpmap is on the releq at start + */ + vpm->vpm_ndxflg = VPMRELEQ; + } +} + + +/* + * unhooks vpm from the freelist if it is still on the freelist. + */ +#define VPMAP_RMFREELIST(vpm) \ + { \ + if (vpm->vpm_next != NULL) { \ + union vpm_freeq *freeq; \ + struct vpmfree *vpmflp; \ + vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \ + freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \ + mutex_enter(&freeq->vpmq_mtx); \ + if (freeq->vpmq_free != vpm) { \ + vpm->vpm_prev->vpm_next = vpm->vpm_next; \ + vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ + } else if (vpm == vpm->vpm_next) { \ + freeq->vpmq_free = NULL; \ + } else { \ + freeq->vpmq_free = vpm->vpm_next; \ + vpm->vpm_prev->vpm_next = vpm->vpm_next; \ + vpm->vpm_next->vpm_prev = vpm->vpm_prev; \ + } \ + mutex_exit(&freeq->vpmq_mtx); \ + vpm->vpm_next = vpm->vpm_prev = NULL; \ + } \ + } + +static int +get_freelndx(int mode) +{ + int ndx; + + ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk; + switch (mode) { + + case VPMCACHE_LRU: + default: + vpmd_cpu[CPU->cpu_seqid].vfree_ndx++; + break; + } + return (ndx); +} + + +/* + * Find one vpmap structure from the free lists and use it for the newpage. + * The previous page it cached is dissociated and released. The page_t's + * p_vpmref is cleared only when the vpm it is pointing to is locked(or + * for AMD64 when the page is exclusively locked in page_unload. That is + * because the p_vpmref is treated as mapping). + * + * The page's p_vpmref is set when the page is + * locked(at least SHARED locked). + */ +static struct vpmap * +get_free_vpmap(page_t *newpage) +{ + struct vpmfree *vpmflp; + kmutex_t *vmtx; + struct vpmap *vpm, *first; + union vpm_freeq *allocq, *releq; + page_t *pp = NULL; + int end_ndx, page_locked = 0; + int free_ndx; + + /* + * get the freelist bin index. + */ + free_ndx = get_freelndx(vpm_cachemode); + + end_ndx = free_ndx; + vpmflp = &vpmd_free[free_ndx]; + +retry_queue: + allocq = vpmflp->vpm_allocq; + mutex_enter(&allocq->vpmq_mtx); + + if ((vpm = allocq->vpmq_free) == NULL) { + +skip_queue: + /* + * The alloc list is empty or this queue is being skipped; + * first see if the allocq toggled. + */ + if (vpmflp->vpm_allocq != allocq) { + /* queue changed */ + mutex_exit(&allocq->vpmq_mtx); + goto retry_queue; + } + releq = vpmflp->vpm_releq; + if (!mutex_tryenter(&releq->vpmq_mtx)) { + /* cannot get releq; a free vpmap may be there now */ + mutex_exit(&allocq->vpmq_mtx); + + /* + * This loop could spin forever if this thread has + * higher priority than the thread that is holding + * releq->vpmq_mtx. In order to force the other thread + * to run, we'll lock/unlock the mutex which is safe + * since we just unlocked the allocq mutex. + */ + mutex_enter(&releq->vpmq_mtx); + mutex_exit(&releq->vpmq_mtx); + goto retry_queue; + } + if (releq->vpmq_free == NULL) { + VPM_DEBUG(vpmd_emptyfreelist); + /* + * This freelist is empty. + * This should not happen unless clients + * are failing to release the vpmap after + * accessing the data. Before resorting + * to sleeping, try the next list of the same color. + */ + free_ndx = (free_ndx + 1) & vpmd_freemsk; + if (free_ndx != end_ndx) { + mutex_exit(&releq->vpmq_mtx); + mutex_exit(&allocq->vpmq_mtx); + vpmflp = &vpmd_free[free_ndx]; + goto retry_queue; + } + /* + * Tried all freelists. + * wait on this list and hope something gets freed. + */ + vpmflp->vpm_want++; + mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx); + cv_wait(&vpmflp->vpm_free_cv, + &vpmflp->vpm_freeq[0].vpmq_mtx); + vpmflp->vpm_want--; + mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); + vpmflp = &vpmd_free[free_ndx]; + VPM_DEBUG(vpmd_nofreevpms); + goto retry_queue; + } else { + /* + * Something on the rele queue; flip the alloc + * and rele queues and retry. + */ + vpmflp->vpm_allocq = releq; + vpmflp->vpm_releq = allocq; + mutex_exit(&allocq->vpmq_mtx); + mutex_exit(&releq->vpmq_mtx); + if (page_locked) { + delay(hz >> 2); + page_locked = 0; + } + goto retry_queue; + } + } else { + int gotnewvpm; + kmutex_t *pmtx; + uint_t vpmref; + + /* + * Fastpath the case we get the vpmap mutex + * on the first try. + */ + first = vpm; +next_vpmap: + vmtx = VPMAPMTX(vpm); + if (!mutex_tryenter(vmtx)) { + /* + * Another thread is trying to reclaim this slot. + * Skip to the next queue or vpmap. + */ + if ((vpm = vpm->vpm_next) == first) { + goto skip_queue; + } else { + goto next_vpmap; + } + } + + /* + * Assign this vpm to the newpage. + */ + pmtx = PPMTX(newpage); + gotnewvpm = 0; + mutex_enter(pmtx); + + /* + * Check if some other thread already assigned a vpm to + * this page. + */ + if ((vpmref = newpage->p_vpmref) == 0) { + newpage->p_vpmref = VPMID(vpm); + gotnewvpm = 1; + } else { + VPM_DEBUG(vpmd_contend); + mutex_exit(vmtx); + } + mutex_exit(pmtx); + + if (gotnewvpm) { + + /* + * At this point, we've selected the vpm. Remove vpm + * from its freelist. If vpm is the first one in + * the freelist, update the head of the freelist. + */ + if (first == vpm) { + ASSERT(first == allocq->vpmq_free); + allocq->vpmq_free = vpm->vpm_next; + } + + /* + * If the head of the freelist still points to vpm, + * then there are no more free vpmaps in that list. + */ + if (allocq->vpmq_free == vpm) + /* + * Took the last one + */ + allocq->vpmq_free = NULL; + else { + vpm->vpm_prev->vpm_next = vpm->vpm_next; + vpm->vpm_next->vpm_prev = vpm->vpm_prev; + } + mutex_exit(&allocq->vpmq_mtx); + vpm->vpm_prev = vpm->vpm_next = NULL; + + /* + * Disassociate the previous page. On x64 systems + * p_vpmref is used as a mapping reference to the page. + */ + if ((pp = vpm->vpm_pp) != NULL && + vpm->vpm_vp == pp->p_vnode && + vpm->vpm_off == pp->p_offset) { + + pmtx = PPMTX(pp); + if (page_trylock(pp, SE_SHARED)) { + /* + * Now verify that it is the correct + * page. If not someone else stole it, + * so just unlock it and leave. + */ + mutex_enter(pmtx); + if (PP_ISFREE(pp) || + vpm->vpm_vp != pp->p_vnode || + vpm->vpm_off != pp->p_offset || + pp->p_vpmref != VPMID(vpm)) { + mutex_exit(pmtx); + + page_unlock(pp); + } else { + /* + * Release the page. + */ + pp->p_vpmref = 0; + mutex_exit(pmtx); + hat_kpm_mapout(pp, 0, + hat_kpm_page2va(pp, 1)); + (void) page_release(pp, 1); + } + } else { + /* + * If the page cannot be locked, just + * clear the p_vpmref and go. + */ + mutex_enter(pmtx); + if (pp->p_vpmref == VPMID(vpm)) { + pp->p_vpmref = 0; + } + mutex_exit(pmtx); + VPM_DEBUG(vpmd_prevpagelocked); + } + } + + /* + * Setup vpm to point to the new page. + */ + vpm->vpm_pp = newpage; + vpm->vpm_vp = newpage->p_vnode; + vpm->vpm_off = newpage->p_offset; + + } else { + int steal = !VPM_MTBF(steals, steals_mtbf); + /* + * Page already has a vpm assigned just use that. + * Grab the vpm mutex and verify that it is still + * the correct one. The pp->p_vpmref should not change + * once we have the vpm mutex and the page lock. + */ + mutex_exit(&allocq->vpmq_mtx); + vpm = VPMP(vpmref); + vmtx = VPMAPMTX(vpm); + mutex_enter(vmtx); + if ((steal && vpm->vpm_refcnt == 0) || + vpm->vpm_pp != newpage) { + /* + * The vpm got stolen, retry. + * clear the p_vpmref. + */ + pmtx = PPMTX(newpage); + mutex_enter(pmtx); + if (newpage->p_vpmref == vpmref) { + newpage->p_vpmref = 0; + } + mutex_exit(pmtx); + + mutex_exit(vmtx); + VPM_DEBUG(vpmd_steals); + goto retry_queue; + } else if (vpm->vpm_refcnt == 0) { + /* + * Remove it from the free list if it + * exists there. + */ + VPMAP_RMFREELIST(vpm); + } + } + return (vpm); + } +} + +static void +free_vpmap(struct vpmap *vpm) +{ + struct vpmfree *vpmflp; + struct vpmap *vpmfreelist; + union vpm_freeq *releq; + + ASSERT(MUTEX_HELD(VPMAPMTX(vpm))); + + if (vpm->vpm_refcnt != 0) { + panic("free_vpmap"); + /*NOTREACHED*/ + } + + vpmflp = &vpmd_free[vpm->vpm_free_ndx]; + /* + * Add to the tail of the release queue + * Note that vpm_releq and vpm_allocq could toggle + * before we get the lock. This does not affect + * correctness as the 2 queues are only maintained + * to reduce lock pressure. + */ + releq = vpmflp->vpm_releq; + if (releq == &vpmflp->vpm_freeq[0]) { + vpm->vpm_ndxflg = 0; + } else { + vpm->vpm_ndxflg = 1; + } + mutex_enter(&releq->vpmq_mtx); + vpmfreelist = releq->vpmq_free; + if (vpmfreelist == 0) { + int want; + + releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; + /* + * Both queue mutexes are held to set vpm_want; + * snapshot the value before dropping releq mutex. + * If vpm_want appears after the releq mutex is dropped, + * then the vpmap just freed is already gone. + */ + want = vpmflp->vpm_want; + mutex_exit(&releq->vpmq_mtx); + /* + * See if there was a waiter before dropping the releq mutex + * then recheck after obtaining vpm_freeq[0] mutex as + * the another thread may have already signaled. + */ + if (want) { + mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx); + if (vpmflp->vpm_want) + cv_signal(&vpmflp->vpm_free_cv); + mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx); + } + } else { + vpm->vpm_next = vpmfreelist; + vpm->vpm_prev = vpmfreelist->vpm_prev; + vpmfreelist->vpm_prev = vpm; + vpm->vpm_prev->vpm_next = vpm; + mutex_exit(&releq->vpmq_mtx); + } +} + +/* + * Get the vpmap for the page. + * The refcnt of this vpm is incremented. + */ +static struct vpmap * +get_vpmap(page_t *pp) +{ + struct vpmap *vpm = NULL; + kmutex_t *vmtx; + kmutex_t *pmtx; + unsigned int refid; + + ASSERT((pp != NULL) && PAGE_LOCKED(pp)); + + if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) { + vpm = VPMP(refid); + vmtx = VPMAPMTX(vpm); + mutex_enter(vmtx); + /* + * Since we have the page lock and the vpm mutex, the + * pp->p_vpmref cannot change. + */ + if (vpm->vpm_pp != pp) { + pmtx = PPMTX(pp); + + /* + * Clear the p_vpmref as it is incorrect. + * This can happen if the page was stolen. + * On x64 this should not happen as p_vpmref + * is treated as a mapping on the page. So + * if the page is stolen, the mapping would have + * been cleared in page_unload(). + */ + mutex_enter(pmtx); + if (pp->p_vpmref == refid) + pp->p_vpmref = 0; + mutex_exit(pmtx); + + mutex_exit(vmtx); + vpm = NULL; + } else if (vpm->vpm_refcnt == 0) { + /* + * Got the vpm, remove it from the free + * list if it exists there. + */ + VPMAP_RMFREELIST(vpm); + } + } + if (vpm == NULL) { + /* + * get_free_vpmap() returns with the vpmap mutex held. + */ + vpm = get_free_vpmap(pp); + vmtx = VPMAPMTX(vpm); + vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++; + } else { + vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++; + } + + vpm->vpm_refcnt++; + mutex_exit(vmtx); + + return (vpm); +} + +/* END --- vpm cache ---- */ + +/* + * The vnode page mapping(vpm) interface routines. + */ + +/* + * Find or create the pages starting form baseoff for specified + * length 'len'. + */ +static int +vpm_pagecreate( + struct vnode *vp, + u_offset_t baseoff, + size_t len, + vmap_t vml[], + int nseg, + int *newpage) +{ + + page_t *pp = NULL; + caddr_t base; + u_offset_t off = baseoff; + int i; + ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS); + + for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++) { + struct vpmap *vpm; + + + if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { + + base = segkpm_create_va(off); + + /* + * the seg pointer passed in is just advisor. Just + * pass segkmap for now like segmap does with + * segmap_kpm enabled. + */ + if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, + segkmap, base)) == NULL) { + panic("segmap_pagecreate_vpm: " + "page_create failed"); + /*NOTREACHED*/ + } + if (newpage != NULL) + *newpage = 1; + + page_io_unlock(pp); + } + + /* + * Get the vpm for this page_t. + */ + if (vpm_cache_enable) { + vpm = get_vpmap(pp); + vml[i].vs_data = (void *)&vpm->vpm_pp; + } else { + vml[i].vs_data = (void *)pp; + pp->p_vpmref = 0; + } + + vml[i].vs_addr = hat_kpm_mapin(pp, 0); + vml[i].vs_len = PAGESIZE; + + off += PAGESIZE; + } + vml[i].vs_data = NULL; + vml[i].vs_addr = (caddr_t)NULL; + return (0); +} + + +/* + * Returns vpm mappings of pages in the range [off, off+len], where + * len is rounded up to the PAGESIZE boundary. The list of pages and + * the page addresses are returned in the SGL vml (vmap_t) array passed in. + * The nseg is the number of vmap_t entries in the array. + * + * Currently max len allowed is MAXBSIZE therefore, it will either + * fetch/create one or two pages depending on what is the PAGESIZE. + * + * The segmap's SM_LOCKPROTO usage is not supported by these interfaces. + * For such cases, use the seg_map interfaces. + */ +int +vpm_map_pages( + struct vnode *vp, + u_offset_t off, + size_t len, + int fetchpage, + vmap_t *vml, + int nseg, + int *newpage, + enum seg_rw rw) +{ + extern struct vnode *common_specvp(); + u_offset_t baseoff; + uint_t prot; + caddr_t base; + page_t *pp, *pplist[MAXVMAPS]; + struct vpmap *vpm; + int i, error = 0; + + ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS); + baseoff = off & (offset_t)PAGEMASK; + vml[0].vs_data = NULL; + vml[0].vs_addr = (caddr_t)NULL; + /* + * For now, lets restrict it to MAXBSIZE. XXX - We can allow + * len longer then MAXBSIZE, but there should be a limit + * which should be determined by how many pages the VOP_GETPAGE() + * can fetch. + */ + if (off + len > baseoff + MAXBSIZE) { + panic("vpm_map_pages bad len"); + /*NOTREACHED*/ + } + + /* + * If this is a block device we have to be sure to use the + * "common" block device vnode for the mapping. + */ + if (vp->v_type == VBLK) + vp = common_specvp(vp); + + + if (!fetchpage) + return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage)); + + for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++, + pplist[i] = NULL) { + + pp = page_lookup(vp, baseoff, SE_SHARED); + + /* + * If we did not find the page or if this page was not + * in our cache, then let VOP_GETPAGE get all the pages. + * We need to call VOP_GETPAGE so that filesytems can do some + * (un)necessary tracking for sequential access. + */ + + if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) || + (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF) + != (P_MOD | P_REF))) { + if (pp != NULL) { + page_unlock(pp); + } + + /* + * Pass a dummy address as it will be required + * by page_create_va(). We pass segkmap as the seg + * as some file systems(UFS) check it. + */ + base = segkpm_create_va(baseoff); + + error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i], + roundup(len, PAGESIZE), segkmap, base, rw, CRED()); + if (error) { + VPM_DEBUG(vpmd_getpagefailed); + pplist[i] = NULL; + } + break; + } else { + pplist[i] = pp; + baseoff += PAGESIZE; + } + } + + if (error) { + for (i = 0; pplist[i] != NULL; i++) { + page_unlock(pplist[i]); + pplist[i] = NULL; + } + vml[0].vs_addr = NULL; + vml[0].vs_data = NULL; + return (FC_MAKE_ERR(error)); + } + + /* + * Get the vpm's for pages. + */ + for (i = 0; pplist[i] != NULL; i++) { + if (vpm_cache_enable) { + vpm = get_vpmap(pplist[i]); + vml[i].vs_data = (void *)&(vpm->vpm_pp); + } else { + vml[i].vs_data = (void *)pplist[i]; + pplist[i]->p_vpmref = 0; + } + + vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0); + vml[i].vs_len = PAGESIZE; + } + + vml[i].vs_data = NULL; + vml[i].vs_addr = (caddr_t)NULL; + + return (0); +} + +/* + * Release the vpm mappings on the pages and unlock them. + */ +void +vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) +{ + int i; + struct vpmap *vpm; + kmutex_t *mtx; + page_t *pp; + + for (i = 0; vml[i].vs_data != NULL; i++) { + ASSERT(IS_KPM_ADDR(vml[i].vs_addr)); + + if (vpm_cache_enable) { + pp = *(((page_t **)vml[i].vs_data)); + } else { + pp = (page_t *)vml[i].vs_data; + } + + /* + * Mark page as being modified or referenced, bacause vpm pages + * would not cause faults where it would be set normally. + */ + if (rw == S_WRITE) { + hat_setrefmod(pp); + } else { + ASSERT(rw == S_READ); + hat_setref(pp); + } + + if (vpm_cache_enable) { + page_unlock(pp); + vpm = (struct vpmap *)((char *)vml[i].vs_data + - offsetof(struct vpmap, vpm_pp)); + mtx = VPMAPMTX(vpm); + mutex_enter(mtx); + + if (--vpm->vpm_refcnt == 0) { + free_vpmap(vpm); + } + mutex_exit(mtx); + } else { + hat_kpm_mapout(pp, 0, vml[i].vs_addr); + (void) page_release(pp, 1); + } + vml[i].vs_data = NULL; + vml[i].vs_addr = NULL; + } +} + +/* + * Given the vp, off and the uio structure, this routine will do the + * the copy (uiomove). If the last page created is partially written, + * the rest of the page is zeroed out. It also zeros the beginning of + * the first page till the start offset if requested(zerostart). + * If pages are to be fetched, it will call the filesystem's getpage + * function (VOP_GETPAGE) to get them, otherwise they will be created if + * not already present in the page cache. + */ +int +vpm_data_copy(struct vnode *vp, + u_offset_t off, + size_t len, + struct uio *uio, + int fetchpage, + int *newpage, + int zerostart, + enum seg_rw rw) +{ + int error; + struct vmap vml[MINVMAPS]; + enum uio_rw uiorw; + int npages = 0; + + uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ; + /* + * 'off' will be the offset where the I/O starts. + * We get the pages starting at the (off & PAGEMASK) + * page boundary. + */ + error = vpm_map_pages(vp, off, (uint_t)len, + fetchpage, vml, MINVMAPS, &npages, rw); + + if (newpage != NULL) + *newpage = npages; + if (!error) { + int i, pn, slen = len; + int pon = off & PAGEOFFSET; + + /* + * Clear from the beginning of the page to start offset + * if requested. + */ + if (!fetchpage && zerostart) { + (void) kzero(vml[0].vs_addr, (uint_t)pon); + VPM_DEBUG(vpmd_zerostart); + } + + for (i = 0; !error && slen > 0 && + vml[i].vs_addr != NULL; i++) { + pn = (int)MIN(slen, (PAGESIZE - pon)); + error = uiomove(vml[i].vs_addr + pon, + (long)pn, uiorw, uio); + slen -= pn; + pon = 0; + } + + /* + * When new pages are created, zero out part of the + * page we did not copy to. + */ + if (!fetchpage && npages && + uio->uio_loffset < roundup(off + len, PAGESIZE)) { + int nzero; + + pon = (uio->uio_loffset & PAGEOFFSET); + nzero = PAGESIZE - pon; + i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE; + (void) kzero(vml[i].vs_addr + pon, (uint_t)nzero); + } + vpm_unmap_pages(vml, rw); + } + return (error); +} + +/* + * called to flush pages for the given vnode covering + * [off, off+len] range. + */ +int +vpm_sync_pages(struct vnode *vp, + u_offset_t off, + size_t len, + uint_t flags) +{ + extern struct vnode *common_specvp(); + int bflags = 0; + int error = 0; + size_t psize = roundup(len, PAGESIZE); + + /* + * If this is a block device we have to be sure to use the + * "common" block device vnode for the mapping. + */ + if (vp->v_type == VBLK) + vp = common_specvp(vp); + + if ((flags & ~SM_DONTNEED) != 0) { + if (flags & SM_ASYNC) + bflags |= B_ASYNC; + if (flags & SM_INVAL) + bflags |= B_INVAL; + if (flags & SM_DESTROY) + bflags |= (B_INVAL|B_TRUNC); + if (flags & SM_FREE) + bflags |= B_FREE; + if (flags & SM_DONTNEED) + bflags |= B_DONTNEED; + + error = VOP_PUTPAGE(vp, off, psize, bflags, CRED()); + } + + return (error); +} + + +#else /* SEGKPM_SUPPORT */ + +/* vpm stubs */ +void +vpm_init() +{ +} + +/*ARGSUSED*/ +int +vpm_pagecreate( + struct vnode *vp, + u_offset_t baseoff, + size_t len, + vmap_t vml[], + int nseg, + int *newpage) +{ + return (0); +} + +/*ARGSUSED*/ +int +vpm_map_pages( + struct vnode *vp, + u_offset_t off, + size_t len, + int fetchpage, + vmap_t vml[], + int nseg, + int *newpage, + enum seg_rw rw) +{ + return (0); +} + +/*ARGSUSED*/ +int +vpm_data_copy(struct vnode *vp, + u_offset_t off, + size_t len, + struct uio *uio, + int fetchpage, + int *newpage, + int zerostart, + enum seg_rw rw) +{ + return (0); +} + +/*ARGSUSED*/ +void +vpm_unmap_pages(vmap_t vml[], enum seg_rw rw) +{ +} +/*ARGSUSED*/ +int +vpm_sync_pages(struct vnode *vp, + u_offset_t off, + size_t len, + uint_t flags) +{ + return (0); +} +#endif /* SEGKPM_SUPPORT */ |