summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoraguzovsk <none@none>2008-05-22 22:23:49 -0700
committeraguzovsk <none@none>2008-05-22 22:23:49 -0700
commita98e9dbfaecb09c4032dc0076786bd835f25eab3 (patch)
tree2fe194d578bd61221d093cd798fcef4757ca76e3
parent963390b497bb4a7103b44028e033f1dbb5c81964 (diff)
downloadillumos-joyent-a98e9dbfaecb09c4032dc0076786bd835f25eab3.tar.gz
6423097 segvn_pagelock() may perform very poorly
6526804 DR delete_memory_thread, AIO, and segvn deadlock 6557794 segspt_dismpagelock() and segspt_shmadvise(MADV_FREE) may deadlock 6557813 seg_ppurge_seg() shouldn't flush all unrelated ISM/DISM segments 6557891 softlocks/pagelocks of anon pages should not decrement availrmem for memory swapped pages 6559612 multiple softlocks on a DISM segment should decrement availrmem just once 6562291 page_mem_avail() is stuck due to availrmem overaccounting and lack of seg_preap() calls 6596555 locked anonymous pages should not have assigned disk swap slots 6639424 hat_sfmmu.c:hat_pagesync() doesn't handle well HAT_SYNC_STOPON_REF and HAT_SYNC_STOPON_MOD flags 6639425 optimize checkpage() optimizations 6662927 page_llock contention during I/O
-rw-r--r--usr/src/uts/common/fs/swapfs/swap_vnops.c56
-rw-r--r--usr/src/uts/common/io/dump.c13
-rw-r--r--usr/src/uts/common/os/kstat_fr.c64
-rw-r--r--usr/src/uts/common/os/mem_cage.c5
-rw-r--r--usr/src/uts/common/os/schedctl.c1
-rw-r--r--usr/src/uts/common/os/shm.c12
-rw-r--r--usr/src/uts/common/os/vm_pageout.c9
-rw-r--r--usr/src/uts/common/vm/anon.h22
-rw-r--r--usr/src/uts/common/vm/as.h9
-rw-r--r--usr/src/uts/common/vm/seg.h59
-rw-r--r--usr/src/uts/common/vm/seg_enum.h10
-rw-r--r--usr/src/uts/common/vm/seg_kmem.c5
-rw-r--r--usr/src/uts/common/vm/seg_spt.c196
-rw-r--r--usr/src/uts/common/vm/seg_spt.h5
-rw-r--r--usr/src/uts/common/vm/seg_vn.c1364
-rw-r--r--usr/src/uts/common/vm/seg_vn.h6
-rw-r--r--usr/src/uts/common/vm/vm_anon.c110
-rw-r--r--usr/src/uts/common/vm/vm_as.c434
-rw-r--r--usr/src/uts/common/vm/vm_page.c68
-rw-r--r--usr/src/uts/common/vm/vm_seg.c1654
-rw-r--r--usr/src/uts/sfmmu/vm/hat_sfmmu.c35
21 files changed, 2862 insertions, 1275 deletions
diff --git a/usr/src/uts/common/fs/swapfs/swap_vnops.c b/usr/src/uts/common/fs/swapfs/swap_vnops.c
index 4e69206084..ec4ad4f3b2 100644
--- a/usr/src/uts/common/fs/swapfs/swap_vnops.c
+++ b/usr/src/uts/common/fs/swapfs/swap_vnops.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -167,7 +167,7 @@ swap_getapage(
int upgrade = 0;
SWAPFS_PRINT(SWAP_VOPS, "swap_getapage: vp %p, off %llx, len %lx\n",
- vp, off, len, 0, 0);
+ vp, off, len, 0, 0);
/*
* Until there is a call-back mechanism to cause SEGKP
@@ -247,8 +247,10 @@ again:
mutex_enter(ahm);
ap = swap_anon(vp, off);
- if (ap == NULL)
- panic("swap_getapage: null anon");
+ if (ap == NULL) {
+ panic("swap_getapage:"
+ " null anon");
+ }
if (ap->an_pvp == pvp &&
ap->an_poff == poff) {
@@ -298,7 +300,7 @@ again:
pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
}
TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
- "swapfs getapage:pp %p vp %p off %llx", pp, vp, off);
+ "swapfs getapage:pp %p vp %p off %llx", pp, vp, off);
return (err);
}
@@ -340,7 +342,7 @@ swap_getconpage(
ASSERT(nreloc != NULL);
ASSERT(!SEG_IS_SEGKP(seg)); /* XXX for now not supported */
SWAPFS_PRINT(SWAP_VOPS, "swap_getconpage: vp %p, off %llx, len %lx\n",
- vp, off, len, 0, 0);
+ vp, off, len, 0, 0);
/*
* If we are not using a preallocated page then we know one already
@@ -384,7 +386,7 @@ swap_getconpage(
pl[1] = NULL;
if (page_pptonum(pp) &
(page_get_pagecnt(conpp->p_szc) - 1))
- cmn_err(CE_PANIC, "swap_getconpage: no root");
+ cmn_err(CE_PANIC, "swap_getconpage: no root");
}
return (err);
}
@@ -415,9 +417,27 @@ swap_getconpage(
"swap_getconpage: swap_getphysname failed!");
}
- if (pvp) {
- err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ, cr,
- NULL);
+ if (pvp != NULL) {
+ err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ,
+ cr, NULL);
+ if (err == 0) {
+ struct anon *ap;
+ kmutex_t *ahm;
+
+ ahm = &anonhash_lock[AH_LOCK(vp, off)];
+ mutex_enter(ahm);
+ ap = swap_anon(vp, off);
+ if (ap == NULL)
+ panic("swap_getconpage: null anon");
+ if (ap->an_pvp != pvp || ap->an_poff != poff)
+ panic("swap_getconpage: bad anon");
+
+ swap_phys_free(pvp, poff, PAGESIZE);
+ ap->an_pvp = NULL;
+ ap->an_poff = NULL;
+ hat_setmod(pp);
+ mutex_exit(ahm);
+ }
} else {
pagezero(pp, 0, PAGESIZE);
}
@@ -435,7 +455,7 @@ swap_getconpage(
ASSERT(pp->p_prev == pp);
TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
- "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off);
+ "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off);
pl[0] = pp;
pl[1] = NULL;
@@ -552,7 +572,7 @@ swap_putpage(
pp = page_lookup(vp, io_off, SE_EXCL);
else
pp = page_lookup_nowait(vp, io_off,
- (flags & B_FREE) ? SE_EXCL : SE_SHARED);
+ (flags & B_FREE) ? SE_EXCL : SE_SHARED);
if (pp == NULL || pvn_getdirty(pp, flags) == 0)
io_len = PAGESIZE;
@@ -628,8 +648,8 @@ swap_putapage(
}
SWAPFS_PRINT(SWAP_PUTP,
- "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
- pp, vp, pp->p_offset, flags, 0);
+ "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
+ pp, vp, pp->p_offset, flags, 0);
ASSERT(PAGE_LOCKED(pp));
@@ -683,7 +703,7 @@ swap_putapage(
doff = off;
dlen = PAGESIZE;
if (err = swap_newphysname(vp, off, &doff, &dlen,
- &pvp, &poff)) {
+ &pvp, &poff)) {
swap_otherfail++;
swap_otherpages += btop(klsz);
hat_setmod(pp);
@@ -715,7 +735,7 @@ swap_putapage(
}
err = VOP_PAGEIO(klvp, pplist, klstart, klsz,
- B_WRITE | flags, cr, NULL);
+ B_WRITE | flags, cr, NULL);
if ((flags & B_ASYNC) == 0)
pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
@@ -727,8 +747,8 @@ swap_putapage(
}
out:
TRACE_4(TR_FAC_SWAPFS, TR_SWAPFS_PUTAPAGE,
- "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
- vp, klvp, klstart, klsz);
+ "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
+ vp, klvp, klstart, klsz);
if (err && err != ENOMEM)
cmn_err(CE_WARN, "swapfs_putapage: err %d\n", err);
if (lenp)
diff --git a/usr/src/uts/common/io/dump.c b/usr/src/uts/common/io/dump.c
index 6498463087..0af0b8660b 100644
--- a/usr/src/uts/common/io/dump.c
+++ b/usr/src/uts/common/io/dump.c
@@ -116,13 +116,12 @@ dump_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp)
* of these counters.
*/
dumpsize_in_pages = (physinstalled - obp_pages -
- availrmem -
- anon_segkp_pages_locked -
- k_anoninfo.ani_mem_resv -
- segvn_pages_locked -
- pages_locked -
- pages_claimed -
- pages_useclaim);
+ availrmem -
+ anon_segkp_pages_locked -
+ k_anoninfo.ani_mem_resv -
+ pages_locked -
+ pages_claimed -
+ pages_useclaim);
/*
* Protect against vm vagaries.
diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c
index c97b1621cb..b7e84aef21 100644
--- a/usr/src/uts/common/os/kstat_fr.c
+++ b/usr/src/uts/common/os/kstat_fr.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
@@ -494,7 +494,7 @@ kstat_init(void)
*/
kstat_chain_id = 0;
ksp = kstat_create("unix", 0, "kstat_headers", "kstat", KSTAT_TYPE_RAW,
- 0, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE);
+ 0, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE);
if (ksp) {
ksp->ks_lock = &kstat_chain_lock;
ksp->ks_update = header_kstat_update;
@@ -505,35 +505,35 @@ kstat_init(void)
}
ksp = kstat_create("unix", 0, "kstat_types", "kstat",
- KSTAT_TYPE_NAMED, KSTAT_NUM_TYPES, 0);
+ KSTAT_TYPE_NAMED, KSTAT_NUM_TYPES, 0);
if (ksp) {
int i;
kstat_named_t *kn = KSTAT_NAMED_PTR(ksp);
for (i = 0; i < KSTAT_NUM_TYPES; i++) {
kstat_named_init(&kn[i], kstat_data_type[i].name,
- KSTAT_DATA_ULONG);
+ KSTAT_DATA_ULONG);
kn[i].value.ul = i;
}
kstat_install(ksp);
}
ksp = kstat_create("unix", 0, "sysinfo", "misc", KSTAT_TYPE_RAW,
- sizeof (sysinfo_t), KSTAT_FLAG_VIRTUAL);
+ sizeof (sysinfo_t), KSTAT_FLAG_VIRTUAL);
if (ksp) {
ksp->ks_data = (void *) &sysinfo;
kstat_install(ksp);
}
ksp = kstat_create("unix", 0, "vminfo", "vm", KSTAT_TYPE_RAW,
- sizeof (vminfo_t), KSTAT_FLAG_VIRTUAL);
+ sizeof (vminfo_t), KSTAT_FLAG_VIRTUAL);
if (ksp) {
ksp->ks_data = (void *) &vminfo;
kstat_install(ksp);
}
ksp = kstat_create("unix", 0, "segmap", "vm", KSTAT_TYPE_NAMED,
- segmapcnt_ndata, KSTAT_FLAG_VIRTUAL);
+ segmapcnt_ndata, KSTAT_FLAG_VIRTUAL);
if (ksp) {
ksp->ks_data = (void *) segmapcnt_ptr;
ksp->ks_update = segmap_kstat_update;
@@ -541,7 +541,7 @@ kstat_init(void)
}
ksp = kstat_create("unix", 0, "biostats", "misc", KSTAT_TYPE_NAMED,
- biostats_ndata, KSTAT_FLAG_VIRTUAL);
+ biostats_ndata, KSTAT_FLAG_VIRTUAL);
if (ksp) {
ksp->ks_data = (void *) biostats_ptr;
kstat_install(ksp);
@@ -549,7 +549,7 @@ kstat_init(void)
#ifdef VAC
ksp = kstat_create("unix", 0, "flushmeter", "hat", KSTAT_TYPE_RAW,
- sizeof (struct flushmeter), KSTAT_FLAG_VIRTUAL);
+ sizeof (struct flushmeter), KSTAT_FLAG_VIRTUAL);
if (ksp) {
ksp->ks_data = (void *) &flush_cnt;
kstat_install(ksp);
@@ -557,15 +557,15 @@ kstat_init(void)
#endif /* VAC */
ksp = kstat_create("unix", 0, "var", "misc", KSTAT_TYPE_RAW,
- sizeof (struct var), KSTAT_FLAG_VIRTUAL);
+ sizeof (struct var), KSTAT_FLAG_VIRTUAL);
if (ksp) {
ksp->ks_data = (void *) &v;
kstat_install(ksp);
}
ksp = kstat_create("unix", 0, "system_misc", "misc", KSTAT_TYPE_NAMED,
- sizeof (system_misc_kstat) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL);
+ sizeof (system_misc_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
if (ksp) {
ksp->ks_data = (void *) &system_misc_kstat;
ksp->ks_update = system_misc_kstat_update;
@@ -573,8 +573,8 @@ kstat_init(void)
}
ksp = kstat_create("unix", 0, "system_pages", "pages", KSTAT_TYPE_NAMED,
- sizeof (system_pages_kstat) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL);
+ sizeof (system_pages_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
if (ksp) {
ksp->ks_data = (void *) &system_pages_kstat;
ksp->ks_update = system_pages_kstat_update;
@@ -911,9 +911,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw)
* user explicit page locking.
*/
system_pages_kstat.pp_kernel.value.ul = (ulong_t)(physinstalled -
- obp_pages - availrmem - k_anoninfo.ani_mem_resv -
- anon_segkp_pages_locked - segvn_pages_locked -
- pages_locked - pages_claimed - pages_useclaim);
+ obp_pages - availrmem - k_anoninfo.ani_mem_resv -
+ anon_segkp_pages_locked - pages_locked -
+ pages_claimed - pages_useclaim);
return (0);
}
@@ -923,7 +923,7 @@ kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags)
{
return (kstat_create_zone(ks_module, ks_instance, ks_name, ks_class,
- ks_type, ks_ndata, ks_flags, ALL_ZONES));
+ ks_type, ks_ndata, ks_flags, ALL_ZONES));
}
/*
@@ -966,8 +966,8 @@ kstat_create_zone(const char *ks_module, int ks_instance, const char *ks_name,
*/
if (ks_type >= KSTAT_NUM_TYPES) {
cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
- "invalid kstat type %d",
- ks_module, ks_instance, ks_name, ks_type);
+ "invalid kstat type %d",
+ ks_module, ks_instance, ks_name, ks_type);
return (NULL);
}
@@ -978,8 +978,8 @@ kstat_create_zone(const char *ks_module, int ks_instance, const char *ks_name,
if ((ks_flags & KSTAT_FLAG_PERSISTENT) &&
(ks_flags & KSTAT_FLAG_VIRTUAL)) {
cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
- "cannot create persistent virtual kstat",
- ks_module, ks_instance, ks_name);
+ "cannot create persistent virtual kstat",
+ ks_module, ks_instance, ks_name);
return (NULL);
}
@@ -990,8 +990,8 @@ kstat_create_zone(const char *ks_module, int ks_instance, const char *ks_name,
if ((ks_flags & KSTAT_FLAG_VAR_SIZE) &&
!(ks_flags & KSTAT_FLAG_VIRTUAL)) {
cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
- "cannot create variable-size physical kstat",
- ks_module, ks_instance, ks_name);
+ "cannot create variable-size physical kstat",
+ ks_module, ks_instance, ks_name);
return (NULL);
}
@@ -1001,10 +1001,10 @@ kstat_create_zone(const char *ks_module, int ks_instance, const char *ks_name,
if (ks_ndata < kstat_data_type[ks_type].min_ndata ||
ks_ndata > kstat_data_type[ks_type].max_ndata) {
cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
- "ks_ndata=%d out of range [%d, %d]",
- ks_module, ks_instance, ks_name, (int)ks_ndata,
- kstat_data_type[ks_type].min_ndata,
- kstat_data_type[ks_type].max_ndata);
+ "ks_ndata=%d out of range [%d, %d]",
+ ks_module, ks_instance, ks_name, (int)ks_ndata,
+ kstat_data_type[ks_type].min_ndata,
+ kstat_data_type[ks_type].max_ndata);
return (NULL);
}
@@ -1036,8 +1036,8 @@ kstat_create_zone(const char *ks_module, int ks_instance, const char *ks_name,
*/
kstat_rele(ksp);
cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
- "invalid reactivation of dormant kstat",
- ks_module, ks_instance, ks_name);
+ "invalid reactivation of dormant kstat",
+ ks_module, ks_instance, ks_name);
return (NULL);
}
/*
@@ -1056,8 +1056,8 @@ kstat_create_zone(const char *ks_module, int ks_instance, const char *ks_name,
e = kstat_alloc(ks_flags & KSTAT_FLAG_VIRTUAL ? 0 : ks_data_size);
if (e == NULL) {
cmn_err(CE_NOTE, "kstat_create('%s', %d, '%s'): "
- "insufficient kernel memory",
- ks_module, ks_instance, ks_name);
+ "insufficient kernel memory",
+ ks_module, ks_instance, ks_name);
return (NULL);
}
diff --git a/usr/src/uts/common/os/mem_cage.c b/usr/src/uts/common/os/mem_cage.c
index 8ff50400d2..f6f8c285dd 100644
--- a/usr/src/uts/common/os/mem_cage.c
+++ b/usr/src/uts/common/os/mem_cage.c
@@ -1271,6 +1271,11 @@ kcage_create_throttle(pgcnt_t npages, int flags)
}
}
}
+
+ if (NOMEMWAIT() && freemem < minfree) {
+ return (KCT_CRIT);
+ }
+
}
return (KCT_NONCRIT);
}
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 4a5ccc9944..752c2535c4 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -676,6 +676,7 @@ schedctl_freepage(struct anon_map *amp, caddr_t kaddr)
* we have to free everything rather than letting as_free
* do the work.
*/
+ anonmap_purge(amp);
anon_free(amp->ahp, 0, PAGESIZE);
ANON_LOCK_EXIT(&amp->a_rwlock);
anonmap_free(amp);
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index 6407f022d9..c6ee57b758 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -718,9 +718,8 @@ shmctl(int shmid, int cmd, void *arg)
if (error = shmem_lock(sp, sp->shm_amp)) {
ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock,
RW_WRITER);
- cmn_err(CE_NOTE,
- "shmctl - couldn't lock %ld pages into "
- "memory", sp->shm_amp->size);
+ cmn_err(CE_NOTE, "shmctl - couldn't lock %ld"
+ " pages into memory", sp->shm_amp->size);
ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock);
error = ENOMEM;
sp->shm_lkcnt--;
@@ -1253,13 +1252,14 @@ shm_rm_amp(kshmid_t *sp)
* Free up the anon_map.
*/
lgrp_shm_policy_fini(amp, NULL);
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
+ anonmap_purge(amp);
if (amp->a_szc != 0) {
- ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
anon_shmap_free_pages(amp, 0, amp->size);
- ANON_LOCK_EXIT(&amp->a_rwlock);
} else {
anon_free(amp->ahp, 0, amp->size);
}
+ ANON_LOCK_EXIT(&amp->a_rwlock);
anon_unresv_zone(amp->swresv, zone);
anonmap_free(amp);
}
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index 7a2bb48887..2a521fdb5d 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -531,7 +531,7 @@ schedpaging(void *arg)
if (freemem < lotsfree + needfree + kmem_reapahead)
kmem_reap();
- if (freemem < lotsfree + needfree + seg_preapahead)
+ if (freemem < lotsfree + needfree)
seg_preap();
if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
@@ -957,9 +957,10 @@ checkpage(struct page *pp, int whichhand)
*
* NOTE: These optimizations assume that reads are atomic.
*/
-top:
- if ((PP_ISKAS(pp)) || (PP_ISFREE(pp)) ||
- hat_page_checkshare(pp, po_share) || PAGE_LOCKED(pp)) {
+
+ if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
+ pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
+ hat_page_checkshare(pp, po_share)) {
return (-1);
}
diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h
index dcd062031e..13672d5c0b 100644
--- a/usr/src/uts/common/vm/anon.h
+++ b/usr/src/uts/common/vm/anon.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -139,7 +139,6 @@ extern struct anon **anon_hash;
* Declaration for the Global counters to accurately
* track the kernel foot print in memory.
*/
-extern pgcnt_t segvn_pages_locked;
extern pgcnt_t pages_locked;
extern pgcnt_t pages_claimed;
extern pgcnt_t pages_useclaim;
@@ -278,7 +277,7 @@ struct kshmid;
* 0 (base page size) or page_num_pagesizes() - 1, while MAP_PRIVATE
* the amp->szc could be anything in [0, page_num_pagesizes() - 1].
*/
-struct anon_map {
+typedef struct anon_map {
krwlock_t a_rwlock; /* protect anon_map and anon array */
size_t size; /* size in bytes mapped by the anon array */
struct anon_hdr *ahp; /* anon array header pointer, containing */
@@ -288,7 +287,13 @@ struct anon_map {
ushort_t a_szc; /* max szc among shared processes */
void *locality; /* lgroup locality info */
struct kshmid *a_sp; /* kshmid if amp backs sysV, or NULL */
-};
+ int a_purgewait; /* somebody waits for slocks to go away */
+ kcondvar_t a_purgecv; /* cv for waiting for slocks to go away */
+ kmutex_t a_purgemtx; /* mutex for anonmap_purge() */
+ spgcnt_t a_softlockcnt; /* number of pages locked in pcache */
+ kmutex_t a_pmtx; /* protects amp's pcache list */
+ pcache_link_t a_phead; /* head of amp's pcache list */
+} amp_t;
#ifdef _KERNEL
@@ -303,6 +308,9 @@ struct anon_map {
#define ANON_LOCK_ENTER(lock, type) rw_enter((lock), (type))
#define ANON_LOCK_EXIT(lock) rw_exit((lock))
+#define ANON_LOCK_HELD(lock) RW_LOCK_HELD((lock))
+#define ANON_READ_HELD(lock) RW_READ_HELD((lock))
+#define ANON_WRITE_HELD(lock) RW_WRITE_HELD((lock))
#define ANON_ARRAY_HASH(amp, idx)\
((((idx) + ((idx) >> ANON_ARRAY_SHIFT) +\
@@ -334,9 +342,9 @@ typedef struct anon_sync_obj {
/*
* Swap slots currently available for reservation
*/
-#define CURRENT_TOTAL_AVAILABLE_SWAP \
+#define CURRENT_TOTAL_AVAILABLE_SWAP \
((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) + \
- MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
+ MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
struct k_anoninfo {
pgcnt_t ani_max; /* total reservable slots on phys */
@@ -392,6 +400,8 @@ extern int anon_resvmem(size_t, boolean_t, zone_t *, int);
extern void anon_unresvmem(size_t, zone_t *);
extern struct anon_map *anonmap_alloc(size_t, size_t, int);
extern void anonmap_free(struct anon_map *);
+extern void anonmap_purge(struct anon_map *);
+extern void anon_swap_free(struct anon *, struct page *);
extern void anon_decref(struct anon *);
extern int non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *);
extern pgcnt_t anon_pages(struct anon_hdr *, ulong_t, pgcnt_t);
diff --git a/usr/src/uts/common/vm/as.h b/usr/src/uts/common/vm/as.h
index 826ad4dbb9..381bdbaedc 100644
--- a/usr/src/uts/common/vm/as.h
+++ b/usr/src/uts/common/vm/as.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -131,23 +131,26 @@ struct as {
#define AS_CLAIMGAP 0x40
#define AS_UNMAPWAIT 0x20
#define AS_NEEDSPURGE 0x10 /* mostly for seg_nf, see as_purge() */
+#define AS_NOUNMAPWAIT 0x02
#define AS_BUSY 0x01 /* needed by XHAT framework */
#define AS_ISPGLCK(as) ((as)->a_flags & AS_PAGLCK)
#define AS_ISCLAIMGAP(as) ((as)->a_flags & AS_CLAIMGAP)
#define AS_ISUNMAPWAIT(as) ((as)->a_flags & AS_UNMAPWAIT)
#define AS_ISBUSY(as) ((as)->a_flags & AS_BUSY)
-
+#define AS_ISNOUNMAPWAIT(as) ((as)->a_flags & AS_NOUNMAPWAIT)
#define AS_SETPGLCK(as) ((as)->a_flags |= AS_PAGLCK)
#define AS_SETCLAIMGAP(as) ((as)->a_flags |= AS_CLAIMGAP)
#define AS_SETUNMAPWAIT(as) ((as)->a_flags |= AS_UNMAPWAIT)
#define AS_SETBUSY(as) ((as)->a_flags |= AS_BUSY)
+#define AS_SETNOUNMAPWAIT(as) ((as)->a_flags |= AS_NOUNMAPWAIT)
#define AS_CLRPGLCK(as) ((as)->a_flags &= ~AS_PAGLCK)
#define AS_CLRCLAIMGAP(as) ((as)->a_flags &= ~AS_CLAIMGAP)
#define AS_CLRUNMAPWAIT(as) ((as)->a_flags &= ~AS_UNMAPWAIT)
#define AS_CLRBUSY(as) ((as)->a_flags &= ~AS_BUSY)
+#define AS_CLRNOUNMAPWAIT(as) ((as)->a_flags &= ~AS_NOUNMAPWAIT)
#define AS_TYPE_64BIT(as) \
(((as)->a_userlimit > (caddr_t)UINT32_MAX) ? 1 : 0)
@@ -281,8 +284,6 @@ int as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
size_t size, enum seg_rw rw);
void as_pageunlock(struct as *as, struct page **pp, caddr_t addr,
size_t size, enum seg_rw rw);
-void as_pagereclaim(struct as *as, struct page **pp, caddr_t addr,
- size_t size, enum seg_rw rw);
int as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
boolean_t wait);
int as_set_default_lpsize(struct as *as, caddr_t addr, size_t size);
diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h
index 3ec037f13b..2e1e6a77de 100644
--- a/usr/src/uts/common/vm/seg.h
+++ b/usr/src/uts/common/vm/seg.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -55,6 +55,8 @@ extern "C" {
* VM - Segments.
*/
+struct anon_map;
+
/*
* kstat statistics for segment advise
*/
@@ -93,7 +95,12 @@ typedef struct memid { u_longlong_t val[2]; } memid_t;
* write locked.
*/
-struct seg {
+typedef struct pcache_link {
+ struct pcache_link *p_lnext;
+ struct pcache_link *p_lprev;
+} pcache_link_t;
+
+typedef struct seg {
caddr_t s_base; /* base virtual address */
size_t s_size; /* size in bytes */
uint_t s_szc; /* max page size code */
@@ -102,7 +109,9 @@ struct seg {
avl_node_t s_tree; /* AVL tree links to segs in this as */
struct seg_ops *s_ops; /* ops vector: see below */
void *s_data; /* private data for instance */
-};
+ kmutex_t s_pmtx; /* protects seg's pcache list */
+ pcache_link_t s_phead; /* head of seg's pcache list */
+} seg_t;
#define S_PURGE (0x01) /* seg should be purged in as_gap() */
@@ -136,6 +145,7 @@ struct seg_ops {
};
#ifdef _KERNEL
+
/*
* Generic segment operations
*/
@@ -149,28 +159,41 @@ extern void seg_free(struct seg *seg);
/*
* functions for pagelock cache support
*/
-extern void seg_ppurge(struct seg *seg);
-extern void seg_ppurge_seg(int (*callback)());
-extern void seg_pinactive(struct seg *seg, caddr_t addr, size_t len,
- struct page **pp, enum seg_rw rw, int (*callback)());
-extern int seg_pinsert_check(struct seg *seg, size_t len, uint_t flags);
-extern int seg_pinsert(struct seg *seg, caddr_t addr, size_t len,
- struct page **pp, enum seg_rw rw, uint_t flags,
- int (*callback)());
-extern struct page **seg_plookup(struct seg *seg, caddr_t addr,
- size_t len, enum seg_rw rw);
+typedef int (*seg_preclaim_cbfunc_t)(void *, caddr_t, size_t,
+ struct page **, enum seg_rw, int);
+
+extern struct page **seg_plookup(struct seg *seg, struct anon_map *amp,
+ caddr_t addr, size_t len, enum seg_rw rw, uint_t flags);
+extern void seg_pinactive(struct seg *seg, struct anon_map *amp,
+ caddr_t addr, size_t len, struct page **pp, enum seg_rw rw,
+ uint_t flags, seg_preclaim_cbfunc_t callback);
+
+extern void seg_ppurge(struct seg *seg, struct anon_map *amp,
+ uint_t flags);
+extern void seg_ppurge_wiredpp(struct page **pp);
+
+extern int seg_pinsert_check(struct seg *seg, struct anon_map *amp,
+ caddr_t addr, size_t len, uint_t flags);
+extern int seg_pinsert(struct seg *seg, struct anon_map *amp,
+ caddr_t addr, size_t len, size_t wlen, struct page **pp, enum seg_rw rw,
+ uint_t flags, seg_preclaim_cbfunc_t callback);
+
extern void seg_pasync_thread(void);
extern void seg_preap(void);
extern int seg_p_disable(void);
extern void seg_p_enable(void);
-extern int seg_preapahead;
-extern segadvstat_t segadvstat;
+extern segadvstat_t segadvstat;
+
/*
- * Flags for pagelock cache support
+ * Flags for pagelock cache support.
+ * Flags argument is passed as uint_t to pcache routines. upper 16 bits of
+ * the flags argument are reserved for alignment page shift when SEGP_PSHIFT
+ * is set.
*/
-#define SEGP_ASYNC_FLUSH 0x1 /* flushed by async thread */
-#define SEGP_FORCE_WIRED 0x2 /* skip check against seg_pwindow */
+#define SEGP_FORCE_WIRED 0x1 /* skip check against seg_pwindow */
+#define SEGP_AMP 0x2 /* anon map's pcache entry */
+#define SEGP_PSHIFT 0x4 /* addr pgsz shift for hash function */
/*
* Return values for seg_pinsert and seg_pinsert_check functions.
diff --git a/usr/src/uts/common/vm/seg_enum.h b/usr/src/uts/common/vm/seg_enum.h
index 88ebd65bee..5c37e2b178 100644
--- a/usr/src/uts/common/vm/seg_enum.h
+++ b/usr/src/uts/common/vm/seg_enum.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -62,8 +61,7 @@ enum fault_type {
*/
enum lock_type {
L_PAGELOCK, /* lock pages */
- L_PAGEUNLOCK, /* unlock pages */
- L_PAGERECLAIM /* reclaim pages */
+ L_PAGEUNLOCK /* unlock pages */
};
/*
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
index de09e4448b..1c2464b36c 100644
--- a/usr/src/uts/common/vm/seg_kmem.c
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -674,9 +674,6 @@ segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
return (SEGOP_PAGELOCK(segkp, addr, len, ppp, type, rw));
- if (type == L_PAGERECLAIM)
- return (ENOTSUP);
-
npages = btopr(len);
nb = sizeof (page_t *) * npages;
diff --git a/usr/src/uts/common/vm/seg_spt.c b/usr/src/uts/common/vm/seg_spt.c
index eaf97ac1a8..ff8f3749ef 100644
--- a/usr/src/uts/common/vm/seg_spt.c
+++ b/usr/src/uts/common/vm/seg_spt.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -174,8 +174,8 @@ struct seg_ops segspt_shmops = {
};
static void segspt_purge(struct seg *seg);
-static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **,
- enum seg_rw);
+static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
+ enum seg_rw, int);
static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
page_t **ppa);
@@ -833,6 +833,7 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
uint_t szc;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
/*
* We want to lock/unlock the entire ISM segment. Therefore,
@@ -857,8 +858,8 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
if (type == L_PAGEUNLOCK) {
ASSERT(sptd->spt_ppa != NULL);
- seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
- sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
+ seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
/*
* If someone is blocked while unmapping, we purge
@@ -868,17 +869,16 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
* raw async i/o is still in progress or where a thread
* exits on data fault in a multithreaded application.
*/
- if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
+ if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
+ (AS_ISUNMAPWAIT(seg->s_as) &&
+ shmd->shm_softlockcnt > 0)) {
segspt_purge(seg);
}
return (0);
- } else if (type == L_PAGERECLAIM) {
- ASSERT(sptd->spt_ppa != NULL);
- (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
- sptd->spt_ppa, sptd->spt_prot);
- return (0);
}
+ /* The L_PAGELOCK case ... */
+
if (sptd->spt_flags & DISM_PPA_CHANGED) {
segspt_purge(seg);
/*
@@ -893,17 +893,17 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
* First try to find pages in segment page cache, without
* holding the segment lock.
*/
- pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
- sptd->spt_prot);
+ pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ S_WRITE, SEGP_FORCE_WIRED);
if (pplist != NULL) {
ASSERT(sptd->spt_ppa != NULL);
ASSERT(sptd->spt_ppa == pplist);
ppa = sptd->spt_ppa;
for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
if (ppa[an_idx] == NULL) {
- seg_pinactive(seg, seg->s_base,
+ seg_pinactive(seg, NULL, seg->s_base,
sptd->spt_amp->size, ppa,
- sptd->spt_prot, segspt_reclaim);
+ S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
*ppp = NULL;
return (ENOTSUP);
}
@@ -923,13 +923,12 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
return (0);
}
- /* The L_PAGELOCK case... */
mutex_enter(&sptd->spt_lock);
/*
* try to find pages in segment page cache with mutex
*/
- pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
- sptd->spt_prot);
+ pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ S_WRITE, SEGP_FORCE_WIRED);
if (pplist != NULL) {
ASSERT(sptd->spt_ppa != NULL);
ASSERT(sptd->spt_ppa == pplist);
@@ -937,9 +936,9 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
if (ppa[an_idx] == NULL) {
mutex_exit(&sptd->spt_lock);
- seg_pinactive(seg, seg->s_base,
+ seg_pinactive(seg, NULL, seg->s_base,
sptd->spt_amp->size, ppa,
- sptd->spt_prot, segspt_reclaim);
+ S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
*ppp = NULL;
return (ENOTSUP);
}
@@ -959,8 +958,8 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
*ppp = &(sptd->spt_ppa[pg_idx]);
return (0);
}
- if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
- SEGP_FAIL) {
+ if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ SEGP_FORCE_WIRED) == SEGP_FAIL) {
mutex_exit(&sptd->spt_lock);
*ppp = NULL;
return (ENOTSUP);
@@ -1038,16 +1037,18 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
}
ANON_LOCK_EXIT(&amp->a_rwlock);
- mutex_enter(&freemem_lock);
- if (availrmem < tune.t_minarmem + claim_availrmem) {
+ if (claim_availrmem) {
+ mutex_enter(&freemem_lock);
+ if (availrmem < tune.t_minarmem + claim_availrmem) {
+ mutex_exit(&freemem_lock);
+ ret = ENOTSUP;
+ claim_availrmem = 0;
+ goto insert_fail;
+ } else {
+ availrmem -= claim_availrmem;
+ }
mutex_exit(&freemem_lock);
- ret = FC_MAKE_ERR(ENOMEM);
- claim_availrmem = 0;
- goto insert_fail;
- } else {
- availrmem -= claim_availrmem;
}
- mutex_exit(&freemem_lock);
sptd->spt_ppa = pl;
} else {
@@ -1059,8 +1060,8 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
ASSERT(pl != NULL);
- ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
- pl, sptd->spt_prot, SEGP_FORCE_WIRED | SEGP_ASYNC_FLUSH,
+ ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
segspt_reclaim);
if (ret == SEGP_FAIL) {
/*
@@ -1089,8 +1090,9 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
if (ppa[an_idx] == NULL) {
mutex_exit(&sptd->spt_lock);
- seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
- pl, sptd->spt_prot, segspt_reclaim);
+ seg_pinactive(seg, NULL, seg->s_base,
+ sptd->spt_amp->size,
+ pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
*ppp = NULL;
return (ENOTSUP);
}
@@ -1113,7 +1115,7 @@ segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
* to the requested addr, i.e. pg_idx.
*/
*ppp = &(sptd->spt_ppa[pg_idx]);
- return (ret);
+ return (0);
insert_fail:
/*
@@ -1125,9 +1127,11 @@ insert_fail:
mutex_exit(&sptd->spt_lock);
if (pl_built) {
- mutex_enter(&freemem_lock);
- availrmem += claim_availrmem;
- mutex_exit(&freemem_lock);
+ if (claim_availrmem) {
+ mutex_enter(&freemem_lock);
+ availrmem += claim_availrmem;
+ mutex_exit(&freemem_lock);
+ }
/*
* We created pl and we need to destroy it.
@@ -1184,6 +1188,8 @@ segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
u_offset_t off;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
+
/*
* We want to lock/unlock the entire ISM segment. Therefore,
@@ -1213,8 +1219,8 @@ segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
ASSERT(sptd->spt_ppa != NULL);
- seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
- sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
+ seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
/*
* If someone is blocked while unmapping, we purge
@@ -1228,20 +1234,16 @@ segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
segspt_purge(seg);
}
return (0);
- } else if (type == L_PAGERECLAIM) {
- ASSERT(sptd->spt_ppa != NULL);
-
- (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
- sptd->spt_ppa, sptd->spt_prot);
- return (0);
}
+ /* The L_PAGELOCK case... */
+
/*
* First try to find pages in segment page cache, without
* holding the segment lock.
*/
- pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
- sptd->spt_prot);
+ pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ S_WRITE, SEGP_FORCE_WIRED);
if (pplist != NULL) {
ASSERT(sptd->spt_ppa == pplist);
ASSERT(sptd->spt_ppa[page_index]);
@@ -1254,14 +1256,13 @@ segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
return (0);
}
- /* The L_PAGELOCK case... */
mutex_enter(&sptd->spt_lock);
/*
* try to find pages in segment page cache
*/
- pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
- sptd->spt_prot);
+ pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ S_WRITE, SEGP_FORCE_WIRED);
if (pplist != NULL) {
ASSERT(sptd->spt_ppa == pplist);
/*
@@ -1274,8 +1275,8 @@ segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
return (0);
}
- if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
- SEGP_FAIL) {
+ if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ SEGP_FORCE_WIRED) == SEGP_FAIL) {
mutex_exit(&sptd->spt_lock);
*ppp = NULL;
return (ENOTSUP);
@@ -1338,8 +1339,9 @@ segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
ASSERT(pl != NULL);
- ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
- pl, sptd->spt_prot, SEGP_FORCE_WIRED, segspt_reclaim);
+ ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
+ sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
+ segspt_reclaim);
if (ret == SEGP_FAIL) {
/*
* seg_pinsert failed. We return
@@ -1375,7 +1377,7 @@ segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
* to the requested addr, i.e. page_index.
*/
*ppp = &(sptd->spt_ppa[page_index]);
- return (ret);
+ return (0);
insert_fail:
/*
@@ -1419,13 +1421,14 @@ insert_fail:
static void
segspt_purge(struct seg *seg)
{
- seg_ppurge(seg);
+ seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
}
static int
-segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
- enum seg_rw rw)
+segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
+ enum seg_rw rw, int async)
{
+ struct seg *seg = (struct seg *)ptag;
struct shm_data *shmd = (struct shm_data *)seg->s_data;
struct seg *sptseg;
struct spt_data *sptd;
@@ -1442,6 +1445,8 @@ segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
ASSERT(sptd->spt_pcachecnt != 0);
ASSERT(sptd->spt_ppa == pplist);
ASSERT(npages == btopr(sptd->spt_amp->size));
+ ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
/*
* Acquire the lock on the dummy seg and destroy the
* ppa array IF this is the last pcachecnt.
@@ -1462,7 +1467,7 @@ segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
free_availrmem++;
page_unlock(pplist[i]);
}
- if (sptd->spt_flags & SHM_PAGEABLE) {
+ if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
mutex_enter(&freemem_lock);
availrmem += free_availrmem;
mutex_exit(&freemem_lock);
@@ -1482,14 +1487,41 @@ segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
done = 1;
}
mutex_exit(&sptd->spt_lock);
+
+ /*
+ * If we are pcache async thread or called via seg_ppurge_wiredpp() we
+ * may not hold AS lock (in this case async argument is not 0). This
+ * means if softlockcnt drops to 0 after the decrement below address
+ * space may get freed. We can't allow it since after softlock
+ * derement to 0 we still need to access as structure for possible
+ * wakeup of unmap waiters. To prevent the disappearance of as we take
+ * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
+ * this mutex as a barrier to make sure this routine completes before
+ * segment is freed.
+ *
+ * The second complication we have to deal with in async case is a
+ * possibility of missed wake up of unmap wait thread. When we don't
+ * hold as lock here we may take a_contents lock before unmap wait
+ * thread that was first to see softlockcnt was still not 0. As a
+ * result we'll fail to wake up an unmap wait thread. To avoid this
+ * race we set nounmapwait flag in as structure if we drop softlockcnt
+ * to 0 if async is not 0. unmapwait thread
+ * will not block if this flag is set.
+ */
+ if (async)
+ mutex_enter(&shmd->shm_segfree_syncmtx);
+
/*
* Now decrement softlockcnt.
*/
+ ASSERT(shmd->shm_softlockcnt > 0);
atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1);
if (shmd->shm_softlockcnt <= 0) {
- if (AS_ISUNMAPWAIT(seg->s_as)) {
+ if (async || AS_ISUNMAPWAIT(seg->s_as)) {
mutex_enter(&seg->s_as->a_contents);
+ if (async)
+ AS_SETNOUNMAPWAIT(seg->s_as);
if (AS_ISUNMAPWAIT(seg->s_as)) {
AS_CLRUNMAPWAIT(seg->s_as);
cv_broadcast(&seg->s_as->a_cv);
@@ -1497,6 +1529,10 @@ segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
mutex_exit(&seg->s_as->a_contents);
}
}
+
+ if (async)
+ mutex_exit(&shmd->shm_segfree_syncmtx);
+
return (done);
}
@@ -1604,6 +1640,7 @@ segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
softlock_decrement:
npages = btopr(len);
+ ASSERT(shmd->shm_softlockcnt >= npages);
atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
if (shmd->shm_softlockcnt == 0) {
/*
@@ -1646,6 +1683,8 @@ segspt_shmattach(struct seg *seg, caddr_t *argsp)
(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
NULL, 0, seg->s_size);
+ mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
+
seg->s_data = (void *)shmd;
seg->s_ops = &segspt_shmops;
seg->s_szc = shmd->shm_sptseg->s_szc;
@@ -1741,6 +1780,15 @@ segspt_shmfree(struct seg *seg)
kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
shmd->shm_vpage = NULL;
}
+
+ /*
+ * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
+ * still working with this segment without holding as lock.
+ */
+ ASSERT(shmd->shm_softlockcnt == 0);
+ mutex_enter(&shmd->shm_segfree_syncmtx);
+ mutex_destroy(&shmd->shm_segfree_syncmtx);
+
kmem_free(shmd, sizeof (*shmd));
}
@@ -1834,14 +1882,6 @@ segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
case F_SOFTLOCK:
- mutex_enter(&freemem_lock);
- if (availrmem < tune.t_minarmem + npages) {
- mutex_exit(&freemem_lock);
- return (FC_MAKE_ERR(ENOMEM));
- } else {
- availrmem -= npages;
- }
- mutex_exit(&freemem_lock);
atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
/*
* Fall through to the F_INVAL case to load up the hat layer
@@ -1858,9 +1898,6 @@ segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
if (err != 0) {
if (type == F_SOFTLOCK) {
- mutex_enter(&freemem_lock);
- availrmem += npages;
- mutex_exit(&freemem_lock);
atomic_add_long((ulong_t *)(
&(shmd->shm_softlockcnt)), -npages);
}
@@ -1934,10 +1971,6 @@ dism_err:
case F_SOFTUNLOCK:
- mutex_enter(&freemem_lock);
- availrmem += npages;
- mutex_exit(&freemem_lock);
-
/*
* This is a bit ugly, we pass in the real seg pointer,
* but the segspt_addr is the virtual address within the
@@ -2616,6 +2649,7 @@ segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
int kernel;
anon_sync_obj_t cookie;
rctl_qty_t unlocked = 0;
+ page_t **ppa;
amp = sptd->spt_amp;
mutex_enter(&sptd->spt_lock);
@@ -2661,12 +2695,15 @@ segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
}
}
ANON_LOCK_EXIT(&amp->a_rwlock);
- if (sptd->spt_ppa != NULL)
+ if ((ppa = sptd->spt_ppa) != NULL)
sptd->spt_flags |= DISM_PPA_CHANGED;
mutex_exit(&sptd->spt_lock);
rctl_decr_locked_mem(NULL, proj, unlocked, 0);
mutex_exit(&sp->shm_mlock);
+
+ if (ppa != NULL)
+ seg_ppurge_wiredpp(ppa);
}
return (sts);
}
@@ -2748,6 +2785,7 @@ segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
ushort_t gen;
clock_t end_lbolt;
int writer;
+ page_t **ppa;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
@@ -2759,7 +2797,7 @@ segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
pg_idx = seg_page(seg, addr);
mutex_enter(&sptd->spt_lock);
- if (sptd->spt_ppa == NULL) {
+ if ((ppa = sptd->spt_ppa) == NULL) {
mutex_exit(&sptd->spt_lock);
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
anon_disclaim(amp, pg_idx, len);
@@ -2775,7 +2813,7 @@ segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
/*
* Purge all DISM cached pages
*/
- seg_ppurge_seg(segspt_reclaim);
+ seg_ppurge_wiredpp(ppa);
/*
* Drop the AS_LOCK so that other threads can grab it
diff --git a/usr/src/uts/common/vm/seg_spt.h b/usr/src/uts/common/vm/seg_spt.h
index ff14c2dc2d..ebc2ebf465 100644
--- a/usr/src/uts/common/vm/seg_spt.h
+++ b/usr/src/uts/common/vm/seg_spt.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -74,7 +74,7 @@ typedef struct spt_data {
typedef struct shm_data {
struct as *shm_sptas;
struct anon_map *shm_amp;
- size_t shm_softlockcnt; /* # outstanding lock operations */
+ spgcnt_t shm_softlockcnt; /* # outstanding lock operations */
struct seg *shm_sptseg; /* pointer to spt segment */
char *shm_vpage; /* indicating locked pages */
spgcnt_t shm_lckpgs; /* # of locked pages per attached seg */
@@ -82,6 +82,7 @@ typedef struct shm_data {
* Memory allocation policy after shmat()
*/
lgrp_mem_policy_info_t shm_policy_info;
+ kmutex_t shm_segfree_syncmtx; /* barrier lock for segspt_shmfree() */
} shm_data_t;
#define DISM_PG_LOCKED 0x1 /* DISM page is locked */
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 9b198a48d3..f8ca679fcf 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -162,6 +162,11 @@ caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */
+size_t segvn_pglock_comb_thrshld = (1UL << 16); /* 64K */
+size_t segvn_pglock_comb_balign = (1UL << 16); /* 64K */
+uint_t segvn_pglock_comb_bshift;
+size_t segvn_pglock_comb_palign;
+
static int segvn_concat(struct seg *, struct seg *, int);
static int segvn_extend_prev(struct seg *, struct seg *,
struct segvn_crargs *, size_t);
@@ -180,13 +185,15 @@ static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
u_offset_t, struct vpage *, page_t **, uint_t,
- enum fault_type, enum seg_rw, int, int);
+ enum fault_type, enum seg_rw, int);
static void segvn_vpage(struct seg *);
static size_t segvn_count_swap_by_vpages(struct seg *);
static void segvn_purge(struct seg *seg);
-static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **,
- enum seg_rw);
+static int segvn_reclaim(void *, caddr_t, size_t, struct page **,
+ enum seg_rw, int);
+static int shamp_reclaim(void *, caddr_t, size_t, struct page **,
+ enum seg_rw, int);
static int sameprot(struct seg *, caddr_t, size_t);
@@ -199,9 +206,6 @@ static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
size_t, void *, u_offset_t);
-static int segvn_slock_anonpages(page_t *, int);
-static void segvn_sunlock_anonpages(page_t *, int);
-
static struct kmem_cache *segvn_cache;
static struct kmem_cache **segvn_szc_cache;
@@ -212,7 +216,7 @@ static struct segvnvmstats_str {
ulong_t fullszcpages[10];
ulong_t relocatepages[3];
ulong_t fltanpages[17];
- ulong_t pagelock[3];
+ ulong_t pagelock[2];
ulong_t demoterange[3];
} segvnvmstats;
#endif /* VM_STATS */
@@ -240,7 +244,7 @@ segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
struct segvn_data *svd = buf;
rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
- mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
svd->svn_trnext = svd->svn_trprev = NULL;
return (0);
}
@@ -252,7 +256,7 @@ segvn_cache_destructor(void *buf, void *cdrarg)
struct segvn_data *svd = buf;
rw_destroy(&svd->lock);
- mutex_destroy(&svd->segp_slock);
+ mutex_destroy(&svd->segfree_syncmtx);
}
/*ARGSUSED*/
@@ -467,6 +471,13 @@ segvn_init(void)
NULL, 0, &p0, TS_RUN, minclsyspri);
}
#endif
+
+ if (!ISP2(segvn_pglock_comb_balign) ||
+ segvn_pglock_comb_balign < PAGESIZE) {
+ segvn_pglock_comb_balign = 1UL << 16; /* 64K */
+ }
+ segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1;
+ segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign);
}
#define SEGVN_PAGEIO ((void *)0x1)
@@ -786,6 +797,8 @@ segvn_create(struct seg *seg, void *argsp)
svd->pageadvice = 0;
svd->flags = (ushort_t)a->flags;
svd->softlockcnt = 0;
+ svd->softlockcnt_sbase = 0;
+ svd->softlockcnt_send = 0;
svd->rcookie = HAT_INVALID_REGION_COOKIE;
svd->pageswap = 0;
@@ -991,7 +1004,7 @@ segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat)
(!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
incompat(type) || incompat(cred) || incompat(flags) ||
seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
- (svd2->softlockcnt > 0))
+ (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0)
return (-1);
#undef incompat
@@ -1232,7 +1245,7 @@ segvn_extend_prev(seg1, seg2, a, swresv)
if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
(!svd1->pageprot && (svd1->prot != a->prot)) ||
svd1->type != a->type || svd1->flags != a->flags ||
- seg1->s_szc != a->szc)
+ seg1->s_szc != a->szc || svd1->softlockcnt_send > 0)
return (-1);
/* vp == NULL implies zfod, offset doesn't matter */
@@ -1353,7 +1366,7 @@ segvn_extend_next(
if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
(!svd2->pageprot && (svd2->prot != a->prot)) ||
svd2->type != a->type || svd2->flags != a->flags ||
- seg2->s_szc != a->szc)
+ seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0)
return (-1);
/* vp == NULL implies zfod, offset doesn't matter */
if (svd2->vp != NULL &&
@@ -1498,6 +1511,8 @@ segvn_dup(struct seg *seg, struct seg *newseg)
newsvd->pageswap = svd->pageswap;
newsvd->flags = svd->flags;
newsvd->softlockcnt = 0;
+ newsvd->softlockcnt_sbase = 0;
+ newsvd->softlockcnt_send = 0;
newsvd->policy_info = svd->policy_info;
newsvd->rcookie = HAT_INVALID_REGION_COOKIE;
@@ -1797,6 +1812,15 @@ segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
retry:
if (svd->softlockcnt > 0) {
ASSERT(svd->tr_state == SEGVN_TR_OFF);
+
+ /*
+ * If this is shared segment non 0 softlockcnt
+ * means locked pages are still in use.
+ */
+ if (svd->type == MAP_SHARED) {
+ return (EAGAIN);
+ }
+
/*
* since we do have the writers lock nobody can fill
* the cache during the purge. The flush either succeeds
@@ -1946,6 +1970,16 @@ retry:
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
/*
+ * Shared anon map is no longer in use. Before
+ * freeing its pages purge all entries from
+ * pcache that belong to this amp.
+ */
+ if (svd->type == MAP_SHARED) {
+ ASSERT(amp->refcnt == 1);
+ ASSERT(svd->softlockcnt == 0);
+ anonmap_purge(amp);
+ }
+ /*
* Free up now unused parts of anon_map array.
*/
if (amp->a_szc == seg->s_szc) {
@@ -2040,6 +2074,18 @@ retry:
* Free up now unused parts of anon_map array.
*/
ulong_t an_idx = svd->anon_index + npages;
+
+ /*
+ * Shared anon map is no longer in use. Before
+ * freeing its pages purge all entries from
+ * pcache that belong to this amp.
+ */
+ if (svd->type == MAP_SHARED) {
+ ASSERT(amp->refcnt == 1);
+ ASSERT(svd->softlockcnt == 0);
+ anonmap_purge(amp);
+ }
+
if (amp->a_szc == seg->s_szc) {
if (seg->s_szc != 0) {
anon_free_pages(amp->ahp,
@@ -2123,6 +2169,8 @@ retry:
nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
nsvd->swresv = 0;
nsvd->softlockcnt = 0;
+ nsvd->softlockcnt_sbase = 0;
+ nsvd->softlockcnt_send = 0;
ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
if (svd->vp != NULL) {
@@ -2173,6 +2221,18 @@ retry:
* Free up now unused parts of anon_map array.
*/
ulong_t an_idx = svd->anon_index + opages;
+
+ /*
+ * Shared anon map is no longer in use. Before
+ * freeing its pages purge all entries from
+ * pcache that belong to this amp.
+ */
+ if (svd->type == MAP_SHARED) {
+ ASSERT(amp->refcnt == 1);
+ ASSERT(svd->softlockcnt == 0);
+ anonmap_purge(amp);
+ }
+
if (amp->a_szc == seg->s_szc) {
if (seg->s_szc != 0) {
anon_free_pages(amp->ahp, an_idx, len,
@@ -2316,6 +2376,15 @@ segvn_free(struct seg *seg)
seg->s_size);
}
} else {
+
+ /*
+ * Shared anon map is no longer in use. Before
+ * freeing its pages purge all entries from
+ * pcache that belong to this amp.
+ */
+ ASSERT(svd->softlockcnt == 0);
+ anonmap_purge(amp);
+
/*
* Shared - anon_free the entire
* anon_map's worth of stuff and
@@ -2380,153 +2449,17 @@ segvn_free(struct seg *seg)
svd->pageswap = 0;
svd->cred = NULL;
- seg->s_data = NULL;
- kmem_cache_free(segvn_cache, svd);
-}
-
-#ifdef DEBUG
-uint32_t segvn_slock_mtbf = 0;
-#endif
-
-ulong_t segvn_lpglck_limit = 0;
-
-/*
- * Support routines used by segvn_pagelock() and softlock faults for anonymous
- * pages to implement availrmem accounting in a way that makes sure the
- * same memory is accounted just once for all softlock/pagelock purposes.
- * This prevents a bug when availrmem is quickly incorrectly exhausted from
- * several pagelocks to different parts of the same large page since each
- * pagelock has to decrement availrmem by the size of the entire large
- * page. Note those pages are not COW shared until softunlock/pageunlock so
- * we don't need to use cow style accounting here. We also need to make sure
- * the entire large page is accounted even if softlock range is less than the
- * entire large page because large anon pages can't be demoted when any of
- * constituent pages is locked. The caller calls this routine for every page_t
- * it locks. The very first page in the range may not be the root page of a
- * large page. For all other pages it's guaranteed we are going to visit the
- * root of a particular large page before any other constituent page as we are
- * locking sequential pages belonging to the same anon map. So we do all the
- * locking when the root is encountered except for the very first page. Since
- * softlocking is not supported (except S_READ_NOCOW special case) for vmpss
- * segments and since vnode pages can be demoted without locking all
- * constituent pages vnode pages don't come here. Unlocking relies on the
- * fact that pagesize can't change whenever any of constituent large pages is
- * locked at least SE_SHARED. This allows unlocking code to find the right
- * root and decrement availrmem by the same amount it was incremented when the
- * page was locked.
- */
-static int
-segvn_slock_anonpages(page_t *pp, int first)
-{
- pgcnt_t pages;
- pfn_t pfn;
- uchar_t szc = pp->p_szc;
-
- ASSERT(PAGE_LOCKED(pp));
- ASSERT(pp->p_vnode != NULL);
- ASSERT(IS_SWAPFSVP(pp->p_vnode));
-
- /*
- * pagesize won't change as long as any constituent page is locked.
- */
- pages = page_get_pagecnt(pp->p_szc);
- pfn = page_pptonum(pp);
-
- if (!first) {
- if (!IS_P2ALIGNED(pfn, pages)) {
-#ifdef DEBUG
- pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
- pfn = page_pptonum(pp);
- ASSERT(IS_P2ALIGNED(pfn, pages));
- ASSERT(pp->p_szc == szc);
- ASSERT(pp->p_vnode != NULL);
- ASSERT(IS_SWAPFSVP(pp->p_vnode));
- ASSERT(pp->p_slckcnt != 0);
-#endif /* DEBUG */
- return (1);
- }
- } else if (!IS_P2ALIGNED(pfn, pages)) {
- pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
-#ifdef DEBUG
- pfn = page_pptonum(pp);
- ASSERT(IS_P2ALIGNED(pfn, pages));
- ASSERT(pp->p_szc == szc);
- ASSERT(pp->p_vnode != NULL);
- ASSERT(IS_SWAPFSVP(pp->p_vnode));
-#endif /* DEBUG */
- }
-
-#ifdef DEBUG
- if (segvn_slock_mtbf && !(gethrtime() % segvn_slock_mtbf)) {
- return (0);
- }
-#endif /* DEBUG */
-
- /*
- * pp is a root page.
- * We haven't locked this large page yet.
- */
- page_struct_lock(pp);
- if (pp->p_slckcnt != 0) {
- if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) {
- pp->p_slckcnt++;
- page_struct_unlock(pp);
- return (1);
- }
- page_struct_unlock(pp);
- segvn_lpglck_limit++;
- return (0);
- }
- mutex_enter(&freemem_lock);
- if (availrmem < tune.t_minarmem + pages) {
- mutex_exit(&freemem_lock);
- page_struct_unlock(pp);
- return (0);
- }
- pp->p_slckcnt++;
- availrmem -= pages;
- mutex_exit(&freemem_lock);
- page_struct_unlock(pp);
- return (1);
-}
-
-static void
-segvn_sunlock_anonpages(page_t *pp, int first)
-{
- pgcnt_t pages;
- pfn_t pfn;
-
- ASSERT(PAGE_LOCKED(pp));
- ASSERT(pp->p_vnode != NULL);
- ASSERT(IS_SWAPFSVP(pp->p_vnode));
-
/*
- * pagesize won't change as long as any constituent page is locked.
+ * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
+ * still working with this segment without holding as lock (in case
+ * it's called by pcache async thread).
*/
- pages = page_get_pagecnt(pp->p_szc);
- pfn = page_pptonum(pp);
+ ASSERT(svd->softlockcnt == 0);
+ mutex_enter(&svd->segfree_syncmtx);
+ mutex_exit(&svd->segfree_syncmtx);
- if (!first) {
- if (!IS_P2ALIGNED(pfn, pages)) {
- return;
- }
- } else if (!IS_P2ALIGNED(pfn, pages)) {
- pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
-#ifdef DEBUG
- pfn = page_pptonum(pp);
- ASSERT(IS_P2ALIGNED(pfn, pages));
-#endif /* DEBUG */
- }
- ASSERT(pp->p_vnode != NULL);
- ASSERT(IS_SWAPFSVP(pp->p_vnode));
- ASSERT(pp->p_slckcnt != 0);
- page_struct_lock(pp);
- if (--pp->p_slckcnt == 0) {
- mutex_enter(&freemem_lock);
- availrmem += pages;
- mutex_exit(&freemem_lock);
- }
- page_struct_unlock(pp);
+ seg->s_data = NULL;
+ kmem_cache_free(segvn_cache, svd);
}
/*
@@ -2601,19 +2534,10 @@ segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
}
TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
"segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
- if (svd->vp == NULL) {
- segvn_sunlock_anonpages(pp, adr == addr);
- }
page_unlock(pp);
}
- mutex_enter(&freemem_lock); /* for availrmem */
- if (svd->vp != NULL) {
- availrmem += btop(len);
- }
- segvn_pages_locked -= btop(len);
- svd->softlockcnt -= btop(len);
- mutex_exit(&freemem_lock);
- if (svd->softlockcnt == 0) {
+ ASSERT(svd->softlockcnt >= btop(len));
+ if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) {
/*
* All SOFTLOCKS are gone. Wakeup any waiting
* unmappers so they can try again to unmap.
@@ -2691,8 +2615,7 @@ segvn_faultpage(
uint_t vpprot, /* access allowed to object pages */
enum fault_type type, /* type of fault */
enum seg_rw rw, /* type of access at fault */
- int brkcow, /* we may need to break cow */
- int first) /* first page for this fault if 1 */
+ int brkcow) /* we may need to break cow */
{
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
page_t *pp, **ppp;
@@ -2749,17 +2672,8 @@ segvn_faultpage(
prot = svd->prot;
}
- if (type == F_SOFTLOCK && svd->vp != NULL) {
- mutex_enter(&freemem_lock);
- if (availrmem <= tune.t_minarmem) {
- mutex_exit(&freemem_lock);
- return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */
- } else {
- availrmem--;
- svd->softlockcnt++;
- segvn_pages_locked++;
- }
- mutex_exit(&freemem_lock);
+ if (type == F_SOFTLOCK) {
+ atomic_add_long((ulong_t *)&svd->softlockcnt, 1);
}
/*
@@ -2809,19 +2723,6 @@ segvn_faultpage(
if (lgrp_optimizations())
page_migrate(seg, addr, &pp, 1);
- if (type == F_SOFTLOCK) {
- if (!segvn_slock_anonpages(pp, first)) {
- page_unlock(pp);
- err = ENOMEM;
- goto out;
- } else {
- mutex_enter(&freemem_lock);
- svd->softlockcnt++;
- segvn_pages_locked++;
- mutex_exit(&freemem_lock);
- }
- }
-
if (enable_mbit_wa) {
if (rw == S_WRITE)
hat_setmod(pp);
@@ -2981,23 +2882,6 @@ segvn_faultpage(
if (lgrp_optimizations())
page_migrate(seg, addr, &opp, 1);
- if (type == F_SOFTLOCK && svd->vp == NULL) {
-
- ASSERT(opp->p_szc == 0 ||
- (svd->type == MAP_SHARED &&
- amp != NULL && amp->a_szc != 0));
-
- if (!segvn_slock_anonpages(opp, first)) {
- page_unlock(opp);
- err = ENOMEM;
- goto out;
- } else {
- mutex_enter(&freemem_lock);
- svd->softlockcnt++;
- segvn_pages_locked++;
- mutex_exit(&freemem_lock);
- }
- }
if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
if (rw == S_WRITE)
hat_setmod(opp);
@@ -3124,18 +3008,6 @@ segvn_faultpage(
page_migrate(seg, addr, &pp, 1);
ASSERT(pp->p_szc == 0);
- if (type == F_SOFTLOCK && svd->vp == NULL) {
- if (!segvn_slock_anonpages(pp, first)) {
- page_unlock(pp);
- err = ENOMEM;
- goto out;
- } else {
- mutex_enter(&freemem_lock);
- svd->softlockcnt++;
- segvn_pages_locked++;
- mutex_exit(&freemem_lock);
- }
- }
ASSERT(!IS_VMODSORT(pp->p_vnode));
if (enable_mbit_wa) {
@@ -3158,12 +3030,8 @@ out:
if (anon_lock)
anon_array_exit(&cookie);
- if (type == F_SOFTLOCK && svd->vp != NULL) {
- mutex_enter(&freemem_lock);
- availrmem++;
- segvn_pages_locked--;
- svd->softlockcnt--;
- mutex_exit(&freemem_lock);
+ if (type == F_SOFTLOCK) {
+ atomic_add_long((ulong_t *)&svd->softlockcnt, -1);
}
return (FC_MAKE_ERR(err));
}
@@ -3819,13 +3687,10 @@ out:
int segvn_anypgsz = 0;
-#define SEGVN_RESTORE_SOFTLOCK(type, pages) \
- if ((type) == F_SOFTLOCK) { \
- mutex_enter(&freemem_lock); \
- availrmem += (pages); \
- segvn_pages_locked -= (pages); \
- svd->softlockcnt -= (pages); \
- mutex_exit(&freemem_lock); \
+#define SEGVN_RESTORE_SOFTLOCK_VP(type, pages) \
+ if ((type) == F_SOFTLOCK) { \
+ atomic_add_long((ulong_t *)&(svd)->softlockcnt, \
+ -(pages)); \
}
#define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \
@@ -4032,17 +3897,8 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
}
}
if (type == F_SOFTLOCK) {
- mutex_enter(&freemem_lock);
- if (availrmem < tune.t_minarmem + pages) {
- mutex_exit(&freemem_lock);
- err = FC_MAKE_ERR(ENOMEM);
- goto out;
- } else {
- availrmem -= pages;
- segvn_pages_locked += pages;
- svd->softlockcnt += pages;
- }
- mutex_exit(&freemem_lock);
+ atomic_add_long((ulong_t *)&svd->softlockcnt,
+ pages);
}
pplist = NULL;
@@ -4123,7 +3979,7 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
page_free_replacement_page(pplist);
page_create_putback(pages);
}
- SEGVN_RESTORE_SOFTLOCK(type, pages);
+ SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
if (a + pgsz <= eaddr) {
SEGVN_VMSTAT_FLTVNPAGES(19);
err = FC_MAKE_ERR(ierr);
@@ -4179,7 +4035,7 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
page_free_replacement_page(pplist);
page_create_putback(pages);
}
- SEGVN_RESTORE_SOFTLOCK(type, pages);
+ SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
if (szc < seg->s_szc) {
SEGVN_VMSTAT_FLTVNPAGES(26);
/*
@@ -4226,7 +4082,7 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
SEGVN_VMSTAT_FLTVNPAGES(28);
anon_array_exit(&an_cookie);
ANON_LOCK_EXIT(&amp->a_rwlock);
- SEGVN_RESTORE_SOFTLOCK(type, pages);
+ SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
err = FC_MAKE_ERR(ierr);
goto out;
}
@@ -4724,9 +4580,7 @@ segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
ulong_t i;
int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
anon_sync_obj_t cookie;
- int first = 1;
int adjszc_chk;
- int purged = 0;
int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0;
ASSERT(szc != 0);
@@ -4794,18 +4648,9 @@ segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr,
pgsz);
}
- if (type == F_SOFTLOCK && svd->vp != NULL) {
- mutex_enter(&freemem_lock);
- if (availrmem < tune.t_minarmem + pages) {
- mutex_exit(&freemem_lock);
- err = FC_MAKE_ERR(ENOMEM);
- goto error;
- } else {
- availrmem -= pages;
- segvn_pages_locked += pages;
- svd->softlockcnt += pages;
- }
- mutex_exit(&freemem_lock);
+ if (type == F_SOFTLOCK) {
+ atomic_add_long((ulong_t *)&svd->softlockcnt,
+ pages);
}
anon_array_enter(amp, aindx, &cookie);
ppa_szc = (uint_t)-1;
@@ -4815,13 +4660,10 @@ segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
if (ierr != 0) {
anon_array_exit(&cookie);
VM_STAT_ADD(segvnvmstats.fltanpages[4]);
- if (type == F_SOFTLOCK && svd->vp != NULL) {
- VM_STAT_ADD(segvnvmstats.fltanpages[5]);
- mutex_enter(&freemem_lock);
- availrmem += pages;
- segvn_pages_locked -= pages;
- svd->softlockcnt -= pages;
- mutex_exit(&freemem_lock);
+ if (type == F_SOFTLOCK) {
+ atomic_add_long(
+ (ulong_t *)&svd->softlockcnt,
+ -pages);
}
if (ierr > 0) {
VM_STAT_ADD(segvnvmstats.fltanpages[6]);
@@ -4845,41 +4687,6 @@ segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
page_migrate(seg, a, ppa, pages);
ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
- if (type == F_SOFTLOCK && svd->vp == NULL) {
- /*
- * If all pages in ppa array belong to the same
- * large page call segvn_slock_anonpages()
- * just for ppa[0].
- */
- for (i = 0; i < pages; i++) {
- if (!segvn_slock_anonpages(ppa[i],
- i == 0 && first)) {
- ulong_t j;
- for (j = 0; j < i; j++) {
- segvn_sunlock_anonpages(
- ppa[j], j == 0 &&
- first);
- page_unlock(ppa[j]);
- }
- for (j = i; j < pages; j++) {
- page_unlock(ppa[j]);
- }
- anon_array_exit(&cookie);
- err = FC_MAKE_ERR(ENOMEM);
- goto error;
- }
- if (i == 0 && ppa[0]->p_szc >= szc) {
- ASSERT(!(page_pptonum(ppa[0]) &
- (pages - 1)));
- break;
- }
- }
- first = 0;
- mutex_enter(&freemem_lock);
- svd->softlockcnt += pages;
- segvn_pages_locked += pages;
- mutex_exit(&freemem_lock);
- }
if (segtype == MAP_SHARED) {
vpprot |= PROT_WRITE;
@@ -4920,17 +4727,6 @@ segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
* have relocated locked pages.
*/
ASSERT(ierr == -1 || ierr == -2);
- /*
- * For the very first relocation failure try to purge this
- * segment's cache so that the relocator can obtain an
- * exclusive lock on pages we want to relocate.
- */
- if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 &&
- svd->softlockcnt != 0) {
- purged = 1;
- segvn_purge(seg);
- continue;
- }
if (segvn_anypgsz) {
ASSERT(ierr == -2 || szc != 0);
@@ -5613,7 +5409,7 @@ slow:
*/
for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) {
err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot,
- type, rw, brkcow, a == addr);
+ type, rw, brkcow);
if (err) {
if (amp != NULL)
ANON_LOCK_EXIT(&amp->a_rwlock);
@@ -5826,6 +5622,16 @@ segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
*/
if (svd->softlockcnt > 0) {
ASSERT(svd->tr_state == SEGVN_TR_OFF);
+
+ /*
+ * If this is shared segment non 0 softlockcnt
+ * means locked pages are still in use.
+ */
+ if (svd->type == MAP_SHARED) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EAGAIN);
+ }
+
/*
* Since we do have the segvn writers lock nobody can fill
* the cache with entries belonging to this seg during
@@ -6084,15 +5890,17 @@ segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
if ((VPP_PROT(svp) ^ prot) &
PROT_WRITE) {
if (prot & PROT_WRITE) {
- if (!page_addclaim(pp)) {
- page_unlock(pp);
- break;
- }
+ if (!page_addclaim(
+ pp)) {
+ page_unlock(pp);
+ break;
+ }
} else {
- if (!page_subclaim(pp)) {
- page_unlock(pp);
- break;
- }
+ if (!page_subclaim(
+ pp)) {
+ page_unlock(pp);
+ break;
+ }
}
}
page_unlock(pp);
@@ -6257,6 +6065,15 @@ segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
*/
if (svd->softlockcnt > 0) {
ASSERT(svd->tr_state == SEGVN_TR_OFF);
+
+ /*
+ * If this is shared segment non 0 softlockcnt
+ * means locked pages are still in use.
+ */
+ if (svd->type == MAP_SHARED) {
+ return (EAGAIN);
+ }
+
/*
* Since we do have the segvn writers lock nobody can fill
* the cache with entries belonging to this seg during
@@ -6339,6 +6156,13 @@ segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
}
nsvd = (struct segvn_data *)nseg->s_data;
if (nsvd->softlockcnt > 0) {
+ /*
+ * If this is shared segment non 0 softlockcnt
+ * means locked pages are still in use.
+ */
+ if (nsvd->type == MAP_SHARED) {
+ return (EAGAIN);
+ }
segvn_purge(nseg);
if (nsvd->softlockcnt > 0) {
return (EAGAIN);
@@ -6698,6 +6522,8 @@ segvn_split_seg(struct seg *seg, caddr_t addr)
}
ASSERT(svd->softlockcnt == 0);
+ ASSERT(svd->softlockcnt_sbase == 0);
+ ASSERT(svd->softlockcnt_send == 0);
crhold(svd->cred);
if (svd->vpage != NULL) {
@@ -7336,11 +7162,20 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
if (svd->softlockcnt > 0) {
/*
+ * If this is shared segment non 0 softlockcnt
+ * means locked pages are still in use.
+ */
+ if (svd->type == MAP_SHARED) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EAGAIN);
+ }
+
+ /*
* flush all pages from seg cache
* otherwise we may deadlock in swap_putpage
* for B_INVAL page (4175402).
*
- * Even if we grab segvn WRITER's lock or segp_slock
+ * Even if we grab segvn WRITER's lock
* here, there might be another thread which could've
* successfully performed lookup/insert just before
* we acquired the lock here. So, grabbing either
@@ -7354,6 +7189,18 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
return (EAGAIN);
}
+ } else if (svd->type == MAP_SHARED && svd->amp != NULL &&
+ svd->amp->a_softlockcnt > 0) {
+ /*
+ * Try to purge this amp's entries from pcache. It will
+ * succeed only if other segments that share the amp have no
+ * outstanding softlock's.
+ */
+ segvn_purge(seg);
+ if (svd->amp->a_softlockcnt > 0 || svd->softlockcnt > 0) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EAGAIN);
+ }
}
vpp = svd->vpage;
@@ -7904,8 +7751,10 @@ segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
vp = svd->vp;
off = offset;
}
- anon_array_exit(&cookie);
- ANON_LOCK_EXIT(&amp->a_rwlock);
+ if (op != MC_LOCK || ap == NULL) {
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
} else {
vp = svd->vp;
off = offset;
@@ -7933,6 +7782,11 @@ segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
(uint_t *)NULL, pl, PAGESIZE, seg, addr,
S_OTHER, svd->cred, NULL);
+ if (error && ap != NULL) {
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
+
/*
* If the error is EDEADLK then we must bounce
* up and drop all vm subsystem locks and then
@@ -8004,6 +7858,13 @@ segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
ASSERT(!VPP_ISPPLOCK(vpp));
ret = page_pp_lock(pp, claim, 0);
+ if (ap != NULL) {
+ if (ap->an_pvp != NULL) {
+ anon_swap_free(ap, pp);
+ }
+ anon_array_exit(&cookie);
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+ }
if (ret == 0) {
/* locking page failed */
page_unlock(pp);
@@ -8146,6 +8007,14 @@ segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
*/
if (svd->softlockcnt > 0) {
/*
+ * If this is shared segment non 0 softlockcnt
+ * means locked pages are still in use.
+ */
+ if (svd->type == MAP_SHARED) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (EAGAIN);
+ }
+ /*
* Since we do have the segvn writers lock
* nobody can fill the cache with entries
* belonging to this seg during the purge.
@@ -8164,6 +8033,14 @@ segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
return (EAGAIN);
}
+ } else if (svd->type == MAP_SHARED && svd->amp != NULL &&
+ svd->amp->a_softlockcnt > 0) {
+ /*
+ * Try to purge this amp's entries from pcache. It
+ * will succeed only if other segments that share the
+ * amp have no outstanding softlock's.
+ */
+ segvn_purge(seg);
}
}
@@ -8182,6 +8059,8 @@ segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
return (0);
}
+ segvn_purge(seg);
+
page = seg_page(seg, addr);
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
anon_disclaim(amp, svd->anon_index + page, len);
@@ -8623,59 +8502,289 @@ segvn_dump(struct seg *seg)
ANON_LOCK_EXIT(&amp->a_rwlock);
}
+#ifdef DEBUG
+static uint32_t segvn_pglock_mtbf = 0;
+#endif
+
+#define PCACHE_SHWLIST ((page_t *)-2)
+#define NOPCACHE_SHWLIST ((page_t *)-1)
+
/*
- * lock/unlock anon pages over a given range. Return shadow list
+ * Lock/Unlock anon pages over a given range. Return shadow list. This routine
+ * uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages
+ * to avoid the overhead of per page locking, unlocking for subsequent IOs to
+ * the same parts of the segment. Currently shadow list creation is only
+ * supported for pure anon segments. MAP_PRIVATE segment pcache entries are
+ * tagged with segment pointer, starting virtual address and length. This
+ * approach for MAP_SHARED segments may add many pcache entries for the same
+ * set of pages and lead to long hash chains that decrease pcache lookup
+ * performance. To avoid this issue for shared segments shared anon map and
+ * starting anon index are used for pcache entry tagging. This allows all
+ * segments to share pcache entries for the same anon range and reduces pcache
+ * chain's length as well as memory overhead from duplicate shadow lists and
+ * pcache entries.
+ *
+ * softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd
+ * pages via segvn_fault() and pagelock'd pages via this routine. But pagelock
+ * part of softlockcnt accounting is done differently for private and shared
+ * segments. In private segment case softlock is only incremented when a new
+ * shadow list is created but not when an existing one is found via
+ * seg_plookup(). pcache entries have reference count incremented/decremented
+ * by each seg_plookup()/seg_pinactive() operation. Only entries that have 0
+ * reference count can be purged (and purging is needed before segment can be
+ * freed). When a private segment pcache entry is purged segvn_reclaim() will
+ * decrement softlockcnt. Since in private segment case each of its pcache
+ * entries only belongs to this segment we can expect that when
+ * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
+ * segment purge will succeed and softlockcnt will drop to 0. In shared
+ * segment case reference count in pcache entry counts active locks from many
+ * different segments so we can't expect segment purging to succeed even when
+ * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
+ * segment. To be able to determine when there're no pending pagelocks in
+ * shared segment case we don't rely on purging to make softlockcnt drop to 0
+ * but instead softlockcnt is incremented and decremented for every
+ * segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow
+ * list was created or an existing one was found. When softlockcnt drops to 0
+ * this segment no longer has any claims for pcached shadow lists and the
+ * segment can be freed even if there're still active pcache entries
+ * shared by this segment anon map. Shared segment pcache entries belong to
+ * anon map and are typically removed when anon map is freed after all
+ * processes destroy the segments that use this anon map.
*/
static int
segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
enum lock_type type, enum seg_rw rw)
{
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
- size_t np, adjustpages = 0, npages = (len >> PAGESHIFT);
+ size_t np;
+ pgcnt_t adjustpages;
+ pgcnt_t npages;
ulong_t anon_index;
- uint_t protchk;
+ uint_t protchk = (rw == S_READ) ? PROT_READ : PROT_WRITE;
uint_t error;
struct anon_map *amp;
+ pgcnt_t anpgcnt;
struct page **pplist, **pl, *pp;
caddr_t a;
size_t page;
caddr_t lpgaddr, lpgeaddr;
- pgcnt_t szc0_npages = 0;
+ anon_sync_obj_t cookie;
+ int anlock;
+ struct anon_map *pamp;
+ caddr_t paddr;
+ seg_preclaim_cbfunc_t preclaim_callback;
+ size_t pgsz;
+ int use_pcache;
+ size_t wlen;
+ uint_t pflags = 0;
+ int sftlck_sbase = 0;
+ int sftlck_send = 0;
+
+#ifdef DEBUG
+ if (type == L_PAGELOCK && segvn_pglock_mtbf) {
+ hrtime_t ts = gethrtime();
+ if ((ts % segvn_pglock_mtbf) == 0) {
+ return (ENOTSUP);
+ }
+ if ((ts % segvn_pglock_mtbf) == 1) {
+ return (EFAULT);
+ }
+ }
+#endif
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START,
"segvn_pagelock: start seg %p addr %p", seg, addr);
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
- if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) {
+ ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
+
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+
+ /*
+ * for now we only support pagelock to anon memory. We would have to
+ * check protections for vnode objects and call into the vnode driver.
+ * That's too much for a fast path. Let the fault entry point handle
+ * it.
+ */
+ if (svd->vp != NULL) {
+ if (type == L_PAGELOCK) {
+ error = ENOTSUP;
+ goto out;
+ }
+ panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL");
+ }
+ if ((amp = svd->amp) == NULL) {
+ if (type == L_PAGELOCK) {
+ error = EFAULT;
+ goto out;
+ }
+ panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL");
+ }
+ if (rw != S_READ && rw != S_WRITE) {
+ if (type == L_PAGELOCK) {
+ error = ENOTSUP;
+ goto out;
+ }
+ panic("segvn_pagelock(L_PAGEUNLOCK): bad rw");
+ }
+
+ if (seg->s_szc != 0) {
/*
* We are adjusting the pagelock region to the large page size
* boundary because the unlocked part of a large page cannot
* be freed anyway unless all constituent pages of a large
- * page are locked. Therefore this adjustment allows us to
- * decrement availrmem by the right value (note we don't want
- * to just decrement availrem by the large page size without
- * adjusting addr and len because then we may end up
- * decrementing availrmem by large page size for every
- * constituent page locked by a new as_pagelock call).
- * as_pageunlock caller must always match as_pagelock call's
- * addr and len.
+ * page are locked. Bigger regions reduce pcache chain length
+ * and improve lookup performance. The tradeoff is that the
+ * very first segvn_pagelock() call for a given page is more
+ * expensive if only 1 page_t is needed for IO. This is only
+ * an issue if pcache entry doesn't get reused by several
+ * subsequent calls. We optimize here for the case when pcache
+ * is heavily used by repeated IOs to the same address range.
*
* Note segment's page size cannot change while we are holding
* as lock. And then it cannot change while softlockcnt is
* not 0. This will allow us to correctly recalculate large
- * page size region for the matching pageunlock/reclaim call.
+ * page size region for the matching pageunlock/reclaim call
+ * since as_pageunlock() caller must always match
+ * as_pagelock() call's addr and len.
*
- * for pageunlock *ppp points to the pointer of page_t that
+ * For pageunlock *ppp points to the pointer of page_t that
* corresponds to the real unadjusted start address. Similar
* for pagelock *ppp must point to the pointer of page_t that
* corresponds to the real unadjusted start address.
*/
- size_t pgsz = page_get_pagesize(seg->s_szc);
+ pgsz = page_get_pagesize(seg->s_szc);
CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
- adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT;
+ adjustpages = btop((uintptr_t)(addr - lpgaddr));
+ } else if (len < segvn_pglock_comb_thrshld) {
+ lpgaddr = addr;
+ lpgeaddr = addr + len;
+ adjustpages = 0;
+ pgsz = PAGESIZE;
+ } else {
+ /*
+ * Align the address range of large enough requests to allow
+ * combining of different shadow lists into 1 to reduce memory
+ * overhead from potentially overlapping large shadow lists
+ * (worst case is we have a 1MB IO into buffers with start
+ * addresses separated by 4K). Alignment is only possible if
+ * padded chunks have sufficient access permissions. Note
+ * permissions won't change between L_PAGELOCK and
+ * L_PAGEUNLOCK calls since non 0 softlockcnt will force
+ * segvn_setprot() to wait until softlockcnt drops to 0. This
+ * allows us to determine in L_PAGEUNLOCK the same range we
+ * computed in L_PAGELOCK.
+ *
+ * If alignment is limited by segment ends set
+ * sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when
+ * these flags are set bump softlockcnt_sbase/softlockcnt_send
+ * per segment counters. In L_PAGEUNLOCK case decrease
+ * softlockcnt_sbase/softlockcnt_send counters if
+ * sftlck_sbase/sftlck_send flags are set. When
+ * softlockcnt_sbase/softlockcnt_send are non 0
+ * segvn_concat()/segvn_extend_prev()/segvn_extend_next()
+ * won't merge the segments. This restriction combined with
+ * restriction on segment unmapping and splitting for segments
+ * that have non 0 softlockcnt allows L_PAGEUNLOCK to
+ * correctly determine the same range that was previously
+ * locked by matching L_PAGELOCK.
+ */
+ pflags = SEGP_PSHIFT | (segvn_pglock_comb_bshift << 16);
+ pgsz = PAGESIZE;
+ if (svd->type == MAP_PRIVATE) {
+ lpgaddr = (caddr_t)P2ALIGN((uintptr_t)addr,
+ segvn_pglock_comb_balign);
+ if (lpgaddr < seg->s_base) {
+ lpgaddr = seg->s_base;
+ sftlck_sbase = 1;
+ }
+ } else {
+ ulong_t aix = svd->anon_index + seg_page(seg, addr);
+ ulong_t aaix = P2ALIGN(aix, segvn_pglock_comb_palign);
+ if (aaix < svd->anon_index) {
+ lpgaddr = seg->s_base;
+ sftlck_sbase = 1;
+ } else {
+ lpgaddr = addr - ptob(aix - aaix);
+ ASSERT(lpgaddr >= seg->s_base);
+ }
+ }
+ if (svd->pageprot && lpgaddr != addr) {
+ struct vpage *vp = &svd->vpage[seg_page(seg, lpgaddr)];
+ struct vpage *evp = &svd->vpage[seg_page(seg, addr)];
+ while (vp < evp) {
+ if ((VPP_PROT(vp) & protchk) == 0) {
+ break;
+ }
+ vp++;
+ }
+ if (vp < evp) {
+ lpgaddr = addr;
+ pflags = 0;
+ }
+ }
+ lpgeaddr = addr + len;
+ if (pflags) {
+ if (svd->type == MAP_PRIVATE) {
+ lpgeaddr = (caddr_t)P2ROUNDUP(
+ (uintptr_t)lpgeaddr,
+ segvn_pglock_comb_balign);
+ } else {
+ ulong_t aix = svd->anon_index +
+ seg_page(seg, lpgeaddr);
+ ulong_t aaix = P2ROUNDUP(aix,
+ segvn_pglock_comb_palign);
+ if (aaix < aix) {
+ lpgeaddr = 0;
+ } else {
+ lpgeaddr += ptob(aaix - aix);
+ }
+ }
+ if (lpgeaddr == 0 ||
+ lpgeaddr > seg->s_base + seg->s_size) {
+ lpgeaddr = seg->s_base + seg->s_size;
+ sftlck_send = 1;
+ }
+ }
+ if (svd->pageprot && lpgeaddr != addr + len) {
+ struct vpage *vp;
+ struct vpage *evp;
+
+ vp = &svd->vpage[seg_page(seg, addr + len)];
+ evp = &svd->vpage[seg_page(seg, lpgeaddr)];
+
+ while (vp < evp) {
+ if ((VPP_PROT(vp) & protchk) == 0) {
+ break;
+ }
+ vp++;
+ }
+ if (vp < evp) {
+ lpgeaddr = addr + len;
+ }
+ }
+ adjustpages = btop((uintptr_t)(addr - lpgaddr));
+ }
+
+ /*
+ * For MAP_SHARED segments we create pcache entries tagged by amp and
+ * anon index so that we can share pcache entries with other segments
+ * that map this amp. For private segments pcache entries are tagged
+ * with segment and virtual address.
+ */
+ if (svd->type == MAP_SHARED) {
+ pamp = amp;
+ paddr = (caddr_t)((lpgaddr - seg->s_base) +
+ ptob(svd->anon_index));
+ preclaim_callback = shamp_reclaim;
+ } else {
+ pamp = NULL;
+ paddr = lpgaddr;
+ preclaim_callback = segvn_reclaim;
}
if (type == L_PAGEUNLOCK) {
+ VM_STAT_ADD(segvnvmstats.pagelock[0]);
/*
* update hat ref bits for /proc. We need to make sure
@@ -8694,13 +8803,50 @@ segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
}
}
}
- SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
- if (seg->s_szc != 0) {
- VM_STAT_ADD(segvnvmstats.pagelock[0]);
- seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr,
- *ppp - adjustpages, rw, segvn_reclaim);
+
+ /*
+ * Check the shadow list entry after the last page used in
+ * this IO request. If it's NOPCACHE_SHWLIST the shadow list
+ * was not inserted into pcache and is not large page
+ * adjusted. In this case call reclaim callback directly and
+ * don't adjust the shadow list start and size for large
+ * pages.
+ */
+ npages = btop(len);
+ if ((*ppp)[npages] == NOPCACHE_SHWLIST) {
+ void *ptag;
+ if (pamp != NULL) {
+ ASSERT(svd->type == MAP_SHARED);
+ ptag = (void *)pamp;
+ paddr = (caddr_t)((addr - seg->s_base) +
+ ptob(svd->anon_index));
+ } else {
+ ptag = (void *)seg;
+ paddr = addr;
+ }
+ (*preclaim_callback)(ptag, paddr, len, *ppp, rw, 0);
} else {
- seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim);
+ ASSERT((*ppp)[npages] == PCACHE_SHWLIST ||
+ IS_SWAPFSVP((*ppp)[npages]->p_vnode));
+ len = lpgeaddr - lpgaddr;
+ npages = btop(len);
+ seg_pinactive(seg, pamp, paddr, len,
+ *ppp - adjustpages, rw, pflags, preclaim_callback);
+ }
+
+ if (pamp != NULL) {
+ ASSERT(svd->type == MAP_SHARED);
+ ASSERT(svd->softlockcnt >= npages);
+ atomic_add_long((ulong_t *)&svd->softlockcnt, -npages);
+ }
+
+ if (sftlck_sbase) {
+ ASSERT(svd->softlockcnt_sbase > 0);
+ atomic_add_long((ulong_t *)&svd->softlockcnt_sbase, -1);
+ }
+ if (sftlck_send) {
+ ASSERT(svd->softlockcnt_send > 0);
+ atomic_add_long((ulong_t *)&svd->softlockcnt_send, -1);
}
/*
@@ -8711,77 +8857,97 @@ segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
* raw async i/o is still in progress or where a thread
* exits on data fault in a multithreaded application.
*/
- if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) {
- /*
- * Even if we grab segvn WRITER's lock or segp_slock
- * here, there might be another thread which could've
- * successfully performed lookup/insert just before
- * we acquired the lock here. So, grabbing either
- * lock here is of not much use. Until we devise
- * a strategy at upper layers to solve the
- * synchronization issues completely, we expect
- * applications to handle this appropriately.
- */
- segvn_purge(seg);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ if (svd->softlockcnt == 0) {
+ mutex_enter(&seg->s_as->a_contents);
+ if (AS_ISUNMAPWAIT(seg->s_as)) {
+ AS_CLRUNMAPWAIT(seg->s_as);
+ cv_broadcast(&seg->s_as->a_cv);
+ }
+ mutex_exit(&seg->s_as->a_contents);
+ } else if (pamp == NULL) {
+ /*
+ * softlockcnt is not 0 and this is a
+ * MAP_PRIVATE segment. Try to purge its
+ * pcache entries to reduce softlockcnt.
+ * If it drops to 0 segvn_reclaim()
+ * will wake up a thread waiting on
+ * unmapwait flag.
+ *
+ * We don't purge MAP_SHARED segments with non
+ * 0 softlockcnt since IO is still in progress
+ * for such segments.
+ */
+ ASSERT(svd->type == MAP_PRIVATE);
+ segvn_purge(seg);
+ }
}
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
"segvn_pagelock: unlock seg %p addr %p", seg, addr);
return (0);
- } else if (type == L_PAGERECLAIM) {
- VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]);
- SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
- (void) segvn_reclaim(seg, addr, len, *ppp, rw);
- SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
- TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
- "segvn_pagelock: reclaim seg %p addr %p", seg, addr);
- return (0);
}
- if (seg->s_szc != 0) {
- VM_STAT_ADD(segvnvmstats.pagelock[2]);
- addr = lpgaddr;
- len = lpgeaddr - lpgaddr;
- npages = (len >> PAGESHIFT);
- }
+ /* The L_PAGELOCK case ... */
- /*
- * for now we only support pagelock to anon memory. We've to check
- * protections for vnode objects and call into the vnode driver.
- * That's too much for a fast path. Let the fault entry point handle it.
- */
- if (svd->vp != NULL) {
- TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
- "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr);
- *ppp = NULL;
- return (ENOTSUP);
- }
+ VM_STAT_ADD(segvnvmstats.pagelock[1]);
/*
- * if anonmap is not yet created, let the fault entry point populate it
- * with anon ptrs.
+ * For MAP_SHARED segments we have to check protections before
+ * seg_plookup() since pcache entries may be shared by many segments
+ * with potentially different page protections.
*/
- if ((amp = svd->amp) == NULL) {
- TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
- "segvn_pagelock: anonmap null seg %p addr %p", seg, addr);
- *ppp = NULL;
- return (EFAULT);
- }
+ if (pamp != NULL) {
+ ASSERT(svd->type == MAP_SHARED);
+ if (svd->pageprot == 0) {
+ if ((svd->prot & protchk) == 0) {
+ error = EACCES;
+ goto out;
+ }
+ } else {
+ /*
+ * check page protections
+ */
+ caddr_t ea;
- SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ if (seg->s_szc) {
+ a = lpgaddr;
+ ea = lpgeaddr;
+ } else {
+ a = addr;
+ ea = addr + len;
+ }
+ for (; a < ea; a += pgsz) {
+ struct vpage *vp;
- /*
- * we acquire segp_slock to prevent duplicate entries
- * in seg_pcache
- */
- mutex_enter(&svd->segp_slock);
+ ASSERT(seg->s_szc == 0 ||
+ sameprot(seg, a, pgsz));
+ vp = &svd->vpage[seg_page(seg, a)];
+ if ((VPP_PROT(vp) & protchk) == 0) {
+ error = EACCES;
+ goto out;
+ }
+ }
+ }
+ }
/*
* try to find pages in segment page cache
*/
- pplist = seg_plookup(seg, addr, len, rw);
+ pplist = seg_plookup(seg, pamp, paddr, lpgeaddr - lpgaddr, rw, pflags);
if (pplist != NULL) {
- mutex_exit(&svd->segp_slock);
+ if (pamp != NULL) {
+ npages = btop((uintptr_t)(lpgeaddr - lpgaddr));
+ ASSERT(svd->type == MAP_SHARED);
+ atomic_add_long((ulong_t *)&svd->softlockcnt,
+ npages);
+ }
+ if (sftlck_sbase) {
+ atomic_add_long((ulong_t *)&svd->softlockcnt_sbase, 1);
+ }
+ if (sftlck_send) {
+ atomic_add_long((ulong_t *)&svd->softlockcnt_send, 1);
+ }
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
*ppp = pplist + adjustpages;
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END,
@@ -8789,145 +8955,211 @@ segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
return (0);
}
- if (rw == S_READ) {
- protchk = PROT_READ;
- } else {
- protchk = PROT_WRITE;
- }
-
- if (svd->pageprot == 0) {
- if ((svd->prot & protchk) == 0) {
- mutex_exit(&svd->segp_slock);
- error = EFAULT;
- goto out;
- }
- } else {
- /*
- * check page protections
- */
- for (a = addr; a < addr + len; a += PAGESIZE) {
- struct vpage *vp;
-
- vp = &svd->vpage[seg_page(seg, a)];
- if ((VPP_PROT(vp) & protchk) == 0) {
- mutex_exit(&svd->segp_slock);
- error = EFAULT;
+ /*
+ * For MAP_SHARED segments we already verified above that segment
+ * protections allow this pagelock operation.
+ */
+ if (pamp == NULL) {
+ ASSERT(svd->type == MAP_PRIVATE);
+ if (svd->pageprot == 0) {
+ if ((svd->prot & protchk) == 0) {
+ error = EACCES;
goto out;
}
+ if (svd->prot & PROT_WRITE) {
+ wlen = lpgeaddr - lpgaddr;
+ } else {
+ wlen = 0;
+ ASSERT(rw == S_READ);
+ }
+ } else {
+ int wcont = 1;
+ /*
+ * check page protections
+ */
+ for (a = lpgaddr, wlen = 0; a < lpgeaddr; a += pgsz) {
+ struct vpage *vp;
+
+ ASSERT(seg->s_szc == 0 ||
+ sameprot(seg, a, pgsz));
+ vp = &svd->vpage[seg_page(seg, a)];
+ if ((VPP_PROT(vp) & protchk) == 0) {
+ error = EACCES;
+ goto out;
+ }
+ if (wcont && (VPP_PROT(vp) & PROT_WRITE)) {
+ wlen += pgsz;
+ } else {
+ wcont = 0;
+ ASSERT(rw == S_READ);
+ }
+ }
}
+ ASSERT(rw == S_READ || wlen == lpgeaddr - lpgaddr);
+ ASSERT(rw == S_WRITE || wlen <= lpgeaddr - lpgaddr);
}
/*
- * Avoid per page overhead of segvn_slock_anonpages() for small
- * pages. For large pages segvn_slock_anonpages() only does real
- * work once per large page. The tradeoff is that we may decrement
- * availrmem more than once for the same page but this is ok
- * for small pages.
+ * Only build large page adjusted shadow list if we expect to insert
+ * it into pcache. For large enough pages it's a big overhead to
+ * create a shadow list of the entire large page. But this overhead
+ * should be amortized over repeated pcache hits on subsequent reuse
+ * of this shadow list (IO into any range within this shadow list will
+ * find it in pcache since we large page align the request for pcache
+ * lookups). pcache performance is improved with bigger shadow lists
+ * as it reduces the time to pcache the entire big segment and reduces
+ * pcache chain length.
*/
- if (seg->s_szc == 0) {
- mutex_enter(&freemem_lock);
- if (availrmem < tune.t_minarmem + npages) {
- mutex_exit(&freemem_lock);
- mutex_exit(&svd->segp_slock);
- error = ENOMEM;
- goto out;
- }
- availrmem -= npages;
- mutex_exit(&freemem_lock);
+ if (seg_pinsert_check(seg, pamp, paddr,
+ lpgeaddr - lpgaddr, pflags) == SEGP_SUCCESS) {
+ addr = lpgaddr;
+ len = lpgeaddr - lpgaddr;
+ use_pcache = 1;
+ } else {
+ use_pcache = 0;
+ /*
+ * Since this entry will not be inserted into the pcache, we
+ * will not do any adjustments to the starting address or
+ * size of the memory to be locked.
+ */
+ adjustpages = 0;
}
+ npages = btop(len);
- pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP);
+ pplist = kmem_alloc(sizeof (page_t *) * (npages + 1), KM_SLEEP);
pl = pplist;
*ppp = pplist + adjustpages;
+ /*
+ * If use_pcache is 0 this shadow list is not large page adjusted.
+ * Record this info in the last entry of shadow array so that
+ * L_PAGEUNLOCK can determine if it should large page adjust the
+ * address range to find the real range that was locked.
+ */
+ pl[npages] = use_pcache ? PCACHE_SHWLIST : NOPCACHE_SHWLIST;
page = seg_page(seg, addr);
anon_index = svd->anon_index + page;
+ anlock = 0;
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ ASSERT(amp->a_szc >= seg->s_szc);
+ anpgcnt = page_get_pagecnt(amp->a_szc);
for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) {
struct anon *ap;
struct vnode *vp;
u_offset_t off;
- anon_sync_obj_t cookie;
- anon_array_enter(amp, anon_index, &cookie);
+ /*
+ * Lock and unlock anon array only once per large page.
+ * anon_array_enter() locks the root anon slot according to
+ * a_szc which can't change while anon map is locked. We lock
+ * anon the first time through this loop and each time we
+ * reach anon index that corresponds to a root of a large
+ * page.
+ */
+ if (a == addr || P2PHASE(anon_index, anpgcnt) == 0) {
+ ASSERT(anlock == 0);
+ anon_array_enter(amp, anon_index, &cookie);
+ anlock = 1;
+ }
ap = anon_get_ptr(amp->ahp, anon_index);
- if (ap == NULL) {
+
+ /*
+ * We must never use seg_pcache for COW pages
+ * because we might end up with original page still
+ * lying in seg_pcache even after private page is
+ * created. This leads to data corruption as
+ * aio_write refers to the page still in cache
+ * while all other accesses refer to the private
+ * page.
+ */
+ if (ap == NULL || ap->an_refcnt != 1) {
+ struct vpage *vpage;
+
+ if (seg->s_szc) {
+ error = EFAULT;
+ break;
+ }
+ if (svd->vpage != NULL) {
+ vpage = &svd->vpage[seg_page(seg, a)];
+ } else {
+ vpage = NULL;
+ }
+ ASSERT(anlock);
anon_array_exit(&cookie);
- break;
- } else {
- /*
- * We must never use seg_pcache for COW pages
- * because we might end up with original page still
- * lying in seg_pcache even after private page is
- * created. This leads to data corruption as
- * aio_write refers to the page still in cache
- * while all other accesses refer to the private
- * page.
- */
- if (ap->an_refcnt != 1) {
- anon_array_exit(&cookie);
+ anlock = 0;
+ pp = NULL;
+ error = segvn_faultpage(seg->s_as->a_hat, seg, a, 0,
+ vpage, &pp, 0, F_INVAL, rw, 1);
+ if (error) {
+ error = fc_decode(error);
+ break;
+ }
+ anon_array_enter(amp, anon_index, &cookie);
+ anlock = 1;
+ ap = anon_get_ptr(amp->ahp, anon_index);
+ if (ap == NULL || ap->an_refcnt != 1) {
+ error = EFAULT;
break;
}
}
swap_xlate(ap, &vp, &off);
- anon_array_exit(&cookie);
-
pp = page_lookup_nowait(vp, off, SE_SHARED);
if (pp == NULL) {
+ error = EFAULT;
break;
}
- if (seg->s_szc != 0 || pp->p_szc != 0) {
- if (!segvn_slock_anonpages(pp, a == addr)) {
- page_unlock(pp);
- break;
- }
- } else {
- szc0_npages++;
+ if (ap->an_pvp != NULL) {
+ anon_swap_free(ap, pp);
+ }
+ /*
+ * Unlock anon if this is the last slot in a large page.
+ */
+ if (P2PHASE(anon_index, anpgcnt) == anpgcnt - 1) {
+ ASSERT(anlock);
+ anon_array_exit(&cookie);
+ anlock = 0;
}
*pplist++ = pp;
}
+ if (anlock) { /* Ensure the lock is dropped */
+ anon_array_exit(&cookie);
+ }
ANON_LOCK_EXIT(&amp->a_rwlock);
- ASSERT(npages >= szc0_npages);
-
if (a >= addr + len) {
- mutex_enter(&freemem_lock);
- if (seg->s_szc == 0 && npages != szc0_npages) {
- ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0);
- availrmem += (npages - szc0_npages);
- }
- svd->softlockcnt += npages;
- segvn_pages_locked += npages;
- mutex_exit(&freemem_lock);
- (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH,
- segvn_reclaim);
- mutex_exit(&svd->segp_slock);
+ atomic_add_long((ulong_t *)&svd->softlockcnt, npages);
+ if (pamp != NULL) {
+ ASSERT(svd->type == MAP_SHARED);
+ atomic_add_long((ulong_t *)&pamp->a_softlockcnt,
+ npages);
+ wlen = len;
+ }
+ if (sftlck_sbase) {
+ atomic_add_long((ulong_t *)&svd->softlockcnt_sbase, 1);
+ }
+ if (sftlck_send) {
+ atomic_add_long((ulong_t *)&svd->softlockcnt_send, 1);
+ }
+ if (use_pcache) {
+ (void) seg_pinsert(seg, pamp, paddr, len, wlen, pl,
+ rw, pflags, preclaim_callback);
+ }
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END,
"segvn_pagelock: cache fill seg %p addr %p", seg, addr);
return (0);
}
- mutex_exit(&svd->segp_slock);
- if (seg->s_szc == 0) {
- mutex_enter(&freemem_lock);
- availrmem += npages;
- mutex_exit(&freemem_lock);
- }
- error = EFAULT;
pplist = pl;
np = ((uintptr_t)(a - addr)) >> PAGESHIFT;
while (np > (uint_t)0) {
ASSERT(PAGE_LOCKED(*pplist));
- if (seg->s_szc != 0 || (*pplist)->p_szc != 0) {
- segvn_sunlock_anonpages(*pplist, pplist == pl);
- }
page_unlock(*pplist);
np--;
pplist++;
}
- kmem_free(pl, sizeof (page_t *) * npages);
+ kmem_free(pl, sizeof (page_t *) * (npages + 1));
out:
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
*ppp = NULL;
@@ -8942,34 +9174,55 @@ out:
static void
segvn_purge(struct seg *seg)
{
- seg_ppurge(seg);
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+
+ /*
+ * pcache is only used by pure anon segments.
+ */
+ if (svd->amp == NULL || svd->vp != NULL) {
+ return;
+ }
+
+ /*
+ * For MAP_SHARED segments non 0 segment's softlockcnt means
+ * active IO is still in progress via this segment. So we only
+ * purge MAP_SHARED segments when their softlockcnt is 0.
+ */
+ if (svd->type == MAP_PRIVATE) {
+ if (svd->softlockcnt) {
+ seg_ppurge(seg, NULL, 0);
+ }
+ } else if (svd->softlockcnt == 0 && svd->amp->a_softlockcnt != 0) {
+ seg_ppurge(seg, svd->amp, 0);
+ }
}
+/*
+ * If async argument is not 0 we are called from pcache async thread and don't
+ * hold AS lock.
+ */
+
+/*ARGSUSED*/
static int
-segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
- enum seg_rw rw)
+segvn_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
+ enum seg_rw rw, int async)
{
+ struct seg *seg = (struct seg *)ptag;
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
pgcnt_t np, npages;
struct page **pl;
- pgcnt_t szc0_npages = 0;
-
-#ifdef lint
- addr = addr;
-#endif
- npages = np = (len >> PAGESHIFT);
+ npages = np = btop(len);
ASSERT(npages);
- pl = pplist;
- if (seg->s_szc != 0) {
- size_t pgsz = page_get_pagesize(seg->s_szc);
- if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
- panic("segvn_reclaim: unaligned addr or len");
- /*NOTREACHED*/
- }
- }
ASSERT(svd->vp == NULL && svd->amp != NULL);
+ ASSERT(svd->softlockcnt >= npages);
+ ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+
+ pl = pplist;
+
+ ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST);
+ ASSERT(!async || pl[np] == PCACHE_SHWLIST);
while (np > (uint_t)0) {
if (rw == S_WRITE) {
@@ -8977,27 +9230,41 @@ segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
} else {
hat_setref(*pplist);
}
- if (seg->s_szc != 0 || (*pplist)->p_szc != 0) {
- segvn_sunlock_anonpages(*pplist, pplist == pl);
- } else {
- szc0_npages++;
- }
page_unlock(*pplist);
np--;
pplist++;
}
- kmem_free(pl, sizeof (page_t *) * npages);
- mutex_enter(&freemem_lock);
- segvn_pages_locked -= npages;
- svd->softlockcnt -= npages;
- if (szc0_npages != 0) {
- availrmem += szc0_npages;
+ kmem_free(pl, sizeof (page_t *) * (npages + 1));
+
+ /*
+ * If we are pcache async thread we don't hold AS lock. This means if
+ * softlockcnt drops to 0 after the decrement below address space may
+ * get freed. We can't allow it since after softlock derement to 0 we
+ * still need to access as structure for possible wakeup of unmap
+ * waiters. To prevent the disappearance of as we take this segment
+ * segfree_syncmtx. segvn_free() also takes this mutex as a barrier to
+ * make sure this routine completes before segment is freed.
+ *
+ * The second complication we have to deal with in async case is a
+ * possibility of missed wake up of unmap wait thread. When we don't
+ * hold as lock here we may take a_contents lock before unmap wait
+ * thread that was first to see softlockcnt was still not 0. As a
+ * result we'll fail to wake up an unmap wait thread. To avoid this
+ * race we set nounmapwait flag in as structure if we drop softlockcnt
+ * to 0 when we were called by pcache async thread. unmapwait thread
+ * will not block if this flag is set.
+ */
+ if (async) {
+ mutex_enter(&svd->segfree_syncmtx);
}
- mutex_exit(&freemem_lock);
- if (svd->softlockcnt <= 0) {
- if (AS_ISUNMAPWAIT(seg->s_as)) {
+
+ if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -npages)) {
+ if (async || AS_ISUNMAPWAIT(seg->s_as)) {
mutex_enter(&seg->s_as->a_contents);
+ if (async) {
+ AS_SETNOUNMAPWAIT(seg->s_as);
+ }
if (AS_ISUNMAPWAIT(seg->s_as)) {
AS_CLRUNMAPWAIT(seg->s_as);
cv_broadcast(&seg->s_as->a_cv);
@@ -9005,8 +9272,59 @@ segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
mutex_exit(&seg->s_as->a_contents);
}
}
+
+ if (async) {
+ mutex_exit(&svd->segfree_syncmtx);
+ }
return (0);
}
+
+/*ARGSUSED*/
+static int
+shamp_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
+ enum seg_rw rw, int async)
+{
+ amp_t *amp = (amp_t *)ptag;
+ pgcnt_t np, npages;
+ struct page **pl;
+
+ npages = np = btop(len);
+ ASSERT(npages);
+ ASSERT(amp->a_softlockcnt >= npages);
+
+ pl = pplist;
+
+ ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST);
+ ASSERT(!async || pl[np] == PCACHE_SHWLIST);
+
+ while (np > (uint_t)0) {
+ if (rw == S_WRITE) {
+ hat_setrefmod(*pplist);
+ } else {
+ hat_setref(*pplist);
+ }
+ page_unlock(*pplist);
+ np--;
+ pplist++;
+ }
+
+ kmem_free(pl, sizeof (page_t *) * (npages + 1));
+
+ /*
+ * If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt
+ * drops to 0. anon map can't be freed until a_softlockcnt drops to 0
+ * and anonmap_purge() acquires a_purgemtx.
+ */
+ mutex_enter(&amp->a_purgemtx);
+ if (!atomic_add_long_nv((ulong_t *)&amp->a_softlockcnt, -npages) &&
+ amp->a_purgewait) {
+ amp->a_purgewait = 0;
+ cv_broadcast(&amp->a_purgecv);
+ }
+ mutex_exit(&amp->a_purgemtx);
+ return (0);
+}
+
/*
* get a memory ID for an addr in a given segment
*
diff --git a/usr/src/uts/common/vm/seg_vn.h b/usr/src/uts/common/vm/seg_vn.h
index 43838fdc24..66acf8cb61 100644
--- a/usr/src/uts/common/vm/seg_vn.h
+++ b/usr/src/uts/common/vm/seg_vn.h
@@ -86,7 +86,7 @@ typedef struct segvn_crargs {
*/
typedef struct segvn_data {
krwlock_t lock; /* protect segvn_data and vpage array */
- kmutex_t segp_slock; /* serialize insertions into seg_pcache */
+ kmutex_t segfree_syncmtx; /* barrier lock for segvn_free() */
uchar_t pageprot; /* true if per page protections present */
uchar_t prot; /* current segment prot if pageprot == 0 */
uchar_t maxprot; /* maximum segment protections */
@@ -101,7 +101,7 @@ typedef struct segvn_data {
uchar_t advice; /* madvise flags for segment */
uchar_t pageadvice; /* true if per page advice set */
ushort_t flags; /* flags - from sys/mman.h */
- ssize_t softlockcnt; /* # of pages SOFTLOCKED in seg */
+ spgcnt_t softlockcnt; /* # of pages SOFTLOCKED in seg */
lgrp_mem_policy_info_t policy_info; /* memory allocation policy */
hat_region_cookie_t rcookie; /* region for hat calls */
lgrp_mem_policy_info_t tr_policy_info; /* memory allocation for TR */
@@ -110,6 +110,8 @@ typedef struct segvn_data {
struct segvn_data *svn_trprev; /* textrepl list prev link */
int tr_state; /* TR (text replication) state */
uchar_t pageswap; /* true if per page swap accounting is set */
+ spgcnt_t softlockcnt_sbase; /* # of softlocks for seg start addr */
+ spgcnt_t softlockcnt_send; /* # of softlocks for seg end addr */
} segvn_data_t;
#ifdef _KERNEL
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index 8196243d42..3f3502036d 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -106,6 +106,7 @@
#include <sys/sysmacros.h>
#include <sys/bitmap.h>
#include <sys/vmsystm.h>
+#include <sys/tuneable.h>
#include <sys/debug.h>
#include <sys/fs/swapnode.h>
#include <sys/tnf_probe.h>
@@ -156,7 +157,6 @@ static struct anonvmstats_str {
} anonvmstats;
#endif /* VM_STATS */
-
/*ARGSUSED*/
static int
anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
@@ -164,6 +164,9 @@ anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
struct anon_map *amp = buf;
rw_init(&amp->a_rwlock, NULL, RW_DEFAULT, NULL);
+ cv_init(&amp->a_purgecv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&amp->a_pmtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&amp->a_purgemtx, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
@@ -174,6 +177,9 @@ anonmap_cache_destructor(void *buf, void *cdrarg)
struct anon_map *amp = buf;
rw_destroy(&amp->a_rwlock);
+ cv_destroy(&amp->a_purgecv);
+ mutex_destroy(&amp->a_pmtx);
+ mutex_destroy(&amp->a_purgemtx);
}
kmutex_t anonhash_lock[AH_LOCK_SIZE];
@@ -870,6 +876,7 @@ anon_unresvmem(size_t size, zone_t *zone)
mutex_enter(&anoninfo_lock);
ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
+
/*
* If some of this reservation belonged to swapfs
* give it back to availrmem.
@@ -944,6 +951,48 @@ anon_alloc(struct vnode *vp, anoff_t off)
}
/*
+ * Called for pages locked in memory via softlock/pagelock/mlock to make sure
+ * such pages don't consume any physical swap resources needed for swapping
+ * unlocked pages.
+ */
+void
+anon_swap_free(struct anon *ap, page_t *pp)
+{
+ kmutex_t *ahm;
+
+ ASSERT(ap != NULL);
+ ASSERT(pp != NULL);
+ ASSERT(PAGE_LOCKED(pp));
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT(IS_SWAPFSVP(pp->p_vnode));
+ ASSERT(ap->an_refcnt != 0);
+ ASSERT(pp->p_vnode == ap->an_vp);
+ ASSERT(pp->p_offset == ap->an_off);
+
+ if (ap->an_pvp == NULL)
+ return;
+
+ page_io_lock(pp);
+ ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
+ mutex_enter(ahm);
+
+ ASSERT(ap->an_refcnt != 0);
+ ASSERT(pp->p_vnode == ap->an_vp);
+ ASSERT(pp->p_offset == ap->an_off);
+
+ if (ap->an_pvp != NULL) {
+ swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
+ ap->an_pvp = NULL;
+ ap->an_poff = 0;
+ mutex_exit(ahm);
+ hat_setmod(pp);
+ } else {
+ mutex_exit(ahm);
+ }
+ page_io_unlock(pp);
+}
+
+/*
* Decrement the reference count of an anon page.
* If reference count goes to zero, free it and
* its associated page (if any).
@@ -3154,7 +3203,7 @@ anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
ulong_t sidx_aligned;
ulong_t eidx_aligned;
- ASSERT(RW_WRITE_HELD(&amp->a_rwlock));
+ ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
ASSERT(amp->refcnt <= 1);
ASSERT(amp->a_szc > 0);
ASSERT(eidx <= ahp->size);
@@ -3205,6 +3254,53 @@ anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
}
/*
+ * This routine should be called with amp's writer lock when there're no other
+ * users of amp. All pcache entries of this amp must have been already
+ * inactivated. We must not drop a_rwlock here to prevent new users from
+ * attaching to this amp.
+ */
+void
+anonmap_purge(struct anon_map *amp)
+{
+ ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
+ ASSERT(amp->refcnt <= 1);
+
+ if (amp->a_softlockcnt != 0) {
+ seg_ppurge(NULL, amp, 0);
+ }
+
+ /*
+ * Since all pcache entries were already inactive before this routine
+ * was called seg_ppurge() couldn't return while there're still
+ * entries that can be found via the list anchored at a_phead. So we
+ * can assert this list is empty now. a_softlockcnt may be still non 0
+ * if asynchronous thread that manages pcache already removed pcache
+ * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
+ * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
+ * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
+ * before shamp_reclaim() is done with it. a_purgemtx also taken by
+ * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
+ * barrier that prevents anonmap_purge() to complete while
+ * shamp_reclaim() may still be referencing this amp.
+ */
+ ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
+ ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
+
+ mutex_enter(&amp->a_purgemtx);
+ while (amp->a_softlockcnt != 0) {
+ ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
+ ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
+ amp->a_purgewait = 1;
+ cv_wait(&amp->a_purgecv, &amp->a_purgemtx);
+ }
+ mutex_exit(&amp->a_purgemtx);
+
+ ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
+ ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
+ ASSERT(amp->a_softlockcnt == 0);
+}
+
+/*
* Allocate and initialize an anon_map structure for seg
* associating the given swap reservation with the new anon_map.
*/
@@ -3232,14 +3328,22 @@ anonmap_alloc(size_t size, size_t swresv, int flags)
amp->locality = 0;
amp->a_szc = 0;
amp->a_sp = NULL;
+ amp->a_softlockcnt = 0;
+ amp->a_purgewait = 0;
+ amp->a_phead.p_lnext = &amp->a_phead;
+ amp->a_phead.p_lprev = &amp->a_phead;
+
return (amp);
}
void
anonmap_free(struct anon_map *amp)
{
- ASSERT(amp->ahp);
+ ASSERT(amp->ahp != NULL);
ASSERT(amp->refcnt == 0);
+ ASSERT(amp->a_softlockcnt == 0);
+ ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
+ ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
lgrp_shm_policy_fini(amp, NULL);
anon_release(amp->ahp, btopr(amp->size));
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
index 5a34aa2803..31fb56aa41 100644
--- a/usr/src/uts/common/vm/vm_as.c
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -715,12 +715,13 @@ top:
int err;
next = AS_SEGNEXT(as, seg);
+retry:
err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
if (err == EAGAIN) {
mutex_enter(&as->a_contents);
if (as->a_callbacks) {
AS_LOCK_EXIT(as, &as->a_lock);
- } else {
+ } else if (!AS_ISNOUNMAPWAIT(as)) {
/*
* Memory is currently locked. Wait for a
* cv_signal that it has been unlocked, then
@@ -732,6 +733,20 @@ top:
AS_LOCK_EXIT(as, &as->a_lock);
while (AS_ISUNMAPWAIT(as))
cv_wait(&as->a_cv, &as->a_contents);
+ } else {
+ /*
+ * We may have raced with
+ * segvn_reclaim()/segspt_reclaim(). In this
+ * case clean nounmapwait flag and retry since
+ * softlockcnt in this segment may be already
+ * 0. We don't drop as writer lock so our
+ * number of retries without sleeping should
+ * be very small. See segvn_reclaim() for
+ * more comments.
+ */
+ AS_CLRNOUNMAPWAIT(as);
+ mutex_exit(&as->a_contents);
+ goto retry;
}
mutex_exit(&as->a_contents);
goto top;
@@ -1193,6 +1208,7 @@ setprot_top:
ssize = seg->s_base + seg->s_size - raddr;
else
ssize = rsize;
+retry:
error = SEGOP_SETPROT(seg, raddr, ssize, prot);
if (error == IE_NOMEM) {
@@ -1254,13 +1270,27 @@ setprot_top:
seg->s_base, seg->s_size))) {
AS_LOCK_EXIT(as, &as->a_lock);
as_execute_callback(as, cb, AS_SETPROT_EVENT);
- } else {
+ } else if (!AS_ISNOUNMAPWAIT(as)) {
if (AS_ISUNMAPWAIT(as) == 0)
cv_broadcast(&as->a_cv);
AS_SETUNMAPWAIT(as);
AS_LOCK_EXIT(as, &as->a_lock);
while (AS_ISUNMAPWAIT(as))
cv_wait(&as->a_cv, &as->a_contents);
+ } else {
+ /*
+ * We may have raced with
+ * segvn_reclaim()/segspt_reclaim(). In this
+ * case clean nounmapwait flag and retry since
+ * softlockcnt in this segment may be already
+ * 0. We don't drop as writer lock so our
+ * number of retries without sleeping should
+ * be very small. See segvn_reclaim() for
+ * more comments.
+ */
+ AS_CLRNOUNMAPWAIT(as);
+ mutex_exit(&as->a_contents);
+ goto retry;
}
mutex_exit(&as->a_contents);
goto setprot_top;
@@ -1385,6 +1415,7 @@ top:
*/
seg_next = AS_SEGNEXT(as, seg);
+retry:
err = SEGOP_UNMAP(seg, raddr, ssize);
if (err == EAGAIN) {
/*
@@ -1419,25 +1450,37 @@ top:
* either there were no callbacks for this event
* or they were already in progress.
*/
- as_setwatch(as);
mutex_enter(&as->a_contents);
if (as->a_callbacks &&
(cb = as_find_callback(as, AS_UNMAP_EVENT,
seg->s_base, seg->s_size))) {
AS_LOCK_EXIT(as, &as->a_lock);
as_execute_callback(as, cb, AS_UNMAP_EVENT);
- } else {
+ } else if (!AS_ISNOUNMAPWAIT(as)) {
if (AS_ISUNMAPWAIT(as) == 0)
cv_broadcast(&as->a_cv);
AS_SETUNMAPWAIT(as);
AS_LOCK_EXIT(as, &as->a_lock);
while (AS_ISUNMAPWAIT(as))
cv_wait(&as->a_cv, &as->a_contents);
+ } else {
+ /*
+ * We may have raced with
+ * segvn_reclaim()/segspt_reclaim(). In this
+ * case clean nounmapwait flag and retry since
+ * softlockcnt in this segment may be already
+ * 0. We don't drop as writer lock so our
+ * number of retries without sleeping should
+ * be very small. See segvn_reclaim() for
+ * more comments.
+ */
+ AS_CLRNOUNMAPWAIT(as);
+ mutex_exit(&as->a_contents);
+ goto retry;
}
mutex_exit(&as->a_contents);
goto top;
} else if (err == IE_RETRY) {
- as_setwatch(as);
AS_LOCK_EXIT(as, &as->a_lock);
goto top;
} else if (err) {
@@ -2539,6 +2582,167 @@ fc_decode(faultcode_t fault_err)
}
/*
+ * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
+ * lists from each segment and copy them to one contiguous shadow list (plist)
+ * as expected by the caller. Save pointers to per segment shadow lists at
+ * the tail of plist so that they can be used during as_pageunlock().
+ */
+static int
+as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
+ caddr_t addr, size_t size, enum seg_rw rw)
+{
+ caddr_t sv_addr = addr;
+ size_t sv_size = size;
+ struct seg *sv_seg = seg;
+ ulong_t segcnt = 1;
+ ulong_t cnt;
+ size_t ssize;
+ pgcnt_t npages = btop(size);
+ page_t **plist;
+ page_t **pl;
+ int error;
+ caddr_t eaddr;
+ faultcode_t fault_err = 0;
+ pgcnt_t pl_off;
+ extern struct seg_ops segspt_shmops;
+
+ ASSERT(AS_LOCK_HELD(as, &as->a_lock));
+ ASSERT(seg != NULL);
+ ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
+ ASSERT(addr + size > seg->s_base + seg->s_size);
+ ASSERT(IS_P2ALIGNED(size, PAGESIZE));
+ ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
+
+ /*
+ * Count the number of segments covered by the range we are about to
+ * lock. The segment count is used to size the shadow list we return
+ * back to the caller.
+ */
+ for (; size != 0; size -= ssize, addr += ssize) {
+ if (addr >= seg->s_base + seg->s_size) {
+
+ seg = AS_SEGNEXT(as, seg);
+ if (seg == NULL || addr != seg->s_base) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (EFAULT);
+ }
+ /*
+ * Do a quick check if subsequent segments
+ * will most likely support pagelock.
+ */
+ if (seg->s_ops == &segvn_ops) {
+ vnode_t *vp;
+
+ if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
+ vp != NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ goto slow;
+ }
+ } else if (seg->s_ops != &segspt_shmops) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ goto slow;
+ }
+ segcnt++;
+ }
+ if (addr + size > seg->s_base + seg->s_size) {
+ ssize = seg->s_base + seg->s_size - addr;
+ } else {
+ ssize = size;
+ }
+ }
+ ASSERT(segcnt > 1);
+
+ plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
+
+ addr = sv_addr;
+ size = sv_size;
+ seg = sv_seg;
+
+ for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
+ if (addr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ ASSERT(seg != NULL && addr == seg->s_base);
+ cnt++;
+ ASSERT(cnt < segcnt);
+ }
+ if (addr + size > seg->s_base + seg->s_size) {
+ ssize = seg->s_base + seg->s_size - addr;
+ } else {
+ ssize = size;
+ }
+ pl = &plist[npages + cnt];
+ error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
+ L_PAGELOCK, rw);
+ if (error) {
+ break;
+ }
+ ASSERT(plist[npages + cnt] != NULL);
+ ASSERT(pl_off + btop(ssize) <= npages);
+ bcopy(plist[npages + cnt], &plist[pl_off],
+ btop(ssize) * sizeof (page_t *));
+ pl_off += btop(ssize);
+ }
+
+ if (size == 0) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ ASSERT(cnt == segcnt - 1);
+ *ppp = plist;
+ return (0);
+ }
+
+ /*
+ * one of pagelock calls failed. The error type is in error variable.
+ * Unlock what we've locked so far and retry with F_SOFTLOCK if error
+ * type is either EFAULT or ENOTSUP. Otherwise just return the error
+ * back to the caller.
+ */
+
+ eaddr = addr;
+ seg = sv_seg;
+
+ for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
+ if (addr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ ASSERT(seg != NULL && addr == seg->s_base);
+ cnt++;
+ ASSERT(cnt < segcnt);
+ }
+ if (eaddr > seg->s_base + seg->s_size) {
+ ssize = seg->s_base + seg->s_size - addr;
+ } else {
+ ssize = eaddr - addr;
+ }
+ pl = &plist[npages + cnt];
+ ASSERT(*pl != NULL);
+ (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
+ L_PAGEUNLOCK, rw);
+ }
+
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
+
+ if (error != ENOTSUP && error != EFAULT) {
+ return (error);
+ }
+
+slow:
+ /*
+ * If we are here because pagelock failed due to the need to cow fault
+ * in the pages we want to lock F_SOFTLOCK will do this job and in
+ * next as_pagelock() call for this address range pagelock will
+ * hopefully succeed.
+ */
+ fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
+ if (fault_err != 0) {
+ return (fc_decode(fault_err));
+ }
+ *ppp = NULL;
+
+ return (0);
+}
+
+/*
* lock pages in a given address space. Return shadow list. If
* the list is NULL, the MMU mapping is also locked.
*/
@@ -2547,12 +2751,10 @@ as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
size_t size, enum seg_rw rw)
{
size_t rsize;
- caddr_t base;
caddr_t raddr;
faultcode_t fault_err;
struct seg *seg;
- int res;
- int prefaulted = 0;
+ int err;
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
"as_pagelock_start: addr %p size %ld", addr, size);
@@ -2560,17 +2762,25 @@ as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
-top:
+
/*
* if the request crosses two segments let
* as_fault handle it.
*/
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
- seg = as_findseg(as, addr, 0);
- if ((seg == NULL) || ((base = seg->s_base) > addr) ||
- (addr + size) > base + seg->s_size) {
+
+ seg = as_segat(as, raddr);
+ if (seg == NULL) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ return (EFAULT);
+ }
+ ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
+ if (raddr + rsize > seg->s_base + seg->s_size) {
+ return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
+ }
+ if (raddr + rsize <= raddr) {
AS_LOCK_EXIT(as, &as->a_lock);
- goto slow;
+ return (EFAULT);
}
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
@@ -2579,46 +2789,22 @@ top:
/*
* try to lock pages and pass back shadow list
*/
- res = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
+ err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
- AS_LOCK_EXIT(as, &as->a_lock);
- if (res == 0) {
- return (0);
- } else if (res == ENOTSUP || prefaulted) {
- /*
- * (1) segment driver doesn't support PAGELOCK fastpath, or
- * (2) we've already tried fast path unsuccessfully after
- * faulting in the addr range below; system might be
- * thrashing or there may not be enough availrmem.
- */
- goto slow;
- }
- TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_FAULT_START,
- "as_fault_start: addr %p size %ld", addr, size);
+ AS_LOCK_EXIT(as, &as->a_lock);
- /*
- * we might get here because of some COW fault or non
- * existing page. Let as_fault deal with it. Just load
- * the page, don't lock the MMU mapping.
- */
- fault_err = as_fault(as->a_hat, as, addr, size, F_INVAL, rw);
- if (fault_err != 0) {
- return (fc_decode(fault_err));
+ if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
+ return (err);
}
- prefaulted = 1;
-
/*
- * try fast path again; since we've dropped a_lock,
- * we need to try the dance from the start to see if
- * the addr range is still valid.
- */
- goto top;
-slow:
- /*
- * load the page and lock the MMU mapping.
+ * Use F_SOFTLOCK to lock the pages because pagelock failed either due
+ * to no pagelock support for this segment or pages need to be cow
+ * faulted in. If fault is needed F_SOFTLOCK will do this job for
+ * this as_pagelock() call and in the next as_pagelock() call for the
+ * same address range pagelock call will hopefull succeed.
*/
fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
if (fault_err != 0) {
@@ -2631,6 +2817,52 @@ slow:
}
/*
+ * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
+ * lists from the end of plist and call pageunlock interface for each segment.
+ * Drop as lock and free plist.
+ */
+static void
+as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
+ struct page **plist, enum seg_rw rw)
+{
+ ulong_t cnt;
+ caddr_t eaddr = addr + size;
+ pgcnt_t npages = btop(size);
+ size_t ssize;
+ page_t **pl;
+
+ ASSERT(AS_LOCK_HELD(as, &as->a_lock));
+ ASSERT(seg != NULL);
+ ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
+ ASSERT(addr + size > seg->s_base + seg->s_size);
+ ASSERT(IS_P2ALIGNED(size, PAGESIZE));
+ ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
+ ASSERT(plist != NULL);
+
+ for (cnt = 0; addr < eaddr; addr += ssize) {
+ if (addr >= seg->s_base + seg->s_size) {
+ seg = AS_SEGNEXT(as, seg);
+ ASSERT(seg != NULL && addr == seg->s_base);
+ cnt++;
+ }
+ if (eaddr > seg->s_base + seg->s_size) {
+ ssize = seg->s_base + seg->s_size - addr;
+ } else {
+ ssize = eaddr - addr;
+ }
+ pl = &plist[npages + cnt];
+ ASSERT(*pl != NULL);
+ (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
+ L_PAGEUNLOCK, rw);
+ }
+ ASSERT(cnt > 0);
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ cnt++;
+ kmem_free(plist, (npages + cnt) * sizeof (page_t *));
+}
+
+/*
* unlock pages in a given address range
*/
void
@@ -2652,44 +2884,29 @@ as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
(void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
return;
}
+
raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
(size_t)raddr;
+
AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
- seg = as_findseg(as, addr, 0);
- ASSERT(seg);
+ seg = as_segat(as, raddr);
+ ASSERT(seg != NULL);
+
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
"seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
- SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
+
+ ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
+ if (raddr + rsize <= seg->s_base + seg->s_size) {
+ SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
+ } else {
+ as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
+ return;
+ }
AS_LOCK_EXIT(as, &as->a_lock);
TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
}
-/*
- * reclaim cached pages in a given address range
- */
-void
-as_pagereclaim(struct as *as, struct page **pp, caddr_t addr,
- size_t size, enum seg_rw rw)
-{
- struct seg *seg;
- size_t rsize;
- caddr_t raddr;
-
- ASSERT(AS_READ_HELD(as, &as->a_lock));
- ASSERT(pp != NULL);
-
- raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
- rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
- (size_t)raddr;
- seg = as_findseg(as, addr, 0);
- ASSERT(seg);
- SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGERECLAIM, rw);
-}
-
-#define MAXPAGEFLIP 4
-#define MAXPAGEFLIPSIZ MAXPAGEFLIP*PAGESIZE
-
int
as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
boolean_t wait)
@@ -2735,6 +2952,7 @@ setpgsz_top:
ssize = rsize;
}
+retry:
error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
if (error == IE_NOMEM) {
@@ -2778,13 +2996,29 @@ setpgsz_top:
* as_unmap, as_setprot or as_free would do.
*/
mutex_enter(&as->a_contents);
- if (AS_ISUNMAPWAIT(as) == 0) {
- cv_broadcast(&as->a_cv);
- }
- AS_SETUNMAPWAIT(as);
- AS_LOCK_EXIT(as, &as->a_lock);
- while (AS_ISUNMAPWAIT(as)) {
- cv_wait(&as->a_cv, &as->a_contents);
+ if (!AS_ISNOUNMAPWAIT(as)) {
+ if (AS_ISUNMAPWAIT(as) == 0) {
+ cv_broadcast(&as->a_cv);
+ }
+ AS_SETUNMAPWAIT(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ while (AS_ISUNMAPWAIT(as)) {
+ cv_wait(&as->a_cv, &as->a_contents);
+ }
+ } else {
+ /*
+ * We may have raced with
+ * segvn_reclaim()/segspt_reclaim(). In this
+ * case clean nounmapwait flag and retry since
+ * softlockcnt in this segment may be already
+ * 0. We don't drop as writer lock so our
+ * number of retries without sleeping should
+ * be very small. See segvn_reclaim() for
+ * more comments.
+ */
+ AS_CLRNOUNMAPWAIT(as);
+ mutex_exit(&as->a_contents);
+ goto retry;
}
mutex_exit(&as->a_contents);
goto setpgsz_top;
@@ -2809,6 +3043,8 @@ as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
size_t ssize;
int error;
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
seg = as_segat(as, raddr);
if (seg == NULL) {
panic("as_iset3_default_lpsize: no seg");
@@ -2864,6 +3100,8 @@ as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
int error;
int retry;
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+
for (;;) {
error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
if (error == EINVAL && retry) {
@@ -3150,16 +3388,30 @@ again:
error = EINVAL;
} else if (error == EAGAIN) {
mutex_enter(&as->a_contents);
- if (AS_ISUNMAPWAIT(as) == 0) {
- cv_broadcast(&as->a_cv);
- }
- AS_SETUNMAPWAIT(as);
- AS_LOCK_EXIT(as, &as->a_lock);
- while (AS_ISUNMAPWAIT(as)) {
- cv_wait(&as->a_cv, &as->a_contents);
+ if (!AS_ISNOUNMAPWAIT(as)) {
+ if (AS_ISUNMAPWAIT(as) == 0) {
+ cv_broadcast(&as->a_cv);
+ }
+ AS_SETUNMAPWAIT(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ while (AS_ISUNMAPWAIT(as)) {
+ cv_wait(&as->a_cv, &as->a_contents);
+ }
+ mutex_exit(&as->a_contents);
+ AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
+ } else {
+ /*
+ * We may have raced with
+ * segvn_reclaim()/segspt_reclaim(). In this case
+ * clean nounmapwait flag and retry since softlockcnt
+ * in this segment may be already 0. We don't drop as
+ * writer lock so our number of retries without
+ * sleeping should be very small. See segvn_reclaim()
+ * for more comments.
+ */
+ AS_CLRNOUNMAPWAIT(as);
+ mutex_exit(&as->a_contents);
}
- mutex_exit(&as->a_contents);
- AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
goto again;
}
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 070f60b3e6..0890319e28 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -106,9 +106,6 @@ pgcnt_t availrmem_initial;
* These new counters will track the pages locked through segvn and
* by explicit user locking.
*
- * segvn_pages_locked : This keeps track on a global basis how many pages
- * are currently locked because of I/O.
- *
* pages_locked : How many pages are locked because of user specified
* locking through mlock or plock.
*
@@ -117,10 +114,9 @@ pgcnt_t availrmem_initial;
*
* All these globals are protected by the same lock which protects availrmem.
*/
-pgcnt_t segvn_pages_locked;
-pgcnt_t pages_locked;
-pgcnt_t pages_useclaim;
-pgcnt_t pages_claimed;
+pgcnt_t pages_locked = 0;
+pgcnt_t pages_useclaim = 0;
+pgcnt_t pages_claimed = 0;
/*
@@ -5878,7 +5874,6 @@ page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
deficit = tune.t_minarmem + npages + epages - availrmem;
mutex_exit(&freemem_lock);
page_needfree(deficit);
- seg_preap();
kmem_reap();
delay(hz);
page_needfree(-(spgcnt_t)deficit);
@@ -6285,7 +6280,7 @@ kcondvar_t pc_cv;
static kmutex_t pc_thread_mutex;
static clock_t pc_thread_shortwait;
static clock_t pc_thread_longwait;
-static int pc_thread_ism_retry;
+static int pc_thread_retry;
struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
@@ -6782,17 +6777,13 @@ page_capture_pre_checks(page_t *pp, uint_t flags)
ASSERT(pp != NULL);
- /* only physmem currently has restrictions */
- if (!(flags & CAPTURE_PHYSMEM)) {
- return (0);
- }
-
#if defined(__sparc)
if (pp->p_vnode == &prom_ppages) {
return (EPERM);
}
- if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE)) {
+ if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
+ (flags & CAPTURE_PHYSMEM)) {
return (ENOENT);
}
@@ -6805,6 +6796,11 @@ page_capture_pre_checks(page_t *pp, uint_t flags)
}
#endif /* __sparc */
+ /* only physmem currently has the restrictions checked below */
+ if (!(flags & CAPTURE_PHYSMEM)) {
+ return (0);
+ }
+
if (availrmem < swapfs_minfree) {
/*
* We won't try to capture this page as we are
@@ -7187,7 +7183,7 @@ page_capture_init()
pc_thread_shortwait = 23 * hz;
pc_thread_longwait = 1201 * hz;
- pc_thread_ism_retry = 3;
+ pc_thread_retry = 3;
mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
@@ -7358,7 +7354,6 @@ do_aio_cleanup(void)
static void
page_capture_handle_outstanding(void)
{
- extern size_t spt_used;
int ntry;
if (!page_retire_pend_count()) {
@@ -7380,34 +7375,23 @@ page_capture_handle_outstanding(void)
* we reap prior to attempting to capture.
*/
kmem_reap();
- /*
- * When ISM is in use, we need to disable and
- * purge the seg_pcache, and initiate aio
- * cleanup in order to release page locks and
- * subsquently retire pages in need of
- * retirement.
- */
- if (spt_used) {
- /* disable and purge seg_pcache */
- (void) seg_p_disable();
- for (ntry = 0; ntry < pc_thread_ism_retry; ntry++) {
- if (!page_retire_pend_count())
- break;
- if (do_aio_cleanup()) {
- /*
- * allow the apps cleanup threads
- * to run
- */
- delay(pc_thread_shortwait);
- }
- page_capture_async();
+
+ /* disable and purge seg_pcache */
+ (void) seg_p_disable();
+ for (ntry = 0; ntry < pc_thread_retry; ntry++) {
+ if (!page_retire_pend_count())
+ break;
+ if (do_aio_cleanup()) {
+ /*
+ * allow the apps cleanup threads
+ * to run
+ */
+ delay(pc_thread_shortwait);
}
- /* reenable seg_pcache */
- seg_p_enable();
- } else {
- seg_preap();
page_capture_async();
}
+ /* reenable seg_pcache */
+ seg_p_enable();
}
}
diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c
index 0023f2cc58..2791f7b29b 100644
--- a/usr/src/uts/common/vm/vm_seg.c
+++ b/usr/src/uts/common/vm/vm_seg.c
@@ -48,8 +48,11 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
+#include <sys/sysmacros.h>
#include <sys/vmsystm.h>
+#include <sys/tuneable.h>
#include <sys/debug.h>
+#include <sys/fs/swapnode.h>
#include <sys/cmn_err.h>
#include <sys/callb.h>
#include <sys/mem_config.h>
@@ -61,6 +64,8 @@
#include <vm/seg_kmem.h>
#include <vm/seg_spt.h>
#include <vm/seg_vn.h>
+#include <vm/anon.h>
+
/*
* kstats for segment advise
*/
@@ -72,472 +77,1188 @@ segadvstat_t segadvstat = {
kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
-/* #define PDEBUG */
-#if defined(PDEBUG) || defined(lint) || defined(__lint)
-int pdebug = 0;
-#else
-#define pdebug 0
-#endif /* PDEBUG */
-
-#define PPRINTF if (pdebug) printf
-#define PPRINT(x) PPRINTF(x)
-#define PPRINT1(x, a) PPRINTF(x, a)
-#define PPRINT2(x, a, b) PPRINTF(x, a, b)
-#define PPRINT3(x, a, b, c) PPRINTF(x, a, b, c)
-#define PPRINT4(x, a, b, c, d) PPRINTF(x, a, b, c, d)
-#define PPRINT5(x, a, b, c, d, e) PPRINTF(x, a, b, c, d, e)
-
-#define P_HASHMASK (p_hashsize - 1)
-#define P_BASESHIFT 6
-
/*
* entry in the segment page cache
*/
struct seg_pcache {
- struct seg_pcache *p_hnext; /* list for hashed blocks */
- struct seg_pcache *p_hprev;
- int p_active; /* active count */
- int p_ref; /* ref bit */
- size_t p_len; /* segment length */
- caddr_t p_addr; /* base address */
- struct seg *p_seg; /* segment */
- struct page **p_pp; /* pp shadow list */
- enum seg_rw p_rw; /* rw */
- uint_t p_flags; /* bit flags */
- int (*p_callback)(struct seg *, caddr_t, size_t,
- struct page **, enum seg_rw);
+ struct seg_pcache *p_hnext; /* list for hashed blocks */
+ struct seg_pcache *p_hprev;
+ pcache_link_t p_plink; /* per segment/amp list */
+ void *p_htag0; /* segment/amp pointer */
+ caddr_t p_addr; /* base address/anon_idx */
+ size_t p_len; /* total bytes */
+ size_t p_wlen; /* writtable bytes at p_addr */
+ struct page **p_pp; /* pp shadow list */
+ seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */
+ clock_t p_lbolt; /* lbolt from last use */
+ struct seg_phash *p_hashp; /* our pcache hash bucket */
+ uint_t p_active; /* active count */
+ uchar_t p_write; /* true if S_WRITE */
+ uchar_t p_ref; /* reference byte */
+ ushort_t p_flags; /* bit flags */
};
struct seg_phash {
- struct seg_pcache *p_hnext; /* list for hashed blocks */
- struct seg_pcache *p_hprev;
- int p_qlen; /* Q length */
- kmutex_t p_hmutex; /* protects hash bucket */
+ struct seg_pcache *p_hnext; /* list for hashed blocks */
+ struct seg_pcache *p_hprev;
+ kmutex_t p_hmutex; /* protects hash bucket */
+ pcache_link_t p_halink[2]; /* active bucket linkages */
+};
+
+struct seg_phash_wired {
+ struct seg_pcache *p_hnext; /* list for hashed blocks */
+ struct seg_pcache *p_hprev;
+ kmutex_t p_hmutex; /* protects hash bucket */
};
-static int seg_preap_time = 20; /* reclaim every 20 secs */
-static int seg_pmaxqlen = 5; /* max Q length in hash list */
-static int seg_ppcount = 5; /* max # of purges per reclaim interval */
-static int seg_plazy = 1; /* if 1, pages are cached after pageunlock */
-static pgcnt_t seg_pwindow; /* max # of pages that can be cached */
-static pgcnt_t seg_plocked; /* # of pages which are cached by pagelock */
-static pgcnt_t seg_plocked_window; /* # pages from window */
-int seg_preapahead;
+/*
+ * A parameter to control a maximum number of bytes that can be
+ * purged from pcache at a time.
+ */
+#define P_MAX_APURGE_BYTES (1024 * 1024 * 1024)
+
+/*
+ * log2(fraction of pcache to reclaim at a time).
+ */
+#define P_SHRINK_SHFT (5)
+
+/*
+ * The following variables can be tuned via /etc/system.
+ */
+
+int segpcache_enabled = 1; /* if 1, shadow lists are cached */
+pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */
+ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */
+ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */
+int segpcache_reap_sec = 1; /* reap check rate in secs */
+clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */
+int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */
+clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
+int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
+pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
+
+static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
+static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
+static kcondvar_t seg_pasync_cv;
+
+#pragma align 64(pctrl1)
+#pragma align 64(pctrl2)
+#pragma align 64(pctrl3)
+
+/*
+ * Keep frequently used variables together in one cache line.
+ */
+static struct p_ctrl1 {
+ uint_t p_disabled; /* if not 0, caching temporarily off */
+ pgcnt_t p_maxwin; /* max # of pages that can be cached */
+ size_t p_hashwin_sz; /* # of non wired buckets */
+ struct seg_phash *p_htabwin; /* hash table for non wired entries */
+ size_t p_hashwired_sz; /* # of wired buckets */
+ struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
+ kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */
+#ifdef _LP64
+ ulong_t pad[1];
+#endif /* _LP64 */
+} pctrl1;
+
+static struct p_ctrl2 {
+ kmutex_t p_mem_mtx; /* protects window counter and p_halinks */
+ pgcnt_t p_locked_win; /* # pages from window */
+ pgcnt_t p_locked; /* # of pages cached by pagelock */
+ uchar_t p_ahcur; /* current active links for insert/delete */
+ uchar_t p_athr_on; /* async reclaim thread is running. */
+ pcache_link_t p_ahhead[2]; /* active buckets linkages */
+} pctrl2;
+
+static struct p_ctrl3 {
+ clock_t p_pcp_maxage; /* max pcp age in ticks */
+ ulong_t p_athr_empty_ahb; /* athread walk stats */
+ ulong_t p_athr_full_ahb; /* athread walk stats */
+ pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */
+ int p_shrink_shft; /* reap shift factor */
+#ifdef _LP64
+ ulong_t pad[3];
+#endif /* _LP64 */
+} pctrl3;
+
+#define seg_pdisabled pctrl1.p_disabled
+#define seg_pmaxwindow pctrl1.p_maxwin
+#define seg_phashsize_win pctrl1.p_hashwin_sz
+#define seg_phashtab_win pctrl1.p_htabwin
+#define seg_phashsize_wired pctrl1.p_hashwired_sz
+#define seg_phashtab_wired pctrl1.p_htabwired
+#define seg_pkmcache pctrl1.p_kmcache
+#define seg_pmem_mtx pctrl2.p_mem_mtx
+#define seg_plocked_window pctrl2.p_locked_win
+#define seg_plocked pctrl2.p_locked
+#define seg_pahcur pctrl2.p_ahcur
+#define seg_pathr_on pctrl2.p_athr_on
+#define seg_pahhead pctrl2.p_ahhead
+#define seg_pmax_pcpage pctrl3.p_pcp_maxage
+#define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb
+#define seg_pathr_full_ahb pctrl3.p_athr_full_ahb
+#define seg_pshrink_shift pctrl3.p_shrink_shft
+#define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages
+
+#define P_HASHWIN_MASK (seg_phashsize_win - 1)
+#define P_HASHWIRED_MASK (seg_phashsize_wired - 1)
+#define P_BASESHIFT (6)
+
+kthread_t *seg_pasync_thr;
+
+extern struct seg_ops segvn_ops;
+extern struct seg_ops segspt_shmops;
+
+#define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
+#define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
-static uint_t seg_pdisable = 0; /* if not 0, caching temporarily disabled */
+#define LBOLT_DELTA(t) ((ulong_t)(lbolt - (t)))
-static int seg_pupdate_active = 1; /* background reclaim thread */
-static clock_t seg_preap_interval; /* reap interval in ticks */
+#define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt)
-static kmutex_t seg_pcache; /* protects the whole pagelock cache */
-static kmutex_t seg_pmem; /* protects window counter */
-static ksema_t seg_pasync_sem; /* sema for reclaim thread */
-static struct seg_phash *p_hashtab;
-static int p_hashsize = 0;
+/*
+ * htag0 argument can be a seg or amp pointer.
+ */
+#define P_HASHBP(seg, htag0, addr, flags) \
+ (IS_PFLAGS_WIRED((flags)) ? \
+ ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
+ ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \
+ (&seg_phashtab_win[P_HASHWIN_MASK & \
+ (((uintptr_t)(htag0) >> 3) ^ \
+ ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \
+ (flags >> 16) : page_get_shift((seg)->s_szc))))]))
-#define p_hash(seg) \
- (P_HASHMASK & \
- ((uintptr_t)(seg) >> P_BASESHIFT))
+/*
+ * htag0 argument can be a seg or amp pointer.
+ */
+#define P_MATCH(pcp, htag0, addr, len) \
+ ((pcp)->p_htag0 == (htag0) && \
+ (pcp)->p_addr == (addr) && \
+ (pcp)->p_len >= (len))
-#define p_match(pcp, seg, addr, len, rw) \
- (((pcp)->p_seg == (seg) && \
- (pcp)->p_addr == (addr) && \
- (pcp)->p_rw == (rw) && \
- (pcp)->p_len == (len)) ? 1 : 0)
+#define P_MATCH_PP(pcp, htag0, addr, len, pp) \
+ ((pcp)->p_pp == (pp) && \
+ (pcp)->p_htag0 == (htag0) && \
+ (pcp)->p_addr == (addr) && \
+ (pcp)->p_len >= (len))
-#define p_match_pp(pcp, seg, addr, len, pp, rw) \
- (((pcp)->p_seg == (seg) && \
- (pcp)->p_addr == (addr) && \
- (pcp)->p_pp == (pp) && \
- (pcp)->p_rw == (rw) && \
- (pcp)->p_len == (len)) ? 1 : 0)
+#define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \
+ offsetof(struct seg_pcache, p_plink)))
+#define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \
+ offsetof(struct seg_phash, p_halink[l])))
/*
- * lookup an address range in pagelock cache. Return shadow list
- * and bump up active count.
+ * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
+ * active hash bucket lists. We maintain active bucket lists to reduce the
+ * overhead of finding active buckets during asynchronous purging since there
+ * can be 10s of millions of buckets on a large system but only a small subset
+ * of them in actual use.
+ *
+ * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
+ * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
+ * buckets. The other list is used by asynchronous purge thread. This allows
+ * the purge thread to walk its active list without holding seg_pmem_mtx for a
+ * long time. When asynchronous thread is done with its list it switches to
+ * current active list and makes the list it just finished processing as
+ * current active list.
+ *
+ * seg_padd_abuck() only adds the bucket to current list if the bucket is not
+ * yet on any list. seg_premove_abuck() may remove the bucket from either
+ * list. If the bucket is on current list it will be always removed. Otherwise
+ * the bucket is only removed if asynchronous purge thread is not currently
+ * running or seg_premove_abuck() is called by asynchronous purge thread
+ * itself. A given bucket can only be on one of active lists at a time. These
+ * routines should be called with per bucket lock held. The routines use
+ * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
+ * the first entry is added to the bucket chain and seg_premove_abuck() must
+ * be called after the last pcp entry is deleted from its chain. Per bucket
+ * lock should be held by the callers. This avoids a potential race condition
+ * when seg_premove_abuck() removes a bucket after pcp entries are added to
+ * its list after the caller checked that the bucket has no entries. (this
+ * race would cause a loss of an active bucket from the active lists).
+ *
+ * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
+ * New entries are added to the end of the list since LRU is used as the
+ * purging policy.
+ */
+static void
+seg_padd_abuck(struct seg_phash *hp)
+{
+ int lix;
+
+ ASSERT(MUTEX_HELD(&hp->p_hmutex));
+ ASSERT((struct seg_phash *)hp->p_hnext != hp);
+ ASSERT((struct seg_phash *)hp->p_hprev != hp);
+ ASSERT(hp->p_hnext == hp->p_hprev);
+ ASSERT(!IS_PCP_WIRED(hp->p_hnext));
+ ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
+ ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
+ ASSERT(hp >= seg_phashtab_win &&
+ hp < &seg_phashtab_win[seg_phashsize_win]);
+
+ /*
+ * This bucket can already be on one of active lists
+ * since seg_premove_abuck() may have failed to remove it
+ * before.
+ */
+ mutex_enter(&seg_pmem_mtx);
+ lix = seg_pahcur;
+ ASSERT(lix >= 0 && lix <= 1);
+ if (hp->p_halink[lix].p_lnext != NULL) {
+ ASSERT(hp->p_halink[lix].p_lprev != NULL);
+ ASSERT(hp->p_halink[!lix].p_lnext == NULL);
+ ASSERT(hp->p_halink[!lix].p_lprev == NULL);
+ mutex_exit(&seg_pmem_mtx);
+ return;
+ }
+ ASSERT(hp->p_halink[lix].p_lprev == NULL);
+
+ /*
+ * If this bucket is still on list !lix async thread can't yet remove
+ * it since we hold here per bucket lock. In this case just return
+ * since async thread will eventually find and process this bucket.
+ */
+ if (hp->p_halink[!lix].p_lnext != NULL) {
+ ASSERT(hp->p_halink[!lix].p_lprev != NULL);
+ mutex_exit(&seg_pmem_mtx);
+ return;
+ }
+ ASSERT(hp->p_halink[!lix].p_lprev == NULL);
+ /*
+ * This bucket is not on any active bucket list yet.
+ * Add the bucket to the tail of current active list.
+ */
+ hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
+ hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
+ seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
+ seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
+ mutex_exit(&seg_pmem_mtx);
+}
+
+static void
+seg_premove_abuck(struct seg_phash *hp, int athr)
+{
+ int lix;
+
+ ASSERT(MUTEX_HELD(&hp->p_hmutex));
+ ASSERT((struct seg_phash *)hp->p_hnext == hp);
+ ASSERT((struct seg_phash *)hp->p_hprev == hp);
+ ASSERT(hp >= seg_phashtab_win &&
+ hp < &seg_phashtab_win[seg_phashsize_win]);
+
+ if (athr) {
+ ASSERT(seg_pathr_on);
+ ASSERT(seg_pahcur <= 1);
+ /*
+ * We are called by asynchronous thread that found this bucket
+ * on not currently active (i.e. !seg_pahcur) list. Remove it
+ * from there. Per bucket lock we are holding makes sure
+ * seg_pinsert() can't sneak in and add pcp entries to this
+ * bucket right before we remove the bucket from its list.
+ */
+ lix = !seg_pahcur;
+ ASSERT(hp->p_halink[lix].p_lnext != NULL);
+ ASSERT(hp->p_halink[lix].p_lprev != NULL);
+ ASSERT(hp->p_halink[!lix].p_lnext == NULL);
+ ASSERT(hp->p_halink[!lix].p_lprev == NULL);
+ hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
+ hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
+ hp->p_halink[lix].p_lnext = NULL;
+ hp->p_halink[lix].p_lprev = NULL;
+ return;
+ }
+
+ mutex_enter(&seg_pmem_mtx);
+ lix = seg_pahcur;
+ ASSERT(lix >= 0 && lix <= 1);
+
+ /*
+ * If the bucket is on currently active list just remove it from
+ * there.
+ */
+ if (hp->p_halink[lix].p_lnext != NULL) {
+ ASSERT(hp->p_halink[lix].p_lprev != NULL);
+ ASSERT(hp->p_halink[!lix].p_lnext == NULL);
+ ASSERT(hp->p_halink[!lix].p_lprev == NULL);
+ hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
+ hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
+ hp->p_halink[lix].p_lnext = NULL;
+ hp->p_halink[lix].p_lprev = NULL;
+ mutex_exit(&seg_pmem_mtx);
+ return;
+ }
+ ASSERT(hp->p_halink[lix].p_lprev == NULL);
+
+ /*
+ * If asynchronous thread is not running we can remove the bucket from
+ * not currently active list. The bucket must be on this list since we
+ * already checked that it's not on the other list and the bucket from
+ * which we just deleted the last pcp entry must be still on one of the
+ * active bucket lists.
+ */
+ lix = !lix;
+ ASSERT(hp->p_halink[lix].p_lnext != NULL);
+ ASSERT(hp->p_halink[lix].p_lprev != NULL);
+
+ if (!seg_pathr_on) {
+ hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
+ hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
+ hp->p_halink[lix].p_lnext = NULL;
+ hp->p_halink[lix].p_lprev = NULL;
+ }
+ mutex_exit(&seg_pmem_mtx);
+}
+
+/*
+ * Check if bucket pointed by hp already has a pcp entry that matches request
+ * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
+ * Also delete matching entries that cover smaller address range but start
+ * at the same address as addr argument. Return the list of deleted entries if
+ * any. This is an internal helper function called from seg_pinsert() only
+ * for non wired shadow lists. The caller already holds a per seg/amp list
+ * lock.
+ */
+static struct seg_pcache *
+seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
+ caddr_t addr, size_t len, int *found)
+{
+ struct seg_pcache *pcp;
+ struct seg_pcache *delcallb_list = NULL;
+
+ ASSERT(MUTEX_HELD(&hp->p_hmutex));
+
+ *found = 0;
+ for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
+ pcp = pcp->p_hnext) {
+ ASSERT(pcp->p_hashp == hp);
+ if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
+ ASSERT(!IS_PCP_WIRED(pcp));
+ if (pcp->p_len < len) {
+ pcache_link_t *plinkp;
+ if (pcp->p_active) {
+ continue;
+ }
+ plinkp = &pcp->p_plink;
+ plinkp->p_lprev->p_lnext = plinkp->p_lnext;
+ plinkp->p_lnext->p_lprev = plinkp->p_lprev;
+ pcp->p_hprev->p_hnext = pcp->p_hnext;
+ pcp->p_hnext->p_hprev = pcp->p_hprev;
+ pcp->p_hprev = delcallb_list;
+ delcallb_list = pcp;
+ } else {
+ *found = 1;
+ break;
+ }
+ }
+ }
+ return (delcallb_list);
+}
+
+/*
+ * lookup an address range in pagelock cache. Return shadow list and bump up
+ * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
+ * as a lookup tag.
*/
struct page **
-seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
+seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
+ enum seg_rw rw, uint_t flags)
{
struct seg_pcache *pcp;
struct seg_phash *hp;
+ void *htag0;
+
+ ASSERT(seg != NULL);
+ ASSERT(rw == S_READ || rw == S_WRITE);
/*
* Skip pagelock cache, while DR is in progress or
* seg_pcache is off.
*/
- if (seg_pdisable || seg_plazy == 0) {
+ if (seg_pdisabled) {
return (NULL);
}
+ ASSERT(seg_phashsize_win != 0);
- hp = &p_hashtab[p_hash(seg)];
+ htag0 = (amp == NULL ? (void *)seg : (void *)amp);
+ hp = P_HASHBP(seg, htag0, addr, flags);
mutex_enter(&hp->p_hmutex);
for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
pcp = pcp->p_hnext) {
- if (p_match(pcp, seg, addr, len, rw)) {
+ ASSERT(pcp->p_hashp == hp);
+ if (P_MATCH(pcp, htag0, addr, len)) {
+ ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
+ /*
+ * If this request wants to write pages
+ * but write permissions starting from
+ * addr don't cover the entire length len
+ * return lookup failure back to the caller.
+ * It will check protections and fail this
+ * pagelock operation with EACCESS error.
+ */
+ if (rw == S_WRITE && pcp->p_wlen < len) {
+ break;
+ }
+ if (pcp->p_active == UINT_MAX) {
+ break;
+ }
pcp->p_active++;
+ if (rw == S_WRITE && !pcp->p_write) {
+ pcp->p_write = 1;
+ }
mutex_exit(&hp->p_hmutex);
-
- PPRINT5("seg_plookup hit: seg %p, addr %p, "
- "len %lx, count %d, pplist %p \n",
- (void *)seg, (void *)addr, len, pcp->p_active,
- (void *)pcp->p_pp);
-
return (pcp->p_pp);
}
}
mutex_exit(&hp->p_hmutex);
-
- PPRINT("seg_plookup miss:\n");
-
return (NULL);
}
/*
- * mark address range inactive. If the cache is off or the address
- * range is not in the cache we call the segment driver to reclaim
- * the pages. Otherwise just decrement active count and set ref bit.
+ * mark address range inactive. If the cache is off or the address range is
+ * not in the cache or another shadow list that covers bigger range is found
+ * we call the segment driver to reclaim the pages. Otherwise just decrement
+ * active count and set ref bit. If amp is not NULL use amp as a lookup tag
+ * otherwise use seg as a lookup tag.
*/
void
-seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
- enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t,
- struct page **, enum seg_rw))
+seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
+ size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
+ seg_preclaim_cbfunc_t callback)
{
struct seg_pcache *pcp;
struct seg_phash *hp;
+ kmutex_t *pmtx = NULL;
+ pcache_link_t *pheadp;
+ void *htag0;
+ pgcnt_t npages = 0;
+ int keep = 0;
- if (seg_plazy == 0) {
- (void) (*callback)(seg, addr, len, pp, rw);
- return;
+ ASSERT(seg != NULL);
+ ASSERT(rw == S_READ || rw == S_WRITE);
+
+ htag0 = (amp == NULL ? (void *)seg : (void *)amp);
+
+ /*
+ * Skip lookup if pcache is not configured.
+ */
+ if (seg_phashsize_win == 0) {
+ goto out;
+ }
+
+ /*
+ * Grab per seg/amp lock before hash lock if we are going to remove
+ * inactive entry from pcache.
+ */
+ if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
+ if (amp == NULL) {
+ pheadp = &seg->s_phead;
+ pmtx = &seg->s_pmtx;
+ } else {
+ pheadp = &amp->a_phead;
+ pmtx = &amp->a_pmtx;
+ }
+ mutex_enter(pmtx);
}
- hp = &p_hashtab[p_hash(seg)];
+
+ hp = P_HASHBP(seg, htag0, addr, flags);
mutex_enter(&hp->p_hmutex);
+again:
for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
pcp = pcp->p_hnext) {
- if (p_match_pp(pcp, seg, addr, len, pp, rw)) {
+ ASSERT(pcp->p_hashp == hp);
+ if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
+ ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
+ ASSERT(pcp->p_active);
+ if (keep) {
+ /*
+ * Don't remove this pcp entry
+ * if we didn't find duplicate
+ * shadow lists on second search.
+ * Somebody removed those duplicates
+ * since we dropped hash lock after first
+ * search.
+ */
+ ASSERT(pmtx != NULL);
+ ASSERT(!IS_PFLAGS_WIRED(flags));
+ mutex_exit(pmtx);
+ pmtx = NULL;
+ }
pcp->p_active--;
- ASSERT(pcp->p_active >= 0);
- if (pcp->p_active == 0 && seg_pdisable) {
- int npages;
+ if (pcp->p_active == 0 && (pmtx != NULL ||
+ (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
+
+ /*
+ * This entry is no longer active. Remove it
+ * now either because pcaching is temporarily
+ * disabled or there're other pcp entries that
+ * can match this pagelock request (i.e. this
+ * entry is a duplicate).
+ */
ASSERT(callback == pcp->p_callback);
- /* free the entry */
- hp->p_qlen--;
+ if (pmtx != NULL) {
+ pcache_link_t *plinkp = &pcp->p_plink;
+ ASSERT(!IS_PCP_WIRED(pcp));
+ ASSERT(pheadp->p_lnext != pheadp);
+ ASSERT(pheadp->p_lprev != pheadp);
+ plinkp->p_lprev->p_lnext =
+ plinkp->p_lnext;
+ plinkp->p_lnext->p_lprev =
+ plinkp->p_lprev;
+ }
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
+ if (!IS_PCP_WIRED(pcp) &&
+ hp->p_hnext == (struct seg_pcache *)hp) {
+ /*
+ * We removed the last entry from this
+ * bucket. Now remove the bucket from
+ * its active list.
+ */
+ seg_premove_abuck(hp, 0);
+ }
mutex_exit(&hp->p_hmutex);
- npages = pcp->p_len >> PAGESHIFT;
- mutex_enter(&seg_pmem);
- seg_plocked -= npages;
- if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
- seg_plocked_window -= npages;
+ if (pmtx != NULL) {
+ mutex_exit(pmtx);
}
- mutex_exit(&seg_pmem);
- kmem_free(pcp, sizeof (struct seg_pcache));
+ len = pcp->p_len;
+ npages = btop(len);
+ if (rw != S_WRITE && pcp->p_write) {
+ rw = S_WRITE;
+ }
+ kmem_cache_free(seg_pkmcache, pcp);
goto out;
+ } else {
+ /*
+ * We found a matching pcp entry but will not
+ * free it right away even if it's no longer
+ * active.
+ */
+ if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
+ /*
+ * Set the reference bit and mark the
+ * time of last access to this pcp
+ * so that asynchronous thread doesn't
+ * free it immediately since
+ * it may be reactivated very soon.
+ */
+ pcp->p_lbolt = lbolt;
+ pcp->p_ref = 1;
+ }
+ mutex_exit(&hp->p_hmutex);
+ if (pmtx != NULL) {
+ mutex_exit(pmtx);
+ }
+ return;
+ }
+ } else if (!IS_PFLAGS_WIRED(flags) &&
+ P_MATCH(pcp, htag0, addr, len)) {
+ /*
+ * This is a duplicate pcp entry. This situation may
+ * happen if a bigger shadow list that covers our
+ * range was added while our entry was still active.
+ * Now we can free our pcp entry if it becomes
+ * inactive.
+ */
+ if (!pcp->p_active) {
+ /*
+ * Mark this entry as referenced just in case
+ * we'll free our own pcp entry soon.
+ */
+ pcp->p_lbolt = lbolt;
+ pcp->p_ref = 1;
+ }
+ if (pmtx != NULL) {
+ /*
+ * we are already holding pmtx and found a
+ * duplicate. Don't keep our own pcp entry.
+ */
+ keep = 0;
+ continue;
+ }
+ /*
+ * We have to use mutex_tryenter to attempt to lock
+ * seg/amp list lock since we already hold hash lock
+ * and seg/amp list lock is above hash lock in lock
+ * order. If mutex_tryenter fails drop hash lock and
+ * retake both locks in correct order and research
+ * this hash chain.
+ */
+ ASSERT(keep == 0);
+ if (amp == NULL) {
+ pheadp = &seg->s_phead;
+ pmtx = &seg->s_pmtx;
+ } else {
+ pheadp = &amp->a_phead;
+ pmtx = &amp->a_pmtx;
+ }
+ if (!mutex_tryenter(pmtx)) {
+ mutex_exit(&hp->p_hmutex);
+ mutex_enter(pmtx);
+ mutex_enter(&hp->p_hmutex);
+ /*
+ * If we don't find bigger shadow list on
+ * second search (it may happen since we
+ * dropped bucket lock) keep the entry that
+ * matches our own shadow list.
+ */
+ keep = 1;
+ goto again;
}
- pcp->p_ref = 1;
- mutex_exit(&hp->p_hmutex);
- return;
}
}
mutex_exit(&hp->p_hmutex);
+ if (pmtx != NULL) {
+ mutex_exit(pmtx);
+ }
out:
- (void) (*callback)(seg, addr, len, pp, rw);
+ (*callback)(htag0, addr, len, pp, rw, 0);
+ if (npages) {
+ mutex_enter(&seg_pmem_mtx);
+ ASSERT(seg_plocked >= npages);
+ seg_plocked -= npages;
+ if (!IS_PFLAGS_WIRED(flags)) {
+ ASSERT(seg_plocked_window >= npages);
+ seg_plocked_window -= npages;
+ }
+ mutex_exit(&seg_pmem_mtx);
+ }
+
}
+#ifdef DEBUG
+static uint32_t p_insert_chk_mtbf = 0;
+#endif
+
/*
* The seg_pinsert_check() is used by segment drivers to predict whether
* a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
*/
-
+/*ARGSUSED*/
int
-seg_pinsert_check(struct seg *seg, size_t len, uint_t flags)
+seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
+ size_t len, uint_t flags)
{
- struct seg_phash *hp;
+ ASSERT(seg != NULL);
- if (seg_plazy == 0) {
+#ifdef DEBUG
+ if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
return (SEGP_FAIL);
}
- if (seg_pdisable != 0) {
+#endif
+
+ if (seg_pdisabled) {
return (SEGP_FAIL);
}
- ASSERT((len & PAGEOFFSET) == 0);
- hp = &p_hashtab[p_hash(seg)];
- if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
+ ASSERT(seg_phashsize_win != 0);
+
+ if (IS_PFLAGS_WIRED(flags)) {
+ return (SEGP_SUCCESS);
+ }
+
+ if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
return (SEGP_FAIL);
}
- /*
- * If the SEGP_FORCE_WIRED flag is set,
- * we skip the check for seg_pwindow.
- */
- if ((flags & SEGP_FORCE_WIRED) == 0) {
- pgcnt_t npages;
- npages = len >> PAGESHIFT;
- if ((seg_plocked_window + npages) > seg_pwindow) {
- return (SEGP_FAIL);
- }
+ if (freemem < desfree) {
+ return (SEGP_FAIL);
}
+
return (SEGP_SUCCESS);
}
+#ifdef DEBUG
+static uint32_t p_insert_mtbf = 0;
+#endif
/*
- * insert address range with shadow list into pagelock cache. If
- * the cache is off or caching is temporarily disabled or the allowed
- * 'window' is exceeded - return SEGP_FAIL. Otherwise return
- * SEGP_SUCCESS.
+ * Insert address range with shadow list into pagelock cache if there's no
+ * shadow list already cached for this address range. If the cache is off or
+ * caching is temporarily disabled or the allowed 'window' is exceeded return
+ * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
+ *
+ * For non wired shadow lists (segvn case) include address in the hashing
+ * function to avoid linking all the entries from the same segment or amp on
+ * the same bucket. amp is used instead of seg if amp is not NULL. Non wired
+ * pcache entries are also linked on a per segment/amp list so that all
+ * entries can be found quickly during seg/amp purge without walking the
+ * entire pcache hash table. For wired shadow lists (segspt case) we
+ * don't use address hashing and per segment linking because the caller
+ * currently inserts only one entry per segment that covers the entire
+ * segment. If we used per segment linking even for segspt it would complicate
+ * seg_ppurge_wiredpp() locking.
+ *
+ * Both hash bucket and per seg/amp locks need to be held before adding a non
+ * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
+ * first.
+ *
+ * This function will also remove from pcache old inactive shadow lists that
+ * overlap with this request but cover smaller range for the same start
+ * address.
*/
int
-seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
- enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t,
- size_t, struct page **, enum seg_rw))
+seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
+ size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
+ seg_preclaim_cbfunc_t callback)
{
struct seg_pcache *pcp;
struct seg_phash *hp;
pgcnt_t npages;
+ pcache_link_t *pheadp;
+ kmutex_t *pmtx;
+ struct seg_pcache *delcallb_list = NULL;
- if (seg_plazy == 0) {
+ ASSERT(seg != NULL);
+ ASSERT(rw == S_READ || rw == S_WRITE);
+ ASSERT(rw == S_READ || wlen == len);
+ ASSERT(rw == S_WRITE || wlen <= len);
+ ASSERT(amp == NULL || wlen == len);
+
+#ifdef DEBUG
+ if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
return (SEGP_FAIL);
}
- if (seg_pdisable != 0) {
+#endif
+
+ if (seg_pdisabled) {
return (SEGP_FAIL);
}
+ ASSERT(seg_phashsize_win != 0);
+
ASSERT((len & PAGEOFFSET) == 0);
- hp = &p_hashtab[p_hash(seg)];
- if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
- return (SEGP_FAIL);
- }
- npages = len >> PAGESHIFT;
- mutex_enter(&seg_pmem);
- /*
- * If the SEGP_FORCE_WIRED flag is set,
- * we skip the check for seg_pwindow.
- */
- if ((flags & SEGP_FORCE_WIRED) == 0) {
- seg_plocked_window += npages;
- if (seg_plocked_window > seg_pwindow) {
- seg_plocked_window -= npages;
- mutex_exit(&seg_pmem);
+ npages = btop(len);
+ mutex_enter(&seg_pmem_mtx);
+ if (!IS_PFLAGS_WIRED(flags)) {
+ if (seg_plocked_window + npages > seg_pmaxwindow) {
+ mutex_exit(&seg_pmem_mtx);
return (SEGP_FAIL);
}
+ seg_plocked_window += npages;
}
seg_plocked += npages;
- mutex_exit(&seg_pmem);
+ mutex_exit(&seg_pmem_mtx);
- pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP);
- pcp->p_seg = seg;
+ pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
+ /*
+ * If amp is not NULL set htag0 to amp otherwise set it to seg.
+ */
+ if (amp == NULL) {
+ pcp->p_htag0 = (void *)seg;
+ pcp->p_flags = flags & 0xffff;
+ } else {
+ pcp->p_htag0 = (void *)amp;
+ pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
+ }
pcp->p_addr = addr;
pcp->p_len = len;
+ pcp->p_wlen = wlen;
pcp->p_pp = pp;
- pcp->p_rw = rw;
+ pcp->p_write = (rw == S_WRITE);
pcp->p_callback = callback;
pcp->p_active = 1;
- pcp->p_flags = flags;
- PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n",
- (void *)seg, (void *)addr, len, (void *)pp);
-
- hp = &p_hashtab[p_hash(seg)];
- mutex_enter(&hp->p_hmutex);
- hp->p_qlen++;
+ hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
+ if (!IS_PFLAGS_WIRED(flags)) {
+ int found;
+ void *htag0;
+ if (amp == NULL) {
+ pheadp = &seg->s_phead;
+ pmtx = &seg->s_pmtx;
+ htag0 = (void *)seg;
+ } else {
+ pheadp = &amp->a_phead;
+ pmtx = &amp->a_pmtx;
+ htag0 = (void *)amp;
+ }
+ mutex_enter(pmtx);
+ mutex_enter(&hp->p_hmutex);
+ delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
+ len, &found);
+ if (found) {
+ mutex_exit(&hp->p_hmutex);
+ mutex_exit(pmtx);
+ mutex_enter(&seg_pmem_mtx);
+ seg_plocked -= npages;
+ seg_plocked_window -= npages;
+ mutex_exit(&seg_pmem_mtx);
+ kmem_cache_free(seg_pkmcache, pcp);
+ goto out;
+ }
+ pcp->p_plink.p_lnext = pheadp->p_lnext;
+ pcp->p_plink.p_lprev = pheadp;
+ pheadp->p_lnext->p_lprev = &pcp->p_plink;
+ pheadp->p_lnext = &pcp->p_plink;
+ } else {
+ mutex_enter(&hp->p_hmutex);
+ }
+ pcp->p_hashp = hp;
pcp->p_hnext = hp->p_hnext;
pcp->p_hprev = (struct seg_pcache *)hp;
hp->p_hnext->p_hprev = pcp;
hp->p_hnext = pcp;
+ if (!IS_PFLAGS_WIRED(flags) &&
+ hp->p_hprev == pcp) {
+ seg_padd_abuck(hp);
+ }
mutex_exit(&hp->p_hmutex);
+ if (!IS_PFLAGS_WIRED(flags)) {
+ mutex_exit(pmtx);
+ }
+
+out:
+ npages = 0;
+ while (delcallb_list != NULL) {
+ pcp = delcallb_list;
+ delcallb_list = pcp->p_hprev;
+ ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
+ (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
+ pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
+ npages += btop(pcp->p_len);
+ kmem_cache_free(seg_pkmcache, pcp);
+ }
+ if (npages) {
+ ASSERT(!IS_PFLAGS_WIRED(flags));
+ mutex_enter(&seg_pmem_mtx);
+ ASSERT(seg_plocked >= npages);
+ ASSERT(seg_plocked_window >= npages);
+ seg_plocked -= npages;
+ seg_plocked_window -= npages;
+ mutex_exit(&seg_pmem_mtx);
+ }
+
return (SEGP_SUCCESS);
}
/*
- * purge all entries from the pagelock cache if not active
- * and not recently used. Drop all locks and call through
- * the address space into the segment driver to reclaim
- * the pages. This makes sure we get the address space
- * and segment driver locking right.
+ * purge entries from the pagelock cache if not active
+ * and not recently used.
*/
static void
-seg_ppurge_all(int force)
+seg_ppurge_async(int force)
{
struct seg_pcache *delcallb_list = NULL;
struct seg_pcache *pcp;
struct seg_phash *hp;
- int purge_count = 0;
pgcnt_t npages = 0;
pgcnt_t npages_window = 0;
+ pgcnt_t npgs_to_purge;
+ pgcnt_t npgs_purged = 0;
+ int hlinks = 0;
+ int hlix;
+ pcache_link_t *hlinkp;
+ pcache_link_t *hlnextp = NULL;
+ int lowmem;
+ int trim;
+
+ ASSERT(seg_phashsize_win != 0);
/*
- * if the cache if off or empty, return
+ * if the cache is off or empty, return
*/
- if (seg_plazy == 0 || seg_plocked == 0) {
+ if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
return;
}
- for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
- mutex_enter(&hp->p_hmutex);
- pcp = hp->p_hnext;
- /*
- * While 'force' is set, seg_pasync_thread is not
- * throttled. This is to speedup flushing of seg_pcache
- * in preparation for DR.
- *
- * In normal case, when 'force' is not set, we throttle
- * seg_pasync_thread so that we don't spend all the time
- * time in purging the cache.
- */
- while ((pcp != (struct seg_pcache *)hp) &&
- (force || (purge_count <= seg_ppcount))) {
+ if (!force) {
+ lowmem = 0;
+ trim = 0;
+ if (freemem < lotsfree + needfree) {
+ spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
+ if (fmem <= 5 * (desfree >> 2)) {
+ lowmem = 1;
+ } else if (fmem <= 7 * (lotsfree >> 3)) {
+ if (seg_plocked_window >=
+ (availrmem_initial >> 1)) {
+ lowmem = 1;
+ }
+ } else if (fmem < lotsfree) {
+ if (seg_plocked_window >=
+ 3 * (availrmem_initial >> 2)) {
+ lowmem = 1;
+ }
+ }
+ }
+ if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
+ trim = 1;
+ }
+ if (!lowmem && !trim) {
+ return;
+ }
+ npgs_to_purge = seg_plocked_window >>
+ seg_pshrink_shift;
+ if (lowmem) {
+ npgs_to_purge = MIN(npgs_to_purge,
+ MAX(seg_pmaxapurge_npages, desfree));
+ } else {
+ npgs_to_purge = MIN(npgs_to_purge,
+ seg_pmaxapurge_npages);
+ }
+ if (npgs_to_purge == 0) {
+ return;
+ }
+ } else {
+ struct seg_phash_wired *hpw;
- /*
- * purge entries which are not active and
- * have not been used recently and
- * have the SEGP_ASYNC_FLUSH flag.
- *
- * In the 'force' case, we ignore the
- * SEGP_ASYNC_FLUSH flag.
- */
- if (!(pcp->p_flags & SEGP_ASYNC_FLUSH))
- pcp->p_ref = 1;
- if (force)
- pcp->p_ref = 0;
- if (!pcp->p_ref && !pcp->p_active) {
- struct as *as = pcp->p_seg->s_as;
+ ASSERT(seg_phashsize_wired != 0);
- /*
- * try to get the readers lock on the address
- * space before taking out the cache element.
- * This ensures as_pagereclaim() can actually
- * call through the address space and free
- * the pages. If we don't get the lock, just
- * skip this entry. The pages will be reclaimed
- * by the segment driver at unmap time.
- */
- if (AS_LOCK_TRYENTER(as, &as->a_lock,
- RW_READER)) {
- hp->p_qlen--;
- pcp->p_hprev->p_hnext = pcp->p_hnext;
- pcp->p_hnext->p_hprev = pcp->p_hprev;
- pcp->p_hprev = delcallb_list;
- delcallb_list = pcp;
- purge_count++;
+ for (hpw = seg_phashtab_wired;
+ hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
+
+ if (hpw->p_hnext == (struct seg_pcache *)hpw) {
+ continue;
+ }
+
+ mutex_enter(&hpw->p_hmutex);
+
+ for (pcp = hpw->p_hnext;
+ pcp != (struct seg_pcache *)hpw;
+ pcp = pcp->p_hnext) {
+
+ ASSERT(IS_PCP_WIRED(pcp));
+ ASSERT(pcp->p_hashp ==
+ (struct seg_phash *)hpw);
+
+ if (pcp->p_active) {
+ continue;
}
- } else {
+ pcp->p_hprev->p_hnext = pcp->p_hnext;
+ pcp->p_hnext->p_hprev = pcp->p_hprev;
+ pcp->p_hprev = delcallb_list;
+ delcallb_list = pcp;
+ }
+ mutex_exit(&hpw->p_hmutex);
+ }
+ }
+
+ mutex_enter(&seg_pmem_mtx);
+ if (seg_pathr_on) {
+ mutex_exit(&seg_pmem_mtx);
+ goto runcb;
+ }
+ seg_pathr_on = 1;
+ mutex_exit(&seg_pmem_mtx);
+ ASSERT(seg_pahcur <= 1);
+ hlix = !seg_pahcur;
+
+again:
+ for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
+ hlinkp = hlnextp) {
+
+ hlnextp = hlinkp->p_lnext;
+ ASSERT(hlnextp != NULL);
+
+ hp = hlink2phash(hlinkp, hlix);
+ if (hp->p_hnext == (struct seg_pcache *)hp) {
+ seg_pathr_empty_ahb++;
+ continue;
+ }
+ seg_pathr_full_ahb++;
+ mutex_enter(&hp->p_hmutex);
+
+ for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
+ pcp = pcp->p_hnext) {
+ pcache_link_t *pheadp;
+ pcache_link_t *plinkp;
+ void *htag0;
+ kmutex_t *pmtx;
+
+ ASSERT(!IS_PCP_WIRED(pcp));
+ ASSERT(pcp->p_hashp == hp);
+
+ if (pcp->p_active) {
+ continue;
+ }
+ if (!force && pcp->p_ref &&
+ PCP_AGE(pcp) < seg_pmax_pcpage) {
pcp->p_ref = 0;
+ continue;
}
- pcp = pcp->p_hnext;
+ plinkp = &pcp->p_plink;
+ htag0 = pcp->p_htag0;
+ if (pcp->p_flags & SEGP_AMP) {
+ pheadp = &((amp_t *)htag0)->a_phead;
+ pmtx = &((amp_t *)htag0)->a_pmtx;
+ } else {
+ pheadp = &((seg_t *)htag0)->s_phead;
+ pmtx = &((seg_t *)htag0)->s_pmtx;
+ }
+ if (!mutex_tryenter(pmtx)) {
+ continue;
+ }
+ ASSERT(pheadp->p_lnext != pheadp);
+ ASSERT(pheadp->p_lprev != pheadp);
+ plinkp->p_lprev->p_lnext =
+ plinkp->p_lnext;
+ plinkp->p_lnext->p_lprev =
+ plinkp->p_lprev;
+ pcp->p_hprev->p_hnext = pcp->p_hnext;
+ pcp->p_hnext->p_hprev = pcp->p_hprev;
+ mutex_exit(pmtx);
+ pcp->p_hprev = delcallb_list;
+ delcallb_list = pcp;
+ npgs_purged += btop(pcp->p_len);
+ }
+ if (hp->p_hnext == (struct seg_pcache *)hp) {
+ seg_premove_abuck(hp, 1);
}
mutex_exit(&hp->p_hmutex);
- if (!force && purge_count > seg_ppcount)
+ if (npgs_purged >= seg_plocked_window) {
break;
+ }
+ if (!force) {
+ if (npgs_purged >= npgs_to_purge) {
+ break;
+ }
+ if (!trim && !(seg_pathr_full_ahb & 15)) {
+ ASSERT(lowmem);
+ if (freemem >= lotsfree + needfree) {
+ break;
+ }
+ }
+ }
}
+ if (hlinkp == &seg_pahhead[hlix]) {
+ /*
+ * We processed the entire hlix active bucket list
+ * but didn't find enough pages to reclaim.
+ * Switch the lists and walk the other list
+ * if we haven't done it yet.
+ */
+ mutex_enter(&seg_pmem_mtx);
+ ASSERT(seg_pathr_on);
+ ASSERT(seg_pahcur == !hlix);
+ seg_pahcur = hlix;
+ mutex_exit(&seg_pmem_mtx);
+ if (++hlinks < 2) {
+ hlix = !hlix;
+ goto again;
+ }
+ } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
+ seg_pahhead[hlix].p_lnext != hlinkp) {
+ ASSERT(hlinkp != NULL);
+ ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
+ ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
+ ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
+
+ /*
+ * Reinsert the header to point to hlinkp
+ * so that we start from hlinkp bucket next time around.
+ */
+ seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
+ seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
+ seg_pahhead[hlix].p_lnext = hlinkp;
+ seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
+ hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
+ hlinkp->p_lprev = &seg_pahhead[hlix];
+ }
+
+ mutex_enter(&seg_pmem_mtx);
+ ASSERT(seg_pathr_on);
+ seg_pathr_on = 0;
+ mutex_exit(&seg_pmem_mtx);
+
+runcb:
/*
- * run the delayed callback list. We don't want to hold the
- * cache lock during a call through the address space.
+ * Run the delayed callback list. segments/amps can't go away until
+ * callback is executed since they must have non 0 softlockcnt. That's
+ * why we don't need to hold as/seg/amp locks to execute the callback.
*/
while (delcallb_list != NULL) {
- struct as *as;
-
pcp = delcallb_list;
delcallb_list = pcp->p_hprev;
- as = pcp->p_seg->s_as;
-
- PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, "
- "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr,
- pcp->p_len, (void *)pcp->p_pp);
-
- as_pagereclaim(as, pcp->p_pp, pcp->p_addr,
- pcp->p_len, pcp->p_rw);
- AS_LOCK_EXIT(as, &as->a_lock);
- npages += pcp->p_len >> PAGESHIFT;
- if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
- npages_window += pcp->p_len >> PAGESHIFT;
+ ASSERT(!pcp->p_active);
+ (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
+ pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
+ npages += btop(pcp->p_len);
+ if (!IS_PCP_WIRED(pcp)) {
+ npages_window += btop(pcp->p_len);
}
- kmem_free(pcp, sizeof (struct seg_pcache));
+ kmem_cache_free(seg_pkmcache, pcp);
+ }
+ if (npages) {
+ mutex_enter(&seg_pmem_mtx);
+ ASSERT(seg_plocked >= npages);
+ ASSERT(seg_plocked_window >= npages_window);
+ seg_plocked -= npages;
+ seg_plocked_window -= npages_window;
+ mutex_exit(&seg_pmem_mtx);
}
- mutex_enter(&seg_pmem);
- seg_plocked -= npages;
- seg_plocked_window -= npages_window;
- mutex_exit(&seg_pmem);
}
/*
- * Remove cached pages for segment(s) entries from hashtable.
- * The segments are identified by a given clients callback
- * function.
- * This is useful for multiple seg's cached on behalf of
- * dummy segment (ISM/DISM) with common callback function.
- * The clients callback function may return status indicating
- * that the last seg's entry has been purged. In such a case
- * the seg_ppurge_seg() stops searching hashtable and exits.
- * Otherwise all hashtable entries are scanned.
+ * Remove cached pages for segment(s) entries from hashtable. The segments
+ * are identified by pp array. This is useful for multiple seg's cached on
+ * behalf of dummy segment (ISM/DISM) with common pp array.
*/
void
-seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t,
- struct page **, enum seg_rw))
+seg_ppurge_wiredpp(struct page **pp)
{
- struct seg_pcache *pcp, *npcp;
- struct seg_phash *hp;
+ struct seg_pcache *pcp;
+ struct seg_phash_wired *hp;
pgcnt_t npages = 0;
- pgcnt_t npages_window = 0;
- int done = 0;
+ struct seg_pcache *delcallb_list = NULL;
/*
- * if the cache if off or empty, return
+ * if the cache is empty, return
*/
- if (seg_plazy == 0 || seg_plocked == 0) {
+ if (seg_plocked == 0) {
return;
}
- mutex_enter(&seg_pcache);
- seg_pdisable++;
- mutex_exit(&seg_pcache);
-
- for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
+ ASSERT(seg_phashsize_wired != 0);
+ for (hp = seg_phashtab_wired;
+ hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
+ if (hp->p_hnext == (struct seg_pcache *)hp) {
+ continue;
+ }
mutex_enter(&hp->p_hmutex);
pcp = hp->p_hnext;
while (pcp != (struct seg_pcache *)hp) {
-
+ ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
+ ASSERT(IS_PCP_WIRED(pcp));
/*
* purge entries which are not active
*/
- npcp = pcp->p_hnext;
- if (!pcp->p_active && pcp->p_callback == callback) {
- hp->p_qlen--;
+ if (!pcp->p_active && pcp->p_pp == pp) {
+ ASSERT(pcp->p_htag0 != NULL);
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
-
- if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr,
- pcp->p_len, pcp->p_pp, pcp->p_rw)) {
- done = 1;
- }
-
- npages += pcp->p_len >> PAGESHIFT;
- if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
- npages_window +=
- pcp->p_len >> PAGESHIFT;
- }
- kmem_free(pcp, sizeof (struct seg_pcache));
+ pcp->p_hprev = delcallb_list;
+ delcallb_list = pcp;
}
- pcp = npcp;
- if (done)
- break;
+ pcp = pcp->p_hnext;
}
mutex_exit(&hp->p_hmutex);
- if (done)
- break;
+ /*
+ * segments can't go away until callback is executed since
+ * they must have non 0 softlockcnt. That's why we don't
+ * need to hold as/seg locks to execute the callback.
+ */
+ while (delcallb_list != NULL) {
+ int done;
+ pcp = delcallb_list;
+ delcallb_list = pcp->p_hprev;
+ ASSERT(!pcp->p_active);
+ done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
+ pcp->p_len, pcp->p_pp,
+ pcp->p_write ? S_WRITE : S_READ, 1);
+ npages += btop(pcp->p_len);
+ ASSERT(IS_PCP_WIRED(pcp));
+ kmem_cache_free(seg_pkmcache, pcp);
+ if (done) {
+ ASSERT(delcallb_list == NULL);
+ goto out;
+ }
+ }
}
- mutex_enter(&seg_pcache);
- seg_pdisable--;
- mutex_exit(&seg_pcache);
-
- mutex_enter(&seg_pmem);
+out:
+ mutex_enter(&seg_pmem_mtx);
+ ASSERT(seg_plocked >= npages);
seg_plocked -= npages;
- seg_plocked_window -= npages_window;
- mutex_exit(&seg_pmem);
+ mutex_exit(&seg_pmem_mtx);
}
/*
@@ -546,55 +1267,99 @@ seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t,
* reclaim the caller needs to hold the right locks.
*/
void
-seg_ppurge(struct seg *seg)
+seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
{
struct seg_pcache *delcallb_list = NULL;
struct seg_pcache *pcp;
struct seg_phash *hp;
pgcnt_t npages = 0;
- pgcnt_t npages_window = 0;
+ void *htag0;
- if (seg_plazy == 0) {
+ if (seg_plocked == 0) {
return;
}
- hp = &p_hashtab[p_hash(seg)];
- mutex_enter(&hp->p_hmutex);
- pcp = hp->p_hnext;
- while (pcp != (struct seg_pcache *)hp) {
- if (pcp->p_seg == seg) {
+ ASSERT(seg_phashsize_win != 0);
+
+ /*
+ * If amp is not NULL use amp as a lookup tag otherwise use seg
+ * as a lookup tag.
+ */
+ htag0 = (amp == NULL ? (void *)seg : (void *)amp);
+ ASSERT(htag0 != NULL);
+ if (IS_PFLAGS_WIRED(flags)) {
+ hp = P_HASHBP(seg, htag0, 0, flags);
+ mutex_enter(&hp->p_hmutex);
+ pcp = hp->p_hnext;
+ while (pcp != (struct seg_pcache *)hp) {
+ ASSERT(pcp->p_hashp == hp);
+ ASSERT(IS_PCP_WIRED(pcp));
+ if (pcp->p_htag0 == htag0) {
+ if (pcp->p_active) {
+ break;
+ }
+ pcp->p_hprev->p_hnext = pcp->p_hnext;
+ pcp->p_hnext->p_hprev = pcp->p_hprev;
+ pcp->p_hprev = delcallb_list;
+ delcallb_list = pcp;
+ }
+ pcp = pcp->p_hnext;
+ }
+ mutex_exit(&hp->p_hmutex);
+ } else {
+ pcache_link_t *plinkp;
+ pcache_link_t *pheadp;
+ kmutex_t *pmtx;
+
+ if (amp == NULL) {
+ ASSERT(seg != NULL);
+ pheadp = &seg->s_phead;
+ pmtx = &seg->s_pmtx;
+ } else {
+ pheadp = &amp->a_phead;
+ pmtx = &amp->a_pmtx;
+ }
+ mutex_enter(pmtx);
+ while ((plinkp = pheadp->p_lnext) != pheadp) {
+ pcp = plink2pcache(plinkp);
+ ASSERT(!IS_PCP_WIRED(pcp));
+ ASSERT(pcp->p_htag0 == htag0);
+ hp = pcp->p_hashp;
+ mutex_enter(&hp->p_hmutex);
if (pcp->p_active) {
+ mutex_exit(&hp->p_hmutex);
break;
}
- hp->p_qlen--;
+ ASSERT(plinkp->p_lprev == pheadp);
+ pheadp->p_lnext = plinkp->p_lnext;
+ plinkp->p_lnext->p_lprev = pheadp;
pcp->p_hprev->p_hnext = pcp->p_hnext;
pcp->p_hnext->p_hprev = pcp->p_hprev;
pcp->p_hprev = delcallb_list;
delcallb_list = pcp;
+ if (hp->p_hnext == (struct seg_pcache *)hp) {
+ seg_premove_abuck(hp, 0);
+ }
+ mutex_exit(&hp->p_hmutex);
}
- pcp = pcp->p_hnext;
+ mutex_exit(pmtx);
}
- mutex_exit(&hp->p_hmutex);
while (delcallb_list != NULL) {
pcp = delcallb_list;
delcallb_list = pcp->p_hprev;
-
- PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, "
- "pplist %p\n", (void *)seg, (void *)pcp->p_addr,
- pcp->p_len, (void *)pcp->p_pp);
-
- ASSERT(seg == pcp->p_seg);
- (void) (*pcp->p_callback)(seg, pcp->p_addr,
- pcp->p_len, pcp->p_pp, pcp->p_rw);
- npages += pcp->p_len >> PAGESHIFT;
- if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
- npages_window += pcp->p_len >> PAGESHIFT;
- }
- kmem_free(pcp, sizeof (struct seg_pcache));
+ ASSERT(!pcp->p_active);
+ (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
+ pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
+ npages += btop(pcp->p_len);
+ kmem_cache_free(seg_pkmcache, pcp);
}
- mutex_enter(&seg_pmem);
+ mutex_enter(&seg_pmem_mtx);
+ ASSERT(seg_plocked >= npages);
seg_plocked -= npages;
- seg_plocked_window -= npages_window;
- mutex_exit(&seg_pmem);
+ if (!IS_PFLAGS_WIRED(flags)) {
+ ASSERT(seg_plocked_window >= npages);
+ seg_plocked_window -= npages;
+ }
+ mutex_exit(&seg_pmem_mtx);
}
static void seg_pinit_mem_config(void);
@@ -606,58 +1371,125 @@ static void
seg_pinit(void)
{
struct seg_phash *hp;
- int i;
- uint_t physmegs;
+ ulong_t i;
+ pgcnt_t physmegs;
- sema_init(&seg_pasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
+ seg_plocked = 0;
+ seg_plocked_window = 0;
- mutex_enter(&seg_pcache);
- if (p_hashtab == NULL) {
- physmegs = physmem >> (20 - PAGESHIFT);
+ if (segpcache_enabled == 0) {
+ seg_phashsize_win = 0;
+ seg_phashsize_wired = 0;
+ seg_pdisabled = 1;
+ return;
+ }
- /* If p_hashsize was not set in /etc/system ... */
- if (p_hashsize == 0) {
- /*
- * Choose p_hashsize based on physmem.
- */
- if (physmegs < 64) {
- p_hashsize = 64;
- } else if (physmegs < 1024) {
- p_hashsize = 1024;
- } else if (physmegs < 10 * 1024) {
- p_hashsize = 8192;
- } else if (physmegs < 20 * 1024) {
- p_hashsize = 2 * 8192;
- seg_pmaxqlen = 16;
- } else {
- p_hashsize = 128 * 1024;
- seg_pmaxqlen = 128;
- }
- }
+ seg_pdisabled = 0;
+ seg_pkmcache = kmem_cache_create("seg_pcache",
+ sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (segpcache_pcp_maxage_ticks <= 0) {
+ segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
+ }
+ seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
+ seg_pathr_empty_ahb = 0;
+ seg_pathr_full_ahb = 0;
+ seg_pshrink_shift = segpcache_shrink_shift;
+ seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
- p_hashtab = kmem_zalloc(p_hashsize * sizeof (struct seg_phash),
- KM_SLEEP);
- for (i = 0; i < p_hashsize; i++) {
- hp = (struct seg_phash *)&p_hashtab[i];
- hp->p_hnext = (struct seg_pcache *)hp;
- hp->p_hprev = (struct seg_pcache *)hp;
- mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
- }
- if (seg_pwindow == 0) {
- if (physmegs < 24) {
- /* don't use cache */
- seg_plazy = 0;
- } else if (physmegs < 64) {
- seg_pwindow = physmem >> 5; /* 3% of memory */
- } else if (physmegs < 10 * 1024) {
- seg_pwindow = physmem >> 3; /* 12% of memory */
- } else {
- seg_pwindow = physmem >> 1;
- }
+ mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
+
+ physmegs = physmem >> (20 - PAGESHIFT);
+
+ /*
+ * If segpcache_hashsize_win was not set in /etc/system or it has
+ * absurd value set it to a default.
+ */
+ if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
+ /*
+ * Create one bucket per 32K (or at least per 8 pages) of
+ * available memory.
+ */
+ pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
+ segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
+ }
+ if (!ISP2(segpcache_hashsize_win)) {
+ ulong_t rndfac = ~(1UL <<
+ (highbit(segpcache_hashsize_win) - 1));
+ rndfac &= segpcache_hashsize_win;
+ segpcache_hashsize_win += rndfac;
+ segpcache_hashsize_win = 1 <<
+ (highbit(segpcache_hashsize_win) - 1);
+ }
+ seg_phashsize_win = segpcache_hashsize_win;
+ seg_phashtab_win = kmem_zalloc(
+ seg_phashsize_win * sizeof (struct seg_phash),
+ KM_SLEEP);
+ for (i = 0; i < seg_phashsize_win; i++) {
+ hp = &seg_phashtab_win[i];
+ hp->p_hnext = (struct seg_pcache *)hp;
+ hp->p_hprev = (struct seg_pcache *)hp;
+ mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
+ }
+
+ seg_pahcur = 0;
+ seg_pathr_on = 0;
+ seg_pahhead[0].p_lnext = &seg_pahhead[0];
+ seg_pahhead[0].p_lprev = &seg_pahhead[0];
+ seg_pahhead[1].p_lnext = &seg_pahhead[1];
+ seg_pahhead[1].p_lprev = &seg_pahhead[1];
+
+ /*
+ * If segpcache_hashsize_wired was not set in /etc/system or it has
+ * absurd value set it to a default.
+ */
+ if (segpcache_hashsize_wired == 0 ||
+ segpcache_hashsize_wired > physmem / 4) {
+ /*
+ * Choose segpcache_hashsize_wired based on physmem.
+ * Create a bucket per 128K bytes upto 256K buckets.
+ */
+ if (physmegs < 20 * 1024) {
+ segpcache_hashsize_wired = MAX(1024, physmegs << 3);
+ } else {
+ segpcache_hashsize_wired = 256 * 1024;
}
}
- mutex_exit(&seg_pcache);
+ if (!ISP2(segpcache_hashsize_wired)) {
+ segpcache_hashsize_wired = 1 <<
+ highbit(segpcache_hashsize_wired);
+ }
+ seg_phashsize_wired = segpcache_hashsize_wired;
+ seg_phashtab_wired = kmem_zalloc(
+ seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
+ for (i = 0; i < seg_phashsize_wired; i++) {
+ hp = (struct seg_phash *)&seg_phashtab_wired[i];
+ hp->p_hnext = (struct seg_pcache *)hp;
+ hp->p_hprev = (struct seg_pcache *)hp;
+ mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
+ }
+ if (segpcache_maxwindow == 0) {
+ if (physmegs < 64) {
+ /* 3% of memory */
+ segpcache_maxwindow = availrmem >> 5;
+ } else if (physmegs < 512) {
+ /* 12% of memory */
+ segpcache_maxwindow = availrmem >> 3;
+ } else if (physmegs < 1024) {
+ /* 25% of memory */
+ segpcache_maxwindow = availrmem >> 2;
+ } else if (physmegs < 2048) {
+ /* 50% of memory */
+ segpcache_maxwindow = availrmem >> 1;
+ } else {
+ /* no limit */
+ segpcache_maxwindow = (pgcnt_t)-1;
+ }
+ }
+ seg_pmaxwindow = segpcache_maxwindow;
seg_pinit_mem_config();
}
@@ -668,15 +1500,23 @@ void
seg_preap(void)
{
/*
- * if the cache if off or empty, return
+ * if the cache is off or empty, return
*/
- if (seg_plocked == 0 || seg_plazy == 0) {
+ if (seg_plocked_window == 0) {
return;
}
- sema_v(&seg_pasync_sem);
-}
+ ASSERT(seg_phashsize_win != 0);
-static void seg_pupdate(void *);
+ /*
+ * If somebody is already purging pcache
+ * just return.
+ */
+ if (seg_pdisabled) {
+ return;
+ }
+
+ cv_signal(&seg_pasync_cv);
+}
/*
* run as a backgroud thread and reclaim pagelock
@@ -686,42 +1526,30 @@ void
seg_pasync_thread(void)
{
callb_cpr_t cpr_info;
- kmutex_t pasync_lock; /* just for CPR stuff */
- mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL);
+ if (seg_phashsize_win == 0) {
+ thread_exit();
+ /*NOTREACHED*/
+ }
+
+ seg_pasync_thr = curthread;
- CALLB_CPR_INIT(&cpr_info, &pasync_lock, callb_generic_cpr,
- "seg_pasync");
+ CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
+ callb_generic_cpr, "seg_pasync");
- if (seg_preap_interval == 0) {
- seg_preap_interval = seg_preap_time * hz;
- } else {
- seg_preap_interval *= hz;
- }
- if (seg_plazy && seg_pupdate_active) {
- (void) timeout(seg_pupdate, NULL, seg_preap_interval);
+ if (segpcache_reap_ticks <= 0) {
+ segpcache_reap_ticks = segpcache_reap_sec * hz;
}
+ mutex_enter(&seg_pasync_mtx);
for (;;) {
- mutex_enter(&pasync_lock);
CALLB_CPR_SAFE_BEGIN(&cpr_info);
- mutex_exit(&pasync_lock);
- sema_p(&seg_pasync_sem);
- mutex_enter(&pasync_lock);
- CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock);
- mutex_exit(&pasync_lock);
-
- seg_ppurge_all(0);
- }
-}
-
-static void
-seg_pupdate(void *dummy)
-{
- sema_v(&seg_pasync_sem);
-
- if (seg_plazy && seg_pupdate_active) {
- (void) timeout(seg_pupdate, dummy, seg_preap_interval);
+ (void) cv_timedwait(&seg_pasync_cv, &seg_pasync_mtx,
+ lbolt + segpcache_reap_ticks);
+ CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
+ if (seg_pdisabled == 0) {
+ seg_ppurge_async(0);
+ }
}
}
@@ -735,8 +1563,8 @@ seg_init(void)
{
kstat_t *ksp;
- seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg), 0,
- NULL, NULL, NULL, NULL, NULL, 0);
+ seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
@@ -776,6 +1604,9 @@ seg_alloc(struct as *as, caddr_t base, size_t size)
new->s_data = NULL;
new->s_szc = 0;
new->s_flags = 0;
+ mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
+ new->s_phead.p_lnext = &new->s_phead;
+ new->s_phead.p_lprev = &new->s_phead;
if (seg_attach(as, segbase, segsize, new) < 0) {
kmem_cache_free(seg_cache, new);
return ((struct seg *)NULL);
@@ -857,6 +1688,9 @@ seg_free(struct seg *seg)
if (seg->s_data != NULL)
SEGOP_FREE(seg);
+ mutex_destroy(&seg->s_pmtx);
+ ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
+ ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
kmem_cache_free(seg_cache, seg);
}
@@ -872,10 +1706,10 @@ seg_p_mem_config_post_add(
void
seg_p_enable(void)
{
- mutex_enter(&seg_pcache);
- ASSERT(seg_pdisable != 0);
- seg_pdisable--;
- mutex_exit(&seg_pcache);
+ mutex_enter(&seg_pcache_mtx);
+ ASSERT(seg_pdisabled != 0);
+ seg_pdisabled--;
+ mutex_exit(&seg_pcache_mtx);
}
/*
@@ -890,18 +1724,19 @@ seg_p_disable(void)
pgcnt_t old_plocked;
int stall_count = 0;
- mutex_enter(&seg_pcache);
- seg_pdisable++;
- ASSERT(seg_pdisable != 0);
- mutex_exit(&seg_pcache);
+ mutex_enter(&seg_pcache_mtx);
+ seg_pdisabled++;
+ ASSERT(seg_pdisabled != 0);
+ mutex_exit(&seg_pcache_mtx);
/*
* Attempt to empty the cache. Terminate if seg_plocked does not
* diminish with SEGP_STALL_THRESHOLD consecutive attempts.
*/
while (seg_plocked != 0) {
+ ASSERT(seg_phashsize_win != 0);
old_plocked = seg_plocked;
- seg_ppurge_all(1);
+ seg_ppurge_async(1);
if (seg_plocked == old_plocked) {
if (stall_count++ > SEGP_STALL_THRESHOLD) {
return (SEGP_FAIL);
@@ -918,7 +1753,7 @@ seg_p_disable(void)
* Attempt to purge seg_pcache. May need to return before this has
* completed to allow other pre_del callbacks to unlock pages. This is
* ok because:
- * 1) The seg_pdisable flag has been set so at least we won't
+ * 1) The seg_pdisabled flag has been set so at least we won't
* cache anymore locks and the locks we couldn't purge
* will not be held if they do get released by a subsequent
* pre-delete callback.
@@ -934,6 +1769,9 @@ seg_p_mem_config_pre_del(
void *arg,
pgcnt_t delta_pages)
{
+ if (seg_phashsize_win == 0) {
+ return (0);
+ }
if (seg_p_disable() != SEGP_SUCCESS)
cmn_err(CE_NOTE,
"!Pre-delete couldn't purge"" pagelock cache - continuing");
@@ -947,6 +1785,9 @@ seg_p_mem_config_post_del(
pgcnt_t delta_pages,
int cancelled)
{
+ if (seg_phashsize_win == 0) {
+ return;
+ }
seg_p_enable();
}
@@ -971,9 +1812,6 @@ seg_pinit_mem_config(void)
ASSERT(ret == 0);
}
-extern struct seg_ops segvn_ops;
-extern struct seg_ops segspt_shmops;
-
/*
* Verify that segment is not a shared anonymous segment which reserves
* swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
diff --git a/usr/src/uts/sfmmu/vm/hat_sfmmu.c b/usr/src/uts/sfmmu/vm/hat_sfmmu.c
index 9c7e706005..378839bf11 100644
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c
@@ -7379,28 +7379,23 @@ hat_pagesync(struct page *pp, uint_t clearflag)
return (PP_GENERIC_ATTR(pp));
}
- if ((clearflag == (HAT_SYNC_STOPON_REF | HAT_SYNC_DONTZERO)) &&
- PP_ISREF(pp)) {
- return (PP_GENERIC_ATTR(pp));
- }
-
- if ((clearflag == (HAT_SYNC_STOPON_MOD | HAT_SYNC_DONTZERO)) &&
- PP_ISMOD(pp)) {
- return (PP_GENERIC_ATTR(pp));
- }
-
- if ((clearflag & HAT_SYNC_STOPON_SHARED) != 0 &&
- (pp->p_share > po_share) &&
- !(clearflag & HAT_SYNC_ZERORM)) {
- hat_page_setattr(pp, P_REF);
- return (PP_GENERIC_ATTR(pp));
+ if ((clearflag & HAT_SYNC_ZERORM) == 0) {
+ if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) {
+ return (PP_GENERIC_ATTR(pp));
+ }
+ if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) {
+ return (PP_GENERIC_ATTR(pp));
+ }
+ if (clearflag & HAT_SYNC_STOPON_SHARED) {
+ if (pp->p_share > po_share) {
+ hat_page_setattr(pp, P_REF);
+ return (PP_GENERIC_ATTR(pp));
+ }
+ stop_on_sh = 1;
+ shcnt = 0;
+ }
}
- if ((clearflag & HAT_SYNC_STOPON_SHARED) &&
- !(clearflag & HAT_SYNC_ZERORM)) {
- stop_on_sh = 1;
- shcnt = 0;
- }
clearflag &= ~HAT_SYNC_STOPON_SHARED;
pml = sfmmu_mlist_enter(pp);
index = PP_MAPINDEX(pp);