summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2012-01-24 21:57:33 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2012-01-24 23:59:11 +0000
commit0db35aa3d41c1fbc653619952786a5be0fd6f19f (patch)
tree6ffab188d0b1db66de22e0e6d052b25a1713adcf
parent045fa6fc0836f3def18745b617bd1878838eca03 (diff)
downloadillumos-joyent-0db35aa3d41c1fbc653619952786a5be0fd6f19f.tar.gz
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process
-rw-r--r--usr/src/cmd/rcap/rcapd/rcapd_scanner.c3
-rw-r--r--usr/src/cmd/truss/print.c9
-rw-r--r--usr/src/cmd/zoneadmd/mcap.c183
-rw-r--r--usr/src/uts/common/sys/buf.h8
-rw-r--r--usr/src/uts/common/sys/mman.h2
-rw-r--r--usr/src/uts/common/syscall/memcntl.c9
-rw-r--r--usr/src/uts/common/vm/hat.h2
-rw-r--r--usr/src/uts/common/vm/seg_vn.c12
-rw-r--r--usr/src/uts/common/vm/vm_pvn.c28
-rw-r--r--usr/src/uts/common/vm/vm_usage.c16
-rw-r--r--usr/src/uts/i86pc/vm/hat_i86.c44
11 files changed, 190 insertions, 126 deletions
diff --git a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
index b39811b552..254bb9e922 100644
--- a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
+++ b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
@@ -21,6 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
@@ -551,7 +552,7 @@ pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
errno = 0;
res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
- (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
+ (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
/*
diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c
index 49d6da39f9..a8c923fae2 100644
--- a/usr/src/cmd/truss/print.c
+++ b/usr/src/cmd/truss/print.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -825,7 +825,9 @@ prt_mc4(private_t *pri, int raw, long val) /* print memcntl() (4th) argument */
return;
case MC_SYNC:
- if ((val & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) == 0) {
+ if ((val &
+ ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC))
+ == 0) {
*(s = pri->code_buf) = '\0';
if (val & MS_SYNC)
(void) strlcat(s, "|MS_SYNC", CBSIZE);
@@ -834,6 +836,9 @@ prt_mc4(private_t *pri, int raw, long val) /* print memcntl() (4th) argument */
if (val & MS_INVALIDATE)
(void) strlcat(s, "|MS_INVALIDATE",
CBSIZE);
+ if (val & MS_INVALCURPROC)
+ (void) strlcat(s, "|MS_INVALCURPROC",
+ CBSIZE);
}
break;
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c
index 9e0fba65e2..d52eec9c97 100644
--- a/usr/src/cmd/zoneadmd/mcap.c
+++ b/usr/src/cmd/zoneadmd/mcap.c
@@ -40,7 +40,7 @@
* checks that against the zone's zone.max-physical-memory rctl. Once the
* zone goes over its cap, then this thread will work through the zone's
* /proc process list, Pgrab-bing each process and stepping through the
- * address space segments attempting to use pr_memcntl(...MS_INVALIDATE...)
+ * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
* to pageout pages, until the zone is again under its cap.
*
* Although zone memory capping is implemented as a soft cap by this user-level
@@ -56,21 +56,14 @@
* the thread will work to pageout until the zone is under the cap, as shown
* by updated vm_usage data.
*
- * There are a couple of interfaces (xmap, pagedata) in proc(4) that can be
- * used to examine a processes mapped segments while we are trying to pageout.
- * The observed xmap segement size data is frequently smaller than the
- * pagedata segement size data, so it is less effective in practice. Thus we
- * use pagedata to determine the size of each segment.
- *
- * The pagedata page maps (at least on x86) are not useful. Those flags
+ * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags
* are set by hrm_setbits() and on x86 that code path is only executed by
* segvn_pagelock -> hat_setstat -> hrm_setbits
* segvn_softunlock -^
* On SPARC there is an additional code path which may make this data
* useful (sfmmu_ttesync), but since it is not generic, we ignore the page
- * maps and only use the segement info from pagedata. If we ever fix this
- * issue, then we could generalize this mcap code to do more with the data on
- * active pages.
+ * maps. If we ever fix this issue, then we could generalize this mcap code to
+ * do more with the data on active pages.
*
* For debugging, touch the file {zonepath}/mcap_debug.log. This will
* cause the thread to start logging its actions into that file (it may take
@@ -124,7 +117,6 @@ static cond_t shutdown_cv;
static int shutting_down = 0;
static thread_t mcap_tid;
static FILE *debug_log_fp = NULL;
-static uint64_t sum_pageout = 0; /* total bytes paged out in a pass */
static uint64_t zone_rss_cap; /* RSS cap(KB) */
static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
@@ -135,13 +127,7 @@ static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
typedef struct {
int pr_curr; /* the # of the mapping we're working on */
int pr_nmap; /* number of mappings in address space */
- int pr_cnt; /* number of mappings processed */
-
- prpageheader_t *pr_pghp; /* process's complete pagedata */
- prasmap_t *pr_asp; /* current address space pointer */
-
- uintptr_t pr_addr; /* base of mapping */
- uint64_t pr_size; /* size of mapping */
+ prxmap_t *pr_xmapp; /* process's xmap array */
} proc_map_t;
typedef struct zsd_vmusage64 {
@@ -293,40 +279,21 @@ control_proc(pid_t pid)
}
/*
- * Get data from the current prasmap_t and advance pr_asp to the next
- * asmap in the pagedata.
+ * Get the next mapping.
*/
-static uintptr_t
+static prxmap_t *
nextmapping(proc_map_t *pmp)
{
- prasmap_t *pap;
- void *pdp; /* per-page data pointer */
-
- pmp->pr_curr++;
- if (pmp->pr_curr > pmp->pr_nmap)
+ if (pmp->pr_xmapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
return (NULL);
- pap = pmp->pr_asp;
-
- pmp->pr_addr = pap->pr_vaddr;
- pmp->pr_size = pap->pr_npage * pap->pr_pagesize;
- pmp->pr_cnt++;
-
- /* Advance the pr_asp pointer to the next asmap */
- pdp = pap + 1;
- pdp = (caddr_t)(uintptr_t)((uintptr_t)pdp + pap->pr_npage);
-
- /* Skip to next 64-bit-aligned address to get the next prasmap_t. */
- pdp = (caddr_t)(((uintptr_t)pdp + 7) & ~7);
- pmp->pr_asp = (prasmap_t *)pdp;
-
- return (pmp->pr_addr);
+ return (&pmp->pr_xmapp[pmp->pr_curr++]);
}
/*
* Initialize the proc_map_t to access the first mapping of an address space.
*/
-static void *
+static prxmap_t *
init_map(proc_map_t *pmp, pid_t pid)
{
int fd;
@@ -337,39 +304,37 @@ init_map(proc_map_t *pmp, pid_t pid)
bzero(pmp, sizeof (proc_map_t));
pmp->pr_nmap = -1;
- (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/pagedata", zoneproc,
- pid);
+ (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/xmap", zoneproc, pid);
if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
return (NULL);
redo:
errno = 0;
if (fstat(fd, &st) != 0)
- return (NULL);
+ goto done;
- if ((pmp->pr_pghp = malloc(st.st_size)) == NULL) {
- debug("cannot malloc() %ld bytes for pagedata", st.st_size);
- return (NULL);
+ if ((pmp->pr_xmapp = malloc(st.st_size)) == NULL) {
+ debug("cannot malloc() %ld bytes for xmap", st.st_size);
+ goto done;
}
- (void) bzero(pmp->pr_pghp, st.st_size);
+ (void) bzero(pmp->pr_xmapp, st.st_size);
errno = 0;
- if ((res = read(fd, pmp->pr_pghp, st.st_size)) != st.st_size) {
- free(pmp->pr_pghp);
- pmp->pr_pghp = NULL;
+ if ((res = read(fd, pmp->pr_xmapp, st.st_size)) != st.st_size) {
+ free(pmp->pr_xmapp);
+ pmp->pr_xmapp = NULL;
if (res > 0 || errno == E2BIG) {
goto redo;
} else {
- debug("pid %ld cannot read pagedata\n", pid);
- return (NULL);
+ debug("pid %ld cannot read xmap\n", pid);
+ goto done;
}
}
- pmp->pr_nmap = pmp->pr_pghp->pr_nmap;
- pmp->pr_asp = (prasmap_t *)(pmp->pr_pghp + 1);
+ pmp->pr_nmap = st.st_size / sizeof (prxmap_t);
done:
(void) close(fd);
- return ((void *)nextmapping(pmp));
+ return (nextmapping(pmp));
}
/*
@@ -377,13 +342,24 @@ done:
* return nonzero if not all of the pages may are pageable, for any reason.
*/
static int
-pageout_mapping(struct ps_prochandle *Pr, proc_map_t *pmp)
+pageout_mapping(struct ps_prochandle *Pr, prxmap_t *pmp)
{
int res;
+ /*
+ * We particularly want to avoid the pr_memcntl on anonymous mappings
+ * which show 0 since that will pull them back off of the free list
+ * and increase the zone's RSS, even though the process itself has
+ * them freed up.
+ */
+ if (pmp->pr_mflags & MA_ANON && pmp->pr_anon == 0)
+ return (0);
+ else if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
+ return (0);
+
errno = 0;
- res = pr_memcntl(Pr, (caddr_t)pmp->pr_addr, pmp->pr_size, MC_SYNC,
- (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
+ res = pr_memcntl(Pr, (caddr_t)pmp->pr_vaddr, pmp->pr_size, MC_SYNC,
+ (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
/*
* EBUSY indicates none of the pages have backing store allocated, or
@@ -423,7 +399,7 @@ static int64_t
pageout_process(pid_t pid, int64_t excess)
{
int psfd;
- void *praddr;
+ prxmap_t *pxmap;
proc_map_t cur;
struct ps_prochandle *ph = NULL;
int unpageable_mappings;
@@ -433,7 +409,6 @@ pageout_process(pid_t pid, int64_t excess)
int incr_rss_check = 0;
char pathbuf[MAXPATHLEN];
- cur.pr_pghp = NULL;
(void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
pid);
if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
@@ -459,11 +434,11 @@ pageout_process(pid_t pid, int64_t excess)
}
/* Get segment residency information. */
- praddr = init_map(&cur, pid);
+ pxmap = init_map(&cur, pid);
/* Skip process if it has no mappings. */
- if (cur.pr_pghp == NULL) {
- debug("%ld: pagedata unreadable; ignoring\n", pid);
+ if (pxmap == NULL) {
+ debug("%ld: xmap unreadable; ignoring\n", pid);
goto done;
}
@@ -489,15 +464,15 @@ pageout_process(pid_t pid, int64_t excess)
*/
sum_att = sum_d_rss = 0;
unpageable_mappings = 0;
- while (excess > 0 && praddr != NULL && !shutting_down) {
+ while (excess > 0 && pxmap != NULL && !shutting_down) {
/* Try to page out the mapping. */
- if (pageout_mapping(ph, &cur) < 0) {
+ if (pageout_mapping(ph, pxmap) < 0) {
debug("pid %ld: exited or unpageable\n", pid);
break;
}
/* attempted is the size of the mapping */
- sum_att += (cur.pr_size / 1024);
+ sum_att += pxmap->pr_size / 1024;
/*
* This processes RSS is potentially enough to clear the
@@ -519,11 +494,10 @@ pageout_process(pid_t pid, int64_t excess)
} else {
excess += d_rss;
sum_d_rss += d_rss;
- sum_pageout += (-d_rss * 1024);
}
}
- praddr = (void *)nextmapping(&cur);
+ pxmap = nextmapping(&cur);
}
if (!incr_rss_check) {
@@ -531,12 +505,11 @@ pageout_process(pid_t pid, int64_t excess)
if (d_rss < 0) {
excess += d_rss;
sum_d_rss += d_rss;
- sum_pageout += (-d_rss * 1024);
}
}
- debug("pid %ld: map %d unp %d att %lluKB drss %lldKB excess %lldKB\n",
- pid, cur.pr_cnt, unpageable_mappings, (unsigned long long)sum_att,
+ debug("pid %ld: unp %d att %lluKB drss %lldKB excess %lldKB\n",
+ pid, unpageable_mappings, (unsigned long long)sum_att,
(unsigned long long)sum_d_rss, (long long)excess);
done:
@@ -546,8 +519,8 @@ done:
(void) Prelease(ph, 0);
}
- if (cur.pr_pghp != NULL)
- free(cur.pr_pghp);
+ if (cur.pr_xmapp != NULL)
+ free(cur.pr_xmapp);
(void) close(psfd);
@@ -680,12 +653,13 @@ get_zone_cap()
* is important considering that each zone will be monitoring its rss.
*/
static int64_t
-check_suspend(int age)
+check_suspend(int age, boolean_t new_cycle)
{
static hrtime_t last_cap_read = 0;
static uint64_t addon;
static uint64_t lo_thresh; /* Thresholds for how long to sleep */
static uint64_t hi_thresh; /* when under the cap (80% & 90%). */
+ static uint64_t prev_zone_rss = 0;
/* Wait a second to give the async pageout a chance to catch up. */
(void) sleep_shutdown(1);
@@ -742,16 +716,6 @@ check_suspend(int age)
continue;
}
- /*
- * If we did some paging out since our last invocation then
- * update the kstat so we can track how much was paged out.
- */
- if (sum_pageout != 0) {
- (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
- &sum_pageout, 0);
- sum_pageout = 0;
- }
-
zone_rss = get_mem_info(age);
/* calculate excess */
@@ -760,18 +724,41 @@ check_suspend(int age)
debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
zone_rss, zone_rss_cap, new_excess);
+ /*
+ * If necessary, updates stats.
+ */
+
+ /*
+ * If it looks like we did some paging out since last over the
+ * cap then update the kstat so we can approximate how much was
+ * paged out.
+ */
+ if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
+ uint64_t diff;
+
+ /* assume diff is num bytes we paged out */
+ diff = (prev_zone_rss - zone_rss) * 1024;
+
+ (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
+ &diff, 0);
+ }
+ prev_zone_rss = zone_rss;
+
if (new_excess > 0) {
- uint64_t n = 1;
+ if (new_cycle) {
+ uint64_t n = 1;
- /* Increment "nover" kstat. */
- (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
+ /* Increment "nover" kstat. */
+ (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER,
+ &n, 0);
+ }
/*
- * Once we go over the cap, then we want to page out a
- * little extra instead of stopping right at the cap.
- * To do this we add 5% to the excess so that
- * pageout_proces will work a little longer before
- * stopping.
+ * Once we go over the cap, then we want to
+ * page out a little extra instead of stopping
+ * right at the cap. To do this we add 5% to
+ * the excess so that pageout_proces will work
+ * a little longer before stopping.
*/
return ((int64_t)(new_excess + addon));
}
@@ -845,7 +832,7 @@ mcap_zone()
struct dirent *dirent;
/* Wait until we've gone over the cap. */
- excess = check_suspend(age);
+ excess = check_suspend(age, B_TRUE);
debug("starting to scan, excess %lldk\n", (long long)excess);
@@ -885,10 +872,10 @@ mcap_zone()
excess = pageout_process(pid, excess);
if (excess <= 0) {
- debug("done scanning; excess %lld\n",
+ debug("apparently under; excess %lld\n",
(long long)excess);
/* Double check the current excess */
- excess = check_suspend(1);
+ excess = check_suspend(1, B_FALSE);
}
}
diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h
index a9191aed7c..cb8a6012fc 100644
--- a/usr/src/uts/common/sys/buf.h
+++ b/usr/src/uts/common/sys/buf.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -186,6 +187,7 @@ struct biostats {
#define B_STARTED 0x2000000 /* io:::start probe called for buf */
#define B_ABRWRITE 0x4000000 /* Application based recovery active */
#define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */
+#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */
/*
* There is some confusion over the meaning of B_FREE and B_INVAL and what
@@ -198,6 +200,12 @@ struct biostats {
* between the sole use of these two flags. In both cases, IO will be done
* if the page is not yet committed to storage.
*
+ * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is
+ * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no
+ * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then
+ * the mapping for the page is only invalidated for the current process.
+ * In this case, the page is not destroyed unless this was the final mapping.
+ *
* In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
* should be used.
*
diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h
index 6c9119e56d..82344607b0 100644
--- a/usr/src/uts/common/sys/mman.h
+++ b/usr/src/uts/common/sys/mman.h
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -353,6 +354,7 @@ struct memcntl_mha32 {
#define MS_SYNC 0x4 /* wait for msync */
#define MS_ASYNC 0x1 /* return immediately */
#define MS_INVALIDATE 0x2 /* invalidate caches */
+#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */
#if (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__)
/* functions to mctl */
diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c
index 1ab3a8b65e..63c8b64ad0 100644
--- a/usr/src/uts/common/syscall/memcntl.c
+++ b/usr/src/uts/common/syscall/memcntl.c
@@ -21,6 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -116,13 +117,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
* MS_SYNC used to be defined to be zero but is now non-zero.
* For binary compatibility we still accept zero
* (the absence of MS_ASYNC) to mean the same thing.
+ * Binary compatibility is not an issue for MS_INVALCURPROC.
*/
iarg = (uintptr_t)arg;
if ((iarg & ~MS_INVALIDATE) == 0)
iarg |= MS_SYNC;
- if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
- ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
+ if (((iarg &
+ ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) ||
+ ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) ||
+ ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) ==
+ (MS_INVALIDATE|MS_INVALCURPROC))) {
error = set_errno(EINVAL);
} else {
error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
index 1d91475e38..156b810046 100644
--- a/usr/src/uts/common/vm/hat.h
+++ b/usr/src/uts/common/vm/hat.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -460,6 +461,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t);
*/
#define HAT_ADV_PGUNLOAD 0x00
#define HAT_FORCE_PGUNLOAD 0x01
+#define HAT_CURPROC_PGUNLOAD 0x02
/*
* Attributes for hat_page_*attr, hat_setstats and
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 31c293d416..5f106f6c06 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -7254,7 +7255,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = svd->vpage;
offset = svd->offset + (uintptr_t)(addr - seg->s_base);
bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
- ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+ ((flags & MS_INVALIDATE) ? B_INVAL : 0) |
+ ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0);
if (attr) {
pageprot = attr & ~(SHARED|PRIVATE);
@@ -7279,11 +7281,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = &svd->vpage[seg_page(seg, addr)];
} else if (svd->vp && svd->amp == NULL &&
- (flags & MS_INVALIDATE) == 0) {
+ (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) {
/*
- * No attributes, no anonymous pages and MS_INVALIDATE flag
- * is not on, just use one big request.
+ * No attributes, no anonymous pages and MS_INVAL* flags
+ * are not on, just use one big request.
*/
err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
bflags, svd->cred, NULL);
@@ -7335,7 +7337,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
* might race in and lock the page after we unlock and before
* we do the PUTPAGE, then PUTPAGE simply does nothing.
*/
- if (flags & MS_INVALIDATE) {
+ if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) {
if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
page_unlock(pp);
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
index 7233581227..39ace0b3c2 100644
--- a/usr/src/uts/common/vm/vm_pvn.c
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -431,7 +432,14 @@ pvn_write_done(page_t *plist, int flags)
page_io_unlock(pp);
page_unlock(pp);
}
- } else if (flags & B_INVAL) {
+ } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
+ /*
+ * If B_INVALCURONLY is set, then we handle that case
+ * in the next conditional if hat_page_is_mapped()
+ * indicates that there are no additional mappings
+ * to the page.
+ */
+
/*
* XXX - Failed writes with B_INVAL set are
* not handled appropriately.
@@ -572,8 +580,9 @@ pvn_write_done(page_t *plist, int flags)
}
/*
- * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
- * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
+ * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
+ * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
+ * B_DELWRI indicates that this page is part of a kluster
* operation and is only to be considered if it doesn't involve any
* waiting here. B_TRUNC indicates that the file is being truncated
* and so no i/o needs to be done. B_FORCE indicates that the page
@@ -627,13 +636,17 @@ pvn_getdirty(page_t *pp, int flags)
* If we want to free or invalidate the page then
* we need to unload it so that anyone who wants
* it will have to take a minor fault to get it.
+ * If we are only invalidating the page for the
+ * current process, then pass in a different flag.
* Otherwise, we're just writing the page back so we
* need to sync up the hardwre and software mod bit to
* detect any future modifications. We clear the
* software mod bit when we put the page on the dirty
* list.
*/
- if (flags & (B_INVAL | B_FREE)) {
+ if (flags & B_INVALCURONLY) {
+ (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
+ } else if (flags & (B_INVAL | B_FREE)) {
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
} else {
(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
@@ -645,7 +658,7 @@ pvn_getdirty(page_t *pp, int flags)
* list after all.
*/
page_io_unlock(pp);
- if (flags & B_INVAL) {
+ if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_INVAL, 0, kcred);
} else if (flags & B_FREE) {
@@ -657,6 +670,9 @@ pvn_getdirty(page_t *pp, int flags)
* of VOP_PUTPAGE() who prefer freeing the
* page _only_ if no one else is accessing it.
* E.g. segmap_release()
+ * We also take this path for B_INVALCURONLY and
+ * let page_release call VN_DISPOSE if no one else is
+ * using the page.
*
* The above hat_ismod() check is useless because:
* (1) we may not be holding SE_EXCL lock;
@@ -681,7 +697,7 @@ pvn_getdirty(page_t *pp, int flags)
* We'll detect the fact that they used it when the
* i/o is done and avoid freeing the page.
*/
- if (flags & B_FREE)
+ if (flags & (B_FREE | B_INVALCURONLY))
page_downgrade(pp);
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index 18e3c4c806..bbfd6013cd 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -939,7 +939,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
(page = page_exists(vn, off)) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -1026,7 +1029,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
if (vnode->v_pages != NULL &&
(page = page_exists(vnode, ptob(index))) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -1306,6 +1312,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
}
/*
+ * Pages on the free list aren't counted for the rss.
+ */
+ if (PP_ISFREE(page))
+ continue;
+
+ /*
* Assume anon structs with a refcnt
* of 1 are not COW shared, so there
* is no reason to track them per entity.
diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c
index 8da02a4c36..40b033d0e4 100644
--- a/usr/src/uts/i86pc/vm/hat_i86.c
+++ b/usr/src/uts/i86pc/vm/hat_i86.c
@@ -27,6 +27,7 @@
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/*
@@ -3350,15 +3351,13 @@ hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
extern int vpm_enable;
/*
- * Unload all translations to a page. If the page is a subpage of a large
+ * Unload translations to a page. If the page is a subpage of a large
* page, the large page mappings are also removed.
- *
- * The forceflags are unused.
+ * If unloadflag is HAT_CURPROC_PGUNLOAD, then we only unload the translation
+ * for the current process, otherwise all translations are unloaded.
*/
-
-/*ARGSUSED*/
static int
-hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
+hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t unloadflag)
{
page_t *cur_pp = pp;
hment_t *hm;
@@ -3366,6 +3365,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
htable_t *ht;
uint_t entry;
level_t level;
+ struct hat *curhat;
+ ulong_t cnt;
XPV_DISALLOW_MIGRATE();
@@ -3375,6 +3376,9 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
++curthread->t_hatdepth;
ASSERT(curthread->t_hatdepth < 16);
+ if (unloadflag == HAT_CURPROC_PGUNLOAD)
+ curhat = curthread->t_procp->p_as->a_hat;
+
#if defined(__amd64)
/*
* clear the vpm ref.
@@ -3387,6 +3391,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
* The loop with next_size handles pages with multiple pagesize mappings
*/
next_size:
+ if (unloadflag == HAT_CURPROC_PGUNLOAD)
+ cnt = hat_page_getshare(cur_pp);
for (;;) {
/*
@@ -3398,6 +3404,7 @@ next_size:
if (hm == NULL) {
x86_hm_exit(cur_pp);
+curproc_done:
/*
* If not part of a larger page, we're done.
*/
@@ -3424,8 +3431,21 @@ next_size:
* If this mapping size matches, remove it.
*/
level = ht->ht_level;
- if (level == pg_szcd)
- break;
+ if (level == pg_szcd) {
+ if (unloadflag != HAT_CURPROC_PGUNLOAD ||
+ ht->ht_hat == curhat)
+ break;
+ /*
+ * unloadflag == HAT_CURPROC_PGUNLOAD but it's
+ * not the hat for the current process. Leave
+ * entry in place. Also do a safety check to
+ * ensure we don't get in an infinite loop
+ */
+ if (cnt-- == 0) {
+ x86_hm_exit(cur_pp);
+ goto curproc_done;
+ }
+ }
}
/*
@@ -3435,14 +3455,18 @@ next_size:
hm = hati_page_unmap(cur_pp, ht, entry);
if (hm != NULL)
hment_free(hm);
+
+ /* Perform check above for being part of a larger page. */
+ if (unloadflag == HAT_CURPROC_PGUNLOAD)
+ goto curproc_done;
}
}
int
-hat_pageunload(struct page *pp, uint_t forceflag)
+hat_pageunload(struct page *pp, uint_t unloadflag)
{
ASSERT(PAGE_EXCL(pp));
- return (hati_pageunload(pp, 0, forceflag));
+ return (hati_pageunload(pp, 0, unloadflag));
}
/*