summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/vm
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/vm')
-rw-r--r--usr/src/uts/common/vm/hat.h10
-rw-r--r--usr/src/uts/common/vm/page.h7
-rw-r--r--usr/src/uts/common/vm/page_lock.c19
-rw-r--r--usr/src/uts/common/vm/page_retire.c7
-rw-r--r--usr/src/uts/common/vm/seg_kmem.c83
-rw-r--r--usr/src/uts/common/vm/seg_kmem.h18
-rw-r--r--usr/src/uts/common/vm/seg_vn.c11
-rw-r--r--usr/src/uts/common/vm/vm_as.c19
-rw-r--r--usr/src/uts/common/vm/vm_page.c29
-rw-r--r--usr/src/uts/common/vm/vm_pvn.c28
-rw-r--r--usr/src/uts/common/vm/vm_usage.c252
11 files changed, 347 insertions, 136 deletions
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
index a2509e7bb6..3735139068 100644
--- a/usr/src/uts/common/vm/hat.h
+++ b/usr/src/uts/common/vm/hat.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -269,7 +270,12 @@ void hat_kpm_walk(void (*)(void *, void *, size_t), void *);
* call.
*
* int hat_pageunload(pp, forceflag)
- * unload all translations attached to pp.
+ * Unload all translations attached to pp. On x86 the bulk of the work is
+ * done by hat_page_inval.
+ *
+ * void hat_page_inval(pp, pgsz, curhat)
+ * Unload translations attached to pp. If curhat is provided, only the
+ * translation for that process is unloaded, otherwise all are unloaded.
*
* uint_t hat_pagesync(pp, flags)
* get hw stats from hardware into page struct and reset hw stats
@@ -291,6 +297,7 @@ void hat_page_setattr(struct page *, uint_t);
void hat_page_clrattr(struct page *, uint_t);
uint_t hat_page_getattr(struct page *, uint_t);
int hat_pageunload(struct page *, uint_t);
+void hat_page_inval(struct page *, uint_t, struct hat *);
uint_t hat_pagesync(struct page *, uint_t);
ulong_t hat_page_getshare(struct page *);
int hat_page_checkshare(struct page *, ulong_t);
@@ -460,6 +467,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t);
*/
#define HAT_ADV_PGUNLOAD 0x00
#define HAT_FORCE_PGUNLOAD 0x01
+#define HAT_CURPROC_PGUNLOAD 0x02
/*
* Attributes for hat_page_*attr, hat_setstats and
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 8747b96acc..ae9b0be758 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -229,6 +230,7 @@ struct as;
* p_nrm
* p_mapping
* p_share
+ * p_zoneid
*
* The following field is file system dependent. How it is used and
* the locking strategies applied are up to the individual file system
@@ -527,9 +529,8 @@ typedef struct page {
pfn_t p_pagenum; /* physical page number */
uint_t p_share; /* number of translations */
-#if defined(_LP64)
- uint_t p_sharepad; /* pad for growing p_share */
-#endif
+ short p_zoneid; /* zone page use tracking */
+ short p_pad1; /* TBD */
uint_t p_slckcnt; /* number of softlocks */
#if defined(__sparc)
uint_t p_kpmref; /* number of kpm mapping sharers */
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
index 7e48602189..a5a39d04c1 100644
--- a/usr/src/uts/common/vm/page_lock.c
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
@@ -140,9 +141,8 @@ static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE];
& (VPH_TABLE_SIZE - 1))
/*
- * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
- * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
- * VPH_TABLE_SIZE + 1.
+ * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes,
+ * one for kvps[KV_ZVP], and one for other kvps[] users.
*/
kmutex_t vph_mutex[VPH_TABLE_SIZE + 2];
@@ -364,7 +364,6 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
retval = 0;
} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
/* no reader/writer lock held */
- THREAD_KPRI_REQUEST();
/* this clears our setting of the SE_EWANTED bit */
pp->p_selock = SE_WRITER;
retval = 1;
@@ -551,7 +550,6 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
if ((old & ~SE_EWANTED) == 0) {
/* no reader/writer lock held */
- THREAD_KPRI_REQUEST();
/* this clears out our setting of the SE_EWANTED bit */
pp->p_selock = SE_WRITER;
mutex_exit(pse);
@@ -590,7 +588,6 @@ page_trylock(page_t *pp, se_t se)
if (se == SE_EXCL) {
if (pp->p_selock == 0) {
- THREAD_KPRI_REQUEST();
pp->p_selock = SE_WRITER;
mutex_exit(pse);
return (1);
@@ -628,7 +625,6 @@ page_unlock_nocapture(page_t *pp)
} else if ((old & ~SE_EWANTED) == SE_DELETED) {
panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
} else if (old < 0) {
- THREAD_KPRI_RELEASE();
pp->p_selock &= SE_EWANTED;
if (CV_HAS_WAITERS(&pp->p_cv))
cv_broadcast(&pp->p_cv);
@@ -662,7 +658,6 @@ page_unlock(page_t *pp)
} else if ((old & ~SE_EWANTED) == SE_DELETED) {
panic("page_unlock: page %p is deleted", (void *)pp);
} else if (old < 0) {
- THREAD_KPRI_RELEASE();
pp->p_selock &= SE_EWANTED;
if (CV_HAS_WAITERS(&pp->p_cv))
cv_broadcast(&pp->p_cv);
@@ -682,7 +677,6 @@ page_unlock(page_t *pp)
if ((pp->p_toxic & PR_CAPTURE) &&
!(curthread->t_flag & T_CAPTURING) &&
!PP_RETIRED(pp)) {
- THREAD_KPRI_REQUEST();
pp->p_selock = SE_WRITER;
mutex_exit(pse);
page_unlock_capture(pp);
@@ -712,7 +706,6 @@ page_tryupgrade(page_t *pp)
if (!(pp->p_selock & SE_EWANTED)) {
/* no threads want exclusive access, try upgrade */
if (pp->p_selock == SE_READER) {
- THREAD_KPRI_REQUEST();
/* convert to exclusive lock */
pp->p_selock = SE_WRITER;
mutex_exit(pse);
@@ -738,7 +731,6 @@ page_downgrade(page_t *pp)
mutex_enter(pse);
excl_waiting = pp->p_selock & SE_EWANTED;
- THREAD_KPRI_RELEASE();
pp->p_selock = SE_READER | excl_waiting;
if (CV_HAS_WAITERS(&pp->p_cv))
cv_broadcast(&pp->p_cv);
@@ -756,7 +748,6 @@ page_lock_delete(page_t *pp)
ASSERT(!PP_ISFREE(pp));
mutex_enter(pse);
- THREAD_KPRI_RELEASE();
pp->p_selock = SE_DELETED;
if (CV_HAS_WAITERS(&pp->p_cv))
cv_broadcast(&pp->p_cv);
@@ -888,10 +879,10 @@ static int page_vnode_mutex_stress = 0;
kmutex_t *
page_vnode_mutex(vnode_t *vp)
{
- if (vp == &kvp)
+ if (vp == &kvp || vp == &kvps[KV_VVP])
return (&vph_mutex[VPH_TABLE_SIZE + 0]);
- if (vp == &zvp)
+ if (vp == &kvps[KV_ZVP])
return (&vph_mutex[VPH_TABLE_SIZE + 1]);
#ifdef DEBUG
if (page_vnode_mutex_stress != 0)
diff --git a/usr/src/uts/common/vm/page_retire.c b/usr/src/uts/common/vm/page_retire.c
index 76be970a45..f4e8d0737f 100644
--- a/usr/src/uts/common/vm/page_retire.c
+++ b/usr/src/uts/common/vm/page_retire.c
@@ -22,6 +22,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -851,9 +852,8 @@ page_retire_incr_pend_count(void *datap)
{
PR_INCR_KSTAT(pr_pending);
- if ((datap == &kvp) || (datap == &zvp)) {
+ if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP])
PR_INCR_KSTAT(pr_pending_kas);
- }
}
void
@@ -861,9 +861,8 @@ page_retire_decr_pend_count(void *datap)
{
PR_DECR_KSTAT(pr_pending);
- if ((datap == &kvp) || (datap == &zvp)) {
+ if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP])
PR_DECR_KSTAT(pr_pending_kas);
- }
}
/*
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
index 439c859d96..0b116d6eba 100644
--- a/usr/src/uts/common/vm/seg_kmem.c
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -122,6 +122,11 @@ vmem_t *static_alloc_arena; /* arena for allocating static memory */
vmem_t *zio_arena = NULL; /* arena for allocating zio memory */
vmem_t *zio_alloc_arena = NULL; /* arena for allocating zio memory */
+#if defined(__amd64)
+vmem_t *kvmm_arena; /* arena for vmm VA */
+struct seg kvmmseg; /* Segment for vmm memory */
+#endif
+
/*
* seg_kmem driver can map part of the kernel heap with large pages.
* Currently this functionality is implemented for sparc platforms only.
@@ -440,7 +445,7 @@ segkmem_badop()
/*ARGSUSED*/
static faultcode_t
segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
- enum fault_type type, enum seg_rw rw)
+ enum fault_type type, enum seg_rw rw)
{
pgcnt_t npages;
spgcnt_t pg;
@@ -655,13 +660,19 @@ segkmem_dump(struct seg *seg)
segkmem_dump_range, seg->s_as);
vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
segkmem_dump_range, seg->s_as);
+ /*
+ * We don't want to dump pages attached to kzioseg since they
+ * contain file data from ZFS. If this page's segment is
+ * kzioseg return instead of writing it to the dump device.
+ *
+ * Same applies to VM memory allocations.
+ */
} else if (seg == &kzioseg) {
- /*
- * We don't want to dump pages attached to kzioseg since they
- * contain file data from ZFS. If this page's segment is
- * kzioseg return instead of writing it to the dump device.
- */
return;
+#if defined(__amd64)
+ } else if (seg == &kvmmseg) {
+ return;
+#endif
} else {
segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
}
@@ -677,7 +688,7 @@ segkmem_dump(struct seg *seg)
/*ARGSUSED*/
static int
segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
- page_t ***ppp, enum lock_type type, enum seg_rw rw)
+ page_t ***ppp, enum lock_type type, enum seg_rw rw)
{
page_t **pplist, *pp;
pgcnt_t npages;
@@ -802,21 +813,18 @@ struct seg_ops segkmem_ops = {
};
int
-segkmem_zio_create(struct seg *seg)
-{
- ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
- seg->s_ops = &segkmem_ops;
- seg->s_data = &zvp;
- kas.a_size += seg->s_size;
- return (0);
-}
-
-int
segkmem_create(struct seg *seg)
{
ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
seg->s_ops = &segkmem_ops;
- seg->s_data = &kvp;
+ if (seg == &kzioseg)
+ seg->s_data = &kvps[KV_ZVP];
+#if defined(__amd64)
+ else if (seg == &kvmmseg)
+ seg->s_data = &kvps[KV_VVP];
+#endif
+ else
+ seg->s_data = &kvps[KV_KVP];
kas.a_size += seg->s_size;
return (0);
}
@@ -858,7 +866,7 @@ segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
*/
void *
segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
- page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
+ page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
{
page_t *ppl;
caddr_t addr = inaddr;
@@ -968,10 +976,10 @@ segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
}
-void *
+static void *
segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
{
- return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
+ return (segkmem_alloc_vn(vmp, size, vmflag, &kvps[KV_ZVP]));
}
/*
@@ -980,8 +988,8 @@ segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
* we currently don't have a special kernel segment for non-paged
* kernel memory that is exported by drivers to user space.
*/
-static void
-segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
+void
+segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
void (*func)(page_t *))
{
page_t *pp;
@@ -1038,21 +1046,15 @@ segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
}
void
-segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, void (*func)(page_t *))
-{
- segkmem_free_vn(vmp, inaddr, size, &kvp, func);
-}
-
-void
segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
{
- segkmem_free_vn(vmp, inaddr, size, &kvp, NULL);
+ segkmem_xfree(vmp, inaddr, size, &kvp, NULL);
}
-void
+static void
segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
{
- segkmem_free_vn(vmp, inaddr, size, &zvp, NULL);
+ segkmem_xfree(vmp, inaddr, size, &kvps[KV_ZVP], NULL);
}
void
@@ -1534,8 +1536,21 @@ segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
ASSERT(zio_alloc_arena != NULL);
}
-#ifdef __sparc
+#if defined(__amd64)
+
+void
+segkmem_kvmm_init(void *base, size_t size)
+{
+ ASSERT(base != NULL);
+ ASSERT(size != 0);
+
+ kvmm_arena = vmem_create("kvmm_arena", base, size, 1024 * 1024,
+ NULL, NULL, NULL, 0, VM_SLEEP);
+
+ ASSERT(kvmm_arena != NULL);
+}
+#elif defined(__sparc)
static void *
segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag)
diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h
index 1db85826b1..9a20101670 100644
--- a/usr/src/uts/common/vm/seg_kmem.h
+++ b/usr/src/uts/common/vm/seg_kmem.h
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2017 RackTop Systems.
*/
@@ -65,12 +65,18 @@ extern vmem_t *static_arena; /* arena for caches to import static memory */
extern vmem_t *static_alloc_arena; /* arena for allocating static memory */
extern vmem_t *zio_arena; /* arena for zio caches */
extern vmem_t *zio_alloc_arena; /* arena for zio caches */
+
+#if defined(__amd64)
+extern struct seg kvmmseg; /* Segment for vmm mappings */
+extern vmem_t *kvmm_arena; /* arena for vmm VA */
+extern void segkmem_kvmm_init(void *, size_t);
+#endif
+
extern struct vnode kvps[];
/*
- * segkmem page vnodes
+ * segkmem page vnodes (please don't add more defines here...)
*/
#define kvp (kvps[KV_KVP])
-#define zvp (kvps[KV_ZVP])
#if defined(__sparc)
#define mpvp (kvps[KV_MPVP])
#define promvp (kvps[KV_PROMVP])
@@ -83,16 +89,14 @@ extern void *segkmem_xalloc(vmem_t *, void *, size_t, int, uint_t,
extern void *segkmem_alloc(vmem_t *, size_t, int);
extern void *segkmem_alloc_permanent(vmem_t *, size_t, int);
extern void segkmem_free(vmem_t *, void *, size_t);
-extern void segkmem_xfree(vmem_t *, void *, size_t, void (*)(page_t *));
+extern void segkmem_xfree(vmem_t *, void *, size_t,
+ struct vnode *, void (*)(page_t *));
extern void *boot_alloc(void *, size_t, uint_t);
extern void boot_mapin(caddr_t addr, size_t size);
extern void kernelheap_init(void *, void *, char *, void *, void *);
extern void segkmem_gc(void);
-extern void *segkmem_zio_alloc(vmem_t *, size_t, int);
-extern int segkmem_zio_create(struct seg *);
-extern void segkmem_zio_free(vmem_t *, void *, size_t);
extern void segkmem_zio_init(void *, size_t);
/*
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 8046d10212..da6393f792 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -7313,7 +7313,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = svd->vpage;
offset = svd->offset + (uintptr_t)(addr - seg->s_base);
bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
- ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+ ((flags & MS_INVALIDATE) ? B_INVAL : 0) |
+ ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0);
if (attr) {
pageprot = attr & ~(SHARED|PRIVATE);
@@ -7338,11 +7339,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = &svd->vpage[seg_page(seg, addr)];
} else if (svd->vp && svd->amp == NULL &&
- (flags & MS_INVALIDATE) == 0) {
+ (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) {
/*
- * No attributes, no anonymous pages and MS_INVALIDATE flag
- * is not on, just use one big request.
+ * No attributes, no anonymous pages and MS_INVAL* flags
+ * are not on, just use one big request.
*/
err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
bflags, svd->cred, NULL);
@@ -7394,7 +7395,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
* might race in and lock the page after we unlock and before
* we do the PUTPAGE, then PUTPAGE simply does nothing.
*/
- if (flags & MS_INVALIDATE) {
+ if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) {
if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
page_unlock(pp);
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
index 853b092e6d..ec6d2b8920 100644
--- a/usr/src/uts/common/vm/vm_as.c
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -58,6 +58,7 @@
#include <sys/debug.h>
#include <sys/tnf_probe.h>
#include <sys/vtrace.h>
+#include <sys/ddi.h>
#include <vm/hat.h>
#include <vm/as.h>
@@ -72,6 +73,8 @@
clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
+ulong_t as_user_seg_limit = 0xffff; /* max segments in an (non-kas) AS */
+
static struct kmem_cache *as_cache;
static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
@@ -853,8 +856,6 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
int as_lock_held;
klwp_t *lwp = ttolwp(curthread);
-
-
retry:
/*
* Indicate that the lwp is not to be stopped while waiting for a
@@ -1724,6 +1725,20 @@ as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
p->p_rctls, p, RCA_UNSAFE_ALL);
return (ENOMEM);
}
+
+ /*
+ * Keep the number of segments in a userspace AS constrained to
+ * a reasonable limit. Linux enforces a value slightly less
+ * than 64k in order to avoid ELF limits if/when a process
+ * dumps core. While SunOS avoids that specific problem with
+ * other tricks, the limit is still valuable to keep kernel
+ * memory consumption in check.
+ */
+ if (avl_numnodes(&as->a_segtree) >= as_user_seg_limit) {
+ AS_LOCK_EXIT(as);
+ atomic_inc_32(&p->p_zone->zone_mfseglim);
+ return (ENOMEM);
+ }
}
if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 78d1cb1a58..abccf82057 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -22,6 +22,7 @@
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
* Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -440,10 +441,26 @@ init_pages_pp_maximum()
}
}
+/*
+ * In the past, we limited the maximum pages that could be gotten to essentially
+ * 1/2 of the total pages on the system. However, this is too conservative for
+ * some cases. For example, if we want to host a large virtual machine which
+ * needs to use a significant portion of the system's memory. In practice,
+ * allowing more than 1/2 of the total pages is fine, but becomes problematic
+ * as we approach or exceed 75% of the pages on the system. Thus, we limit the
+ * maximum to 23/32 of the total pages, which is ~72%.
+ */
void
set_max_page_get(pgcnt_t target_total_pages)
{
- max_page_get = target_total_pages / 2;
+ max_page_get = (target_total_pages >> 5) * 23;
+ ASSERT3U(max_page_get, >, 0);
+}
+
+pgcnt_t
+get_max_page_get()
+{
+ return (max_page_get);
}
static pgcnt_t pending_delete;
@@ -1460,6 +1477,8 @@ page_create_throttle(pgcnt_t npages, int flags)
uint_t i;
pgcnt_t tf; /* effective value of throttlefree */
+ atomic_inc_64(&n_throttle);
+
/*
* Normal priority allocations.
*/
@@ -1492,7 +1511,7 @@ page_create_throttle(pgcnt_t npages, int flags)
tf = throttlefree -
((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
for (;;) {
fm = 0;
@@ -1579,7 +1598,7 @@ checkagain:
}
ASSERT(proc_pageout != NULL);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
"page_create_sleep_start: freemem %ld needfree %ld",
@@ -2226,7 +2245,7 @@ page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
if (nscan < desscan && freemem < minfree) {
TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
"pageout_cv_signal:freemem %ld", freemem);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
}
pp = rootpp;
@@ -2355,7 +2374,7 @@ page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
if (nscan < desscan && freemem < minfree) {
TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
"pageout_cv_signal:freemem %ld", freemem);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
}
/*
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
index 1b8d12eb8d..a206320a30 100644
--- a/usr/src/uts/common/vm/vm_pvn.c
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -432,7 +433,14 @@ pvn_write_done(page_t *plist, int flags)
page_io_unlock(pp);
page_unlock(pp);
}
- } else if (flags & B_INVAL) {
+ } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
+ /*
+ * If B_INVALCURONLY is set, then we handle that case
+ * in the next conditional if hat_page_is_mapped()
+ * indicates that there are no additional mappings
+ * to the page.
+ */
+
/*
* XXX - Failed writes with B_INVAL set are
* not handled appropriately.
@@ -573,8 +581,9 @@ pvn_write_done(page_t *plist, int flags)
}
/*
- * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
- * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
+ * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
+ * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
+ * B_DELWRI indicates that this page is part of a kluster
* operation and is only to be considered if it doesn't involve any
* waiting here. B_TRUNC indicates that the file is being truncated
* and so no i/o needs to be done. B_FORCE indicates that the page
@@ -628,13 +637,17 @@ pvn_getdirty(page_t *pp, int flags)
* If we want to free or invalidate the page then
* we need to unload it so that anyone who wants
* it will have to take a minor fault to get it.
+ * If we are only invalidating the page for the
+ * current process, then pass in a different flag.
* Otherwise, we're just writing the page back so we
* need to sync up the hardwre and software mod bit to
* detect any future modifications. We clear the
* software mod bit when we put the page on the dirty
* list.
*/
- if (flags & (B_INVAL | B_FREE)) {
+ if (flags & B_INVALCURONLY) {
+ (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
+ } else if (flags & (B_INVAL | B_FREE)) {
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
} else {
(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
@@ -646,7 +659,7 @@ pvn_getdirty(page_t *pp, int flags)
* list after all.
*/
page_io_unlock(pp);
- if (flags & B_INVAL) {
+ if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_INVAL, 0, kcred);
} else if (flags & B_FREE) {
@@ -658,6 +671,9 @@ pvn_getdirty(page_t *pp, int flags)
* of VOP_PUTPAGE() who prefer freeing the
* page _only_ if no one else is accessing it.
* E.g. segmap_release()
+ * We also take this path for B_INVALCURONLY and
+ * let page_release call VN_DISPOSE if no one else is
+ * using the page.
*
* The above hat_ismod() check is useless because:
* (1) we may not be holding SE_EXCL lock;
@@ -682,7 +698,7 @@ pvn_getdirty(page_t *pp, int flags)
* We'll detect the fact that they used it when the
* i/o is done and avoid freeing the page.
*/
- if (flags & B_FREE)
+ if (flags & (B_FREE | B_INVALCURONLY))
page_downgrade(pp);
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index e542e8e479..01c2666e91 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,6 +25,10 @@
*/
/*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+/*
* vm_usage
*
* This file implements the getvmusage() private system call.
@@ -114,7 +118,7 @@
* For accurate counting of map-shared and COW-shared pages.
*
* - visited private anons (refcnt > 1) for each collective.
- * (entity->vme_anon_hash)
+ * (entity->vme_anon)
* For accurate counting of COW-shared pages.
*
* The common accounting structure is the vmu_entity_t, which represents
@@ -152,6 +156,7 @@
#include <sys/vm_usage.h>
#include <sys/zone.h>
#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
#include <sys/avl.h>
#include <vm/anon.h>
#include <vm/as.h>
@@ -199,6 +204,14 @@ typedef struct vmu_object {
} vmu_object_t;
/*
+ * Node for tree of visited COW anons.
+ */
+typedef struct vmu_anon {
+ avl_node_t vma_node;
+ uintptr_t vma_addr;
+} vmu_anon_t;
+
+/*
* Entity by which to count results.
*
* The entity structure keeps the current rss/swap counts for each entity
@@ -221,7 +234,7 @@ typedef struct vmu_entity {
struct vmu_entity *vme_next_calc;
mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
- mod_hash_t *vme_anon_hash; /* COW anons visited for entity */
+ avl_tree_t vme_anon; /* COW anons visited for entity */
vmusage_t vme_result; /* identifies entity and results */
} vmu_entity_t;
@@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2)
}
/*
+ * Comparison routine for our AVL tree of anon structures.
+ */
+static int
+vmu_anon_cmp(const void *lhs, const void *rhs)
+{
+ const vmu_anon_t *l = lhs, *r = rhs;
+
+ if (l->vma_addr == r->vma_addr)
+ return (0);
+
+ if (l->vma_addr < r->vma_addr)
+ return (-1);
+
+ return (1);
+}
+
+/*
* Save a bound on the free list.
*/
static void
@@ -363,13 +393,18 @@ static void
vmu_free_entity(mod_hash_val_t val)
{
vmu_entity_t *entity = (vmu_entity_t *)val;
+ vmu_anon_t *anon;
+ void *cookie = NULL;
if (entity->vme_vnode_hash != NULL)
i_mod_hash_clear_nosync(entity->vme_vnode_hash);
if (entity->vme_amp_hash != NULL)
i_mod_hash_clear_nosync(entity->vme_amp_hash);
- if (entity->vme_anon_hash != NULL)
- i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+ while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
+ kmem_free(anon, sizeof (vmu_anon_t));
+
+ avl_destroy(&entity->vme_anon);
entity->vme_next = vmu_data.vmu_free_entities;
vmu_data.vmu_free_entities = entity;
@@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid)
"vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
sizeof (struct anon_map));
- if (entity->vme_anon_hash == NULL)
- entity->vme_anon_hash = mod_hash_create_ptrhash(
- "vmusage anon hash", VMUSAGE_HASH_SIZE,
- mod_hash_null_valdtor, sizeof (struct anon));
+ VERIFY(avl_first(&entity->vme_anon) == NULL);
+
+ avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
+ offsetof(struct vmu_anon, vma_node));
entity->vme_next = vmu_data.vmu_entities;
vmu_data.vmu_entities = entity;
@@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id)
zone->vmz_id = id;
- if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+ if ((vmu_data.vmu_calc_flags &
+ (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
@@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
}
static int
-vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+vmu_find_insert_anon(vmu_entity_t *entity, void *key)
{
- int ret;
- caddr_t val;
+ vmu_anon_t anon, *ap;
- ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
- (mod_hash_val_t *)&val);
+ anon.vma_addr = (uintptr_t)key;
- if (ret == 0)
+ if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
return (0);
- ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
- (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+ ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
+ ap->vma_addr = (uintptr_t)key;
- ASSERT(ret == 0);
+ avl_add(&entity->vme_anon, ap);
return (1);
}
@@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
next = AVL_NEXT(tree, next);
continue;
}
+
+ ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
bound_type = next->vmb_type;
index = next->vmb_start;
while (index <= next->vmb_end) {
@@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
(page = page_exists(vn, off)) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
} else {
page_type = VMUSAGE_BOUND_NOT_INCORE;
}
+
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
next->vmb_type = page_type;
+ bound_type = page_type;
} else if (next->vmb_type != page_type) {
/*
* If current bound type does not match page
@@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
continue;
}
+ ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
bound_type = next->vmb_type;
index = next->vmb_start;
while (index <= next->vmb_end) {
@@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
if (vnode->v_pages != NULL &&
(page = page_exists(vnode, ptob(index))) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
} else {
page_type = VMUSAGE_BOUND_NOT_INCORE;
}
+
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
next->vmb_type = page_type;
+ bound_type = page_type;
} else if (next->vmb_type != page_type) {
/*
* If current bound type does not match page
@@ -1304,6 +1351,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
}
/*
+ * Pages on the free list aren't counted for the rss.
+ */
+ if (PP_ISFREE(page))
+ continue;
+
+ /*
* Assume anon structs with a refcnt
* of 1 are not COW shared, so there
* is no reason to track them per entity.
@@ -1320,8 +1373,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
* Track COW anons per entity so
* they are not double counted.
*/
- if (vmu_find_insert_anon(entity->vme_anon_hash,
- (caddr_t)ap) == 0)
+ if (vmu_find_insert_anon(entity, ap) == 0)
continue;
result->vmu_rss_all += (pgcnt << PAGESHIFT);
@@ -1461,8 +1513,9 @@ vmu_calculate_proc(proc_t *p)
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
- (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
- VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+ (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
+ VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
VMUSAGE_ALL_EUSERS)) {
ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
@@ -1595,8 +1648,7 @@ vmu_free_extra()
mod_hash_destroy_hash(te->vme_vnode_hash);
if (te->vme_amp_hash != NULL)
mod_hash_destroy_hash(te->vme_amp_hash);
- if (te->vme_anon_hash != NULL)
- mod_hash_destroy_hash(te->vme_anon_hash);
+ VERIFY(avl_first(&te->vme_anon) == NULL);
kmem_free(te, sizeof (vmu_entity_t));
}
while (vmu_data.vmu_free_zones != NULL) {
@@ -1617,13 +1669,42 @@ vmu_free_extra()
extern kcondvar_t *pr_pid_cv;
+static void
+vmu_get_zone_rss(zoneid_t zid)
+{
+ vmu_zone_t *zone;
+ zone_t *zp;
+ int ret;
+ uint_t pgcnt;
+
+ if ((zp = zone_find_by_id(zid)) == NULL)
+ return;
+
+ ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
+ (mod_hash_key_t)(uintptr_t)zid, (mod_hash_val_t *)&zone);
+ if (ret != 0) {
+ zone = vmu_alloc_zone(zid);
+ ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
+ (mod_hash_key_t)(uintptr_t)zid,
+ (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
+ ASSERT(ret == 0);
+ }
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ pgcnt = zone_pdata[zid].zpers_pg_cnt;
+ zone->vmz_zone->vme_result.vmu_rss_all = (size_t)ptob(pgcnt);
+ zone->vmz_zone->vme_result.vmu_swap_all = zp->zone_max_swap;
+
+ zone_rele(zp);
+}
+
/*
* Determine which entity types are relevant and allocate the hashes to
- * track them. Then walk the process table and count rss and swap
- * for each process'es address space. Address space object such as
- * vnodes, amps and anons are tracked per entity, so that they are
- * not double counted in the results.
- *
+ * track them. First get the zone rss using the data we already have. Then,
+ * if necessary, walk the process table and count rss and swap for each
+ * process'es address space. Address space object such as vnodes, amps and
+ * anons are tracked per entity, so that they are not double counted in the
+ * results.
*/
static void
vmu_calculate()
@@ -1631,6 +1712,7 @@ vmu_calculate()
int i = 0;
int ret;
proc_t *p;
+ uint_t zone_flags = 0;
vmu_clear_calc();
@@ -1638,9 +1720,34 @@ vmu_calculate()
vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
ALL_ZONES);
+ zone_flags = vmu_data.vmu_calc_flags & VMUSAGE_ZONE_FLAGS;
+ if (zone_flags != 0) {
+ /*
+ * Use the accurate zone RSS data we already keep track of.
+ */
+ int i;
+
+ for (i = 0; i <= MAX_ZONEID; i++) {
+ if (zone_pdata[i].zpers_pg_cnt > 0) {
+ vmu_get_zone_rss(i);
+ }
+ }
+ }
+
+ /* If only neeeded zone data, we're done. */
+ if ((vmu_data.vmu_calc_flags & ~VMUSAGE_ZONE_FLAGS) == 0) {
+ return;
+ }
+
+ DTRACE_PROBE(vmu__calculate__all);
+ vmu_data.vmu_calc_flags &= ~VMUSAGE_ZONE_FLAGS;
+
/*
* Walk process table and calculate rss of each proc.
*
+ * Since we already obtained all zone rss above, the following loop
+ * executes with the VMUSAGE_ZONE_FLAGS cleared.
+ *
* Pidlock and p_lock cannot be held while doing the rss calculation.
* This is because:
* 1. The calculation allocates using KM_SLEEP.
@@ -1695,6 +1802,12 @@ again:
mutex_exit(&pidlock);
vmu_free_extra();
+
+ /*
+ * Restore any caller-supplied zone flags we blocked during
+ * the process-table walk.
+ */
+ vmu_data.vmu_calc_flags |= zone_flags;
}
/*
@@ -1745,7 +1858,7 @@ vmu_cache_rele(vmu_cache_t *cache)
*/
static int
vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
- uint_t flags, int cpflg)
+ uint_t flags, id_t req_zone_id, int cpflg)
{
vmusage_t *result, *out_result;
vmusage_t dummy;
@@ -1764,7 +1877,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
/* figure out what results the caller is interested in. */
if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
types |= VMUSAGE_SYSTEM;
- if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+ if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
types |= VMUSAGE_ZONE;
if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
VMUSAGE_COL_PROJECTS))
@@ -1827,26 +1940,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
continue;
}
- /* Skip "other zone" results if not requested */
- if (result->vmu_zoneid != curproc->p_zone->zone_id) {
- if (result->vmu_type == VMUSAGE_ZONE &&
- (flags & VMUSAGE_ALL_ZONES) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_PROJECTS &&
- (flags & (VMUSAGE_ALL_PROJECTS |
- VMUSAGE_COL_PROJECTS)) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_TASKS &&
- (flags & VMUSAGE_ALL_TASKS) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_RUSERS &&
- (flags & (VMUSAGE_ALL_RUSERS |
- VMUSAGE_COL_RUSERS)) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_EUSERS &&
- (flags & (VMUSAGE_ALL_EUSERS |
- VMUSAGE_COL_EUSERS)) == 0)
+ if (result->vmu_type == VMUSAGE_ZONE &&
+ flags & VMUSAGE_A_ZONE) {
+ /* Skip non-requested zone results */
+ if (result->vmu_zoneid != req_zone_id)
continue;
+ } else {
+ /* Skip "other zone" results if not requested */
+ if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+ if (result->vmu_type == VMUSAGE_ZONE &&
+ (flags & VMUSAGE_ALL_ZONES) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_PROJECTS &&
+ (flags & (VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_COL_PROJECTS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_TASKS &&
+ (flags & VMUSAGE_ALL_TASKS) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_RUSERS &&
+ (flags & (VMUSAGE_ALL_RUSERS |
+ VMUSAGE_COL_RUSERS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_EUSERS &&
+ (flags & (VMUSAGE_ALL_EUSERS |
+ VMUSAGE_COL_EUSERS)) == 0)
+ continue;
+ }
}
count++;
if (out_result != NULL) {
@@ -1902,10 +2022,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
int cacherecent = 0;
hrtime_t now;
uint_t flags_orig;
+ id_t req_zone_id;
/*
* Non-global zones cannot request system wide and/or collated
- * results, or the system result, so munge the flags accordingly.
+ * results, or the system result, or usage of another zone, so munge
+ * the flags accordingly.
*/
flags_orig = flags;
if (curproc->p_zone != global_zone) {
@@ -1925,6 +2047,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
flags &= ~VMUSAGE_SYSTEM;
flags |= VMUSAGE_ZONE;
}
+ if (flags & VMUSAGE_A_ZONE) {
+ flags &= ~VMUSAGE_A_ZONE;
+ flags |= VMUSAGE_ZONE;
+ }
}
/* Check for unknown flags */
@@ -1935,6 +2061,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
if ((flags & VMUSAGE_MASK) == 0)
return (set_errno(EINVAL));
+ /* If requesting results for a specific zone, get the zone ID */
+ if (flags & VMUSAGE_A_ZONE) {
+ size_t bufsize;
+ vmusage_t zreq;
+
+ if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
+ return (set_errno(EFAULT));
+ /* Requested zone ID is passed in buf, so 0 len not allowed */
+ if (bufsize == 0)
+ return (set_errno(EINVAL));
+ if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
+ return (set_errno(EFAULT));
+ req_zone_id = zreq.vmu_id;
+ }
+
mutex_enter(&vmu_data.vmu_lock);
now = gethrtime();
@@ -1954,7 +2095,7 @@ start:
mutex_exit(&vmu_data.vmu_lock);
ret = vmu_copyout_results(cache, buf, nres, flags_orig,
- cpflg);
+ req_zone_id, cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
if (vmu_data.vmu_pending_waiters > 0)
@@ -2011,7 +2152,8 @@ start:
mutex_exit(&vmu_data.vmu_lock);
/* copy cache */
- ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
+ ret = vmu_copyout_results(cache, buf, nres, flags_orig,
+ req_zone_id, cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
mutex_exit(&vmu_data.vmu_lock);