11 files changed, 347 insertions, 136 deletions
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
index a2509e7bb6..3735139068 100644
--- a/usr/src/uts/common/vm/hat.h
+++ b/usr/src/uts/common/vm/hat.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -269,7 +270,12 @@ void	hat_kpm_walk(void (*)(void *, void *, size_t), void *);
  *	call.
  *
  * int hat_pageunload(pp, forceflag)
- *	unload all translations attached to pp.
+ *	Unload all translations attached to pp. On x86 the bulk of the work is
+ *	done by hat_page_inval.
+ *
+ * void	hat_page_inval(pp, pgsz, curhat)
+ *	Unload translations attached to pp. If curhat is provided, only the
+ *	translation for that process is unloaded, otherwise all are unloaded.
  *
  * uint_t hat_pagesync(pp, flags)
  *	get hw stats from hardware into page struct and reset hw stats
@@ -291,6 +297,7 @@ void	hat_page_setattr(struct page *, uint_t);
 void	hat_page_clrattr(struct page *, uint_t);
 uint_t	hat_page_getattr(struct page *, uint_t);
 int	hat_pageunload(struct page *, uint_t);
+void	hat_page_inval(struct page *, uint_t, struct hat *);
 uint_t	hat_pagesync(struct page *, uint_t);
 ulong_t	hat_page_getshare(struct page *);
 int	hat_page_checkshare(struct page *, ulong_t);
@@ -460,6 +467,7 @@ void	hat_setstat(struct as *, caddr_t, size_t, uint_t);
  */
 #define	HAT_ADV_PGUNLOAD	0x00
 #define	HAT_FORCE_PGUNLOAD	0x01
+#define	HAT_CURPROC_PGUNLOAD	0x02
 
 /*
  * Attributes for hat_page_*attr, hat_setstats and
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 8747b96acc..ae9b0be758 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -229,6 +230,7 @@ struct as;
  *				p_nrm
  *				p_mapping
  *				p_share
+ *				p_zoneid
  *
  * The following field is file system dependent.  How it is used and
  * the locking strategies applied are up to the individual file system
@@ -527,9 +529,8 @@ typedef struct page {
 	pfn_t		p_pagenum;	/* physical page number */
 
 	uint_t		p_share;	/* number of translations */
-#if defined(_LP64)
-	uint_t		p_sharepad;	/* pad for growing p_share */
-#endif
+	short		p_zoneid;	/* zone page use tracking */
+	short		p_pad1;		/* TBD */
 	uint_t		p_slckcnt;	/* number of softlocks */
 #if defined(__sparc)
 	uint_t		p_kpmref;	/* number of kpm mapping sharers */
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
index 7e48602189..a5a39d04c1 100644
--- a/usr/src/uts/common/vm/page_lock.c
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 
@@ -140,9 +141,8 @@ static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
 	    & (VPH_TABLE_SIZE - 1))
 
 /*
- * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
- * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
- * VPH_TABLE_SIZE + 1.
+ * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes,
+ * one for kvps[KV_ZVP], and one for other kvps[] users.
  */
 
 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
@@ -364,7 +364,6 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
 			retval = 0;
 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
 			/* no reader/writer lock held */
-			THREAD_KPRI_REQUEST();
 			/* this clears our setting of the SE_EWANTED bit */
 			pp->p_selock = SE_WRITER;
 			retval = 1;
@@ -551,7 +550,6 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
 		if ((old & ~SE_EWANTED) == 0) {
 			/* no reader/writer lock held */
-			THREAD_KPRI_REQUEST();
 			/* this clears out our setting of the SE_EWANTED bit */
 			pp->p_selock = SE_WRITER;
 			mutex_exit(pse);
@@ -590,7 +588,6 @@ page_trylock(page_t *pp, se_t se)
 
 	if (se == SE_EXCL) {
 		if (pp->p_selock == 0) {
-			THREAD_KPRI_REQUEST();
 			pp->p_selock = SE_WRITER;
 			mutex_exit(pse);
 			return (1);
@@ -628,7 +625,6 @@ page_unlock_nocapture(page_t *pp)
 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
 		panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
 	} else if (old < 0) {
-		THREAD_KPRI_RELEASE();
 		pp->p_selock &= SE_EWANTED;
 		if (CV_HAS_WAITERS(&pp->p_cv))
 			cv_broadcast(&pp->p_cv);
@@ -662,7 +658,6 @@ page_unlock(page_t *pp)
 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
 		panic("page_unlock: page %p is deleted", (void *)pp);
 	} else if (old < 0) {
-		THREAD_KPRI_RELEASE();
 		pp->p_selock &= SE_EWANTED;
 		if (CV_HAS_WAITERS(&pp->p_cv))
 			cv_broadcast(&pp->p_cv);
@@ -682,7 +677,6 @@ page_unlock(page_t *pp)
 		if ((pp->p_toxic & PR_CAPTURE) &&
 		    !(curthread->t_flag & T_CAPTURING) &&
 		    !PP_RETIRED(pp)) {
-			THREAD_KPRI_REQUEST();
 			pp->p_selock = SE_WRITER;
 			mutex_exit(pse);
 			page_unlock_capture(pp);
@@ -712,7 +706,6 @@ page_tryupgrade(page_t *pp)
 	if (!(pp->p_selock & SE_EWANTED)) {
 		/* no threads want exclusive access, try upgrade */
 		if (pp->p_selock == SE_READER) {
-			THREAD_KPRI_REQUEST();
 			/* convert to exclusive lock */
 			pp->p_selock = SE_WRITER;
 			mutex_exit(pse);
@@ -738,7 +731,6 @@ page_downgrade(page_t *pp)
 
 	mutex_enter(pse);
 	excl_waiting =  pp->p_selock & SE_EWANTED;
-	THREAD_KPRI_RELEASE();
 	pp->p_selock = SE_READER | excl_waiting;
 	if (CV_HAS_WAITERS(&pp->p_cv))
 		cv_broadcast(&pp->p_cv);
@@ -756,7 +748,6 @@ page_lock_delete(page_t *pp)
 	ASSERT(!PP_ISFREE(pp));
 
 	mutex_enter(pse);
-	THREAD_KPRI_RELEASE();
 	pp->p_selock = SE_DELETED;
 	if (CV_HAS_WAITERS(&pp->p_cv))
 		cv_broadcast(&pp->p_cv);
@@ -888,10 +879,10 @@ static int page_vnode_mutex_stress = 0;
 kmutex_t *
 page_vnode_mutex(vnode_t *vp)
 {
-	if (vp == &kvp)
+	if (vp == &kvp || vp == &kvps[KV_VVP])
 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
 
-	if (vp == &zvp)
+	if (vp == &kvps[KV_ZVP])
 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
 #ifdef DEBUG
 	if (page_vnode_mutex_stress != 0)
diff --git a/usr/src/uts/common/vm/page_retire.c b/usr/src/uts/common/vm/page_retire.c
index 76be970a45..f4e8d0737f 100644
--- a/usr/src/uts/common/vm/page_retire.c
+++ b/usr/src/uts/common/vm/page_retire.c
@@ -22,6 +22,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -851,9 +852,8 @@ page_retire_incr_pend_count(void *datap)
 {
 	PR_INCR_KSTAT(pr_pending);
 
-	if ((datap == &kvp) || (datap == &zvp)) {
+	if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP])
 		PR_INCR_KSTAT(pr_pending_kas);
-	}
 }
 
 void
@@ -861,9 +861,8 @@ page_retire_decr_pend_count(void *datap)
 {
 	PR_DECR_KSTAT(pr_pending);
 
-	if ((datap == &kvp) || (datap == &zvp)) {
+	if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP])
 		PR_DECR_KSTAT(pr_pending_kas);
-	}
 }
 
 /*
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
index 439c859d96..0b116d6eba 100644
--- a/usr/src/uts/common/vm/seg_kmem.c
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -122,6 +122,11 @@ vmem_t *static_alloc_arena;	/* arena for allocating static memory */
 vmem_t *zio_arena = NULL;	/* arena for allocating zio memory */
 vmem_t *zio_alloc_arena = NULL;	/* arena for allocating zio memory */
 
+#if defined(__amd64)
+vmem_t *kvmm_arena;		/* arena for vmm VA */
+struct seg kvmmseg;		/* Segment for vmm memory */
+#endif
+
 /*
  * seg_kmem driver can map part of the kernel heap with large pages.
  * Currently this functionality is implemented for sparc platforms only.
@@ -440,7 +445,7 @@ segkmem_badop()
 /*ARGSUSED*/
 static faultcode_t
 segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
-	enum fault_type type, enum seg_rw rw)
+    enum fault_type type, enum seg_rw rw)
 {
 	pgcnt_t npages;
 	spgcnt_t pg;
@@ -655,13 +660,19 @@ segkmem_dump(struct seg *seg)
 		    segkmem_dump_range, seg->s_as);
 		vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 		    segkmem_dump_range, seg->s_as);
+	/*
+	 * We don't want to dump pages attached to kzioseg since they
+	 * contain file data from ZFS.  If this page's segment is
+	 * kzioseg return instead of writing it to the dump device.
+	 *
+	 * Same applies to VM memory allocations.
+	 */
 	} else if (seg == &kzioseg) {
-		/*
-		 * We don't want to dump pages attached to kzioseg since they
-		 * contain file data from ZFS.  If this page's segment is
-		 * kzioseg return instead of writing it to the dump device.
-		 */
 		return;
+#if defined(__amd64)
+	} else if (seg == &kvmmseg) {
+		return;
+#endif
 	} else {
 		segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
 	}
@@ -677,7 +688,7 @@ segkmem_dump(struct seg *seg)
 /*ARGSUSED*/
 static int
 segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
-	page_t ***ppp, enum lock_type type, enum seg_rw rw)
+    page_t ***ppp, enum lock_type type, enum seg_rw rw)
 {
 	page_t **pplist, *pp;
 	pgcnt_t npages;
@@ -802,21 +813,18 @@ struct seg_ops segkmem_ops = {
 };
 
 int
-segkmem_zio_create(struct seg *seg)
-{
-	ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
-	seg->s_ops = &segkmem_ops;
-	seg->s_data = &zvp;
-	kas.a_size += seg->s_size;
-	return (0);
-}
-
-int
 segkmem_create(struct seg *seg)
 {
 	ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 	seg->s_ops = &segkmem_ops;
-	seg->s_data = &kvp;
+	if (seg == &kzioseg)
+		seg->s_data = &kvps[KV_ZVP];
+#if defined(__amd64)
+	else if (seg == &kvmmseg)
+		seg->s_data = &kvps[KV_VVP];
+#endif
+	else
+		seg->s_data = &kvps[KV_KVP];
 	kas.a_size += seg->s_size;
 	return (0);
 }
@@ -858,7 +866,7 @@ segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
  */
 void *
 segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
-	page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
+    page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
 {
 	page_t *ppl;
 	caddr_t addr = inaddr;
@@ -968,10 +976,10 @@ segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
 	return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
 }
 
-void *
+static void *
 segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
 {
-	return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
+	return (segkmem_alloc_vn(vmp, size, vmflag, &kvps[KV_ZVP]));
 }
 
 /*
@@ -980,8 +988,8 @@ segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
  * we currently don't have a special kernel segment for non-paged
  * kernel memory that is exported by drivers to user space.
  */
-static void
-segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
+void
+segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
     void (*func)(page_t *))
 {
 	page_t *pp;
@@ -1038,21 +1046,15 @@ segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
 }
 
 void
-segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, void (*func)(page_t *))
-{
-	segkmem_free_vn(vmp, inaddr, size, &kvp, func);
-}
-
-void
 segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
 {
-	segkmem_free_vn(vmp, inaddr, size, &kvp, NULL);
+	segkmem_xfree(vmp, inaddr, size, &kvp, NULL);
 }
 
-void
+static void
 segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
 {
-	segkmem_free_vn(vmp, inaddr, size, &zvp, NULL);
+	segkmem_xfree(vmp, inaddr, size, &kvps[KV_ZVP], NULL);
 }
 
 void
@@ -1534,8 +1536,21 @@ segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
 	ASSERT(zio_alloc_arena != NULL);
 }
 
-#ifdef __sparc
+#if defined(__amd64)
+
+void
+segkmem_kvmm_init(void *base, size_t size)
+{
+	ASSERT(base != NULL);
+	ASSERT(size != 0);
+
+	kvmm_arena = vmem_create("kvmm_arena", base, size, 1024 * 1024,
+	    NULL, NULL, NULL, 0, VM_SLEEP);
+
+	ASSERT(kvmm_arena != NULL);
+}
 
+#elif defined(__sparc)
 
 static void *
 segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag)
diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h
index 1db85826b1..9a20101670 100644
--- a/usr/src/uts/common/vm/seg_kmem.h
+++ b/usr/src/uts/common/vm/seg_kmem.h
@@ -21,7 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 RackTop Systems.
  */
 
@@ -65,12 +65,18 @@ extern vmem_t *static_arena;	/* arena for caches to import static memory */
 extern vmem_t *static_alloc_arena;	/* arena for allocating static memory */
 extern vmem_t *zio_arena;	/* arena for zio caches */
 extern vmem_t *zio_alloc_arena;	/* arena for zio caches */
+
+#if defined(__amd64)
+extern struct seg kvmmseg;	/* Segment for vmm mappings */
+extern vmem_t *kvmm_arena;	/* arena for vmm VA */
+extern void segkmem_kvmm_init(void *, size_t);
+#endif
+
 extern struct vnode kvps[];
 /*
- * segkmem page vnodes
+ * segkmem page vnodes (please don't add more defines here...)
  */
 #define	kvp		(kvps[KV_KVP])
-#define	zvp		(kvps[KV_ZVP])
 #if defined(__sparc)
 #define	mpvp		(kvps[KV_MPVP])
 #define	promvp		(kvps[KV_PROMVP])
@@ -83,16 +89,14 @@ extern void *segkmem_xalloc(vmem_t *, void *, size_t, int, uint_t,
 extern void *segkmem_alloc(vmem_t *, size_t, int);
 extern void *segkmem_alloc_permanent(vmem_t *, size_t, int);
 extern void segkmem_free(vmem_t *, void *, size_t);
-extern void segkmem_xfree(vmem_t *, void *, size_t, void (*)(page_t *));
+extern void segkmem_xfree(vmem_t *, void *, size_t,
+    struct vnode *, void (*)(page_t *));
 
 extern void *boot_alloc(void *, size_t, uint_t);
 extern void boot_mapin(caddr_t addr, size_t size);
 extern void kernelheap_init(void *, void *, char *, void *, void *);
 extern void segkmem_gc(void);
 
-extern void *segkmem_zio_alloc(vmem_t *, size_t, int);
-extern int segkmem_zio_create(struct seg *);
-extern void segkmem_zio_free(vmem_t *, void *, size_t);
 extern void segkmem_zio_init(void *, size_t);
 
 /*
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 8046d10212..da6393f792 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -7313,7 +7313,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 	vpp = svd->vpage;
 	offset = svd->offset + (uintptr_t)(addr - seg->s_base);
 	bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
-	    ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+	    ((flags & MS_INVALIDATE) ? B_INVAL : 0) |
+	    ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0);
 
 	if (attr) {
 		pageprot = attr & ~(SHARED|PRIVATE);
@@ -7338,11 +7339,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 			vpp = &svd->vpage[seg_page(seg, addr)];
 
 	} else if (svd->vp && svd->amp == NULL &&
-	    (flags & MS_INVALIDATE) == 0) {
+	    (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) {
 
 		/*
-		 * No attributes, no anonymous pages and MS_INVALIDATE flag
-		 * is not on, just use one big request.
+		 * No attributes, no anonymous pages and MS_INVAL* flags
+		 * are not on, just use one big request.
 		 */
 		err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
 		    bflags, svd->cred, NULL);
@@ -7394,7 +7395,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 		 * might race in and lock the page after we unlock and before
 		 * we do the PUTPAGE, then PUTPAGE simply does nothing.
 		 */
-		if (flags & MS_INVALIDATE) {
+		if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) {
 			if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 					page_unlock(pp);
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
index 853b092e6d..ec6d2b8920 100644
--- a/usr/src/uts/common/vm/vm_as.c
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -58,6 +58,7 @@
 #include <sys/debug.h>
 #include <sys/tnf_probe.h>
 #include <sys/vtrace.h>
+#include <sys/ddi.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
@@ -72,6 +73,8 @@
 
 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
 
+ulong_t as_user_seg_limit = 0xffff; /* max segments in an (non-kas) AS */
+
 static struct kmem_cache *as_cache;
 
 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
@@ -853,8 +856,6 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 	int as_lock_held;
 	klwp_t *lwp = ttolwp(curthread);
 
-
-
 retry:
 	/*
 	 * Indicate that the lwp is not to be stopped while waiting for a
@@ -1724,6 +1725,20 @@ as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
 			    p->p_rctls, p, RCA_UNSAFE_ALL);
 			return (ENOMEM);
 		}
+
+		/*
+		 * Keep the number of segments in a userspace AS constrained to
+		 * a reasonable limit.  Linux enforces a value slightly less
+		 * than 64k in order to avoid ELF limits if/when a process
+		 * dumps core.  While SunOS avoids that specific problem with
+		 * other tricks, the limit is still valuable to keep kernel
+		 * memory consumption in check.
+		 */
+		if (avl_numnodes(&as->a_segtree) >= as_user_seg_limit) {
+			AS_LOCK_EXIT(as);
+			atomic_inc_32(&p->p_zone->zone_mfseglim);
+			return (ENOMEM);
+		}
 	}
 
 	if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 78d1cb1a58..abccf82057 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -22,6 +22,7 @@
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
  * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T	*/
@@ -440,10 +441,26 @@ init_pages_pp_maximum()
 	}
 }
 
+/*
+ * In the past, we limited the maximum pages that could be gotten to essentially
+ * 1/2 of the total pages on the system. However, this is too conservative for
+ * some cases. For example, if we want to host a large virtual machine which
+ * needs to use a significant portion of the system's memory. In practice,
+ * allowing more than 1/2 of the total pages is fine, but becomes problematic
+ * as we approach or exceed 75% of the pages on the system. Thus, we limit the
+ * maximum to 23/32 of the total pages, which is ~72%.
+ */
 void
 set_max_page_get(pgcnt_t target_total_pages)
 {
-	max_page_get = target_total_pages / 2;
+	max_page_get = (target_total_pages >> 5) * 23;
+	ASSERT3U(max_page_get, >, 0);
+}
+
+pgcnt_t
+get_max_page_get()
+{
+	return (max_page_get);
 }
 
 static pgcnt_t pending_delete;
@@ -1460,6 +1477,8 @@ page_create_throttle(pgcnt_t npages, int flags)
 	uint_t	i;
 	pgcnt_t tf;	/* effective value of throttlefree */
 
+	atomic_inc_64(&n_throttle);
+
 	/*
 	 * Normal priority allocations.
 	 */
@@ -1492,7 +1511,7 @@ page_create_throttle(pgcnt_t npages, int flags)
 	tf = throttlefree -
 	    ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
 
-	cv_signal(&proc_pageout->p_cv);
+	WAKE_PAGEOUT_SCANNER();
 
 	for (;;) {
 		fm = 0;
@@ -1579,7 +1598,7 @@ checkagain:
 	}
 
 	ASSERT(proc_pageout != NULL);
-	cv_signal(&proc_pageout->p_cv);
+	WAKE_PAGEOUT_SCANNER();
 
 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
 	    "page_create_sleep_start: freemem %ld needfree %ld",
@@ -2226,7 +2245,7 @@ page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
 	if (nscan < desscan && freemem < minfree) {
 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
 		    "pageout_cv_signal:freemem %ld", freemem);
-		cv_signal(&proc_pageout->p_cv);
+		WAKE_PAGEOUT_SCANNER();
 	}
 
 	pp = rootpp;
@@ -2355,7 +2374,7 @@ page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
 	if (nscan < desscan && freemem < minfree) {
 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
 		    "pageout_cv_signal:freemem %ld", freemem);
-		cv_signal(&proc_pageout->p_cv);
+		WAKE_PAGEOUT_SCANNER();
 	}
 
 	/*
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
index 1b8d12eb8d..a206320a30 100644
--- a/usr/src/uts/common/vm/vm_pvn.c
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -432,7 +433,14 @@ pvn_write_done(page_t *plist, int flags)
 				page_io_unlock(pp);
 				page_unlock(pp);
 			}
-		} else if (flags & B_INVAL) {
+		} else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
+			/*
+			 * If B_INVALCURONLY is set, then we handle that case
+			 * in the next conditional if hat_page_is_mapped()
+			 * indicates that there are no additional mappings
+			 * to the page.
+			 */
+
 			/*
 			 * XXX - Failed writes with B_INVAL set are
 			 * not handled appropriately.
@@ -573,8 +581,9 @@ pvn_write_done(page_t *plist, int flags)
 }
 
 /*
- * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
- * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
+ * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
+ * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
+ * B_DELWRI indicates that this page is part of a kluster
  * operation and is only to be considered if it doesn't involve any
  * waiting here.  B_TRUNC indicates that the file is being truncated
  * and so no i/o needs to be done. B_FORCE indicates that the page
@@ -628,13 +637,17 @@ pvn_getdirty(page_t *pp, int flags)
 	 * If we want to free or invalidate the page then
 	 * we need to unload it so that anyone who wants
 	 * it will have to take a minor fault to get it.
+	 * If we are only invalidating the page for the
+	 * current process, then pass in a different flag.
 	 * Otherwise, we're just writing the page back so we
 	 * need to sync up the hardwre and software mod bit to
 	 * detect any future modifications.  We clear the
 	 * software mod bit when we put the page on the dirty
 	 * list.
 	 */
-	if (flags & (B_INVAL | B_FREE)) {
+	if (flags & B_INVALCURONLY) {
+		(void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
+	} else if (flags & (B_INVAL | B_FREE)) {
 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 	} else {
 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
@@ -646,7 +659,7 @@ pvn_getdirty(page_t *pp, int flags)
 		 * list after all.
 		 */
 		page_io_unlock(pp);
-		if (flags & B_INVAL) {
+		if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 			/*LINTED: constant in conditional context*/
 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
 		} else if (flags & B_FREE) {
@@ -658,6 +671,9 @@ pvn_getdirty(page_t *pp, int flags)
 			 * of VOP_PUTPAGE() who prefer freeing the
 			 * page _only_ if no one else is accessing it.
 			 * E.g. segmap_release()
+			 * We also take this path for B_INVALCURONLY and
+			 * let page_release call VN_DISPOSE if no one else is
+			 * using the page.
 			 *
 			 * The above hat_ismod() check is useless because:
 			 * (1) we may not be holding SE_EXCL lock;
@@ -682,7 +698,7 @@ pvn_getdirty(page_t *pp, int flags)
 	 * We'll detect the fact that they used it when the
 	 * i/o is done and avoid freeing the page.
 	 */
-	if (flags & B_FREE)
+	if (flags & (B_FREE | B_INVALCURONLY))
 		page_downgrade(pp);
 
 
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index e542e8e479..01c2666e91 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,6 +25,10 @@
  */
 
 /*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+/*
  * vm_usage
  *
  * This file implements the getvmusage() private system call.
@@ -114,7 +118,7 @@
  *	For accurate counting of map-shared and COW-shared pages.
  *
  *    - visited private anons (refcnt > 1) for each collective.
- *	(entity->vme_anon_hash)
+ *	(entity->vme_anon)
  *	For accurate counting of COW-shared pages.
  *
  * The common accounting structure is the vmu_entity_t, which represents
@@ -152,6 +156,7 @@
 #include <sys/vm_usage.h>
 #include <sys/zone.h>
 #include <sys/sunddi.h>
+#include <sys/sysmacros.h>
 #include <sys/avl.h>
 #include <vm/anon.h>
 #include <vm/as.h>
@@ -199,6 +204,14 @@ typedef struct vmu_object {
 } vmu_object_t;
 
 /*
+ * Node for tree of visited COW anons.
+ */
+typedef struct vmu_anon {
+	avl_node_t vma_node;
+	uintptr_t vma_addr;
+} vmu_anon_t;
+
+/*
  * Entity by which to count results.
  *
  * The entity structure keeps the current rss/swap counts for each entity
@@ -221,7 +234,7 @@ typedef struct vmu_entity {
 	struct vmu_entity *vme_next_calc;
 	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
 	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
-	mod_hash_t	*vme_anon_hash;	 /* COW anons visited for entity */
+	avl_tree_t	vme_anon;	 /* COW anons visited for entity */
 	vmusage_t	vme_result;	 /* identifies entity and results */
 } vmu_entity_t;
 
@@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2)
 }
 
 /*
+ * Comparison routine for our AVL tree of anon structures.
+ */
+static int
+vmu_anon_cmp(const void *lhs, const void *rhs)
+{
+	const vmu_anon_t *l = lhs, *r = rhs;
+
+	if (l->vma_addr == r->vma_addr)
+		return (0);
+
+	if (l->vma_addr < r->vma_addr)
+		return (-1);
+
+	return (1);
+}
+
+/*
  * Save a bound on the free list.
  */
 static void
@@ -363,13 +393,18 @@ static void
 vmu_free_entity(mod_hash_val_t val)
 {
 	vmu_entity_t *entity = (vmu_entity_t *)val;
+	vmu_anon_t *anon;
+	void *cookie = NULL;
 
 	if (entity->vme_vnode_hash != NULL)
 		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
 	if (entity->vme_amp_hash != NULL)
 		i_mod_hash_clear_nosync(entity->vme_amp_hash);
-	if (entity->vme_anon_hash != NULL)
-		i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+	while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
+		kmem_free(anon, sizeof (vmu_anon_t));
+
+	avl_destroy(&entity->vme_anon);
 
 	entity->vme_next = vmu_data.vmu_free_entities;
 	vmu_data.vmu_free_entities = entity;
@@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid)
 		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 		    sizeof (struct anon_map));
 
-	if (entity->vme_anon_hash == NULL)
-		entity->vme_anon_hash = mod_hash_create_ptrhash(
-		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
-		    mod_hash_null_valdtor, sizeof (struct anon));
+	VERIFY(avl_first(&entity->vme_anon) == NULL);
+
+	avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
+	    offsetof(struct vmu_anon, vma_node));
 
 	entity->vme_next = vmu_data.vmu_entities;
 	vmu_data.vmu_entities = entity;
@@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id)
 
 	zone->vmz_id = id;
 
-	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+	if ((vmu_data.vmu_calc_flags &
+	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
 		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 
 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
@@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
 }
 
 static int
-vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+vmu_find_insert_anon(vmu_entity_t *entity, void *key)
 {
-	int ret;
-	caddr_t val;
+	vmu_anon_t anon, *ap;
 
-	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
-	    (mod_hash_val_t *)&val);
+	anon.vma_addr = (uintptr_t)key;
 
-	if (ret == 0)
+	if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
 		return (0);
 
-	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
-	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+	ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
+	ap->vma_addr = (uintptr_t)key;
 
-	ASSERT(ret == 0);
+	avl_add(&entity->vme_anon, ap);
 
 	return (1);
 }
@@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 			next = AVL_NEXT(tree, next);
 			continue;
 		}
+
+		ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 		bound_type = next->vmb_type;
 		index = next->vmb_start;
 		while (index <= next->vmb_end) {
@@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 
 			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 			    (page = page_exists(vn, off)) != NULL) {
-				page_type = VMUSAGE_BOUND_INCORE;
+				if (PP_ISFREE(page))
+					page_type = VMUSAGE_BOUND_NOT_INCORE;
+				else
+					page_type = VMUSAGE_BOUND_INCORE;
 				if (page->p_szc > 0) {
 					pgcnt = page_get_pagecnt(page->p_szc);
 					pgshft = page_get_shift(page->p_szc);
@@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 			} else {
 				page_type = VMUSAGE_BOUND_NOT_INCORE;
 			}
+
 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 				next->vmb_type = page_type;
+				bound_type = page_type;
 			} else if (next->vmb_type != page_type) {
 				/*
 				 * If current bound type does not match page
@@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 			continue;
 		}
 
+		ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 		bound_type = next->vmb_type;
 		index = next->vmb_start;
 		while (index <= next->vmb_end) {
@@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 
 			if (vnode->v_pages != NULL &&
 			    (page = page_exists(vnode, ptob(index))) != NULL) {
-				page_type = VMUSAGE_BOUND_INCORE;
+				if (PP_ISFREE(page))
+					page_type = VMUSAGE_BOUND_NOT_INCORE;
+				else
+					page_type = VMUSAGE_BOUND_INCORE;
 				if (page->p_szc > 0) {
 					pgcnt = page_get_pagecnt(page->p_szc);
 					pgshft = page_get_shift(page->p_szc);
@@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 			} else {
 				page_type = VMUSAGE_BOUND_NOT_INCORE;
 			}
+
 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 				next->vmb_type = page_type;
+				bound_type = page_type;
 			} else if (next->vmb_type != page_type) {
 				/*
 				 * If current bound type does not match page
@@ -1304,6 +1351,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
 			}
 
 			/*
+			 * Pages on the free list aren't counted for the rss.
+			 */
+			if (PP_ISFREE(page))
+				continue;
+
+			/*
 			 * Assume anon structs with a refcnt
 			 * of 1 are not COW shared, so there
 			 * is no reason to track them per entity.
@@ -1320,8 +1373,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
 				 * Track COW anons per entity so
 				 * they are not double counted.
 				 */
-				if (vmu_find_insert_anon(entity->vme_anon_hash,
-				    (caddr_t)ap) == 0)
+				if (vmu_find_insert_anon(entity, ap) == 0)
 					continue;
 
 				result->vmu_rss_all += (pgcnt << PAGESHIFT);
@@ -1461,8 +1513,9 @@ vmu_calculate_proc(proc_t *p)
 		entities = tmp;
 	}
 	if (vmu_data.vmu_calc_flags &
-	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
-	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
+	    VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+	    VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
 	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
 	    VMUSAGE_ALL_EUSERS)) {
 		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
@@ -1595,8 +1648,7 @@ vmu_free_extra()
 			mod_hash_destroy_hash(te->vme_vnode_hash);
 		if (te->vme_amp_hash != NULL)
 			mod_hash_destroy_hash(te->vme_amp_hash);
-		if (te->vme_anon_hash != NULL)
-			mod_hash_destroy_hash(te->vme_anon_hash);
+		VERIFY(avl_first(&te->vme_anon) == NULL);
 		kmem_free(te, sizeof (vmu_entity_t));
 	}
 	while (vmu_data.vmu_free_zones != NULL) {
@@ -1617,13 +1669,42 @@ vmu_free_extra()
 
 extern kcondvar_t *pr_pid_cv;
 
+static void
+vmu_get_zone_rss(zoneid_t zid)
+{
+	vmu_zone_t *zone;
+	zone_t *zp;
+	int ret;
+	uint_t pgcnt;
+
+	if ((zp = zone_find_by_id(zid)) == NULL)
+		return;
+
+	ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
+	    (mod_hash_key_t)(uintptr_t)zid, (mod_hash_val_t *)&zone);
+	if (ret != 0) {
+		zone = vmu_alloc_zone(zid);
+		ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
+		    (mod_hash_key_t)(uintptr_t)zid,
+		    (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
+		ASSERT(ret == 0);
+	}
+
+	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+	pgcnt = zone_pdata[zid].zpers_pg_cnt;
+	zone->vmz_zone->vme_result.vmu_rss_all = (size_t)ptob(pgcnt);
+	zone->vmz_zone->vme_result.vmu_swap_all = zp->zone_max_swap;
+
+	zone_rele(zp);
+}
+
 /*
  * Determine which entity types are relevant and allocate the hashes to
- * track them.  Then walk the process table and count rss and swap
- * for each process'es address space.  Address space object such as
- * vnodes, amps and anons are tracked per entity, so that they are
- * not double counted in the results.
- *
+ * track them.  First get the zone rss using the data we already have. Then,
+ * if necessary, walk the process table and count rss and swap for each
+ * process'es address space.  Address space object such as vnodes, amps and
+ * anons are tracked per entity, so that they are not double counted in the
+ * results.
  */
 static void
 vmu_calculate()
@@ -1631,6 +1712,7 @@ vmu_calculate()
 	int i = 0;
 	int ret;
 	proc_t *p;
+	uint_t	zone_flags = 0;
 
 	vmu_clear_calc();
 
@@ -1638,9 +1720,34 @@ vmu_calculate()
 		vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
 		    ALL_ZONES);
 
+	zone_flags = vmu_data.vmu_calc_flags & VMUSAGE_ZONE_FLAGS;
+	if (zone_flags != 0) {
+		/*
+		 * Use the accurate zone RSS data we already keep track of.
+		 */
+		int i;
+
+		for (i = 0; i <= MAX_ZONEID; i++) {
+			if (zone_pdata[i].zpers_pg_cnt > 0) {
+				vmu_get_zone_rss(i);
+			}
+		}
+	}
+
+	/* If only neeeded zone data, we're done. */
+	if ((vmu_data.vmu_calc_flags & ~VMUSAGE_ZONE_FLAGS) == 0) {
+		return;
+	}
+
+	DTRACE_PROBE(vmu__calculate__all);
+	vmu_data.vmu_calc_flags &= ~VMUSAGE_ZONE_FLAGS;
+
 	/*
 	 * Walk process table and calculate rss of each proc.
 	 *
+	 * Since we already obtained all zone rss above, the following loop
+	 * executes with the VMUSAGE_ZONE_FLAGS cleared.
+	 *
 	 * Pidlock and p_lock cannot be held while doing the rss calculation.
 	 * This is because:
 	 *	1.  The calculation allocates using KM_SLEEP.
@@ -1695,6 +1802,12 @@ again:
 	mutex_exit(&pidlock);
 
 	vmu_free_extra();
+
+	/*
+	 * Restore any caller-supplied zone flags we blocked during
+	 * the process-table walk.
+	 */
+	vmu_data.vmu_calc_flags |= zone_flags;
 }
 
 /*
@@ -1745,7 +1858,7 @@ vmu_cache_rele(vmu_cache_t *cache)
  */
 static int
 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
-    uint_t flags, int cpflg)
+    uint_t flags, id_t req_zone_id, int cpflg)
 {
 	vmusage_t *result, *out_result;
 	vmusage_t dummy;
@@ -1764,7 +1877,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
 	/* figure out what results the caller is interested in. */
 	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
 		types |= VMUSAGE_SYSTEM;
-	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
 		types |= VMUSAGE_ZONE;
 	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
 	    VMUSAGE_COL_PROJECTS))
@@ -1827,26 +1940,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
 				continue;
 		}
 
-		/* Skip "other zone" results if not requested */
-		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
-			if (result->vmu_type == VMUSAGE_ZONE &&
-			    (flags & VMUSAGE_ALL_ZONES) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_PROJECTS &&
-			    (flags & (VMUSAGE_ALL_PROJECTS |
-			    VMUSAGE_COL_PROJECTS)) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_TASKS &&
-			    (flags & VMUSAGE_ALL_TASKS) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_RUSERS &&
-			    (flags & (VMUSAGE_ALL_RUSERS |
-			    VMUSAGE_COL_RUSERS)) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_EUSERS &&
-			    (flags & (VMUSAGE_ALL_EUSERS |
-			    VMUSAGE_COL_EUSERS)) == 0)
+		if (result->vmu_type == VMUSAGE_ZONE &&
+		    flags & VMUSAGE_A_ZONE) {
+			/* Skip non-requested zone results */
+			if (result->vmu_zoneid != req_zone_id)
 				continue;
+		} else {
+			/* Skip "other zone" results if not requested */
+			if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+				if (result->vmu_type == VMUSAGE_ZONE &&
+				    (flags & VMUSAGE_ALL_ZONES) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_PROJECTS &&
+				    (flags & (VMUSAGE_ALL_PROJECTS |
+				    VMUSAGE_COL_PROJECTS)) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_TASKS &&
+				    (flags & VMUSAGE_ALL_TASKS) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_RUSERS &&
+				    (flags & (VMUSAGE_ALL_RUSERS |
+				    VMUSAGE_COL_RUSERS)) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_EUSERS &&
+				    (flags & (VMUSAGE_ALL_EUSERS |
+				    VMUSAGE_COL_EUSERS)) == 0)
+					continue;
+			}
 		}
 		count++;
 		if (out_result != NULL) {
@@ -1902,10 +2022,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 	int cacherecent = 0;
 	hrtime_t now;
 	uint_t flags_orig;
+	id_t req_zone_id;
 
 	/*
 	 * Non-global zones cannot request system wide and/or collated
-	 * results, or the system result, so munge the flags accordingly.
+	 * results, or the system result, or usage of another zone, so munge
+	 * the flags accordingly.
 	 */
 	flags_orig = flags;
 	if (curproc->p_zone != global_zone) {
@@ -1925,6 +2047,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 			flags &= ~VMUSAGE_SYSTEM;
 			flags |= VMUSAGE_ZONE;
 		}
+		if (flags & VMUSAGE_A_ZONE) {
+			flags &= ~VMUSAGE_A_ZONE;
+			flags |= VMUSAGE_ZONE;
+		}
 	}
 
 	/* Check for unknown flags */
@@ -1935,6 +2061,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 	if ((flags & VMUSAGE_MASK) == 0)
 		return (set_errno(EINVAL));
 
+	/* If requesting results for a specific zone, get the zone ID */
+	if (flags & VMUSAGE_A_ZONE) {
+		size_t bufsize;
+		vmusage_t zreq;
+
+		if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
+			return (set_errno(EFAULT));
+		/* Requested zone ID is passed in buf, so 0 len not allowed */
+		if (bufsize == 0)
+			return (set_errno(EINVAL));
+		if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
+			return (set_errno(EFAULT));
+		req_zone_id = zreq.vmu_id;
+	}
+
 	mutex_enter(&vmu_data.vmu_lock);
 	now = gethrtime();
 
@@ -1954,7 +2095,7 @@ start:
 			mutex_exit(&vmu_data.vmu_lock);
 
 			ret = vmu_copyout_results(cache, buf, nres, flags_orig,
-			    cpflg);
+			    req_zone_id, cpflg);
 			mutex_enter(&vmu_data.vmu_lock);
 			vmu_cache_rele(cache);
 			if (vmu_data.vmu_pending_waiters > 0)
@@ -2011,7 +2152,8 @@ start:
 		mutex_exit(&vmu_data.vmu_lock);
 
 		/* copy cache */
-		ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
+		ret = vmu_copyout_results(cache, buf, nres, flags_orig,
+		    req_zone_id, cpflg);
 		mutex_enter(&vmu_data.vmu_lock);
 		vmu_cache_rele(cache);
 		mutex_exit(&vmu_data.vmu_lock);