11 files changed, 942 insertions, 67 deletions
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
index 1d91475e38..c908a9e16c 100644
--- a/usr/src/uts/common/vm/hat.h
+++ b/usr/src/uts/common/vm/hat.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -269,7 +270,12 @@ void	hat_kpm_walk(void (*)(void *, void *, size_t), void *);
  *	call.
  *
  * int hat_pageunload(pp, forceflag)
- *	unload all translations attached to pp.
+ *	Unload all translations attached to pp. On x86 the bulk of the work is
+ *	done by hat_page_inval.
+ *
+ * void	hat_page_inval(pp, pgsz, curhat)
+ *	Unload translations attached to pp. If curhat is provided, only the
+ *	translation for that process is unloaded, otherwise all are unloaded.
  *
  * uint_t hat_pagesync(pp, flags)
  *	get hw stats from hardware into page struct and reset hw stats
@@ -291,6 +297,7 @@ void	hat_page_setattr(struct page *, uint_t);
 void	hat_page_clrattr(struct page *, uint_t);
 uint_t	hat_page_getattr(struct page *, uint_t);
 int	hat_pageunload(struct page *, uint_t);
+void	hat_page_inval(struct page *, uint_t, struct hat *);
 uint_t	hat_pagesync(struct page *, uint_t);
 ulong_t	hat_page_getshare(struct page *);
 int	hat_page_checkshare(struct page *, ulong_t);
@@ -460,6 +467,7 @@ void	hat_setstat(struct as *, caddr_t, size_t, uint_t);
  */
 #define	HAT_ADV_PGUNLOAD	0x00
 #define	HAT_FORCE_PGUNLOAD	0x01
+#define	HAT_CURPROC_PGUNLOAD	0x02
 
 /*
  * Attributes for hat_page_*attr, hat_setstats and
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
index 90e1b73b70..439c859d96 100644
--- a/usr/src/uts/common/vm/seg_kmem.c
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -773,7 +774,7 @@ segkmem_capable(struct seg *seg, segcapability_t capability)
 	return (0);
 }
 
-static struct seg_ops segkmem_ops = {
+struct seg_ops segkmem_ops = {
 	SEGKMEM_BADOP(int),		/* dup */
 	SEGKMEM_BADOP(int),		/* unmap */
 	SEGKMEM_BADOP(void),		/* free */
diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h
index 2a4ed3b2aa..3ad4202e91 100644
--- a/usr/src/uts/common/vm/seg_kmem.h
+++ b/usr/src/uts/common/vm/seg_kmem.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #ifndef _VM_SEG_KMEM_H
@@ -136,6 +137,8 @@ extern size_t	segkmem_kmemlp_max;
 #define	IS_KMEM_VA_LARGEPAGE(vaddr)				        \
 	(((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end))
 
+extern struct seg_ops segkmem_ops;
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/vm/seg_umap.c b/usr/src/uts/common/vm/seg_umap.c
new file mode 100644
index 0000000000..ccad71c5d6
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_umap.c
@@ -0,0 +1,466 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * VM - Kernel-to-user mapping segment
+ *
+ * The umap segment driver was primarily designed to facilitate the comm page:
+ * a portion of kernel memory shared with userspace so that certain (namely
+ * clock-related) actions could operate without making an expensive trip into
+ * the kernel.
+ *
+ * Since the initial requirements for the comm page are slim, advanced features
+ * of the segment driver such as per-page protection have been left
+ * unimplemented at this time.
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/lgrp.h>
+#include <sys/mman.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_umap.h>
+
+
+static boolean_t segumap_verify_safe(caddr_t, size_t);
+static int segumap_dup(struct seg *, struct seg *);
+static int segumap_unmap(struct seg *, caddr_t, size_t);
+static void segumap_free(struct seg *);
+static faultcode_t segumap_fault(struct hat *, struct seg *, caddr_t, size_t,
+    enum fault_type, enum seg_rw);
+static faultcode_t segumap_faulta(struct seg *, caddr_t);
+static int segumap_setprot(struct seg *, caddr_t, size_t, uint_t);
+static int segumap_checkprot(struct seg *, caddr_t, size_t, uint_t);
+static int segumap_sync(struct seg *, caddr_t, size_t, int, uint_t);
+static size_t segumap_incore(struct seg *, caddr_t, size_t, char *);
+static int segumap_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *,
+    size_t);
+static int segumap_getprot(struct seg *, caddr_t, size_t, uint_t *);
+static u_offset_t segumap_getoffset(struct seg *, caddr_t);
+static int segumap_gettype(struct seg *, caddr_t);
+static int segumap_getvp(struct seg *, caddr_t, struct vnode **);
+static int segumap_advise(struct seg *, caddr_t, size_t, uint_t);
+static void segumap_dump(struct seg *);
+static int segumap_pagelock(struct seg *, caddr_t, size_t, struct page ***,
+    enum lock_type, enum seg_rw);
+static int segumap_setpagesize(struct seg *, caddr_t, size_t, uint_t);
+static int segumap_getmemid(struct seg *, caddr_t, memid_t *);
+static int segumap_capable(struct seg *, segcapability_t);
+
+static struct seg_ops segumap_ops = {
+	segumap_dup,
+	segumap_unmap,
+	segumap_free,
+	segumap_fault,
+	segumap_faulta,
+	segumap_setprot,
+	segumap_checkprot,
+	NULL,			/* kluster: disabled */
+	NULL,			/* swapout: disabled */
+	segumap_sync,
+	segumap_incore,
+	segumap_lockop,
+	segumap_getprot,
+	segumap_getoffset,
+	segumap_gettype,
+	segumap_getvp,
+	segumap_advise,
+	segumap_dump,
+	segumap_pagelock,
+	segumap_setpagesize,
+	segumap_getmemid,
+	NULL,			/* getpolicy: disabled */
+	segumap_capable,
+	seg_inherit_notsup
+};
+
+
+/*
+ * Create a kernel/user-mapped segment.
+ */
+int
+segumap_create(struct seg *seg, void *argsp)
+{
+	segumap_crargs_t *a = (struct segumap_crargs *)argsp;
+	segumap_data_t *data;
+
+	ASSERT((uintptr_t)a->kaddr > _userlimit);
+
+	/*
+	 * Check several aspects of the mapping request to ensure validity:
+	 * - kernel pages must reside entirely in kernel space
+	 * - target protection must be user-accessible
+	 * - kernel address must be page-aligned
+	 * - kernel address must reside inside a "safe" segment
+	 */
+	if ((uintptr_t)a->kaddr <= _userlimit ||
+	    ((uintptr_t)a->kaddr + seg->s_size) < (uintptr_t)a->kaddr ||
+	    (a->prot & PROT_USER) == 0 ||
+	    ((uintptr_t)a->kaddr & PAGEOFFSET) != 0 ||
+	    !segumap_verify_safe(a->kaddr, seg->s_size)) {
+		return (EINVAL);
+	}
+
+	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+	rw_init(&data->sud_lock, NULL, RW_DEFAULT, NULL);
+	data->sud_kaddr = a->kaddr;
+	data->sud_prot = a->prot;
+	data->sud_loaded = B_FALSE;
+
+	seg->s_ops = &segumap_ops;
+	seg->s_data = data;
+	return (0);
+}
+
+static boolean_t
+segumap_verify_safe(caddr_t kaddr, size_t len)
+{
+	struct seg *seg;
+
+	/*
+	 * Presently, only pages which are backed by segkmem are allowed to be
+	 * shared with userspace.  This prevents nasty paging behavior with
+	 * other drivers such as seg_kp.  Furthermore, the backing kernel
+	 * segment must completely contain the region to be mapped.
+	 *
+	 * Failing these checks is fatal for now since such mappings are done
+	 * in a very limited context from the kernel.
+	 */
+	AS_LOCK_ENTER(&kas, RW_READER);
+	seg = as_segat(&kas, kaddr);
+	VERIFY(seg != NULL);
+	VERIFY(seg->s_base + seg->s_size >= kaddr + len);
+	VERIFY(seg->s_ops == &segkmem_ops);
+	AS_LOCK_EXIT(&kas);
+
+	return (B_TRUE);
+}
+
+static int
+segumap_dup(struct seg *seg, struct seg *newseg)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+	segumap_data_t *newsud;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
+
+	newsud = kmem_zalloc(sizeof (segumap_data_t), KM_SLEEP);
+	rw_init(&newsud->sud_lock, NULL, RW_DEFAULT, NULL);
+	newsud->sud_kaddr = sud->sud_kaddr;
+	newsud->sud_prot = sud->sud_prot;
+	newsud->sud_loaded = B_FALSE;
+
+	newseg->s_ops = seg->s_ops;
+	newseg->s_data = newsud;
+	return (0);
+}
+
+static int
+segumap_unmap(struct seg *seg, caddr_t addr, size_t len)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
+
+	/* Only allow unmap of entire segment */
+	if (addr != seg->s_base || len != seg->s_size) {
+		return (EINVAL);
+	}
+	if (sud->sud_softlockcnt != 0) {
+		return (EAGAIN);
+	}
+
+	hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP);
+	/*
+	 * While setting this field before immediately freeing the segment is
+	 * not necessary, it is done for the sake of completeness.  Doing so
+	 * outside sud_lock is safe with the AS write-locked.
+	 */
+	sud->sud_loaded = B_FALSE;
+
+	seg_free(seg);
+	return (0);
+}
+
+static void
+segumap_free(struct seg *seg)
+{
+	segumap_data_t *data = (segumap_data_t *)seg->s_data;
+
+	ASSERT(data != NULL);
+
+	rw_destroy(&data->sud_lock);
+	VERIFY(data->sud_loaded == B_FALSE);
+	VERIFY(data->sud_softlockcnt == 0);
+	kmem_free(data, sizeof (*data));
+	seg->s_data = NULL;
+}
+
+/* ARGSUSED */
+static faultcode_t
+segumap_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
+    enum fault_type type, enum seg_rw tw)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	if (type == F_PROT) {
+		/*
+		 * Since protection on the segment is fixed, there is nothing
+		 * to do but report an error for protection faults.
+		 */
+		return (FC_PROT);
+	} else if (type == F_SOFTUNLOCK) {
+		size_t plen = btop(len);
+
+		rw_enter(&sud->sud_lock, RW_WRITER);
+		VERIFY(sud->sud_softlockcnt >= plen);
+		sud->sud_softlockcnt -= plen;
+		rw_exit(&sud->sud_lock);
+		return (0);
+	}
+
+	ASSERT(type == F_INVAL || type == F_SOFTLOCK);
+	rw_enter(&sud->sud_lock, RW_WRITER);
+
+	if (type == F_INVAL && sud->sud_loaded) {
+		rw_exit(&sud->sud_lock);
+		return (FC_NOMAP);
+	}
+
+	/*
+	 * Load the (entire) segment into the HAT if it has not been done so.
+	 */
+	if (!sud->sud_loaded) {
+		for (uintptr_t i = 0; i < seg->s_size; i += PAGESIZE) {
+			pfn_t pfn;
+
+			pfn = hat_getpfnum(kas.a_hat, sud->sud_kaddr + i);
+			VERIFY(pfn != PFN_INVALID);
+			hat_devload(seg->s_as->a_hat, seg->s_base + i,
+			    PAGESIZE, pfn, sud->sud_prot, HAT_LOAD);
+		}
+		sud->sud_loaded = B_TRUE;
+	} else {
+		/*
+		 * If there the segment has already been loaded, there is no
+		 * reason to take an F_INVALID fault.
+		 */
+		VERIFY(type != F_INVAL);
+	}
+
+	if (type == F_SOFTLOCK) {
+		size_t nval = sud->sud_softlockcnt + btop(len);
+
+		if (sud->sud_softlockcnt >= nval) {
+			rw_exit(&sud->sud_lock);
+			return (FC_MAKE_ERR(EOVERFLOW));
+		}
+		sud->sud_softlockcnt = nval;
+	}
+	rw_exit(&sud->sud_lock);
+	return (0);
+}
+
+/* ARGSUSED */
+static faultcode_t
+segumap_faulta(struct seg *seg, caddr_t addr)
+{
+	/* Do nothing since asynch pagefault should not load translation. */
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+segumap_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	/*
+	 * The seg_umap driver does not yet allow protection to be changed.
+	 */
+	return (EACCES);
+}
+
+/* ARGSUSED */
+static int
+segumap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+	int error = 0;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	rw_enter(&sud->sud_lock, RW_READER);
+	if ((sud->sud_prot & prot) != prot) {
+		error = EACCES;
+	}
+	rw_exit(&sud->sud_lock);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+segumap_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
+{
+	/* Always succeed since there are no backing store to sync */
+	return (0);
+}
+
+/* ARGSUSED */
+static size_t
+segumap_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
+{
+	size_t sz = 0;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	len = (len + PAGEOFFSET) & PAGEMASK;
+	while (len > 0) {
+		*vec = 1;
+		sz += PAGESIZE;
+		vec++;
+		len -= PAGESIZE;
+	}
+	return (sz);
+}
+
+/* ARGSUSED */
+static int
+segumap_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op,
+    ulong_t *lockmap, size_t pos)
+{
+	/* Report success since kernel pages are always in memory. */
+	return (0);
+}
+
+static int
+segumap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+	size_t pgno;
+	uint_t prot;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	rw_enter(&sud->sud_lock, RW_READER);
+	prot = sud->sud_prot;
+	rw_exit(&sud->sud_lock);
+
+	/*
+	 * Reporting protection is simple since it is not tracked per-page.
+	 */
+	pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
+	while (pgno > 0) {
+		protv[--pgno] = prot;
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static u_offset_t
+segumap_getoffset(struct seg *seg, caddr_t addr)
+{
+	/*
+	 * To avoid leaking information about the layout of the kernel address
+	 * space, always report '0' as the offset.
+	 */
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+segumap_gettype(struct seg *seg, caddr_t addr)
+{
+	/*
+	 * Since already-existing kernel pages are being mapped into userspace,
+	 * always report the segment type as shared.
+	 */
+	return (MAP_SHARED);
+}
+
+/* ARGSUSED */
+static int
+segumap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	*vpp = NULL;
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+segumap_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
+{
+	if (behav == MADV_PURGE) {
+		/* Purge does not make sense for this mapping */
+		return (EINVAL);
+	}
+	/* Indicate success for everything else. */
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+segumap_dump(struct seg *seg)
+{
+	/*
+	 * Since this is a mapping to share kernel data with userspace, nothing
+	 * additional should be dumped.
+	 */
+}
+
+/* ARGSUSED */
+static int
+segumap_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
+    enum lock_type type, enum seg_rw rw)
+{
+	return (ENOTSUP);
+}
+
+/* ARGSUSED */
+static int
+segumap_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
+{
+	return (ENOTSUP);
+}
+
+static int
+segumap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+
+	memidp->val[0] = (uintptr_t)sud->sud_kaddr;
+	memidp->val[1] = (uintptr_t)(addr - seg->s_base);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+segumap_capable(struct seg *seg, segcapability_t capability)
+{
+	/* no special capablities */
+	return (0);
+}
diff --git a/usr/src/uts/common/vm/seg_umap.h b/usr/src/uts/common/vm/seg_umap.h
new file mode 100644
index 0000000000..bcf7447509
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_umap.h
@@ -0,0 +1,43 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_VM_SEG_UMAP_H
+#define	_VM_SEG_UMAP_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct segumap_crargs {
+	caddr_t	kaddr;
+	uchar_t	prot;		/* protection */
+	uchar_t	maxprot;	/* maximum protection */
+} segumap_crargs_t;
+
+typedef struct segumap_data {
+	krwlock_t	sud_lock;
+	caddr_t		sud_kaddr;
+	uchar_t		sud_prot;
+	size_t		sud_softlockcnt;
+	boolean_t	sud_loaded;
+} segumap_data_t;
+
+extern int segumap_create(struct seg *, void *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_UMAP_H */
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 875dec7fe9..f143c1e464 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -7308,7 +7308,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 	vpp = svd->vpage;
 	offset = svd->offset + (uintptr_t)(addr - seg->s_base);
 	bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
-	    ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+	    ((flags & MS_INVALIDATE) ? B_INVAL : 0) |
+	    ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0);
 
 	if (attr) {
 		pageprot = attr & ~(SHARED|PRIVATE);
@@ -7333,11 +7334,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 			vpp = &svd->vpage[seg_page(seg, addr)];
 
 	} else if (svd->vp && svd->amp == NULL &&
-	    (flags & MS_INVALIDATE) == 0) {
+	    (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) {
 
 		/*
-		 * No attributes, no anonymous pages and MS_INVALIDATE flag
-		 * is not on, just use one big request.
+		 * No attributes, no anonymous pages and MS_INVAL* flags
+		 * are not on, just use one big request.
 		 */
 		err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
 		    bflags, svd->cred, NULL);
@@ -7389,7 +7390,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 		 * might race in and lock the page after we unlock and before
 		 * we do the PUTPAGE, then PUTPAGE simply does nothing.
 		 */
-		if (flags & MS_INVALIDATE) {
+		if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) {
 			if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 					page_unlock(pp);
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index 4fd32a3f4a..01db9b23d7 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -788,14 +788,21 @@ anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
 	pgcnt_t pswap_pages = 0;
 	proc_t *p = curproc;
 
-	if (zone != NULL && takemem) {
+	if (zone != NULL) {
 		/* test zone.max-swap resource control */
 		mutex_enter(&p->p_lock);
 		if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
 			mutex_exit(&p->p_lock);
-			atomic_add_64(&zone->zone_anon_alloc_fail, 1);
+
+			if (takemem)
+				atomic_add_64(&zone->zone_anon_alloc_fail, 1);
+
 			return (0);
 		}
+
+		if (!takemem)
+			rctl_decr_swap(zone, ptob(npages));
+
 		mutex_exit(&p->p_lock);
 	}
 	mutex_enter(&anoninfo_lock);
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
index bb5a96eb0f..b0a5e7fb33 100644
--- a/usr/src/uts/common/vm/vm_as.c
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -57,6 +57,7 @@
 #include <sys/debug.h>
 #include <sys/tnf_probe.h>
 #include <sys/vtrace.h>
+#include <sys/ddi.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
@@ -848,8 +849,7 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 	struct seg *segsav;
 	int as_lock_held;
 	klwp_t *lwp = ttolwp(curthread);
-
-
+	zone_t *zonep = curzone;
 
 retry:
 	/*
@@ -885,6 +885,22 @@ retry:
 		if (as == &kas)
 			CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 		CPU_STATS_EXIT_K();
+		if (zonep->zone_pg_flt_delay != 0) {
+			/*
+			 * The zone in which this process is running is
+			 * currently over it's physical memory cap. Throttle
+			 * page faults to help the user-land memory capper
+			 * catch up. Note that drv_usectohz() rounds up.
+			 */
+			atomic_add_64(&zonep->zone_pf_throttle, 1);
+			atomic_add_64(&zonep->zone_pf_throttle_usec,
+			    zonep->zone_pg_flt_delay);
+			if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1)) {
+				drv_usecwait(zonep->zone_pg_flt_delay);
+			} else {
+				delay(drv_usectohz(zonep->zone_pg_flt_delay));
+			}
+		}
 		break;
 	}
 
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
index 1b8d12eb8d..a206320a30 100644
--- a/usr/src/uts/common/vm/vm_pvn.c
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -432,7 +433,14 @@ pvn_write_done(page_t *plist, int flags)
 				page_io_unlock(pp);
 				page_unlock(pp);
 			}
-		} else if (flags & B_INVAL) {
+		} else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
+			/*
+			 * If B_INVALCURONLY is set, then we handle that case
+			 * in the next conditional if hat_page_is_mapped()
+			 * indicates that there are no additional mappings
+			 * to the page.
+			 */
+
 			/*
 			 * XXX - Failed writes with B_INVAL set are
 			 * not handled appropriately.
@@ -573,8 +581,9 @@ pvn_write_done(page_t *plist, int flags)
 }
 
 /*
- * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
- * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
+ * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
+ * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
+ * B_DELWRI indicates that this page is part of a kluster
  * operation and is only to be considered if it doesn't involve any
  * waiting here.  B_TRUNC indicates that the file is being truncated
  * and so no i/o needs to be done. B_FORCE indicates that the page
@@ -628,13 +637,17 @@ pvn_getdirty(page_t *pp, int flags)
 	 * If we want to free or invalidate the page then
 	 * we need to unload it so that anyone who wants
 	 * it will have to take a minor fault to get it.
+	 * If we are only invalidating the page for the
+	 * current process, then pass in a different flag.
 	 * Otherwise, we're just writing the page back so we
 	 * need to sync up the hardwre and software mod bit to
 	 * detect any future modifications.  We clear the
 	 * software mod bit when we put the page on the dirty
 	 * list.
 	 */
-	if (flags & (B_INVAL | B_FREE)) {
+	if (flags & B_INVALCURONLY) {
+		(void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
+	} else if (flags & (B_INVAL | B_FREE)) {
 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 	} else {
 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
@@ -646,7 +659,7 @@ pvn_getdirty(page_t *pp, int flags)
 		 * list after all.
 		 */
 		page_io_unlock(pp);
-		if (flags & B_INVAL) {
+		if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 			/*LINTED: constant in conditional context*/
 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
 		} else if (flags & B_FREE) {
@@ -658,6 +671,9 @@ pvn_getdirty(page_t *pp, int flags)
 			 * of VOP_PUTPAGE() who prefer freeing the
 			 * page _only_ if no one else is accessing it.
 			 * E.g. segmap_release()
+			 * We also take this path for B_INVALCURONLY and
+			 * let page_release call VN_DISPOSE if no one else is
+			 * using the page.
 			 *
 			 * The above hat_ismod() check is useless because:
 			 * (1) we may not be holding SE_EXCL lock;
@@ -682,7 +698,7 @@ pvn_getdirty(page_t *pp, int flags)
 	 * We'll detect the fact that they used it when the
 	 * i/o is done and avoid freeing the page.
 	 */
-	if (flags & B_FREE)
+	if (flags & (B_FREE | B_INVALCURONLY))
 		page_downgrade(pp);
 
 
diff --git a/usr/src/uts/common/vm/vm_swap.c b/usr/src/uts/common/vm/vm_swap.c
index 1a28c04357..2a008e114b 100644
--- a/usr/src/uts/common/vm/vm_swap.c
+++ b/usr/src/uts/common/vm/vm_swap.c
@@ -18,6 +18,11 @@
  *
  * CDDL HEADER END
  */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
 /*
  * Copyright (c) 1987, 2010, Oracle and/or its affiliates. All rights reserved.
  */
@@ -625,7 +630,18 @@ swapctl(int sc_cmd, void *sc_arg, int *rv)
 			return (0);
 		}
 beginning:
+		mutex_enter(&swapinfo_lock);
 		tmp_nswapfiles = nswapfiles;
+		mutex_exit(&swapinfo_lock);
+
+		/*
+		 * Return early if there are no swap entries to report:
+		 */
+		if (tmp_nswapfiles < 1) {
+			*rv = 0;
+			return (0);
+		}
+
 		/* Return an error if not enough space for the whole table. */
 		if (length < tmp_nswapfiles)
 			return (ENOMEM);
@@ -920,7 +936,18 @@ swapctl32(int sc_cmd, void *sc_arg, int *rv)
 			return (0);
 		}
 beginning:
+		mutex_enter(&swapinfo_lock);
 		tmp_nswapfiles = nswapfiles;
+		mutex_exit(&swapinfo_lock);
+
+		/*
+		 * Return early if there are no swap entries to report:
+		 */
+		if (tmp_nswapfiles < 1) {
+			*rv = 0;
+			return (0);
+		}
+
 		/* Return an error if not enough space for the whole table. */
 		if (length < tmp_nswapfiles)
 			return (ENOMEM);
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index 57166b4e63..8b9fd0d7a3 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,6 +25,10 @@
  */
 
 /*
+ * Copyright 2016, Joyent, Inc.
+ */
+
+/*
  * vm_usage
  *
  * This file implements the getvmusage() private system call.
@@ -114,7 +118,7 @@
  *	For accurate counting of map-shared and COW-shared pages.
  *
  *    - visited private anons (refcnt > 1) for each collective.
- *	(entity->vme_anon_hash)
+ *	(entity->vme_anon)
  *	For accurate counting of COW-shared pages.
  *
  * The common accounting structure is the vmu_entity_t, which represents
@@ -152,6 +156,7 @@
 #include <sys/vm_usage.h>
 #include <sys/zone.h>
 #include <sys/sunddi.h>
+#include <sys/sysmacros.h>
 #include <sys/avl.h>
 #include <vm/anon.h>
 #include <vm/as.h>
@@ -199,6 +204,14 @@ typedef struct vmu_object {
 } vmu_object_t;
 
 /*
+ * Node for tree of visited COW anons.
+ */
+typedef struct vmu_anon {
+	avl_node_t vma_node;
+	uintptr_t vma_addr;
+} vmu_anon_t;
+
+/*
  * Entity by which to count results.
  *
  * The entity structure keeps the current rss/swap counts for each entity
@@ -221,7 +234,7 @@ typedef struct vmu_entity {
 	struct vmu_entity *vme_next_calc;
 	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
 	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
-	mod_hash_t	*vme_anon_hash;	 /* COW anons visited for entity */
+	avl_tree_t	vme_anon;	 /* COW anons visited for entity */
 	vmusage_t	vme_result;	 /* identifies entity and results */
 } vmu_entity_t;
 
@@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2)
 }
 
 /*
+ * Comparison routine for our AVL tree of anon structures.
+ */
+static int
+vmu_anon_cmp(const void *lhs, const void *rhs)
+{
+	const vmu_anon_t *l = lhs, *r = rhs;
+
+	if (l->vma_addr == r->vma_addr)
+		return (0);
+
+	if (l->vma_addr < r->vma_addr)
+		return (-1);
+
+	return (1);
+}
+
+/*
  * Save a bound on the free list.
  */
 static void
@@ -363,13 +393,18 @@ static void
 vmu_free_entity(mod_hash_val_t val)
 {
 	vmu_entity_t *entity = (vmu_entity_t *)val;
+	vmu_anon_t *anon;
+	void *cookie = NULL;
 
 	if (entity->vme_vnode_hash != NULL)
 		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
 	if (entity->vme_amp_hash != NULL)
 		i_mod_hash_clear_nosync(entity->vme_amp_hash);
-	if (entity->vme_anon_hash != NULL)
-		i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+	while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
+		kmem_free(anon, sizeof (vmu_anon_t));
+
+	avl_destroy(&entity->vme_anon);
 
 	entity->vme_next = vmu_data.vmu_free_entities;
 	vmu_data.vmu_free_entities = entity;
@@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid)
 		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 		    sizeof (struct anon_map));
 
-	if (entity->vme_anon_hash == NULL)
-		entity->vme_anon_hash = mod_hash_create_ptrhash(
-		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
-		    mod_hash_null_valdtor, sizeof (struct anon));
+	VERIFY(avl_first(&entity->vme_anon) == NULL);
+
+	avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
+	    offsetof(struct vmu_anon, vma_node));
 
 	entity->vme_next = vmu_data.vmu_entities;
 	vmu_data.vmu_entities = entity;
@@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id)
 
 	zone->vmz_id = id;
 
-	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+	if ((vmu_data.vmu_calc_flags &
+	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
 		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 
 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
@@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
 }
 
 static int
-vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+vmu_find_insert_anon(vmu_entity_t *entity, void *key)
 {
-	int ret;
-	caddr_t val;
+	vmu_anon_t anon, *ap;
 
-	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
-	    (mod_hash_val_t *)&val);
+	anon.vma_addr = (uintptr_t)key;
 
-	if (ret == 0)
+	if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
 		return (0);
 
-	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
-	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+	ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
+	ap->vma_addr = (uintptr_t)key;
 
-	ASSERT(ret == 0);
+	avl_add(&entity->vme_anon, ap);
 
 	return (1);
 }
@@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 			next = AVL_NEXT(tree, next);
 			continue;
 		}
+
+		ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 		bound_type = next->vmb_type;
 		index = next->vmb_start;
 		while (index <= next->vmb_end) {
@@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 
 			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 			    (page = page_exists(vn, off)) != NULL) {
-				page_type = VMUSAGE_BOUND_INCORE;
+				if (PP_ISFREE(page))
+					page_type = VMUSAGE_BOUND_NOT_INCORE;
+				else
+					page_type = VMUSAGE_BOUND_INCORE;
 				if (page->p_szc > 0) {
 					pgcnt = page_get_pagecnt(page->p_szc);
 					pgshft = page_get_shift(page->p_szc);
@@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 			} else {
 				page_type = VMUSAGE_BOUND_NOT_INCORE;
 			}
+
 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 				next->vmb_type = page_type;
+				bound_type = page_type;
 			} else if (next->vmb_type != page_type) {
 				/*
 				 * If current bound type does not match page
@@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 			continue;
 		}
 
+		ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 		bound_type = next->vmb_type;
 		index = next->vmb_start;
 		while (index <= next->vmb_end) {
@@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 
 			if (vnode->v_pages != NULL &&
 			    (page = page_exists(vnode, ptob(index))) != NULL) {
-				page_type = VMUSAGE_BOUND_INCORE;
+				if (PP_ISFREE(page))
+					page_type = VMUSAGE_BOUND_NOT_INCORE;
+				else
+					page_type = VMUSAGE_BOUND_INCORE;
 				if (page->p_szc > 0) {
 					pgcnt = page_get_pagecnt(page->p_szc);
 					pgshft = page_get_shift(page->p_szc);
@@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 			} else {
 				page_type = VMUSAGE_BOUND_NOT_INCORE;
 			}
+
 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 				next->vmb_type = page_type;
+				bound_type = page_type;
 			} else if (next->vmb_type != page_type) {
 				/*
 				 * If current bound type does not match page
@@ -1304,6 +1351,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
 			}
 
 			/*
+			 * Pages on the free list aren't counted for the rss.
+			 */
+			if (PP_ISFREE(page))
+				continue;
+
+			/*
 			 * Assume anon structs with a refcnt
 			 * of 1 are not COW shared, so there
 			 * is no reason to track them per entity.
@@ -1320,8 +1373,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
 				 * Track COW anons per entity so
 				 * they are not double counted.
 				 */
-				if (vmu_find_insert_anon(entity->vme_anon_hash,
-				    (caddr_t)ap) == 0)
+				if (vmu_find_insert_anon(entity, ap) == 0)
 					continue;
 
 				result->vmu_rss_all += (pgcnt << PAGESHIFT);
@@ -1461,8 +1513,9 @@ vmu_calculate_proc(proc_t *p)
 		entities = tmp;
 	}
 	if (vmu_data.vmu_calc_flags &
-	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
-	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
+	    VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+	    VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
 	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
 	    VMUSAGE_ALL_EUSERS)) {
 		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
@@ -1594,8 +1647,7 @@ vmu_free_extra()
 			mod_hash_destroy_hash(te->vme_vnode_hash);
 		if (te->vme_amp_hash != NULL)
 			mod_hash_destroy_hash(te->vme_amp_hash);
-		if (te->vme_anon_hash != NULL)
-			mod_hash_destroy_hash(te->vme_anon_hash);
+		VERIFY(avl_first(&te->vme_anon) == NULL);
 		kmem_free(te, sizeof (vmu_entity_t));
 	}
 	while (vmu_data.vmu_free_zones != NULL) {
@@ -1739,12 +1791,34 @@ vmu_cache_rele(vmu_cache_t *cache)
 }
 
 /*
+ * When new data is calculated, update the phys_mem rctl usage value in the
+ * zones.
+ */
+static void
+vmu_update_zone_rctls(vmu_cache_t *cache)
+{
+	vmusage_t	*rp;
+	size_t		i = 0;
+	zone_t		*zp;
+
+	for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
+		if (rp->vmu_type == VMUSAGE_ZONE &&
+		    rp->vmu_zoneid != ALL_ZONES) {
+			if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
+				zp->zone_phys_mem = rp->vmu_rss_all;
+				zone_rele(zp);
+			}
+		}
+	}
+}
+
+/*
  * Copy out the cached results to a caller.  Inspect the callers flags
  * and zone to determine which cached results should be copied.
  */
 static int
 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
-    uint_t flags, int cpflg)
+    uint_t flags, id_t req_zone_id, int cpflg)
 {
 	vmusage_t *result, *out_result;
 	vmusage_t dummy;
@@ -1763,7 +1837,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
 	/* figure out what results the caller is interested in. */
 	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
 		types |= VMUSAGE_SYSTEM;
-	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
 		types |= VMUSAGE_ZONE;
 	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
 	    VMUSAGE_COL_PROJECTS))
@@ -1826,26 +1900,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
 				continue;
 		}
 
-		/* Skip "other zone" results if not requested */
-		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
-			if (result->vmu_type == VMUSAGE_ZONE &&
-			    (flags & VMUSAGE_ALL_ZONES) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_PROJECTS &&
-			    (flags & (VMUSAGE_ALL_PROJECTS |
-			    VMUSAGE_COL_PROJECTS)) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_TASKS &&
-			    (flags & VMUSAGE_ALL_TASKS) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_RUSERS &&
-			    (flags & (VMUSAGE_ALL_RUSERS |
-			    VMUSAGE_COL_RUSERS)) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_EUSERS &&
-			    (flags & (VMUSAGE_ALL_EUSERS |
-			    VMUSAGE_COL_EUSERS)) == 0)
+		if (result->vmu_type == VMUSAGE_ZONE &&
+		    flags & VMUSAGE_A_ZONE) {
+			/* Skip non-requested zone results */
+			if (result->vmu_zoneid != req_zone_id)
 				continue;
+		} else {
+			/* Skip "other zone" results if not requested */
+			if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+				if (result->vmu_type == VMUSAGE_ZONE &&
+				    (flags & VMUSAGE_ALL_ZONES) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_PROJECTS &&
+				    (flags & (VMUSAGE_ALL_PROJECTS |
+				    VMUSAGE_COL_PROJECTS)) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_TASKS &&
+				    (flags & VMUSAGE_ALL_TASKS) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_RUSERS &&
+				    (flags & (VMUSAGE_ALL_RUSERS |
+				    VMUSAGE_COL_RUSERS)) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_EUSERS &&
+				    (flags & (VMUSAGE_ALL_EUSERS |
+				    VMUSAGE_COL_EUSERS)) == 0)
+					continue;
+			}
 		}
 		count++;
 		if (out_result != NULL) {
@@ -1901,10 +1982,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 	int cacherecent = 0;
 	hrtime_t now;
 	uint_t flags_orig;
+	id_t req_zone_id;
 
 	/*
 	 * Non-global zones cannot request system wide and/or collated
-	 * results, or the system result, so munge the flags accordingly.
+	 * results, or the system result, or usage of another zone, so munge
+	 * the flags accordingly.
 	 */
 	flags_orig = flags;
 	if (curproc->p_zone != global_zone) {
@@ -1924,6 +2007,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 			flags &= ~VMUSAGE_SYSTEM;
 			flags |= VMUSAGE_ZONE;
 		}
+		if (flags & VMUSAGE_A_ZONE) {
+			flags &= ~VMUSAGE_A_ZONE;
+			flags |= VMUSAGE_ZONE;
+		}
 	}
 
 	/* Check for unknown flags */
@@ -1934,6 +2021,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 	if ((flags & VMUSAGE_MASK) == 0)
 		return (set_errno(EINVAL));
 
+	/* If requesting results for a specific zone, get the zone ID */
+	if (flags & VMUSAGE_A_ZONE) {
+		size_t bufsize;
+		vmusage_t zreq;
+
+		if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
+			return (set_errno(EFAULT));
+		/* Requested zone ID is passed in buf, so 0 len not allowed */
+		if (bufsize == 0)
+			return (set_errno(EINVAL));
+		if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
+			return (set_errno(EFAULT));
+		req_zone_id = zreq.vmu_id;
+	}
+
 	mutex_enter(&vmu_data.vmu_lock);
 	now = gethrtime();
 
@@ -1953,7 +2055,7 @@ start:
 			mutex_exit(&vmu_data.vmu_lock);
 
 			ret = vmu_copyout_results(cache, buf, nres, flags_orig,
-			    cpflg);
+			    req_zone_id, cpflg);
 			mutex_enter(&vmu_data.vmu_lock);
 			vmu_cache_rele(cache);
 			if (vmu_data.vmu_pending_waiters > 0)
@@ -2009,8 +2111,11 @@ start:
 
 		mutex_exit(&vmu_data.vmu_lock);
 
+		/* update zone's phys. mem. rctl usage */
+		vmu_update_zone_rctls(cache);
 		/* copy cache */
-		ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
+		ret = vmu_copyout_results(cache, buf, nres, flags_orig,
+		    req_zone_id, cpflg);
 		mutex_enter(&vmu_data.vmu_lock);
 		vmu_cache_rele(cache);
 		mutex_exit(&vmu_data.vmu_lock);
@@ -2030,3 +2135,185 @@ start:
 	vmu_data.vmu_pending_waiters--;
 	goto start;
 }
+
+#if defined(__x86)
+/*
+ * Attempt to invalidate all of the pages in the mapping for the given process.
+ */
+static void
+map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
+{
+	page_t		*pp;
+	size_t		psize;
+	u_offset_t	off;
+	caddr_t		eaddr;
+	struct vnode	*vp;
+	struct segvn_data *svd;
+	struct hat	*victim_hat;
+
+	ASSERT((addr + size) <= (seg->s_base + seg->s_size));
+
+	victim_hat = p->p_as->a_hat;
+	svd = (struct segvn_data *)seg->s_data;
+	vp = svd->vp;
+	psize = page_get_pagesize(seg->s_szc);
+
+	off = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+	for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
+		pp = page_lookup_nowait(vp, off, SE_SHARED);
+
+		if (pp != NULL) {
+			/* following logic based on pvn_getdirty() */
+
+			if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+				page_unlock(pp);
+				continue;
+			}
+
+			page_io_lock(pp);
+			hat_page_inval(pp, 0, victim_hat);
+			page_io_unlock(pp);
+
+			/*
+			 * For B_INVALCURONLY-style handling we let
+			 * page_release call VN_DISPOSE if no one else is using
+			 * the page.
+			 *
+			 * A hat_ismod() check would be useless because:
+			 * (1) we are not be holding SE_EXCL lock
+			 * (2) we've not unloaded _all_ translations
+			 *
+			 * Let page_release() do the heavy-lifting.
+			 */
+			(void) page_release(pp, 1);
+		}
+	}
+}
+
+/*
+ * vm_map_inval()
+ *
+ * Invalidate as many pages as possible within the given mapping for the given
+ * process. addr is expected to be the base address of the mapping and size is
+ * the length of the mapping. In some cases a mapping will encompass an
+ * entire segment, but at least for anon or stack mappings, these will be
+ * regions within a single large segment. Thus, the invalidation is oriented
+ * around a single mapping and not an entire segment.
+ *
+ * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
+ * this code is only applicable to x86.
+ */
+int
+vm_map_inval(pid_t pid, caddr_t addr, size_t size)
+{
+	int ret;
+	int error = 0;
+	proc_t *p;		/* target proc */
+	struct as *as;		/* target proc's address space */
+	struct seg *seg;	/* working segment */
+
+	if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
+		return (set_errno(EPERM));
+
+	/* If not a valid mapping address, return an error */
+	if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
+		return (set_errno(EINVAL));
+
+again:
+	mutex_enter(&pidlock);
+	p = prfind(pid);
+	if (p == NULL) {
+		mutex_exit(&pidlock);
+		return (set_errno(ESRCH));
+	}
+
+	mutex_enter(&p->p_lock);
+	mutex_exit(&pidlock);
+
+	if (panicstr != NULL) {
+		mutex_exit(&p->p_lock);
+		return (0);
+	}
+
+	as = p->p_as;
+
+	/*
+	 * Try to set P_PR_LOCK - prevents process "changing shape"
+	 * - blocks fork
+	 * - blocks sigkill
+	 * - cannot be a system proc
+	 * - must be fully created proc
+	 */
+	ret = sprtrylock_proc(p);
+	if (ret == -1) {
+		/* Process in invalid state */
+		mutex_exit(&p->p_lock);
+		return (set_errno(ESRCH));
+	}
+
+	if (ret == 1) {
+		/*
+		 * P_PR_LOCK is already set. Wait and try again. This also
+		 * drops p_lock so p may no longer be valid since the proc may
+		 * have exited.
+		 */
+		sprwaitlock_proc(p);
+		goto again;
+	}
+
+	/* P_PR_LOCK is now set */
+	mutex_exit(&p->p_lock);
+
+	AS_LOCK_ENTER(as, RW_READER);
+	if ((seg = as_segat(as, addr)) == NULL) {
+		AS_LOCK_EXIT(as);
+		mutex_enter(&p->p_lock);
+		sprunlock(p);
+		return (set_errno(ENOMEM));
+	}
+
+	/*
+	 * The invalidation behavior only makes sense for vnode-backed segments.
+	 */
+	if (seg->s_ops != &segvn_ops) {
+		AS_LOCK_EXIT(as);
+		mutex_enter(&p->p_lock);
+		sprunlock(p);
+		return (0);
+	}
+
+	/*
+	 * If the mapping is out of bounds of the segement return an error.
+	 */
+	if ((addr + size) > (seg->s_base + seg->s_size)) {
+		AS_LOCK_EXIT(as);
+		mutex_enter(&p->p_lock);
+		sprunlock(p);
+		return (set_errno(EINVAL));
+	}
+
+	/*
+	 * Don't use MS_INVALCURPROC flag here since that would eventually
+	 * initiate hat invalidation based on curthread. Since we're doing this
+	 * on behalf of a different process, that would erroneously invalidate
+	 * our own process mappings.
+	 */
+	error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
+	if (error == 0) {
+		/*
+		 * Since we didn't invalidate during the sync above, we now
+		 * try to invalidate all of the pages in the mapping.
+		 */
+		map_inval(p, seg, addr, size);
+	}
+	AS_LOCK_EXIT(as);
+
+	mutex_enter(&p->p_lock);
+	sprunlock(p);
+
+	if (error)
+		(void) set_errno(error);
+	return (error);
+}
+#endif