35 files changed, 2984 insertions, 159 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 32a63d6c22..b2bbcbc8c3 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -334,6 +334,7 @@ GENUNIX_OBJS +=	\
 		vm_seg.o	\
 		vm_subr.o	\
 		vm_swap.o	\
+		vm_usage.o	\
 		vnode.o		\
 		vuid_queue.o	\
 		vuid_store.o	\
diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c
index 3bb90cf1fa..9197dc815b 100644
--- a/usr/src/uts/common/disp/priocntl.c
+++ b/usr/src/uts/common/disp/priocntl.c
@@ -136,6 +136,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg,
 	struct pcmpargs		pcmpargs;
 	pc_vaparms_t		vaparms;
 	char			clname[PC_CLNMSZ];
+	char			*outstr;
 	int			count;
 	kthread_id_t		retthreadp;
 	proc_t			*initpp;
@@ -145,6 +146,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg,
 	int			rv = 0;
 	pid_t			saved_pid;
 	id_t			classid;
+	int			size;
 	int (*copyinfn)(const void *, void *, size_t);
 	int (*copyoutfn)(const void *, void *, size_t);
 
@@ -692,6 +694,21 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg,
 		ASSERT(defaultcid > 0 && defaultcid < loaded_classes);
 		break;
 
+	case PC_GETDFLCL:
+		mutex_enter(&class_lock);
+
+		if (defaultcid >= loaded_classes)
+			outstr = "";
+		else
+			outstr = sclass[defaultcid].cl_name;
+		size = strlen(outstr) + 1;
+		if (arg != NULL)
+			if ((*copyoutfn)(outstr, arg, size) != 0)
+				error = EFAULT;
+
+		mutex_exit(&class_lock);
+		break;
+
 	default:
 		error = EINVAL;
 		break;
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
index 5a7000c242..c5145cccf0 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,6 +66,7 @@ tmp_resv(
 	int pagecreate)		/* call anon_resv if set */
 {
 	pgcnt_t pages = btopr(delta);
+	zone_t *zone;
 
 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
 	ASSERT(tp->tn_type == VREG);
@@ -79,9 +79,10 @@ tmp_resv(
 	 *
 	 * Deny if trying to reserve more than tmpfs can allocate
 	 */
+	zone = tm->tm_vfsp->vfs_zone;
 	if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) ||
-	    (!anon_checkspace(ptob(pages + tmpfs_minfree))) ||
-	    (anon_resv(delta) == 0))) {
+	    (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) ||
+	    (anon_resv_zone(delta, zone) == 0))) {
 		return (1);
 	}
 
@@ -114,7 +115,7 @@ tmp_unresv(
 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
 	ASSERT(tp->tn_type == VREG);
 
-	anon_unresv(delta);
+	anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone);
 
 	mutex_enter(&tm->tm_contents);
 	tm->tm_anonmem -= btopr(delta);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index d623dce3f7..aa870b124a 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -215,9 +215,26 @@ wrtmp(
 		if (delta > 0) {
 			pagecreate = 1;
 			if (tmp_resv(tm, tp, delta, pagecreate)) {
-				cmn_err(CE_WARN,
-	"%s: File system full, swap space limit exceeded",
+				/*
+				 * Log file system full in the zone that owns
+				 * the tmpfs mount, as well as in the global
+				 * zone if necessary.
+				 */
+				zcmn_err(tm->tm_vfsp->vfs_zone->zone_id,
+				    CE_WARN, "%s: File system full, "
+				    "swap space limit exceeded",
 				    tm->tm_mntpath);
+
+				if (tm->tm_vfsp->vfs_zone->zone_id !=
+				    GLOBAL_ZONEID) {
+
+					vfs_t *vfs = tm->tm_vfsp;
+
+					zcmn_err(GLOBAL_ZONEID,
+					    CE_WARN, "%s: File system full, "
+					    "swap space limit exceeded",
+					    vfs->vfs_vnodecovered->v_path);
+				}
 				error = ENOSPC;
 				break;
 			}
diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c
index 19700ce685..3c63231253 100644
--- a/usr/src/uts/common/os/modhash.c
+++ b/usr/src/uts/common/os/modhash.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -165,15 +164,6 @@
  */
 #define	MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2))
 
-static void i_mod_hash_clear_nosync(mod_hash_t *);
-static int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t,
-    mod_hash_val_t *);
-static int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t,
-    mod_hash_val_t, mod_hash_hndl_t);
-static int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t,
-    mod_hash_val_t *);
-static uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t);
-
 /*
  * Cache for struct mod_hash_entry
  */
@@ -522,7 +512,7 @@ mod_hash_destroy_hash(mod_hash_t *hash)
  * i_mod_hash()
  * 	Call the hashing algorithm for this hash table, with the given key.
  */
-static uint_t
+uint_t
 i_mod_hash(mod_hash_t *hash, mod_hash_key_t key)
 {
 	uint_t h;
@@ -778,7 +768,7 @@ mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key)
  * mod_hash_find()
  * 	Find a value in the hash table corresponding to the given key.
  */
-static int
+int
 i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key,
     mod_hash_val_t *val)
 {
@@ -826,7 +816,7 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
 	return (res);
 }
 
-static void
+void
 i_mod_hash_walk_nosync(mod_hash_t *hash,
     uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg)
 {
@@ -870,7 +860,7 @@ mod_hash_walk(mod_hash_t *hash,
  *	Clears the given hash table by calling the destructor of every hash
  *	element and freeing up all mod_hash_entry's.
  */
-static void
+void
 i_mod_hash_clear_nosync(mod_hash_t *hash)
 {
 	int i;
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index 88b0258afe..fecc4a6c45 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -385,6 +385,56 @@ pgfind(pid_t pgid)
 }
 
 /*
+ * Sets P_PR_LOCK on a non-system process.  Process must be fully created
+ * and not exiting to succeed.
+ *
+ * Returns 0 on success.
+ * Returns 1 if P_PR_LOCK is set.
+ * Returns -1 if proc is in invalid state.
+ */
+int
+sprtrylock_proc(proc_t *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	/* skip system and incomplete processes */
+	if (p->p_stat == SIDL || p->p_stat == SZOMB ||
+	    (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) {
+		return (-1);
+	}
+
+	if (p->p_proc_flag & P_PR_LOCK)
+		return (1);
+
+	p->p_proc_flag |= P_PR_LOCK;
+	THREAD_KPRI_REQUEST();
+
+	return (0);
+}
+
+/*
+ * Wait for P_PR_LOCK to become clear.  Returns with p_lock dropped,
+ * and the proc pointer no longer valid, as the proc may have exited.
+ */
+void
+sprwaitlock_proc(proc_t *p)
+{
+	kmutex_t *mp;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+
+	/*
+	 * p_lock is persistent, but p itself is not -- it could
+	 * vanish during cv_wait().  Load p->p_lock now so we can
+	 * drop it after cv_wait() without referencing p.
+	 */
+	mp = &p->p_lock;
+	cv_wait(&pr_pid_cv[p->p_slot], mp);
+	mutex_exit(mp);
+}
+
+/*
  * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
  * Returns the proc pointer on success, NULL on failure.  sprlock() is
  * really just a stripped-down version of pr_p_lock() to allow practive
@@ -394,7 +444,7 @@ proc_t *
 sprlock_zone(pid_t pid, zoneid_t zoneid)
 {
 	proc_t *p;
-	kmutex_t *mp;
+	int ret;
 
 	for (;;) {
 		mutex_enter(&pidlock);
@@ -402,31 +452,21 @@ sprlock_zone(pid_t pid, zoneid_t zoneid)
 			mutex_exit(&pidlock);
 			return (NULL);
 		}
-		/*
-		 * p_lock is persistent, but p itself is not -- it could
-		 * vanish during cv_wait().  Load p->p_lock now so we can
-		 * drop it after cv_wait() without referencing p.
-		 */
-		mp = &p->p_lock;
-		mutex_enter(mp);
+		mutex_enter(&p->p_lock);
 		mutex_exit(&pidlock);
-		/*
-		 * If the process is in some half-baked state, fail.
-		 */
-		if (p->p_stat == SZOMB || p->p_stat == SIDL ||
-		    (p->p_flag & (SEXITING | SEXITLWPS))) {
-			mutex_exit(mp);
-			return (NULL);
-		}
+
 		if (panicstr)
 			return (p);
-		if (!(p->p_proc_flag & P_PR_LOCK))
+
+		ret = sprtrylock_proc(p);
+		if (ret == -1) {
+			mutex_exit(&p->p_lock);
+			return (NULL);
+		} else if (ret == 0) {
 			break;
-		cv_wait(&pr_pid_cv[p->p_slot], mp);
-		mutex_exit(mp);
+		}
+		sprwaitlock_proc(p);
 	}
-	p->p_proc_flag |= P_PR_LOCK;
-	THREAD_KPRI_REQUEST();
 	return (p);
 }
 
diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c
index ceb90850fa..818bb54701 100644
--- a/usr/src/uts/common/os/pool.c
+++ b/usr/src/uts/common/os/pool.c
@@ -293,6 +293,8 @@ pool_enable(void)
 	(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
 	(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
 	(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
+	(void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
+	    "wt-load");
 
 	(void) nvlist_alloc(&pool_default->pool_props,
 	    NV_UNIQUE_NAME, KM_SLEEP);
@@ -1309,7 +1311,7 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
 	}
 
 	if (idtype == P_PROJID) {
-		kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND);
+		kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
 		if (kpj == NULL)
 			return (ESRCH);
 		mutex_enter(&kpj->kpj_poolbind);
diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c
index 6c266c0ca3..d75b60f6e9 100644
--- a/usr/src/uts/common/os/project.c
+++ b/usr/src/uts/common/os/project.c
@@ -29,6 +29,7 @@
 #include <sys/modhash.h>
 #include <sys/modctl.h>
 #include <sys/kmem.h>
+#include <sys/kstat.h>
 #include <sys/atomic.h>
 #include <sys/cmn_err.h>
 #include <sys/proc.h>
@@ -103,6 +104,8 @@ struct project_zone {
  *   acquired, the hash lock is to be acquired first.
  */
 
+static kstat_t *project_kstat_create(kproject_t *pj, zone_t *zone);
+static void project_kstat_delete(kproject_t *pj);
 
 static void
 project_data_init(kproject_data_t *data)
@@ -118,6 +121,7 @@ project_data_init(kproject_data_t *data)
 	data->kpd_locked_mem_ctl = UINT64_MAX;
 	data->kpd_contract = 0;
 	data->kpd_crypto_mem = 0;
+	data->kpd_lockedmem_kstat = NULL;
 }
 
 /*ARGSUSED*/
@@ -179,11 +183,11 @@ project_hold(kproject_t *p)
 }
 
 /*
- * kproject_t *project_hold_by_id(projid_t, zoneid_t, int)
+ * kproject_t *project_hold_by_id(projid_t, zone_t *, int)
  *
  * Overview
  *   project_hold_by_id() performs a look-up in the dictionary of projects
- *   active on the system by specified project ID + zone ID and puts a hold on
+ *   active on the system by specified project ID + zone and puts a hold on
  *   it.  The third argument defines the desired behavior in the case when
  *   project with given project ID cannot be found:
  *
@@ -202,7 +206,7 @@ project_hold(kproject_t *p)
  *   Caller must be in a context suitable for KM_SLEEP allocations.
  */
 kproject_t *
-project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
+project_hold_by_id(projid_t id, zone_t *zone, int flag)
 {
 	kproject_t *spare_p;
 	kproject_t *p;
@@ -211,9 +215,11 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
 	rctl_alloc_gp_t *gp;
 	rctl_entity_p_t e;
 	struct project_zone pz;
+	boolean_t create = B_FALSE;
+	kstat_t *ksp;
 
 	pz.kpj_id = id;
-	pz.kpj_zoneid = zoneid;
+	pz.kpj_zoneid = zone->zone_id;
 
 	if (flag == PROJECT_HOLD_FIND) {
 		mutex_enter(&project_hash_lock);
@@ -241,9 +247,10 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
 	mutex_enter(&project_hash_lock);
 	if (mod_hash_find(projects_hash, (mod_hash_key_t)&pz,
 	    (mod_hash_val_t *)&p) == MH_ERR_NOTFOUND) {
+
 		p = spare_p;
 		p->kpj_id = id;
-		p->kpj_zoneid = zoneid;
+		p->kpj_zoneid = zone->zone_id;
 		p->kpj_count = 0;
 		p->kpj_shares = 1;
 		p->kpj_nlwps = 0;
@@ -265,7 +272,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
 		 * Insert project into global project list.
 		 */
 		mutex_enter(&projects_list_lock);
-		if (id != 0 || zoneid != GLOBAL_ZONEID) {
+		if (id != 0 || zone != &zone0) {
 			p->kpj_next = projects_list;
 			p->kpj_prev = projects_list->kpj_prev;
 			p->kpj_prev->kpj_next = p;
@@ -279,6 +286,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
 			projects_list = p;
 		}
 		mutex_exit(&projects_list_lock);
+		create = B_TRUE;
 	} else {
 		mutex_exit(&curproc->p_lock);
 		mod_hash_cancel(projects_hash, &hndl);
@@ -290,10 +298,20 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
 	p->kpj_count++;
 	mutex_exit(&project_hash_lock);
 
+	/*
+	 * The kstat stores the project's zone name, as zoneid's may change
+	 * across reboots.
+	 */
+	if (create == B_TRUE) {
+		ksp = project_kstat_create(p, zone);
+		mutex_enter(&project_hash_lock);
+		ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL);
+		p->kpj_data.kpd_lockedmem_kstat = ksp;
+		mutex_exit(&project_hash_lock);
+	}
 	return (p);
 }
 
-
 /*
  * void project_rele(kproject_t *)
  *
@@ -325,6 +343,7 @@ project_rele(kproject_t *p)
 		mutex_exit(&projects_list_lock);
 
 		rctl_set_free(p->kpj_rctls);
+		project_kstat_delete(p);
 
 		if (mod_hash_destroy(projects_hash, (mod_hash_key_t)p))
 			panic("unable to delete project %d zone %d", p->kpj_id,
@@ -636,9 +655,9 @@ project_locked_mem_usage(rctl_t *rctl, struct proc *p)
 {
 	rctl_qty_t q;
 	ASSERT(MUTEX_HELD(&p->p_lock));
-	mutex_enter(&p->p_zone->zone_rctl_lock);
+	mutex_enter(&p->p_zone->zone_mem_lock);
 	q = p->p_task->tk_proj->kpj_data.kpd_locked_mem;
-	mutex_exit(&p->p_zone->zone_rctl_lock);
+	mutex_exit(&p->p_zone->zone_mem_lock);
 	return (q);
 }
 
@@ -649,7 +668,7 @@ project_locked_mem_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
 {
 	rctl_qty_t q;
 	ASSERT(MUTEX_HELD(&p->p_lock));
-	ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock));
+	ASSERT(MUTEX_HELD(&p->p_zone->zone_mem_lock));
 	q = p->p_task->tk_proj->kpj_data.kpd_locked_mem;
 	if (q + inc > rval->rcv_value)
 		return (1);
@@ -868,7 +887,7 @@ project_init(void)
 	rctl_add_default_limit("project.max-contracts", 10000,
 	    RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
 
-	t0.t_proj = proj0p = project_hold_by_id(0, GLOBAL_ZONEID,
+	t0.t_proj = proj0p = project_hold_by_id(0, &zone0,
 	    PROJECT_HOLD_INSERT);
 
 	mutex_enter(&p0.p_lock);
@@ -876,3 +895,57 @@ project_init(void)
 	mutex_exit(&p0.p_lock);
 	proj0p->kpj_ntasks = 1;
 }
+
+static int
+project_lockedmem_kstat_update(kstat_t *ksp, int rw)
+{
+	kproject_t *pj = ksp->ks_private;
+	kproject_kstat_t *kpk = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	kpk->kpk_usage.value.ui64 = pj->kpj_data.kpd_locked_mem;
+	kpk->kpk_value.value.ui64 = pj->kpj_data.kpd_locked_mem_ctl;
+	return (0);
+}
+
+static kstat_t *
+project_kstat_create(kproject_t *pj, zone_t *zone)
+{
+	kstat_t *ksp;
+	kproject_kstat_t *kpk;
+	char *zonename = zone->zone_name;
+
+	ksp = rctl_kstat_create_project(pj, "lockedmem", KSTAT_TYPE_NAMED,
+	    sizeof (kproject_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return (NULL);
+
+	kpk = ksp->ks_data = kmem_alloc(sizeof (kproject_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zonename) + 1;
+	kstat_named_init(&kpk->kpk_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&kpk->kpk_zonename, zonename);
+	kstat_named_init(&kpk->kpk_usage, "usage", KSTAT_DATA_UINT64);
+	kstat_named_init(&kpk->kpk_value, "value", KSTAT_DATA_UINT64);
+	ksp->ks_update = project_lockedmem_kstat_update;
+	ksp->ks_private = pj;
+	kstat_install(ksp);
+
+	return (ksp);
+}
+
+static void
+project_kstat_delete(kproject_t *pj)
+{
+	void *data;
+
+	if (pj->kpj_data.kpd_lockedmem_kstat != NULL) {
+		data = pj->kpj_data.kpd_lockedmem_kstat->ks_data;
+		kstat_delete(pj->kpj_data.kpd_lockedmem_kstat);
+		kmem_free(data, sizeof (zone_kstat_t));
+	}
+	pj->kpj_data.kpd_lockedmem_kstat = NULL;
+}
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index 4de4c74fe8..c0479005ea 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -29,6 +29,7 @@
 #include <sys/cmn_err.h>
 #include <sys/id_space.h>
 #include <sys/kmem.h>
+#include <sys/kstat.h>
 #include <sys/log.h>
 #include <sys/modctl.h>
 #include <sys/modhash.h>
@@ -2599,7 +2600,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
 		zonep = p->p_zone;
 	}
 
-	mutex_enter(&zonep->zone_rctl_lock);
+	mutex_enter(&zonep->zone_mem_lock);
 
 	e.rcep_p.proj = projp;
 	e.rcep_t = RCENTITY_PROJECT;
@@ -2627,7 +2628,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
 		p->p_locked_mem += inc;
 	}
 out:
-	mutex_exit(&zonep->zone_rctl_lock);
+	mutex_exit(&zonep->zone_mem_lock);
 	if (proj != NULL)
 		zone_rele(zonep);
 	return (ret);
@@ -2661,7 +2662,7 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
 		zonep = p->p_zone;
 	}
 
-	mutex_enter(&zonep->zone_rctl_lock);
+	mutex_enter(&zonep->zone_mem_lock);
 	zonep->zone_locked_mem -= inc;
 	projp->kpj_data.kpd_locked_mem -= inc;
 	if (creditproc != 0) {
@@ -2669,7 +2670,120 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
 		ASSERT(MUTEX_HELD(&p->p_lock));
 		p->p_locked_mem -= inc;
 	}
-	mutex_exit(&zonep->zone_rctl_lock);
+	mutex_exit(&zonep->zone_mem_lock);
 	if (proj != NULL)
 		zone_rele(zonep);
 }
+
+/*
+ * rctl_incr_swap(proc_t *, zone_t *, size_t)
+ *
+ * Overview
+ *   Increments the swap charge on the specified zone.
+ *
+ * Return values
+ *   0 on success.  EAGAIN if swap increment fails due an rctl value
+ *   on the zone.
+ *
+ * Callers context
+ *   p_lock held on specified proc.
+ *   swap must be even multiple of PAGESIZE
+ */
+int
+rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
+{
+	rctl_entity_p_t e;
+
+	ASSERT(MUTEX_HELD(&proc->p_lock));
+	ASSERT((swap & PAGEOFFSET) == 0);
+	e.rcep_p.zone = zone;
+	e.rcep_t = RCENTITY_ZONE;
+
+	mutex_enter(&zone->zone_mem_lock);
+
+	if ((zone->zone_max_swap + swap) >
+	    zone->zone_max_swap_ctl) {
+
+		if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
+		    proc, &e, swap, 0) & RCT_DENY) {
+			mutex_exit(&zone->zone_mem_lock);
+			return (EAGAIN);
+		}
+	}
+	zone->zone_max_swap += swap;
+	mutex_exit(&zone->zone_mem_lock);
+	return (0);
+}
+
+/*
+ * rctl_decr_swap(zone_t *, size_t)
+ *
+ * Overview
+ *   Decrements the swap charge on the specified zone.
+ *
+ * Return values
+ *   None
+ *
+ * Callers context
+ *   swap must be even multiple of PAGESIZE
+ */
+void
+rctl_decr_swap(zone_t *zone, size_t swap)
+{
+	ASSERT((swap & PAGEOFFSET) == 0);
+	mutex_enter(&zone->zone_mem_lock);
+	ASSERT(zone->zone_max_swap >= swap);
+	zone->zone_max_swap -= swap;
+	mutex_exit(&zone->zone_mem_lock);
+}
+
+/*
+ * Create resource kstat
+ */
+static kstat_t *
+rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
+    uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
+{
+	kstat_t *ksp = NULL;
+	char name[KSTAT_STRLEN];
+
+	(void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
+
+	if ((ksp = kstat_create_zone("caps", ks_zoneid,
+		name, ks_class, ks_type,
+		ks_ndata, ks_flags, ks_zoneid)) != NULL) {
+		if (ks_zoneid != GLOBAL_ZONEID)
+			kstat_zone_add(ksp, GLOBAL_ZONEID);
+	}
+	return (ksp);
+}
+
+/*
+ * Create zone-specific resource kstat
+ */
+kstat_t *
+rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
+    uint_t ks_ndata, uchar_t ks_flags)
+{
+	char name[KSTAT_STRLEN];
+
+	(void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
+
+	return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
+	    ks_type, ks_ndata, ks_flags, zone->zone_id));
+}
+
+/*
+ * Create project-specific resource kstat
+ */
+kstat_t *
+rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
+    uint_t ks_ndata, uchar_t ks_flags)
+{
+	char name[KSTAT_STRLEN];
+
+	(void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
+
+	return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
+	    ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
+}
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 66aae7d2bc..62279e0777 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -542,13 +541,13 @@ schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr)
 	 * Set up anonymous memory struct.  No swap reservation is
 	 * needed since the page will be locked into memory.
 	 */
-	amp = anonmap_alloc(PAGESIZE, PAGESIZE);
+	amp = anonmap_alloc(PAGESIZE, 0);
 
 	/*
 	 * Allocate the page.
 	 */
-	kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_LOCKED | KPD_ZERO,
-	    amp);
+	kaddr = segkp_get_withanonmap(segkp, PAGESIZE,
+	    KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp);
 	if (kaddr == NULL) {
 		amp->refcnt--;
 		anonmap_free(amp);
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index 9ada0aac18..a7ef99fddb 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -666,7 +666,7 @@ struct sysent sysent[NSYSCALL] =
 	/* 178 */ SYSENT_LOADABLE(),		/* kaio */
 	/* 179 */ SYSENT_LOADABLE(),		/* cpc */
 	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3),
-	/* 181 */ SYSENT_CI("rusagesys",	rusagesys, 	2),
+	/* 181 */ SYSENT_CI("rusagesys",	rusagesys, 	5),
 	/* 182 */ SYSENT_LOADABLE(),		/* portfs */
 	/* 183 */ SYSENT_CI("pollsys",		pollsys,	4),
 	/* 184 */ SYSENT_CI("labelsys",		labelsys,	5),
@@ -1044,7 +1044,7 @@ struct sysent sysent32[NSYSCALL] =
 	/* 178 */ SYSENT_LOADABLE32(),		/* kaio */
 	/* 179 */ SYSENT_LOADABLE32(),		/* cpc */
 	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3),
-	/* 181 */ SYSENT_CI("rusagesys",	rusagesys,	2),
+	/* 181 */ SYSENT_CI("rusagesys",	rusagesys,	5),
 	/* 182 */ SYSENT_LOADABLE32(),		/* portfs */
 	/* 183 */ SYSENT_CI("pollsys",		pollsys,	4),
 	/* 184 */ SYSENT_CI("labelsys",		labelsys,	5),
diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c
index 562e3596b5..785f74c145 100644
--- a/usr/src/uts/common/os/task.c
+++ b/usr/src/uts/common/os/task.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -389,7 +388,7 @@ task_create(projid_t projid, zone_t *zone)
 	tk->tk_nlwps = 0;
 	tk->tk_nlwps_ctl = INT_MAX;
 	tk->tk_usage = tu;
-	tk->tk_proj = project_hold_by_id(projid, zone->zone_id,
+	tk->tk_proj = project_hold_by_id(projid, zone,
 	    PROJECT_HOLD_INSERT);
 	tk->tk_flags = TASK_NORMAL;
 
@@ -848,7 +847,7 @@ task_init(void)
 
 	task0p->tk_tkid = id_alloc(taskid_space);
 	task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
-	task0p->tk_proj = project_hold_by_id(0, GLOBAL_ZONEID,
+	task0p->tk_proj = project_hold_by_id(0, &zone0,
 	    PROJECT_HOLD_INSERT);
 	task0p->tk_flags = TASK_NORMAL;
 	task0p->tk_nlwps = p->p_lwpcnt;
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 0fb2c2be55..19ea8b31f1 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -154,6 +154,10 @@
  *   zone_lock: This is a per-zone lock used to protect several fields of
  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
  *       this lock means that the zone cannot go away.
+ *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
+ *	 related to the zone.max-lwps rctl.
+ *   zone_mem_lock: This is a per-zone lock used to protect the fields
+ *	 related to the zone.max-locked-memory and zone.max-swap rctls.
  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
  *       list (a list of zones in the ZONE_IS_DEAD state).
@@ -162,6 +166,10 @@
  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
  *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
  *
+ *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
+ *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
+ *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
+ *
  *   Blocking memory allocations are permitted while holding any of the
  *   zone locks.
  *
@@ -190,6 +198,7 @@
 #include <sys/debug.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
+#include <sys/kstat.h>
 #include <sys/mutex.h>
 #include <sys/note.h>
 #include <sys/pathname.h>
@@ -232,6 +241,8 @@
 #include <sys/zone.h>
 #include <sys/tsol/label.h>
 
+#include <vm/seg.h>
+
 /*
  * cv used to signal that all references to the zone have been released.  This
  * needs to be global since there may be multiple waiters, and the first to
@@ -317,6 +328,7 @@ const char  *zone_status_table[] = {
  */
 rctl_hndl_t rc_zone_cpu_shares;
 rctl_hndl_t rc_zone_locked_mem;
+rctl_hndl_t rc_zone_max_swap;
 rctl_hndl_t rc_zone_nlwps;
 rctl_hndl_t rc_zone_shmmax;
 rctl_hndl_t rc_zone_shmmni;
@@ -1011,9 +1023,9 @@ zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
 {
 	rctl_qty_t q;
 	ASSERT(MUTEX_HELD(&p->p_lock));
-	mutex_enter(&p->p_zone->zone_rctl_lock);
+	mutex_enter(&p->p_zone->zone_mem_lock);
 	q = p->p_zone->zone_locked_mem;
-	mutex_exit(&p->p_zone->zone_rctl_lock);
+	mutex_exit(&p->p_zone->zone_mem_lock);
 	return (q);
 }
 
@@ -1023,9 +1035,12 @@ zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
 {
 	rctl_qty_t q;
+	zone_t *z;
+
+	z = e->rcep_p.zone;
 	ASSERT(MUTEX_HELD(&p->p_lock));
-	ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock));
-	q = p->p_zone->zone_locked_mem;
+	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
+	q = z->zone_locked_mem;
 	if (q + incr > rcntl->rcv_value)
 		return (1);
 	return (0);
@@ -1051,6 +1066,57 @@ static rctl_ops_t zone_locked_mem_ops = {
 	zone_locked_mem_test
 };
 
+/*ARGSUSED*/
+static rctl_qty_t
+zone_max_swap_usage(rctl_t *rctl, struct proc *p)
+{
+	rctl_qty_t q;
+	zone_t *z = p->p_zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	mutex_enter(&z->zone_mem_lock);
+	q = z->zone_max_swap;
+	mutex_exit(&z->zone_mem_lock);
+	return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
+    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
+{
+	rctl_qty_t q;
+	zone_t *z;
+
+	z = e->rcep_p.zone;
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
+	q = z->zone_max_swap;
+	if (q + incr > rcntl->rcv_value)
+		return (1);
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+	if (e->rcep_p.zone == NULL)
+		return (0);
+	e->rcep_p.zone->zone_max_swap_ctl = nv;
+	return (0);
+}
+
+static rctl_ops_t zone_max_swap_ops = {
+	rcop_no_action,
+	zone_max_swap_usage,
+	zone_max_swap_set,
+	zone_max_swap_test
+};
+
 /*
  * Helper function to brand the zone with a unique ID.
  */
@@ -1080,6 +1146,96 @@ zone_get_kcred(zoneid_t zoneid)
 	return (cr);
 }
 
+static int
+zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_kstat_t *zk = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
+	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
+	return (0);
+}
+
+static int
+zone_swapresv_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_kstat_t *zk = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	zk->zk_usage.value.ui64 = zone->zone_max_swap;
+	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
+	return (0);
+}
+
+static void
+zone_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_kstat_t *zk;
+
+	ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED,
+	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
+	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
+	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
+	ksp->ks_update = zone_lockedmem_kstat_update;
+	ksp->ks_private = zone;
+	kstat_install(ksp);
+
+	zone->zone_lockedmem_kstat = ksp;
+
+	ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED,
+	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
+	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
+	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
+	ksp->ks_update = zone_swapresv_kstat_update;
+	ksp->ks_private = zone;
+	kstat_install(ksp);
+
+	zone->zone_swapresv_kstat = ksp;
+}
+
+static void
+zone_kstat_delete(zone_t *zone)
+{
+	void *data;
+
+	if (zone->zone_lockedmem_kstat != NULL) {
+		data = zone->zone_lockedmem_kstat->ks_data;
+		kstat_delete(zone->zone_lockedmem_kstat);
+		kmem_free(data, sizeof (zone_kstat_t));
+	}
+	if (zone->zone_swapresv_kstat != NULL) {
+		data = zone->zone_swapresv_kstat->ks_data;
+		kstat_delete(zone->zone_swapresv_kstat);
+		kmem_free(data, sizeof (zone_kstat_t));
+	}
+}
+
 /*
  * Called very early on in boot to initialize the ZSD list so that
  * zone_key_create() can be called before zone_init().  It also initializes
@@ -1101,8 +1257,14 @@ zone_zsd_init(void)
 
 	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
 	zone0.zone_shares = 1;
+	zone0.zone_nlwps = 0;
 	zone0.zone_nlwps_ctl = INT_MAX;
+	zone0.zone_locked_mem = 0;
+	zone0.zone_locked_mem_ctl = UINT64_MAX;
+	ASSERT(zone0.zone_max_swap == 0);
+	zone0.zone_max_swap_ctl = UINT64_MAX;
 	zone0.zone_shmmax = 0;
 	zone0.zone_ipc.ipcq_shmmni = 0;
 	zone0.zone_ipc.ipcq_semmni = 0;
@@ -1120,6 +1282,8 @@ zone_zsd_init(void)
 	zone0.zone_ncpus_online = 0;
 	zone0.zone_proc_initpid = 1;
 	zone0.zone_initname = initname;
+	zone0.zone_lockedmem_kstat = NULL;
+	zone0.zone_swapresv_kstat = NULL;
 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
 	    offsetof(struct zsd_entry, zsd_linkage));
 	list_insert_head(&zone_active, &zone0);
@@ -1259,6 +1423,12 @@ zone_init(void)
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
 	    &zone_locked_mem_ops);
+
+	rc_zone_max_swap = rctl_register("zone.max-swap",
+	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+	    &zone_max_swap_ops);
+
 	/*
 	 * Initialize the ``global zone''.
 	 */
@@ -1277,9 +1447,14 @@ zone_init(void)
 	zone0.zone_brand = &native_brand;
 	rctl_prealloc_destroy(gp);
 	/*
-	 * pool_default hasn't been initialized yet, so we let pool_init() take
-	 * care of making the global zone is in the default pool.
+	 * pool_default hasn't been initialized yet, so we let pool_init()
+	 * take care of making sure the global zone is in the default pool.
+	 */
+
+	/*
+	 * Initialize global zone kstats
 	 */
+	zone_kstat_create(&zone0);
 
 	/*
 	 * Initialize zone label.
@@ -1337,6 +1512,7 @@ zone_init(void)
 
 	if (res)
 		panic("Sysevent_evc_bind failed during zone setup.\n");
+
 }
 
 static void
@@ -1476,6 +1652,38 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
 	return (0);
 }
 
+static int
+zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+{
+	uint64_t mcap;
+	int err = 0;
+
+	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
+		zone->zone_phys_mcap = mcap;
+
+	return (err);
+}
+
+static int
+zone_set_sched_class(zone_t *zone, const char *new_class)
+{
+	char sched_class[PC_CLNMSZ];
+	id_t classid;
+	int err;
+
+	ASSERT(zone != global_zone);
+	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
+		return (err);	/* EFAULT or ENAMETOOLONG */
+
+	if (getcid(sched_class, &classid) != 0 || classid == syscid)
+		return (set_errno(EINVAL));
+	zone->zone_defaultcid = classid;
+	ASSERT(zone->zone_defaultcid > 0 &&
+	    zone->zone_defaultcid < loaded_classes);
+
+	return (0);
+}
+
 /*
  * Block indefinitely waiting for (zone_status >= status)
  */
@@ -2510,10 +2718,10 @@ zsched(void *arg)
 	/*
 	 * Decrement locked memory counts on old zone and project.
 	 */
-	mutex_enter(&global_zone->zone_rctl_lock);
+	mutex_enter(&global_zone->zone_mem_lock);
 	global_zone->zone_locked_mem -= pp->p_locked_mem;
 	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
-	mutex_exit(&global_zone->zone_rctl_lock);
+	mutex_exit(&global_zone->zone_mem_lock);
 
 	/*
 	 * Create and join a new task in project '0' of this zone.
@@ -2529,10 +2737,10 @@ zsched(void *arg)
 
 	pj = pp->p_task->tk_proj;
 
-	mutex_enter(&zone->zone_rctl_lock);
+	mutex_enter(&zone->zone_mem_lock);
 	zone->zone_locked_mem += pp->p_locked_mem;
 	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
-	mutex_exit(&zone->zone_rctl_lock);
+	mutex_exit(&zone->zone_mem_lock);
 
 	/*
 	 * add lwp counts to zsched's zone, and increment project's task count
@@ -2689,7 +2897,10 @@ zsched(void *arg)
 		 * classid 'cid'.
 		 */
 		pool_lock();
-		cid = pool_get_class(zone->zone_pool);
+		if (zone->zone_defaultcid > 0)
+			cid = zone->zone_defaultcid;
+		else
+			cid = pool_get_class(zone->zone_pool);
 		if (cid == -1)
 			cid = defaultcid;
 
@@ -3019,7 +3230,7 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_initname = NULL;
 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&zone->zone_rctl_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
 	    offsetof(struct zsd_entry, zsd_linkage));
@@ -3057,8 +3268,14 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_initname =
 	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
 	(void) strcpy(zone->zone_initname, zone_default_initname);
+	zone->zone_nlwps = 0;
+	zone->zone_nlwps_ctl = INT_MAX;
 	zone->zone_locked_mem = 0;
 	zone->zone_locked_mem_ctl = UINT64_MAX;
+	zone->zone_max_swap = 0;
+	zone->zone_max_swap_ctl = UINT64_MAX;
+	zone0.zone_lockedmem_kstat = NULL;
+	zone0.zone_swapresv_kstat = NULL;
 
 	/*
 	 * Zsched initializes the rctls.
@@ -3233,6 +3450,11 @@ zone_create(const char *zone_name, const char *zone_root,
 	 */
 
 	/*
+	 * Create zone kstats
+	 */
+	zone_kstat_create(zone);
+
+	/*
 	 * Let the other lwps continue.
 	 */
 	mutex_enter(&pp->p_lock);
@@ -3643,6 +3865,9 @@ zone_destroy(zoneid_t zoneid)
 
 	}
 
+	/* Get rid of the zone's kstats */
+	zone_kstat_delete(zone);
+
 	/*
 	 * It is now safe to let the zone be recreated; remove it from the
 	 * lists.  The memory will not be freed until the last cred
@@ -3892,6 +4117,32 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 				error = EFAULT;
 		}
 		break;
+	case ZONE_ATTR_PHYS_MCAP:
+		size = sizeof (zone->zone_phys_mcap);
+		if (bufsize > size)
+			bufsize = size;
+		if (buf != NULL &&
+		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
+			error = EFAULT;
+		break;
+	case ZONE_ATTR_SCHED_CLASS:
+		mutex_enter(&class_lock);
+
+		if (zone->zone_defaultcid >= loaded_classes)
+			outstr = "";
+		else
+			outstr = sclass[zone->zone_defaultcid].cl_name;
+		size = strlen(outstr) + 1;
+		if (bufsize > size)
+			bufsize = size;
+		if (buf != NULL) {
+			err = copyoutstr(outstr, buf, bufsize, NULL);
+			if (err != 0 && err != ENAMETOOLONG)
+				error = EFAULT;
+		}
+
+		mutex_exit(&class_lock);
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
 			size = bufsize;
@@ -3923,10 +4174,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		return (set_errno(EPERM));
 
 	/*
-	 * At present, attributes can only be set on non-running,
-	 * non-global zones.
+	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
+	 * global zone.
 	 */
-	if (zoneid == GLOBAL_ZONEID) {
+	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
 		return (set_errno(EINVAL));
 	}
 
@@ -3938,8 +4189,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	zone_hold(zone);
 	mutex_exit(&zonehash_lock);
 
+	/*
+	 * At present most attributes can only be set on non-running,
+	 * non-global zones.
+	 */
 	zone_status = zone_status_get(zone);
-	if (zone_status > ZONE_IS_READY)
+	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
 		goto done;
 
 	switch (attr) {
@@ -3971,6 +4226,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		if (zone->zone_brand == NULL)
 			err = EINVAL;
 		break;
+	case ZONE_ATTR_PHYS_MCAP:
+		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
+		break;
+	case ZONE_ATTR_SCHED_CLASS:
+		err = zone_set_sched_class(zone, (const char *)buf);
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -3986,6 +4247,11 @@ done:
 /*
  * Return zero if the process has at least one vnode mapped in to its
  * address space which shouldn't be allowed to change zones.
+ *
+ * Also return zero if the process has any shared mappings which reserve
+ * swap.  This is because the counting for zone.max-swap does not allow swap
+ * revervation to be shared between zones.  zone swap reservation is counted
+ * on zone->zone_max_swap.
  */
 static int
 as_can_change_zones(void)
@@ -3997,8 +4263,17 @@ as_can_change_zones(void)
 	int allow = 1;
 
 	ASSERT(pp->p_as != &kas);
-	AS_LOCK_ENTER(&as, &as->a_lock, RW_READER);
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+
+		/*
+		 * Cannot enter zone with shared anon memory which
+		 * reserves swap.  See comment above.
+		 */
+		if (seg_can_change_zones(seg) == B_FALSE) {
+			allow = 0;
+			break;
+		}
 		/*
 		 * if we can't get a backing vnode for this segment then skip
 		 * it.
@@ -4011,11 +4286,30 @@ as_can_change_zones(void)
 			break;
 		}
 	}
-	AS_LOCK_EXIT(&as, &as->a_lock);
+	AS_LOCK_EXIT(as, &as->a_lock);
 	return (allow);
 }
 
 /*
+ * Count swap reserved by curproc's address space
+ */
+static size_t
+as_swresv(void)
+{
+	proc_t *pp = curproc;
+	struct seg *seg;
+	struct as *as = pp->p_as;
+	size_t swap = 0;
+
+	ASSERT(pp->p_as != &kas);
+	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
+		swap += seg_swresv(seg);
+
+	return (swap);
+}
+
+/*
  * Systemcall entry point for zone_enter().
  *
  * The current process is injected into said zone.  In the process
@@ -4043,6 +4337,7 @@ zone_enter(zoneid_t zoneid)
 	zone_status_t status;
 	int err = 0;
 	rctl_entity_p_t e;
+	size_t swap;
 
 	if (secpolicy_zone_config(CRED()) != 0)
 		return (set_errno(EPERM));
@@ -4205,6 +4500,15 @@ zone_enter(zoneid_t zoneid)
 		goto out;
 	}
 
+	/*
+	 * a_lock must be held while transfering locked memory and swap
+	 * reservation from the global zone to the non global zone because
+	 * asynchronous faults on the processes' address space can lock
+	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
+	 * segments respectively.
+	 */
+	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
+	swap = as_swresv();
 	mutex_enter(&pp->p_lock);
 	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
 	/* verify that we do not exceed and task or lwp limits */
@@ -4216,10 +4520,11 @@ zone_enter(zoneid_t zoneid)
 	zone_proj0->kpj_ntasks += 1;
 	mutex_exit(&zone->zone_nlwps_lock);
 
-	mutex_enter(&zone->zone_rctl_lock);
+	mutex_enter(&zone->zone_mem_lock);
 	zone->zone_locked_mem += pp->p_locked_mem;
 	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
-	mutex_exit(&zone->zone_rctl_lock);
+	zone->zone_max_swap += swap;
+	mutex_exit(&zone->zone_mem_lock);
 
 	/* remove lwps from proc's old zone and old project */
 	mutex_enter(&pp->p_zone->zone_nlwps_lock);
@@ -4227,12 +4532,14 @@ zone_enter(zoneid_t zoneid)
 	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
 	mutex_exit(&pp->p_zone->zone_nlwps_lock);
 
-	mutex_enter(&pp->p_zone->zone_rctl_lock);
+	mutex_enter(&pp->p_zone->zone_mem_lock);
 	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
 	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
-	mutex_exit(&pp->p_zone->zone_rctl_lock);
+	pp->p_zone->zone_max_swap -= swap;
+	mutex_exit(&pp->p_zone->zone_mem_lock);
 
 	mutex_exit(&pp->p_lock);
+	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
 
 	/*
 	 * Joining the zone cannot fail from now on.
@@ -4289,6 +4596,31 @@ zone_enter(zoneid_t zoneid)
 	sess_rele(pp->p_sessp, B_TRUE);
 	pp->p_sessp = sp;
 	pgjoin(pp, zone->zone_zsched->p_pidp);
+
+	/*
+	 * If there is a default scheduling class for the zone and it is not
+	 * the class we are currently in, change all of the threads in the
+	 * process to the new class.  We need to be holding pidlock & p_lock
+	 * when we call parmsset so this is a good place to do it.
+	 */
+	if (zone->zone_defaultcid > 0 &&
+	    zone->zone_defaultcid != curthread->t_cid) {
+		pcparms_t pcparms;
+		kthread_id_t t;
+
+		pcparms.pc_cid = zone->zone_defaultcid;
+		pcparms.pc_clparms[0] = 0;
+
+		/*
+		 * If setting the class fails, we still want to enter the zone.
+		 */
+		if ((t = pp->p_tlist) != NULL) {
+			do {
+				(void) parmsset(&pcparms, t);
+			} while ((t = t->t_forw) != pp->p_tlist);
+		}
+	}
+
 	mutex_exit(&pp->p_lock);
 	mutex_exit(&pidlock);
 
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index ab103ef4c7..4493f99454 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -544,6 +544,7 @@ CHKHDRS=			\
 	visual_io.h		\
 	vlan.h			\
 	vm.h			\
+	vm_usage.h		\
 	vmem.h			\
 	vmem_impl.h		\
 	vmmeter.h		\
diff --git a/usr/src/uts/common/sys/modhash_impl.h b/usr/src/uts/common/sys/modhash_impl.h
index 25e45cec23..a187eb68ee 100644
--- a/usr/src/uts/common/sys/modhash_impl.h
+++ b/usr/src/uts/common/sys/modhash_impl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -93,6 +92,18 @@ struct mod_hash {
  */
 void mod_hash_init(void);
 
+/*
+ * Internal routines.  Use directly with care.
+ */
+uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t);
+int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t,
+    mod_hash_hndl_t);
+int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t,
+    mod_hash_val_t *, void *), void *);
+void i_mod_hash_clear_nosync(mod_hash_t *hash);
+
 #endif /* _KERNEL */
 
 #ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/priocntl.h b/usr/src/uts/common/sys/priocntl.h
index ca1a92400a..6475ed0a4c 100644
--- a/usr/src/uts/common/sys/priocntl.h
+++ b/usr/src/uts/common/sys/priocntl.h
@@ -65,6 +65,7 @@ extern long	priocntl(), priocntlset();
 #define	PC_SETXPARMS	7	/* Set extended scheduling parameters */
 #define	PC_GETXPARMS	8	/* Get extended scheduling parameters */
 #define	PC_SETDFLCL	9	/* Set default class, not for general use */
+#define	PC_GETDFLCL	10	/* Get default class, not for general use */
 
 #define	PC_CLNULL	-1
 
diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h
index fcf953262c..9a0ba2cc37 100644
--- a/usr/src/uts/common/sys/proc.h
+++ b/usr/src/uts/common/sys/proc.h
@@ -613,6 +613,8 @@ extern proc_t *pgfind(pid_t);
 extern proc_t *pgfind_zone(pid_t, zoneid_t);
 extern proc_t *sprlock(pid_t);
 extern proc_t *sprlock_zone(pid_t, zoneid_t);
+extern int sprtrylock_proc(proc_t *);
+extern void sprwaitlock_proc(proc_t *);
 extern void sprlock_proc(proc_t *);
 extern void sprunlock(proc_t *);
 extern void pid_init(void);
diff --git a/usr/src/uts/common/sys/project.h b/usr/src/uts/common/sys/project.h
index 679c1eddc2..5018df8499 100644
--- a/usr/src/uts/common/sys/project.h
+++ b/usr/src/uts/common/sys/project.h
@@ -28,15 +28,24 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+
+#include <sys/kstat.h>
 #include <sys/types.h>
 #include <sys/mutex.h>
 #include <sys/rctl.h>
 #include <sys/ipc_rctl.h>
 
+typedef struct kproject_kstat {
+	kstat_named_t kpk_zonename;
+	kstat_named_t kpk_usage;
+	kstat_named_t kpk_value;
+} kproject_kstat_t;
+
 typedef struct kproject_data {		/* Datum protected by: */
 	rctl_qty_t	kpd_shmmax;	/* shm's ipcs_lock */
 	ipc_rqty_t	kpd_ipc;	/* shm|sem|msg's ipcs lock */
@@ -44,6 +53,7 @@ typedef struct kproject_data {		/* Datum protected by: */
 	rctl_qty_t	kpd_locked_mem_ctl; /* kpj_rctls->rcs_lock */
 	rctl_qty_t	kpd_contract;	/* contract_lock */
 	rctl_qty_t	kpd_crypto_mem;	/* crypto_rctl_lock */
+	kstat_t		*kpd_lockedmem_kstat; /* locked memory kstat */
 
 } kproject_data_t;
 
@@ -76,9 +86,11 @@ typedef struct kproject {
 #define	PROJECT_HOLD_FIND	1
 #define	PROJECT_HOLD_INSERT	2
 
+struct zone;
+
 void project_init(void);
 kproject_t *project_hold(kproject_t *);
-kproject_t *project_hold_by_id(projid_t, zoneid_t, int);
+kproject_t *project_hold_by_id(projid_t, struct zone *, int);
 void project_rele(kproject_t *);
 int project_walk_all(zoneid_t, int (*)(kproject_t *, void *), void *);
 projid_t curprojid(void);
diff --git a/usr/src/uts/common/sys/rctl.h b/usr/src/uts/common/sys/rctl.h
index eb56fff9e5..a8480c2768 100644
--- a/usr/src/uts/common/sys/rctl.h
+++ b/usr/src/uts/common/sys/rctl.h
@@ -168,6 +168,7 @@ struct proc;
 struct task;
 struct kproject;
 struct zone;
+struct kstat;
 
 typedef struct rctl_entity_p_struct {
 	rctl_entity_t rcep_t;
@@ -324,6 +325,14 @@ int rctl_incr_locked_mem(struct proc *, struct kproject *, rctl_qty_t,
     int);
 void rctl_decr_locked_mem(struct proc *, struct kproject *, rctl_qty_t,
     int);
+int rctl_incr_swap(struct proc *, struct zone *, size_t);
+void rctl_decr_swap(struct zone *, size_t);
+
+struct kstat *rctl_kstat_create_zone(struct zone *, char *, uchar_t, uint_t,
+    uchar_t);
+
+struct kstat *rctl_kstat_create_project(struct kproject *, char *, uchar_t,
+    uint_t, uchar_t);
 
 #endif /* _KERNEL */
 
diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h
index 86cc716d56..bf02808d4b 100644
--- a/usr/src/uts/common/sys/resource.h
+++ b/usr/src/uts/common/sys/resource.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -191,6 +190,7 @@ struct	rusage {
 #define	_RUSAGESYS_GETRUSAGE		0	/* rusage process */
 #define	_RUSAGESYS_GETRUSAGE_CHLD	1	/* rusage child process */
 #define	_RUSAGESYS_GETRUSAGE_LWP	2	/* rusage lwp */
+#define	_RUSAGESYS_GETVMUSAGE		3	/* getvmusage */
 
 #if defined(_SYSCALL32)
 
diff --git a/usr/src/uts/common/sys/syscall.h b/usr/src/uts/common/sys/syscall.h
index 96cb967023..eedadfa0c0 100644
--- a/usr/src/uts/common/sys/syscall.h
+++ b/usr/src/uts/common/sys/syscall.h
@@ -384,7 +384,8 @@ extern "C" {
 #define	SYS_rusagesys		181
 	/*
 	 * subcodes:
-	 *	getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE,...)
+	 *	getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE, ...)
+	 *	getvmusage(...)    :: rusagesys(RUSAGESYS_GETVMUSAGE, ...)
 	 */
 #define	SYS_port		182
 	/*
diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h
new file mode 100644
index 0000000000..5f8c8b8fe5
--- /dev/null
+++ b/usr/src/uts/common/sys/vm_usage.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_VM_USAGE_H
+#define	_SYS_VM_USAGE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * The flags passed to getvmusage() request how to aggregate rss/swap results.
+ * Results can be aggregated by zone, project, task, ruser, and/or euser.
+ *
+ * If VMUSAGE_ALL_* or VMUSAGE_COL_* are passed from a non-global-zone, the
+ * flag is treated as VMUSAGE_*.  For example, VMUSAGE_ALL_ZONES would be
+ * treated as VMUSAGE_ZONE.
+ *
+ * If VMUSAGE_SYSTEM is passed from a non-global zone, a result of type
+ * VMUSAGE_SYSTEM will be returned, but it will only reflect the usage
+ * of the calling zone.
+ *
+ * VMUSAGE_*	 requests results for the calling zone.
+ * VMUSAGE_ALL_* requests results for all zones.
+ * VMUSAGE_COL_* requests results for all zones, but collapses out the zoneid.
+ *		 For example, VMUSAGE_COL_PROJECTS requests results for all
+ *		 projects in all zones, and project N in ANY zone is treated
+ *		 as the same project.
+ */
+#define	VMUSAGE_SYSTEM		0x1	/* rss/swap for ALL processes */
+#define	VMUSAGE_ZONE		0x2	/* rss/swap for caller's zone */
+#define	VMUSAGE_PROJECTS	0x4	/* rss/swap for all projects in */
+					/* caller's zone */
+#define	VMUSAGE_TASKS		0x8	/* rss/swap for all tasks in */
+					/* caller's zones */
+#define	VMUSAGE_RUSERS		0x10	/* rss/swap for all users (by process */
+					/* ruser) in the caller's zone */
+#define	VMUSAGE_EUSERS		0x20	/* same as VMUSAGE_RUSERS, but by */
+					/* euser */
+
+#define	VMUSAGE_ALL_ZONES	0x40	/* rss/swap for all zones */
+#define	VMUSAGE_ALL_PROJECTS	0x80	/* rss/swap for all projects in */
+					/* all zones */
+#define	VMUSAGE_ALL_TASKS	0x100	/* rss/swap for all tasks in all */
+					/* zones */
+#define	VMUSAGE_ALL_RUSERS	0x200	/* rss/swap for all users (by process */
+					/* ruser) in all zones */
+#define	VMUSAGE_ALL_EUSERS	0x400	/* same as VMUSAGE_ALL_RUSERS, but by */
+					/* euser */
+
+#define	VMUSAGE_COL_PROJECTS	0x800	/* rss/swap for all projects in */
+					/* all zones.  Collapse zoneid. */
+#define	VMUSAGE_COL_RUSERS	0x1000	/* rss/swap for all users (by process */
+					/* ruser), in all zones.  Collapse */
+					/* zoneid */
+#define	VMUSAGE_COL_EUSERS	0x2000	/* same as VMUSAGE_COL_RUSERS, but by */
+					/* euser */
+
+#define	VMUSAGE_MASK		0x3fff  /* all valid flags for getvmusage() */
+
+typedef struct vmusage {
+	id_t	vmu_zoneid;		/* zoneid, or ALL_ZONES for */
+					/* VMUSAGE_COL_* results */
+					/* ALL_ZONES means that the result */
+					/* reflects swap and rss usage for */
+					/* a projid/uid across all zones */
+	uint_t	vmu_type;		/* Entity type of result.  One of:  */
+					/* VMUSAGE_(SYSTEM|ZONE|PROJECTS| */
+					/* TASKS|RUSERS|EUSERS) */
+	id_t	vmu_id;			/* zoneid, projid, taskid, ... */
+	size_t	vmu_rss_all;		/* total resident memory of entity */
+					/* in bytes */
+	size_t	vmu_rss_private;	/* total resident private memory */
+	size_t	vmu_rss_shared;		/* total resident shared memory */
+	size_t	vmu_swap_all;		/* total swap reserved, in bytes */
+	size_t	vmu_swap_private;	/* swap reserved for private mappings */
+	size_t	vmu_swap_shared;	/* swap reserved for shared mappings */
+
+} vmusage_t;
+
+extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres);
+
+#ifdef	_KERNEL
+
+int vm_getusage(uint_t, time_t, vmusage_t *, size_t *);
+void vm_usage_init();
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VM_USAGE_H */
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index daccd16bdf..94646bc976 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -88,6 +88,8 @@ extern "C" {
 #define	ZONE_ATTR_INITNAME	9
 #define	ZONE_ATTR_BOOTARGS	10
 #define	ZONE_ATTR_BRAND		11
+#define	ZONE_ATTR_PHYS_MCAP	12
+#define	ZONE_ATTR_SCHED_CLASS	13
 
 /* Start of the brand-specific attribute namespace */
 #define	ZONE_ATTR_BRAND_ATTRS	32768
@@ -280,6 +282,15 @@ typedef struct zone_dataset {
 	list_node_t	zd_linkage;
 } zone_dataset_t;
 
+/*
+ * structure for zone kstats
+ */
+typedef struct zone_kstat {
+	kstat_named_t zk_zonename;
+	kstat_named_t zk_usage;
+	kstat_named_t zk_value;
+} zone_kstat_t;
+
 typedef struct zone {
 	/*
 	 * zone_name is never modified once set.
@@ -326,14 +337,20 @@ typedef struct zone {
 	uint_t		zone_rootpathlen; /* strlen(zone_rootpath) + 1 */
 	uint32_t	zone_shares;	/* FSS shares allocated to zone */
 	rctl_set_t	*zone_rctls;	/* zone-wide (zone.*) rctls */
-	kmutex_t	zone_rctl_lock; /* protects zone_locked_mem and */
+	kmutex_t	zone_mem_lock;	/* protects zone_locked_mem and */
 					/* kpd_locked_mem for all */
-					/* projects in zone */
+					/* projects in zone. */
+					/* Also protects zone_max_swap */
 					/* grab after p_lock, before rcs_lock */
-	rctl_qty_t	zone_locked_mem; /* bytes of locked memory in zone */
-	rctl_qty_t	zone_locked_mem_ctl;	/* current locked memory */
+	rctl_qty_t	zone_locked_mem;	/* bytes of locked memory in */
+						/* zone */
+	rctl_qty_t	zone_locked_mem_ctl;	/* Current locked memory */
 						/* limit.  Protected by */
 						/* zone_rctls->rcs_lock */
+	rctl_qty_t	zone_max_swap; /* bytes of swap reserved by zone */
+	rctl_qty_t	zone_max_swap_ctl;	/* current swap limit. */
+						/* Protected by */
+						/* zone_rctls->rcs_lock */
 	list_t		zone_zsd;	/* list of Zone-Specific Data values */
 	kcondvar_t	zone_cv;	/* used to signal state changes */
 	struct proc	*zone_zsched;	/* Dummy kernel "zsched" process */
@@ -341,6 +358,7 @@ typedef struct zone {
 	char		*zone_initname;	/* fs path to 'init' */
 	int		zone_boot_err;  /* for zone_boot() if boot fails */
 	char		*zone_bootargs;	/* arguments passed via zone_boot() */
+	uint64_t	zone_phys_mcap;	/* physical memory cap */
 	/*
 	 * zone_kthreads is protected by zone_status_lock.
 	 */
@@ -376,6 +394,9 @@ typedef struct zone {
 
 	boolean_t	zone_restart_init;	/* Restart init if it dies? */
 	struct brand	*zone_brand;		/* zone's brand */
+	id_t		zone_defaultcid;	/* dflt scheduling class id */
+	kstat_t		*zone_swapresv_kstat;
+	kstat_t		*zone_lockedmem_kstat;
 } zone_t;
 
 /*
@@ -553,6 +574,7 @@ extern void mount_completed(void);
 extern int zone_walk(int (*)(zone_t *, void *), void *);
 
 extern rctl_hndl_t rc_zone_locked_mem;
+extern rctl_hndl_t rc_zone_max_swap;
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/syscall/processor_bind.c b/usr/src/uts/common/syscall/processor_bind.c
index 10ca1178d5..bd416e43e6 100644
--- a/usr/src/uts/common/syscall/processor_bind.c
+++ b/usr/src/uts/common/syscall/processor_bind.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -285,9 +284,10 @@ processor_bind(idtype_t idtype, id_t id, processorid_t bind,
 		break;
 
 	case P_PROJID:
+		pp = curproc;
 		if (id == P_MYID)
 			id = curprojid();
-		if ((kpj = project_hold_by_id(id, getzoneid(),
+		if ((kpj = project_hold_by_id(id, pp->p_zone,
 		    PROJECT_HOLD_FIND)) == NULL) {
 			ret = ESRCH;
 		} else {
diff --git a/usr/src/uts/common/syscall/pset.c b/usr/src/uts/common/syscall/pset.c
index 5d3b7e6233..767529fc5d 100644
--- a/usr/src/uts/common/syscall/pset.c
+++ b/usr/src/uts/common/syscall/pset.c
@@ -542,9 +542,10 @@ pset_bind(psetid_t pset, idtype_t idtype, id_t id, psetid_t *opset)
 		break;
 
 	case P_PROJID:
+		pp = curproc;
 		if (id == P_MYID)
 			id = curprojid();
-		if ((kpj = project_hold_by_id(id, getzoneid(),
+		if ((kpj = project_hold_by_id(id, pp->p_zone,
 		    PROJECT_HOLD_FIND)) == NULL) {
 			error = ESRCH;
 			break;
diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c
index 3e09643981..036500932f 100644
--- a/usr/src/uts/common/syscall/rusagesys.c
+++ b/usr/src/uts/common/syscall/rusagesys.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,6 +34,7 @@
 #include <sys/time.h>
 #include <sys/errno.h>
 #include <sys/resource.h>
+#include <sys/vm_usage.h>
 
 static int
 getrusage(void *user_rusage)
@@ -246,16 +246,19 @@ getrusage_lwp(void *user_rusage)
 }
 
 int
-rusagesys(int code, void * arg)
+rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4)
 {
 	switch (code) {
 
 	case _RUSAGESYS_GETRUSAGE:
-		return (getrusage(arg));
+		return (getrusage(arg1));
 	case _RUSAGESYS_GETRUSAGE_CHLD:
-		return (getrusage_chld(arg));
+		return (getrusage_chld(arg1));
 	case _RUSAGESYS_GETRUSAGE_LWP:
-		return (getrusage_lwp(arg));
+		return (getrusage_lwp(arg1));
+	case _RUSAGESYS_GETVMUSAGE:
+		return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2,
+		    (vmusage_t *)arg3, (size_t *)arg4));
 	default:
 		return (set_errno(EINVAL));
 	}
diff --git a/usr/src/uts/common/syscall/tasksys.c b/usr/src/uts/common/syscall/tasksys.c
index 705b543a37..bec091e61c 100644
--- a/usr/src/uts/common/syscall/tasksys.c
+++ b/usr/src/uts/common/syscall/tasksys.c
@@ -25,6 +25,7 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+
 /*
  * System calls for creating and inquiring about tasks and projects
  */
@@ -102,7 +103,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
 	 * Put a hold on our new project and make sure that nobody is
 	 * trying to bind it to a pool while we're joining.
 	 */
-	kpj = project_hold_by_id(projid, getzoneid(), PROJECT_HOLD_INSERT);
+	kpj = project_hold_by_id(projid, p->p_zone, PROJECT_HOLD_INSERT);
 	e.rcep_p.proj = kpj;
 	e.rcep_t = RCENTITY_PROJECT;
 
@@ -111,7 +112,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
 	zone = p->p_zone;
 
 	mutex_enter(&zone->zone_nlwps_lock);
-	mutex_enter(&zone->zone_rctl_lock);
+	mutex_enter(&zone->zone_mem_lock);
 
 	if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl)
 		if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e,
@@ -130,7 +131,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
 			rctlfail = 1;
 
 	if (rctlfail) {
-		mutex_exit(&zone->zone_rctl_lock);
+		mutex_exit(&zone->zone_mem_lock);
 		mutex_exit(&zone->zone_nlwps_lock);
 		if (curthread != p->p_agenttp)
 			continuelwps(p);
@@ -144,7 +145,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
 	oldpj->kpj_data.kpd_locked_mem -= p->p_locked_mem;
 	oldpj->kpj_nlwps -= p->p_lwpcnt;
 
-	mutex_exit(&zone->zone_rctl_lock);
+	mutex_exit(&zone->zone_mem_lock);
 	mutex_exit(&zone->zone_nlwps_lock);
 	mutex_exit(&p->p_lock);
 
diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h
index 90f6e1e661..ed59ec590b 100644
--- a/usr/src/uts/common/vm/anon.h
+++ b/usr/src/uts/common/vm/anon.h
@@ -42,6 +42,7 @@
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/cred.h>
+#include <sys/zone.h>
 #include <vm/seg.h>
 #include <vm/vpage.h>
 
@@ -387,8 +388,8 @@ extern int	anon_map_demotepages(struct anon_map *, ulong_t,
 		    struct seg *, caddr_t, uint_t,
 		    struct vpage [], struct cred *);
 extern void	anon_shmap_free_pages(struct anon_map *, ulong_t, size_t);
-extern int	anon_resvmem(size_t, uint_t);
-extern void	anon_unresv(size_t);
+extern int	anon_resvmem(size_t, boolean_t, zone_t *);
+extern void	anon_unresvmem(size_t, zone_t *);
 extern struct	anon_map *anonmap_alloc(size_t, size_t);
 extern void	anonmap_free(struct anon_map *);
 extern void	anon_decref(struct anon *);
@@ -416,9 +417,16 @@ extern void	anon_array_exit(anon_sync_obj_t *);
  * request and if so, reserves the appropriate anonymous memory resources.
  * anon_checkspace just checks to see if there is space to fulfill the request,
  * without taking any resources.  Both return 1 if successful and 0 if not.
+ *
+ * Macros are provided as anon reservation is usually charged to the zone of
+ * the current process.  In some cases (such as anon reserved by tmpfs), a
+ * zone pointer is needed to charge the appropriate zone.
  */
-#define	anon_resv(size)		anon_resvmem((size), 1)
-#define	anon_checkspace(size)	anon_resvmem((size), 0)
+#define	anon_unresv(size)		anon_unresvmem(size, curproc->p_zone)
+#define	anon_unresv_zone(size, zone)	anon_unresvmem(size, zone)
+#define	anon_resv(size)			anon_resvmem((size), 1, curproc->p_zone)
+#define	anon_resv_zone(size, zone)	anon_resvmem((size), 1, zone)
+#define	anon_checkspace(size, zone)	anon_resvmem((size), 0, zone)
 
 /*
  * Flags to anon_private
diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h
index 0ee7d62ce1..a9683c0e54 100644
--- a/usr/src/uts/common/vm/seg.h
+++ b/usr/src/uts/common/vm/seg.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -245,6 +244,9 @@ uint_t	seg_pages(struct seg *);
 
 #endif	/* VMDEBUG */
 
+boolean_t	seg_can_change_zones(struct seg *);
+size_t		seg_swresv(struct seg *);
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c
index ff9c47e0ff..d58e873a19 100644
--- a/usr/src/uts/common/vm/seg_kp.c
+++ b/usr/src/uts/common/vm/seg_kp.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -147,6 +146,7 @@ uint32_t	red_closest = UINT_MAX;
 uint32_t	red_ndoubles;
 
 pgcnt_t anon_segkp_pages_locked;	/* See vm/anon.h */
+pgcnt_t anon_segkp_pages_resv;		/* anon reserved by seg_kp */
 
 static struct	seg_ops segkp_ops = {
 	SEGKP_BADOP(int),		/* dup */
@@ -448,8 +448,10 @@ segkp_get_internal(
 	 * Note that we don't need swap space for the red zone page.
 	 */
 	if (amp != NULL) {
-		ASSERT((flags & KPD_NO_ANON) == 0);
-		/* The reserve has been done and the anon_hdr is separate. */
+		/*
+		 * The swap reservation has been done, if required, and the
+		 * anon_hdr is separate.
+		 */
 		anon_idx = 0;
 		kpd->kp_anon_idx = anon_idx;
 		kpd->kp_anon = amp->ahp;
@@ -458,7 +460,7 @@ segkp_get_internal(
 		    kpd, vbase, len, flags, 1);
 
 	} else if ((flags & KPD_NO_ANON) == 0) {
-		if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) {
+		if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) {
 			if (flags & KPD_LOCKED) {
 				atomic_add_long(&anon_segkp_pages_locked,
 				    -pages);
@@ -468,6 +470,8 @@ segkp_get_internal(
 			kmem_free(kpd, sizeof (struct segkp_data));
 			return (NULL);
 		}
+		atomic_add_long(&anon_segkp_pages_resv,
+		    btop(SEGKP_MAPLEN(len, flags)));
 		anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT;
 		kpd->kp_anon_idx = anon_idx;
 		kpd->kp_anon = kpsd->kpsd_anon;
@@ -704,7 +708,9 @@ segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len)
 			if ((kpd->kp_flags & KPD_HASAMP) == 0) {
 				anon_free(kpd->kp_anon, kpd->kp_anon_idx + i,
 				    PAGESIZE);
-				anon_unresv(PAGESIZE);
+				anon_unresv_zone(PAGESIZE, NULL);
+				atomic_add_long(&anon_segkp_pages_resv,
+				    -1);
 			}
 			TRACE_5(TR_FAC_VM,
 			    TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index f48db44acc..e2069b27c6 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -2323,8 +2323,9 @@ segvn_faultpage(
 			 * zeroes. If no advance reservations, reserve now.
 			 */
 			if (svd->flags & MAP_NORESERVE) {
-				if (anon_resv(ptob(1))) {
-					svd->swresv += ptob(1);
+				if (anon_resv_zone(ptob(1),
+				    seg->s_as->a_proc->p_zone)) {
+					atomic_add_long(&svd->swresv, ptob(1));
 				} else {
 					err = ENOMEM;
 					goto out;
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index 0cad34257c..3f225a345a 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -113,6 +113,7 @@
 #include <sys/policy.h>
 #include <sys/condvar_impl.h>
 #include <sys/mutex_impl.h>
+#include <sys/rctl.h>
 
 #include <vm/as.h>
 #include <vm/hat.h>
@@ -729,12 +730,22 @@ set_anoninfo(void)
  * Return non-zero on success.
  */
 int
-anon_resvmem(size_t size, uint_t takemem)
+anon_resvmem(size_t size, boolean_t takemem, zone_t *zone)
 {
 	pgcnt_t npages = btopr(size);
 	pgcnt_t mswap_pages = 0;
 	pgcnt_t pswap_pages = 0;
+	proc_t *p = curproc;
 
+	if (zone != NULL && takemem) {
+		/* test zone.max-swap resource control */
+		mutex_enter(&p->p_lock);
+		if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
+			mutex_exit(&p->p_lock);
+			return (0);
+		}
+		mutex_exit(&p->p_lock);
+	}
 	mutex_enter(&anoninfo_lock);
 
 	/*
@@ -834,16 +845,17 @@ anon_resvmem(size_t size, uint_t takemem)
 		mutex_exit(&anoninfo_lock);
 		ANON_PRINT(A_RESV,
 			("anon_resvmem: not enough space from swapfs\n"));
+		if (zone != NULL && takemem)
+			rctl_decr_swap(zone, ptob(npages));
 		return (0);
 	}
 }
 
-
 /*
  * Give back an anon reservation.
  */
 void
-anon_unresv(size_t size)
+anon_unresvmem(size_t size, zone_t *zone)
 {
 	pgcnt_t npages = btopr(size);
 	spgcnt_t mem_free_pages = 0;
@@ -851,6 +863,8 @@ anon_unresv(size_t size)
 #ifdef	ANON_DEBUG
 	pgcnt_t mem_resv;
 #endif
+	if (zone != NULL)
+		rctl_decr_swap(zone, size);
 
 	mutex_enter(&anoninfo_lock);
 
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 05bfe662be..adac07b766 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -77,7 +77,7 @@
 #include <vm/pvn.h>
 #include <vm/seg_kmem.h>
 #include <vm/vm_dep.h>
-
+#include <sys/vm_usage.h>
 #include <fs/fs_subr.h>
 
 static int nopageage = 0;
@@ -343,6 +343,7 @@ vm_init(void)
 	(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
 	page_init_mem_config();
 	page_retire_init();
+	vm_usage_init();
 }
 
 /*
diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c
index 50cc21cdf7..aed892969d 100644
--- a/usr/src/uts/common/vm/vm_seg.c
+++ b/usr/src/uts/common/vm/vm_seg.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -54,12 +53,14 @@
 #include <sys/cmn_err.h>
 #include <sys/callb.h>
 #include <sys/mem_config.h>
+#include <sys/mman.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
 #include <vm/seg.h>
 #include <vm/seg_kmem.h>
-
+#include <vm/seg_spt.h>
+#include <vm/seg_vn.h>
 /*
  * kstats for segment advise
  */
@@ -950,3 +951,48 @@ seg_pinit_mem_config(void)
 	 */
 	ASSERT(ret == 0);
 }
+
+extern struct seg_ops segvn_ops;
+extern struct seg_ops segspt_shmops;
+
+/*
+ * Verify that segment is not a shared anonymous segment which reserves
+ * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
+ * from one zone to another if any segments are shared.  This is because the
+ * last process to exit will credit the swap reservation.  This could lead
+ * to the swap being reserved by one zone, and credited to another.
+ */
+boolean_t
+seg_can_change_zones(struct seg *seg)
+{
+	struct segvn_data *svd;
+
+	if (seg->s_ops == &segspt_shmops)
+		return (B_FALSE);
+
+	if (seg->s_ops == &segvn_ops) {
+		svd = (struct segvn_data *)seg->s_data;
+		if (svd->type == MAP_SHARED &&
+		    svd->amp != NULL &&
+		    svd->amp->swresv > 0)
+		return (B_FALSE);
+	}
+	return (B_TRUE);
+}
+
+/*
+ * Return swap reserved by a segment backing a private mapping.
+ */
+size_t
+seg_swresv(struct seg *seg)
+{
+	struct segvn_data *svd;
+	size_t swap = 0;
+
+	if (seg->s_ops == &segvn_ops) {
+		svd = (struct segvn_data *)seg->s_data;
+		if (svd->type == MAP_PRIVATE && svd->swresv > 0)
+			swap = svd->swresv;
+	}
+	return (swap);
+}
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
new file mode 100644
index 0000000000..32a8811e10
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -0,0 +1,1978 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * vm_usage
+ *
+ * This file implements the getvmusage() private system call.
+ * getvmusage() counts the amount of resident memory pages and swap
+ * reserved by the specified process collective. A "process collective" is
+ * the set of processes owned by a particular, zone, project, task, or user.
+ *
+ * rss and swap are counted so that for a given process collective, a page is
+ * only counted once.  For example, this means that if multiple processes in
+ * the same project map the same page, then the project will only be charged
+ * once for that page.  On the other hand, if two processes in different
+ * projects map the same page, then both projects will be charged
+ * for the page.
+ *
+ * The vm_getusage() calculation is implemented so that the first thread
+ * performs the rss/swap counting. Other callers will wait for that thread to
+ * finish, copying the results.  This enables multiple rcapds and prstats to
+ * consume data from the same calculation.  The results are also cached so that
+ * a caller interested in recent results can just copy them instead of starting
+ * a new calculation. The caller passes the maximium age (in seconds) of the
+ * data.  If the cached data is young enough, the cache is copied, otherwise,
+ * a new calculation is executed and the cache is replaced with the new
+ * data.
+ *
+ * The rss calculation for each process collective is as follows:
+ *
+ *   - Inspect flags, determine if counting rss for zones, projects, tasks,
+ *     and/or users.
+ *   - For each proc:
+ *	- Figure out proc's collectives (zone, project, task, and/or user).
+ *	- For each seg in proc's address space:
+ *		- If seg is private:
+ *			- Lookup anons in the amp.
+ *			- For incore pages not previously visited each of the
+ *			  proc's collectives, add incore pagesize to each.
+ *			  collective.
+ *			  Anon's with a refcnt of 1 can be assummed to be not
+ *			  previously visited.
+ *			- For address ranges without anons in the amp:
+ *				- Lookup pages in underlying vnode.
+ *				- For incore pages not previously visiting for
+ *				  each of the proc's collectives, add incore
+ *				  pagesize to each collective.
+ *		- If seg is shared:
+ *			- Lookup pages in the shared amp or vnode.
+ *			- For incore pages not previously visited for each of
+ *			  the proc's collectives, add incore pagesize to each
+ *			  collective.
+ *
+ * Swap is reserved by private segments, and shared anonymous segments.
+ * The only shared anon segments which do not reserve swap are ISM segments
+ * and schedctl segments, both of which can be identified by having
+ * amp->swresv == 0.
+ *
+ * The swap calculation for each collective is as follows:
+ *
+ *   - Inspect flags, determine if counting rss for zones, projects, tasks,
+ *     and/or users.
+ *   - For each proc:
+ *	- Figure out proc's collectives (zone, project, task, and/or user).
+ *	- For each seg in proc's address space:
+ *		- If seg is private:
+ *			- Add svd->swresv pages to swap count for each of the
+ *			  proc's collectives.
+ *		- If seg is anon, shared, and amp->swresv != 0
+ *			- For address ranges in amp not previously visited for
+ *			  each of the proc's collectives, add size of address
+ *			  range to the swap count for each collective.
+ *
+ * These two calculations are done simultaneously, with most of the work
+ * being done in vmu_calculate_seg().  The results of the calculation are
+ * copied into "vmu_data.vmu_cache_results".
+ *
+ * To perform the calculation, various things are tracked and cached:
+ *
+ *    - incore/not-incore page ranges for all vnodes.
+ *	(vmu_data.vmu_all_vnodes_hash)
+ *	This eliminates looking up the same page more than once.
+ *
+ *    - incore/not-incore page ranges for all shared amps.
+ *	(vmu_data.vmu_all_amps_hash)
+ *	This eliminates looking up the same page more than once.
+ *
+ *    - visited page ranges for each collective.
+ *	   - per vnode (entity->vme_vnode_hash)
+ *	   - per shared amp (entity->vme_amp_hash)
+ *	For accurate counting of map-shared and cow-shared pages.
+ *
+ *    - visited private anons (refcnt > 1) for each collective.
+ *	(entity->vme_anon_hash)
+ *	For accurate counting of cow-shared pages.
+ *
+ * The common accounting structure is the vmu_entity_t, which represents
+ * collectives:
+ *
+ *    - A zone.
+ *    - A project, task, or user within a zone.
+ *    - The entire system (vmu_data.vmu_system).
+ *    - Each collapsed (col) project and user.  This means a given projid or
+ *	uid, regardless of which zone the process is in.  For instance,
+ *      project 0 in the global zone and project 0 in a non global zone are
+ *	the same collapsed project.
+ *
+ *  Each entity structure tracks which pages have been already visited for
+ *  that entity (via previously inspected processes) so that these pages are
+ *  not double counted.
+ */
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/zone.h>
+#include <sys/proc.h>
+#include <sys/project.h>
+#include <sys/task.h>
+#include <sys/thread.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/modhash.h>
+#include <sys/modhash_impl.h>
+#include <sys/shm.h>
+#include <sys/swap.h>
+#include <sys/synch.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vm_usage.h>
+#include <sys/zone.h>
+#include <vm/anon.h>
+#include <vm/as.h>
+#include <vm/seg_vn.h>
+#include <vm/seg_spt.h>
+
+#define	VMUSAGE_HASH_SIZE		512
+
+#define	VMUSAGE_TYPE_VNODE		1
+#define	VMUSAGE_TYPE_AMP		2
+#define	VMUSAGE_TYPE_ANON		3
+
+#define	VMUSAGE_BOUND_UNKNOWN		0
+#define	VMUSAGE_BOUND_INCORE		1
+#define	VMUSAGE_BOUND_NOT_INCORE	2
+
+/*
+ * bounds for vnodes and shared amps
+ * Each bound is either entirely incore, entirely not in core, or
+ * entirely unknown.  bounds are stored in order by offset.
+ */
+typedef struct vmu_bound {
+	struct  vmu_bound *vmb_next;
+	pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
+	pgcnt_t	vmb_end;    /* page offset in vnode/amp on which bound ends */
+	char	vmb_type;   /* One of VMUSAGE_BOUND_* */
+} vmu_bound_t;
+
+/*
+ * hash of visited objects (vnodes or shared amps)
+ * key is address of vnode or amp.  Bounds lists known incore/non-incore
+ * bounds for vnode/amp.
+ */
+typedef struct vmu_object {
+	struct vmu_object	*vmo_next;	/* free list */
+	caddr_t		vmo_key;
+	short		vmo_type;
+	vmu_bound_t	*vmo_bounds;
+} vmu_object_t;
+
+/*
+ * Entity by which to count results.
+ *
+ * The entity structure keeps the current rss/swap counts for each entity
+ * (zone, project, etc), and hashes of vm structures that have already
+ * been visited for the entity.
+ *
+ * vme_next:	links the list of all entities currently being counted by
+ *		vmu_calculate().
+ *
+ * vme_next_calc: links the list of entities related to the current process
+ *		 being counted by vmu_calculate_proc().
+ *
+ * vmu_calculate_proc() walks all processes.  For each process, it makes a
+ * list of the entities related to that process using vme_next_calc.  This
+ * list changes each time vmu_calculate_proc() is called.
+ *
+ */
+typedef struct vmu_entity {
+	struct vmu_entity *vme_next;
+	struct vmu_entity *vme_next_calc;
+	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
+	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
+	mod_hash_t	*vme_anon_hash;	 /* cow anons visited for entity */
+	vmusage_t	vme_result;	 /* identifies entity and results */
+} vmu_entity_t;
+
+/*
+ * Hash of entities visited within a zone, and an entity for the zone
+ * itself.
+ */
+typedef struct vmu_zone {
+	struct vmu_zone	*vmz_next;	/* free list */
+	id_t		vmz_id;
+	vmu_entity_t	*vmz_zone;
+	mod_hash_t	*vmz_projects_hash;
+	mod_hash_t	*vmz_tasks_hash;
+	mod_hash_t	*vmz_rusers_hash;
+	mod_hash_t	*vmz_eusers_hash;
+} vmu_zone_t;
+
+/*
+ * Cache of results from last calculation
+ */
+typedef struct vmu_cache {
+	vmusage_t	*vmc_results;	/* Results from last call to */
+					/* vm_getusage(). */
+	uint64_t	vmc_nresults;	/* Count of cached results */
+	uint64_t	vmc_refcnt;	/* refcnt for free */
+	uint_t		vmc_flags;	/* Flags for vm_getusage() */
+	hrtime_t	vmc_timestamp;	/* when cache was created */
+} vmu_cache_t;
+
+/*
+ * top level rss info for the system
+ */
+typedef struct vmu_data {
+	kmutex_t	vmu_lock;		/* Protects vmu_data */
+	kcondvar_t	vmu_cv;			/* Used to signal threads */
+						/* Waiting for */
+						/* Rss_calc_thread to finish */
+	vmu_entity_t	*vmu_system;		/* Entity for tracking */
+						/* rss/swap for all processes */
+						/* in all zones */
+	mod_hash_t	*vmu_zones_hash;	/* Zones visited */
+	mod_hash_t	*vmu_projects_col_hash; /* These *_col_hash hashes */
+	mod_hash_t	*vmu_rusers_col_hash;	/* keep track of entities, */
+	mod_hash_t	*vmu_eusers_col_hash;	/* ignoring zoneid, in order */
+						/* to implement VMUSAGE_COL_* */
+						/* flags, which aggregate by */
+						/* project or user regardless */
+						/* of zoneid. */
+	mod_hash_t	*vmu_all_vnodes_hash;	/* System wide visited vnodes */
+						/* to track incore/not-incore */
+	mod_hash_t	*vmu_all_amps_hash;	/* System wide visited shared */
+						/* amps to track incore/not- */
+						/* incore */
+	vmu_entity_t	*vmu_entities;		/* Linked list of entities */
+	size_t		vmu_nentities;		/* Count of entities in list */
+	vmu_cache_t	*vmu_cache;		/* Cached results */
+	kthread_t	*vmu_calc_thread;	/* NULL, or thread running */
+						/* vmu_calculate() */
+	uint_t		vmu_calc_flags;		/* Flags being using by */
+						/* currently running calc */
+						/* thread */
+	uint_t		vmu_pending_flags;	/* Flags of vm_getusage() */
+						/* threads waiting for */
+						/* calc thread to finish */
+	uint_t		vmu_pending_waiters;	/* Number of threads waiting */
+						/* for calc thread */
+	vmu_bound_t	*vmu_free_bounds;
+	vmu_object_t	*vmu_free_objects;
+	vmu_entity_t	*vmu_free_entities;
+	vmu_zone_t	*vmu_free_zones;
+} vmu_data_t;
+
+extern struct as kas;
+extern proc_t *practive;
+extern zone_t *global_zone;
+extern struct seg_ops segvn_ops;
+extern struct seg_ops segspt_shmops;
+
+static vmu_data_t vmu_data;
+static kmem_cache_t *vmu_bound_cache;
+static kmem_cache_t *vmu_object_cache;
+
+/*
+ * Save a bound on the free list
+ */
+static void
+vmu_free_bound(vmu_bound_t *bound)
+{
+	bound->vmb_next = vmu_data.vmu_free_bounds;
+	vmu_data.vmu_free_bounds = bound;
+}
+
+/*
+ * Free an object, and all visited bound info.
+ */
+static void
+vmu_free_object(mod_hash_val_t val)
+{
+	vmu_object_t *obj = (vmu_object_t *)val;
+	vmu_bound_t *bound = obj->vmo_bounds;
+	vmu_bound_t *tmp;
+
+	while (bound != NULL) {
+		tmp = bound;
+		bound = bound->vmb_next;
+		vmu_free_bound(tmp);
+	}
+	obj->vmo_next = vmu_data.vmu_free_objects;
+	vmu_data.vmu_free_objects = obj;
+}
+
+/*
+ * Free an entity, and hashes of visited objects for that entity.
+ */
+static void
+vmu_free_entity(mod_hash_val_t val)
+{
+	vmu_entity_t *entity = (vmu_entity_t *)val;
+
+	if (entity->vme_vnode_hash != NULL)
+		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
+	if (entity->vme_amp_hash != NULL)
+		i_mod_hash_clear_nosync(entity->vme_amp_hash);
+	if (entity->vme_anon_hash != NULL)
+		i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+	entity->vme_next = vmu_data.vmu_free_entities;
+	vmu_data.vmu_free_entities = entity;
+}
+
+/*
+ * Free zone entity, and all hashes of entities inside that zone,
+ * which are projects, tasks, and users.
+ */
+static void
+vmu_free_zone(mod_hash_val_t val)
+{
+	vmu_zone_t *zone = (vmu_zone_t *)val;
+
+	if (zone->vmz_zone != NULL) {
+		vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
+		zone->vmz_zone = NULL;
+	}
+	if (zone->vmz_projects_hash != NULL)
+		i_mod_hash_clear_nosync(zone->vmz_projects_hash);
+	if (zone->vmz_tasks_hash != NULL)
+		i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
+	if (zone->vmz_rusers_hash != NULL)
+		i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
+	if (zone->vmz_eusers_hash != NULL)
+		i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
+	zone->vmz_next = vmu_data.vmu_free_zones;
+	vmu_data.vmu_free_zones = zone;
+}
+
+/*
+ * Initialize synchronization primitives and hashes for system-wide tracking
+ * of visited vnodes and shared amps.  Initialize results cache.
+ */
+void
+vm_usage_init()
+{
+	mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
+
+	vmu_data.vmu_system = NULL;
+	vmu_data.vmu_zones_hash = NULL;
+	vmu_data.vmu_projects_col_hash = NULL;
+	vmu_data.vmu_rusers_col_hash = NULL;
+	vmu_data.vmu_eusers_col_hash = NULL;
+
+	vmu_data.vmu_free_bounds = NULL;
+	vmu_data.vmu_free_objects = NULL;
+	vmu_data.vmu_free_entities = NULL;
+	vmu_data.vmu_free_zones = NULL;
+
+	vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
+	    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+	    sizeof (vnode_t));
+	vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
+	    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+	    sizeof (struct anon_map));
+	vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
+	    "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
+	    vmu_free_entity);
+	vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
+	    "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
+	    vmu_free_entity);
+	vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
+	    "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
+	    vmu_free_entity);
+	vmu_data.vmu_zones_hash = mod_hash_create_idhash(
+	    "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
+
+	vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
+	    sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	vmu_object_cache = kmem_cache_create("vmu_object_cache",
+	    sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	vmu_data.vmu_entities = NULL;
+	vmu_data.vmu_nentities = 0;
+
+	vmu_data.vmu_cache = NULL;
+	vmu_data.vmu_calc_thread = NULL;
+	vmu_data.vmu_calc_flags = 0;
+	vmu_data.vmu_pending_flags = 0;
+	vmu_data.vmu_pending_waiters = 0;
+}
+
+/*
+ * Allocate hashes for tracking vm objects visited for an entity.
+ * Update list of entities.
+ */
+static vmu_entity_t *
+vmu_alloc_entity(id_t id, int type, id_t zoneid)
+{
+	vmu_entity_t *entity;
+
+	if (vmu_data.vmu_free_entities != NULL) {
+		entity = vmu_data.vmu_free_entities;
+		vmu_data.vmu_free_entities =
+		    vmu_data.vmu_free_entities->vme_next;
+		bzero(&entity->vme_result, sizeof (vmusage_t));
+	} else {
+		entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
+	}
+	entity->vme_result.vmu_id = id;
+	entity->vme_result.vmu_zoneid = zoneid;
+	entity->vme_result.vmu_type = type;
+
+	if (entity->vme_vnode_hash == NULL)
+		entity->vme_vnode_hash = mod_hash_create_ptrhash(
+		    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+		    sizeof (vnode_t));
+
+	if (entity->vme_amp_hash == NULL)
+		entity->vme_amp_hash = mod_hash_create_ptrhash(
+		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+		    sizeof (struct anon_map));
+
+	if (entity->vme_anon_hash == NULL)
+		entity->vme_anon_hash = mod_hash_create_ptrhash(
+		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
+		    mod_hash_null_valdtor, sizeof (struct anon));
+
+	entity->vme_next = vmu_data.vmu_entities;
+	vmu_data.vmu_entities = entity;
+	vmu_data.vmu_nentities++;
+
+	return (entity);
+}
+
+/*
+ * Allocate a zone entity, and hashes for tracking visited vm objects
+ * for projects, tasks, and users within that zone.
+ */
+static vmu_zone_t *
+vmu_alloc_zone(id_t id)
+{
+	vmu_zone_t *zone;
+
+	if (vmu_data.vmu_free_zones != NULL) {
+		zone = vmu_data.vmu_free_zones;
+		vmu_data.vmu_free_zones =
+		    vmu_data.vmu_free_zones->vmz_next;
+		zone->vmz_next = NULL;
+		zone->vmz_zone = NULL;
+	} else {
+		zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
+	}
+
+	zone->vmz_id = id;
+
+	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
+
+	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
+	    VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
+		zone->vmz_projects_hash = mod_hash_create_idhash(
+		    "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+	if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
+	    != 0 && zone->vmz_tasks_hash == NULL)
+		zone->vmz_tasks_hash = mod_hash_create_idhash(
+		    "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+	if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
+	    != 0 && zone->vmz_rusers_hash == NULL)
+		zone->vmz_rusers_hash = mod_hash_create_idhash(
+		    "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+	if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
+	    != 0 && zone->vmz_eusers_hash == NULL)
+		zone->vmz_eusers_hash = mod_hash_create_idhash(
+		    "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+	return (zone);
+}
+
+/*
+ * Allocate a structure for tracking visited bounds for a vm object.
+ */
+static vmu_object_t *
+vmu_alloc_object(caddr_t key, int type)
+{
+	vmu_object_t *object;
+
+	if (vmu_data.vmu_free_objects != NULL) {
+		object = vmu_data.vmu_free_objects;
+		vmu_data.vmu_free_objects =
+		    vmu_data.vmu_free_objects->vmo_next;
+	} else {
+		object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
+	}
+
+	object->vmo_key = key;
+	object->vmo_type = type;
+	object->vmo_bounds = NULL;
+
+	return (object);
+}
+
+/*
+ * Allocate and return a bound structure.
+ */
+static vmu_bound_t *
+vmu_alloc_bound()
+{
+	vmu_bound_t *bound;
+
+	if (vmu_data.vmu_free_bounds != NULL) {
+		bound = vmu_data.vmu_free_bounds;
+		vmu_data.vmu_free_bounds =
+		    vmu_data.vmu_free_bounds->vmb_next;
+		bzero(bound, sizeof (vmu_bound_t));
+	} else {
+		bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
+		bzero(bound, sizeof (vmu_bound_t));
+	}
+	return (bound);
+}
+
+/*
+ * vmu_find_insert_* functions implement hash lookup or allocate and
+ * insert operations.
+ */
+static vmu_object_t *
+vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
+{
+	int ret;
+	vmu_object_t *object;
+
+	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
+	    (mod_hash_val_t *)&object);
+	if (ret != 0) {
+		object = vmu_alloc_object(key, type);
+		ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
+		    (mod_hash_val_t)object, (mod_hash_hndl_t)0);
+		ASSERT(ret == 0);
+	}
+	return (object);
+}
+
+static int
+vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+{
+	int ret;
+	caddr_t val;
+
+	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
+	    (mod_hash_val_t *)&val);
+
+	if (ret == 0)
+		return (0);
+
+	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
+	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+
+	ASSERT(ret == 0);
+
+	return (1);
+}
+
+static vmu_entity_t *
+vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
+{
+	int ret;
+	vmu_entity_t *entity;
+
+	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
+	    (mod_hash_val_t *)&entity);
+	if (ret != 0) {
+		entity = vmu_alloc_entity(id, type, zoneid);
+		ret = i_mod_hash_insert_nosync(hash,
+		    (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
+		    (mod_hash_hndl_t)0);
+		ASSERT(ret == 0);
+	}
+	return (entity);
+}
+
+
+
+
+/*
+ * Returns list of object bounds between start and end.  New bounds inserted
+ * by this call are given type.
+ *
+ * Returns the number of pages covered if new bounds are created.  Returns 0
+ * if region between start/end consists of all existing bounds.
+ */
+static pgcnt_t
+vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
+    end, char type, vmu_bound_t **first, vmu_bound_t **last)
+{
+	vmu_bound_t *next;
+	vmu_bound_t *prev = NULL;
+	vmu_bound_t *tmp = NULL;
+	pgcnt_t ret = 0;
+
+	*first = *last = NULL;
+
+	for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) {
+		/*
+		 * Find bounds overlapping or overlapped by range [start,end].
+		 */
+		if (start > next->vmb_end) {
+			/* bound is before new bound */
+			prev = next;
+			continue;
+		}
+		if (next->vmb_start > end) {
+			/* bound is after new bound */
+			break;
+		}
+		if (*first == NULL)
+			*first = next;
+		*last = next;
+	}
+
+	if (*first == NULL) {
+		ASSERT(*last == NULL);
+		/*
+		 * No bounds overlapping range [start,end], so create new
+		 * bound
+		 */
+		tmp = vmu_alloc_bound();
+		tmp->vmb_start = start;
+		tmp->vmb_end = end;
+		tmp->vmb_type = type;
+		if (prev == NULL) {
+			tmp->vmb_next = ro->vmo_bounds;
+			ro->vmo_bounds = tmp;
+		} else {
+			tmp->vmb_next = prev->vmb_next;
+			prev->vmb_next = tmp;
+		}
+		*first = tmp;
+		*last = tmp;
+		ASSERT(tmp->vmb_end >= tmp->vmb_start);
+		ret = tmp->vmb_end - tmp->vmb_start + 1;
+		return (ret);
+	}
+
+	/* Check to see if start is before first known bound */
+	ASSERT(first != NULL && last != NULL);
+	next = (*first);
+	if (start < (*first)->vmb_start) {
+		/* Create new bound before first bound */
+		tmp = vmu_alloc_bound();
+		tmp->vmb_start = start;
+		tmp->vmb_end = (*first)->vmb_start - 1;
+		tmp->vmb_type = type;
+		tmp->vmb_next = *first;
+		if (*first == ro->vmo_bounds)
+			ro->vmo_bounds = tmp;
+		if (prev != NULL)
+			prev->vmb_next = tmp;
+		ASSERT(tmp->vmb_end >= tmp->vmb_start);
+		ret += tmp->vmb_end - tmp->vmb_start + 1;
+		*first = tmp;
+	}
+	/*
+	 * Between start and end, search for gaps between and after existing
+	 * bounds.  Create new bounds to fill gaps if they exist.
+	 */
+	while (end > next->vmb_end) {
+		/*
+		 * Check for gap between bound and next bound. if no gap,
+		 * continue.
+		 */
+		if ((next != *last) &&
+		    ((next->vmb_end + 1) == next->vmb_next->vmb_start)) {
+			next = next->vmb_next;
+			continue;
+		}
+		/*
+		 * Insert new bound in gap after bound, and before next
+		 * bound if next bound exists.
+		 */
+		tmp = vmu_alloc_bound();
+		tmp->vmb_type = type;
+		tmp->vmb_next = next->vmb_next;
+		tmp->vmb_start = next->vmb_end + 1;
+
+		if (next != *last) {
+			tmp->vmb_end = next->vmb_next->vmb_start - 1;
+			ASSERT(tmp->vmb_end >= tmp->vmb_start);
+			ret += tmp->vmb_end - tmp->vmb_start + 1;
+			next->vmb_next = tmp;
+			next = tmp->vmb_next;
+		} else {
+			tmp->vmb_end = end;
+			ASSERT(tmp->vmb_end >= tmp->vmb_start);
+			ret += tmp->vmb_end - tmp->vmb_start + 1;
+			next->vmb_next = tmp;
+			*last = tmp;
+			break;
+		}
+	}
+	return (ret);
+}
+
+/*
+ * vmu_update_bounds()
+ *
+ * first, last:	list of continuous bounds, of which zero or more are of
+ * 		type VMUSAGE_BOUND_UNKNOWN.
+ *
+ * new_first, new_last:	list of continuous bounds, of which none are of
+ *			type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
+ *			update the types of bounds in (first,last) with
+ *			type VMUSAGE_BOUND_UNKNOWN.
+ *
+ * For the list of bounds (first,last), this function updates any bounds
+ * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
+ * the list (new_first, new_last).
+ *
+ * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
+ * (new_first, new_last), it will be split into multiple bounds.
+ *
+ * Return value:
+ * 	The number of pages in the list of bounds (first,last) that were of
+ *	type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
+ *	VMUSAGE_BOUND_INCORE.
+ *
+ */
+static pgcnt_t
+vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last,
+    vmu_bound_t *new_first, vmu_bound_t *new_last)
+{
+	vmu_bound_t *next, *new_next, *tmp;
+	pgcnt_t rss = 0;
+
+	next = *first;
+	new_next = new_first;
+
+	/* verify bounds span same pages */
+	ASSERT((*first)->vmb_start >= new_next->vmb_start);
+	ASSERT((*last)->vmb_end <= new_last->vmb_end);
+	for (;;) {
+		/* If bound already has type, proceed to next bound */
+		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
+			if (next == *last)
+				break;
+			next = next->vmb_next;
+			continue;
+		}
+		while (new_next->vmb_end < next->vmb_start)
+			new_next = new_next->vmb_next;
+		ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
+		next->vmb_type = new_next->vmb_type;
+		if (new_next->vmb_end < next->vmb_end) {
+			/* need to split bound */
+			tmp = vmu_alloc_bound();
+			tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
+			tmp->vmb_start = new_next->vmb_end + 1;
+			tmp->vmb_end = next->vmb_end;
+			tmp->vmb_next = next->vmb_next;
+			next->vmb_end = new_next->vmb_end;
+			next->vmb_next = tmp;
+			if (*last == next)
+				*last = tmp;
+			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
+				rss += next->vmb_end - next->vmb_start + 1;
+			next = tmp;
+		} else {
+			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
+				rss += next->vmb_end - next->vmb_start + 1;
+			if (next == *last)
+				break;
+			next = next->vmb_next;
+		}
+	}
+	return (rss);
+}
+
+/*
+ * merges adjacent bounds with same type between first and last bound.
+ * After merge, last pointer is no longer valid, as last bound may be
+ * merged away.
+ */
+static void
+vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last)
+{
+	vmu_bound_t *next;
+	vmu_bound_t *tmp;
+
+	ASSERT(*first != NULL);
+	ASSERT(*last != NULL);
+
+	next = *first;
+	while (next != *last) {
+
+		/* If bounds are adjacent and have same type, merge them */
+		if (((next->vmb_end + 1) == next->vmb_next->vmb_start) &&
+		    (next->vmb_type == next->vmb_next->vmb_type)) {
+			tmp = next->vmb_next;
+			next->vmb_end = tmp->vmb_end;
+			next->vmb_next = tmp->vmb_next;
+			vmu_free_bound(tmp);
+			if (tmp == *last)
+				*last = next;
+		} else {
+			next = next->vmb_next;
+		}
+	}
+}
+
+/*
+ * Given an amp and a list of bounds, updates each bound's type with
+ * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
+ *
+ * If a bound is partially incore, it will be split into two bounds.
+ * first and last may be modified, as bounds may be split into multiple
+ * bounds if the are partially incore/not-incore.
+ *
+ * Set incore to non-zero if bounds are already known to be incore
+ *
+ */
+static void
+vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first,
+    vmu_bound_t **last, boolean_t incore)
+{
+	vmu_bound_t *next;
+	vmu_bound_t *tmp;
+	pgcnt_t index;
+	short bound_type;
+	short page_type;
+	vnode_t *vn;
+	anoff_t off;
+	struct anon *ap;
+
+	next = *first;
+	/* Shared anon slots don't change once set */
+	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+	for (;;) {
+		if (incore == B_TRUE)
+			next->vmb_type = VMUSAGE_BOUND_INCORE;
+
+		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
+			if (next == *last)
+				break;
+			next = next->vmb_next;
+			continue;
+		}
+		bound_type = next->vmb_type;
+		index = next->vmb_start;
+		while (index <= next->vmb_end) {
+
+			/*
+			 * These are used to determine how much to increment
+			 * index when a large page is found.
+			 */
+			page_t *page;
+			pgcnt_t pgcnt = 1;
+			uint_t pgshft;
+			pgcnt_t pgmsk;
+
+			ap = anon_get_ptr(amp->ahp, index);
+			if (ap != NULL)
+				swap_xlate(ap, &vn, &off);
+
+			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
+			    (page = page_exists(vn, off)) != NULL) {
+				page_type = VMUSAGE_BOUND_INCORE;
+				if (page->p_szc > 0) {
+					pgcnt = page_get_pagecnt(page->p_szc);
+					pgshft = page_get_shift(page->p_szc);
+					pgmsk = (0x1 << (pgshft - PAGESHIFT))
+					    - 1;
+				}
+			} else {
+				page_type = VMUSAGE_BOUND_NOT_INCORE;
+			}
+			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
+				next->vmb_type = page_type;
+			} else if (next->vmb_type != page_type) {
+				/*
+				 * if current bound type does not match page
+				 * type, need to split off new bound.
+				 */
+				tmp = vmu_alloc_bound();
+				tmp->vmb_type = page_type;
+				tmp->vmb_start = index;
+				tmp->vmb_end = next->vmb_end;
+				tmp->vmb_next = next->vmb_next;
+				next->vmb_end = index - 1;
+				next->vmb_next = tmp;
+				if (*last == next)
+					*last = tmp;
+				next = tmp;
+			}
+			if (pgcnt > 1) {
+				/*
+				 * If inside large page, jump to next large
+				 * page
+				 */
+				index = (index & ~pgmsk) + pgcnt;
+			} else {
+				index++;
+			}
+		}
+		if (next == *last) {
+			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
+			break;
+		} else
+			next = next->vmb_next;
+	}
+	ANON_LOCK_EXIT(&amp->a_rwlock);
+}
+
+/*
+ * Same as vmu_amp_update_incore_bounds(), except for tracking
+ * incore-/not-incore for vnodes.
+ */
+static void
+vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first,
+    vmu_bound_t **last)
+{
+	vmu_bound_t *next;
+	vmu_bound_t *tmp;
+	pgcnt_t index;
+	short bound_type;
+	short page_type;
+
+	next = *first;
+	for (;;) {
+		if (vnode->v_pages == NULL)
+			next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
+
+		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
+			if (next == *last)
+				break;
+			next = next->vmb_next;
+			continue;
+		}
+
+		bound_type = next->vmb_type;
+		index = next->vmb_start;
+		while (index <= next->vmb_end) {
+
+			/*
+			 * These are used to determine how much to increment
+			 * index when a large page is found.
+			 */
+			page_t *page;
+			pgcnt_t pgcnt = 1;
+			uint_t pgshft;
+			pgcnt_t pgmsk;
+
+			if (vnode->v_pages != NULL &&
+			    (page = page_exists(vnode, ptob(index))) != NULL) {
+				page_type = VMUSAGE_BOUND_INCORE;
+				if (page->p_szc > 0) {
+					pgcnt = page_get_pagecnt(page->p_szc);
+					pgshft = page_get_shift(page->p_szc);
+					pgmsk = (0x1 << (pgshft - PAGESHIFT))
+					    - 1;
+				}
+			} else {
+				page_type = VMUSAGE_BOUND_NOT_INCORE;
+			}
+			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
+				next->vmb_type = page_type;
+			} else if (next->vmb_type != page_type) {
+				/*
+				 * if current bound type does not match page
+				 * type, need to split off new bound.
+				 */
+				tmp = vmu_alloc_bound();
+				tmp->vmb_type = page_type;
+				tmp->vmb_start = index;
+				tmp->vmb_end = next->vmb_end;
+				tmp->vmb_next = next->vmb_next;
+				next->vmb_end = index - 1;
+				next->vmb_next = tmp;
+				if (*last == next)
+					*last = tmp;
+				next = tmp;
+			}
+			if (pgcnt > 1) {
+				/*
+				 * If inside large page, jump to next large
+				 * page
+				 */
+				index = (index & ~pgmsk) + pgcnt;
+			} else {
+				index++;
+			}
+		}
+		if (next == *last) {
+			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
+			break;
+		} else
+			next = next->vmb_next;
+	}
+}
+
+/*
+ * Calculate the rss and swap consumed by a segment.  vmu_entities is the
+ * list of entities to visit.  For shared segments, the vnode or amp
+ * is looked up in each entity to see if has been already counted.  Private
+ * anon pages are checked per entity to ensure that cow pages are not
+ * double counted.
+ *
+ * For private mapped files, first the amp is checked for private pages.
+ * Bounds not backed by the amp are looked up in the vnode for each entity
+ * to avoid double counting of private COW vnode pages.
+ */
+static void
+vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
+{
+	struct segvn_data *svd;
+	struct shm_data *shmd;
+	struct spt_data *sptd;
+	vmu_object_t *shared_object = NULL;
+	vmu_object_t *entity_object = NULL;
+	vmu_entity_t *entity;
+	vmusage_t *result;
+	vmu_bound_t *first = NULL;
+	vmu_bound_t *last = NULL;
+	vmu_bound_t *cur = NULL;
+	vmu_bound_t *e_first = NULL;
+	vmu_bound_t *e_last = NULL;
+	vmu_bound_t *tmp;
+	pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
+	struct anon_map *private_amp = NULL;
+	boolean_t incore = B_FALSE;
+	boolean_t shared = B_FALSE;
+	int file = 0;
+	pgcnt_t swresv = 0;
+	pgcnt_t panon = 0;
+
+	/* Can zero-length segments exist?  Not sure, so parenoia */
+	if (seg->s_size <= 0)
+		return;
+
+	/*
+	 * Figure out if there is a shared object (such as a named vnode or
+	 * a shared amp, then figure out if there is a private amp, which
+	 * identifies private pages.
+	 */
+	if (seg->s_ops == &segvn_ops) {
+		svd = (struct segvn_data *)seg->s_data;
+		if (svd->type == MAP_SHARED)
+			shared = B_TRUE;
+		else
+			swresv = svd->swresv;
+
+		if (svd->vp != NULL) {
+			file = 1;
+			shared_object = vmu_find_insert_object(
+			    vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
+			    VMUSAGE_TYPE_VNODE);
+			s_start = btop(svd->offset);
+			s_end = btop(svd->offset + seg->s_size) - 1;
+		}
+		if (svd->amp != NULL && svd->type == MAP_SHARED) {
+			ASSERT(shared_object == NULL);
+			shared_object = vmu_find_insert_object(
+			    vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
+			    VMUSAGE_TYPE_AMP);
+			s_start = svd->anon_index;
+			s_end = svd->anon_index + btop(seg->s_size) - 1;
+			/* schedctl mappings are always in core */
+			if (svd->amp->swresv == 0)
+				incore = B_TRUE;
+		}
+		if (svd->amp != NULL && svd->type == MAP_PRIVATE) {
+			private_amp = svd->amp;
+			p_start = svd->anon_index;
+			p_end = svd->anon_index + btop(seg->s_size) - 1;
+		}
+	} else if (seg->s_ops == &segspt_shmops) {
+		shared = B_TRUE;
+		shmd = (struct shm_data *)seg->s_data;
+		shared_object = vmu_find_insert_object(
+		    vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
+		    VMUSAGE_TYPE_AMP);
+		s_start = 0;
+		s_end = btop(seg->s_size) - 1;
+		sptd = shmd->shm_sptseg->s_data;
+
+		/* ism segments are always incore and do not reserve swap */
+		if (sptd->spt_flags & SHM_SHARE_MMU)
+			incore = B_TRUE;
+
+	} else {
+		return;
+	}
+
+	/*
+	 * If there is a private amp, count anon pages that exist.  If an
+	 * anon has a refcnt > 1 (cow sharing), then save the anon in a
+	 * hash so that it is not double counted.
+	 *
+	 * If there is also a shared object, they figure out the bounds
+	 * which are not mapped by the private amp.
+	 */
+	if (private_amp != NULL) {
+
+		/* Enter as writer to prevent cow anons from being freed */
+		ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
+
+		p_index = p_start;
+		s_index = s_start;
+
+		while (p_index <= p_end) {
+
+			pgcnt_t p_index_next;
+			pgcnt_t p_bound_size;
+			int cnt;
+			anoff_t off;
+			struct vnode *vn;
+			struct anon *ap;
+			page_t *page;		/* For handling of large */
+			pgcnt_t pgcnt = 1;	/* pages */
+			pgcnt_t pgstart;
+			pgcnt_t pgend;
+			uint_t pgshft;
+			pgcnt_t pgmsk;
+
+			p_index_next = p_index;
+			ap = anon_get_next_ptr(private_amp->ahp,
+			    &p_index_next);
+
+			/*
+			 * If next anon is past end of mapping, simulate
+			 * end of anon so loop terminates.
+			 */
+			if (p_index_next > p_end) {
+				p_index_next = p_end + 1;
+				ap = NULL;
+			}
+			/*
+			 * For cow segments, keep track of bounds not
+			 * backed by private amp so they can be looked
+			 * up in the backing vnode
+			 */
+			if (p_index_next != p_index) {
+
+				/*
+				 * Compute index difference between anon and
+				 * previous anon.
+				 */
+				p_bound_size = p_index_next - p_index - 1;
+
+				if (shared_object != NULL) {
+					cur = vmu_alloc_bound();
+					cur->vmb_next = NULL;
+					cur->vmb_start = s_index;
+					cur->vmb_end = s_index + p_bound_size;
+					cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
+					if (first == NULL) {
+						first = cur;
+						last = cur;
+					} else {
+						last->vmb_next = cur;
+						last = cur;
+					}
+				}
+				p_index = p_index + p_bound_size + 1;
+				s_index = s_index + p_bound_size + 1;
+			}
+
+			/* Detect end of anons in amp */
+			if (ap == NULL)
+				break;
+
+			cnt = ap->an_refcnt;
+			swap_xlate(ap, &vn, &off);
+
+			if (vn == NULL || vn->v_pages == NULL ||
+			    (page = page_exists(vn, off)) == NULL) {
+				p_index++;
+				s_index++;
+				continue;
+			}
+
+			/*
+			 * If large page is found, compute portion of large
+			 * page in mapping, and increment indicies to the next
+			 * large page.
+			 */
+			if (page->p_szc > 0) {
+
+				pgcnt = page_get_pagecnt(page->p_szc);
+				pgshft = page_get_shift(page->p_szc);
+				pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
+
+				/* First page in large page */
+				pgstart = p_index & ~pgmsk;
+				/* Last page in large page */
+				pgend = pgstart + pgcnt - 1;
+				/*
+				 * Artifically end page if page extends past
+				 * end of mapping.
+				 */
+				if (pgend > p_end)
+					pgend = p_end;
+
+				/*
+				 * Compute number of pages from large page
+				 * which are mapped.
+				 */
+				pgcnt = pgend - p_index + 1;
+
+				/*
+				 * Point indicies at page after large page,
+				 * or at page after end of mapping.
+				 */
+				p_index += pgcnt;
+				s_index += pgcnt;
+			} else {
+				p_index++;
+				s_index++;
+			}
+
+			/*
+			 * Assume anon structs with a refcnt
+			 * of 1 are not cow shared, so there
+			 * is no reason to track them per entity.
+			 */
+			if (cnt == 1) {
+				panon += pgcnt;
+				continue;
+			}
+			for (entity = vmu_entities; entity != NULL;
+			    entity = entity->vme_next_calc) {
+
+				result = &entity->vme_result;
+				/*
+				 * Track cow anons per entity so
+				 * they are not double counted.
+				 */
+				if (vmu_find_insert_anon(entity->vme_anon_hash,
+				    (caddr_t)ap) == 0)
+					continue;
+
+				result->vmu_rss_all += (pgcnt << PAGESHIFT);
+				result->vmu_rss_private +=
+				    (pgcnt << PAGESHIFT);
+			}
+		}
+		ANON_LOCK_EXIT(&private_amp->a_rwlock);
+	}
+
+	/* Add up resident anon and swap reserved for private mappings */
+	if (swresv > 0 || panon > 0) {
+		for (entity = vmu_entities; entity != NULL;
+		    entity = entity->vme_next_calc) {
+			result = &entity->vme_result;
+			result->vmu_swap_all += swresv;
+			result->vmu_swap_private += swresv;
+			result->vmu_rss_all += (panon << PAGESHIFT);
+			result->vmu_rss_private += (panon << PAGESHIFT);
+		}
+	}
+
+	/* Compute resident pages backing shared amp or named vnode */
+	if (shared_object != NULL) {
+		if (first == NULL) {
+			/*
+			 * No private amp, or private amp has no anon
+			 * structs.  This means entire segment is backed by
+			 * the shared object.
+			 */
+			first = vmu_alloc_bound();
+			first->vmb_next = NULL;
+			first->vmb_start = s_start;
+			first->vmb_end = s_end;
+			first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
+		}
+		/*
+		 * Iterate bounds not backed by private amp, and compute
+		 * resident pages.
+		 */
+		cur = first;
+		while (cur != NULL) {
+
+			if (vmu_insert_lookup_object_bounds(shared_object,
+			    cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
+			    &first, &last) > 0) {
+				/* new bounds, find incore/not-incore */
+				if (shared_object->vmo_type ==
+				    VMUSAGE_TYPE_VNODE)
+					vmu_vnode_update_incore_bounds(
+					    (vnode_t *)
+					    shared_object->vmo_key, &first,
+					    &last);
+				else
+					vmu_amp_update_incore_bounds(
+					    (struct anon_map *)
+					    shared_object->vmo_key, &first,
+					    &last, incore);
+				vmu_merge_bounds(&first, &last);
+			}
+			for (entity = vmu_entities; entity != NULL;
+			    entity = entity->vme_next_calc) {
+
+				result = &entity->vme_result;
+
+				entity_object = vmu_find_insert_object(
+				    shared_object->vmo_type ==
+				    VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
+					entity->vme_amp_hash,
+					shared_object->vmo_key,
+					shared_object->vmo_type);
+
+				virt = vmu_insert_lookup_object_bounds(
+				    entity_object, cur->vmb_start, cur->vmb_end,
+				    VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
+
+				if (virt == 0)
+					continue;
+				/*
+				 * Range visited for this entity
+				 */
+				rss = vmu_update_bounds(&e_first,
+				    &e_last, first, last);
+				result->vmu_rss_all += (rss << PAGESHIFT);
+				if (shared == B_TRUE && file == B_FALSE) {
+					/* shared anon mapping */
+					result->vmu_swap_all +=
+					    (virt << PAGESHIFT);
+					result->vmu_swap_shared +=
+					    (virt << PAGESHIFT);
+					result->vmu_rss_shared +=
+					    (rss << PAGESHIFT);
+				} else if (shared == B_TRUE && file == B_TRUE) {
+					/* shared file mapping */
+					result->vmu_rss_shared +=
+					    (rss << PAGESHIFT);
+				} else if (shared == B_FALSE &&
+				    file == B_TRUE) {
+					/* private file mapping */
+					result->vmu_rss_private +=
+					    (rss << PAGESHIFT);
+				}
+				vmu_merge_bounds(&e_first, &e_last);
+			}
+			tmp = cur;
+			cur = cur->vmb_next;
+			vmu_free_bound(tmp);
+		}
+	}
+}
+
+/*
+ * Based on the current calculation flags, find the relevant entities
+ * which are relative to the process.  Then calculate each segment
+ * in the process'es address space for each relevant entity.
+ */
+static void
+vmu_calculate_proc(proc_t *p)
+{
+	vmu_entity_t *entities = NULL;
+	vmu_zone_t *zone;
+	vmu_entity_t *tmp;
+	struct as *as;
+	struct seg *seg;
+	int ret;
+
+	/* Figure out which entities are being computed */
+	if ((vmu_data.vmu_system) != NULL) {
+		tmp = vmu_data.vmu_system;
+		tmp->vme_next_calc = entities;
+		entities = tmp;
+	}
+	if (vmu_data.vmu_calc_flags &
+	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
+	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
+	    VMUSAGE_ALL_EUSERS)) {
+		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
+		    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
+		    (mod_hash_val_t *)&zone);
+		if (ret != 0) {
+			zone = vmu_alloc_zone(p->p_zone->zone_id);
+			ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
+			    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
+			    (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
+			ASSERT(ret == 0);
+		}
+		if (zone->vmz_zone != NULL) {
+			tmp = zone->vmz_zone;
+			tmp->vme_next_calc = entities;
+			entities = tmp;
+		}
+		if (vmu_data.vmu_calc_flags &
+		    (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
+			tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
+			    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
+			    zone->vmz_id);
+			tmp->vme_next_calc = entities;
+			entities = tmp;
+		}
+		if (vmu_data.vmu_calc_flags &
+		    (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
+			tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
+			    p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
+			tmp->vme_next_calc = entities;
+			entities = tmp;
+		}
+		if (vmu_data.vmu_calc_flags &
+		    (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
+			tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
+			    crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
+			tmp->vme_next_calc = entities;
+			entities = tmp;
+		}
+		if (vmu_data.vmu_calc_flags &
+		    (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
+			tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
+			    crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
+			tmp->vme_next_calc = entities;
+			entities = tmp;
+		}
+	}
+	/* Entities which collapse projects and users for all zones */
+	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
+		tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
+		    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
+		tmp->vme_next_calc = entities;
+		entities = tmp;
+	}
+	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
+		tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
+		    crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
+		tmp->vme_next_calc = entities;
+		entities = tmp;
+	}
+	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
+		tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
+		    crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
+		tmp->vme_next_calc = entities;
+		entities = tmp;
+	}
+
+	ASSERT(entities != NULL);
+	/* process all segs in process's address space */
+	as = p->p_as;
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	for (seg = AS_SEGFIRST(as); seg != NULL;
+	    seg = AS_SEGNEXT(as, seg)) {
+		vmu_calculate_seg(entities, seg);
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+}
+
+/*
+ * Free data created by previous call to vmu_calculate().
+ */
+static void
+vmu_clear_calc()
+{
+	if (vmu_data.vmu_system != NULL)
+		vmu_free_entity(vmu_data.vmu_system);
+		vmu_data.vmu_system = NULL;
+	if (vmu_data.vmu_zones_hash != NULL)
+		i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
+	if (vmu_data.vmu_projects_col_hash != NULL)
+		i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
+	if (vmu_data.vmu_rusers_col_hash != NULL)
+		i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
+	if (vmu_data.vmu_eusers_col_hash != NULL)
+		i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
+
+	i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
+	i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
+}
+
+/*
+ * Free unused data structures.  These can result if the system workload
+ * decreases between calculations.
+ */
+static void
+vmu_free_extra()
+{
+	vmu_bound_t *tb;
+	vmu_object_t *to;
+	vmu_entity_t *te;
+	vmu_zone_t *tz;
+
+	while (vmu_data.vmu_free_bounds != NULL) {
+		tb = vmu_data.vmu_free_bounds;
+		vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
+		kmem_cache_free(vmu_bound_cache, tb);
+	}
+	while (vmu_data.vmu_free_objects != NULL) {
+		to = vmu_data.vmu_free_objects;
+		vmu_data.vmu_free_objects =
+		    vmu_data.vmu_free_objects->vmo_next;
+		kmem_cache_free(vmu_object_cache, to);
+	}
+	while (vmu_data.vmu_free_entities != NULL) {
+		te = vmu_data.vmu_free_entities;
+		vmu_data.vmu_free_entities =
+		    vmu_data.vmu_free_entities->vme_next;
+		if (te->vme_vnode_hash != NULL)
+			mod_hash_destroy_hash(te->vme_vnode_hash);
+		if (te->vme_amp_hash != NULL)
+			mod_hash_destroy_hash(te->vme_amp_hash);
+		if (te->vme_anon_hash != NULL)
+			mod_hash_destroy_hash(te->vme_anon_hash);
+		kmem_free(te, sizeof (vmu_entity_t));
+	}
+	while (vmu_data.vmu_free_zones != NULL) {
+		tz = vmu_data.vmu_free_zones;
+		vmu_data.vmu_free_zones =
+		    vmu_data.vmu_free_zones->vmz_next;
+		if (tz->vmz_projects_hash != NULL)
+			mod_hash_destroy_hash(tz->vmz_projects_hash);
+		if (tz->vmz_tasks_hash != NULL)
+			mod_hash_destroy_hash(tz->vmz_tasks_hash);
+		if (tz->vmz_rusers_hash != NULL)
+			mod_hash_destroy_hash(tz->vmz_rusers_hash);
+		if (tz->vmz_eusers_hash != NULL)
+			mod_hash_destroy_hash(tz->vmz_eusers_hash);
+		kmem_free(tz, sizeof (vmu_zone_t));
+	}
+}
+
+extern kcondvar_t *pr_pid_cv;
+
+/*
+ * Determine which entity types are relevant and allocate the hashes to
+ * track them.  Then walk the process table and count rss and swap
+ * for each process'es address space.  Address space object such as
+ * vnodes, amps and anons are tracked per entity, so that they are
+ * not double counted in the results.
+ *
+ */
+static void
+vmu_calculate()
+{
+	int i = 0;
+	int ret;
+	proc_t *p;
+
+	vmu_clear_calc();
+
+	if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
+		vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
+		    ALL_ZONES);
+
+	/*
+	 * Walk process table and calculate rss of each proc.
+	 *
+	 * Pidlock and p_lock cannot be held while doing the rss calculation.
+	 * This is because:
+	 *	1.  The calculation allocates using KM_SLEEP.
+	 *	2.  The calculation grabs a_lock, which cannot be grabbed
+	 *	    after p_lock.
+	 *
+	 * Since pidlock must be dropped, we cannot simply just walk the
+	 * practive list.  Instead, we walk the process table, and sprlock
+	 * each process to ensure that it does not exit during the
+	 * calculation.
+	 */
+
+	mutex_enter(&pidlock);
+	for (i = 0; i < v.v_proc; i++) {
+again:
+		p = pid_entry(i);
+		if (p == NULL)
+			continue;
+
+		mutex_enter(&p->p_lock);
+		mutex_exit(&pidlock);
+
+		if (panicstr) {
+			mutex_exit(&p->p_lock);
+			return;
+		}
+
+		/* Try to set P_PR_LOCK */
+		ret = sprtrylock_proc(p);
+		if (ret == -1) {
+			/* Process in invalid state */
+			mutex_exit(&p->p_lock);
+			mutex_enter(&pidlock);
+			continue;
+		} else if (ret == 1) {
+			/*
+			 * P_PR_LOCK is already set.  Wait and try again.
+			 * This also drops p_lock.
+			 */
+			sprwaitlock_proc(p);
+			mutex_enter(&pidlock);
+			goto again;
+		}
+		mutex_exit(&p->p_lock);
+
+		vmu_calculate_proc(p);
+
+		mutex_enter(&p->p_lock);
+		sprunlock(p);
+		mutex_enter(&pidlock);
+	}
+	mutex_exit(&pidlock);
+
+	vmu_free_extra();
+}
+
+/*
+ * allocate a new cache for N results satisfying flags
+ */
+vmu_cache_t *
+vmu_cache_alloc(size_t nres, uint_t flags)
+{
+	vmu_cache_t *cache;
+
+	cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
+	cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
+	cache->vmc_nresults = nres;
+	cache->vmc_flags = flags;
+	cache->vmc_refcnt = 1;
+	return (cache);
+}
+
+/*
+ * Make sure cached results are not freed
+ */
+static void
+vmu_cache_hold(vmu_cache_t *cache)
+{
+	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
+	cache->vmc_refcnt++;
+}
+
+/*
+ * free cache data
+ */
+static void
+vmu_cache_rele(vmu_cache_t *cache)
+{
+	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
+	ASSERT(cache->vmc_refcnt > 0);
+	cache->vmc_refcnt--;
+	if (cache->vmc_refcnt == 0) {
+		kmem_free(cache->vmc_results, sizeof (vmusage_t) *
+			cache->vmc_nresults);
+		kmem_free(cache, sizeof (vmu_cache_t));
+	}
+}
+
+/*
+ * Copy out the cached results to a caller.  Inspect the callers flags
+ * and zone to determine which cached results should be copied.
+ */
+static int
+vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
+    uint_t flags)
+{
+	vmusage_t *result, *out_result;
+	vmusage_t dummy;
+	size_t i, count = 0;
+	size_t bufsize;
+	int ret = 0;
+	uint_t types = 0;
+
+	if (nres != NULL) {
+		if (copyin((caddr_t)nres, &bufsize, sizeof (size_t)))
+			return (set_errno(EFAULT));
+	} else {
+		bufsize = 0;
+	}
+
+	/* figure out what results the caller is interested in. */
+	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
+		types |= VMUSAGE_SYSTEM;
+	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+		types |= VMUSAGE_ZONE;
+	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+	    VMUSAGE_COL_PROJECTS))
+		types |= VMUSAGE_PROJECTS;
+	if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
+		types |= VMUSAGE_TASKS;
+	if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
+		types |= VMUSAGE_RUSERS;
+	if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
+		types |= VMUSAGE_EUSERS;
+
+	/* count results for current zone */
+	out_result = buf;
+	for (result = cache->vmc_results, i = 0;
+	    i < cache->vmc_nresults; result++, i++) {
+
+		/* Do not return "other-zone" results to non-global zones */
+		if (curproc->p_zone != global_zone &&
+		    curproc->p_zone->zone_id != result->vmu_zoneid)
+			continue;
+
+		/*
+		 * If non-global zone requests VMUSAGE_SYSTEM, fake
+		 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
+		 */
+		if (curproc->p_zone != global_zone &&
+		    (flags & VMUSAGE_SYSTEM) != 0 &&
+		    result->vmu_type == VMUSAGE_ZONE) {
+			count++;
+			if (out_result != NULL) {
+				if (bufsize < count) {
+					ret = set_errno(EOVERFLOW);
+				} else {
+					dummy = *result;
+					dummy.vmu_zoneid = ALL_ZONES;
+					dummy.vmu_id = 0;
+					dummy.vmu_type = VMUSAGE_SYSTEM;
+					if (copyout(&dummy, out_result,
+					    sizeof (vmusage_t)))
+						return (set_errno(
+						    EFAULT));
+					out_result++;
+				}
+			}
+		}
+
+		/* Skip results that do not match requested type */
+		if ((result->vmu_type & types) == 0)
+			continue;
+
+		/* Skip collated results if not requested */
+		if (result->vmu_zoneid == ALL_ZONES) {
+			if (result->vmu_type == VMUSAGE_PROJECTS &&
+			    (flags & VMUSAGE_COL_PROJECTS) == 0)
+				continue;
+			if (result->vmu_type == VMUSAGE_EUSERS &&
+			    (flags & VMUSAGE_COL_EUSERS) == 0)
+				continue;
+			if (result->vmu_type == VMUSAGE_RUSERS &&
+			    (flags & VMUSAGE_COL_RUSERS) == 0)
+				continue;
+		}
+
+		/* Skip "other zone" results if not requested */
+		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+			if (result->vmu_type == VMUSAGE_ZONE &&
+			    (flags & VMUSAGE_ALL_ZONES) == 0)
+				continue;
+			if (result->vmu_type == VMUSAGE_PROJECTS &&
+			    (flags & (VMUSAGE_ALL_PROJECTS |
+			    VMUSAGE_COL_PROJECTS)) == 0)
+				continue;
+			if (result->vmu_type == VMUSAGE_TASKS &&
+			    (flags & VMUSAGE_ALL_TASKS) == 0)
+				continue;
+			if (result->vmu_type == VMUSAGE_RUSERS &&
+			    (flags & (VMUSAGE_ALL_RUSERS |
+			    VMUSAGE_COL_RUSERS)) == 0)
+				continue;
+			if (result->vmu_type == VMUSAGE_EUSERS &&
+			    (flags & (VMUSAGE_ALL_EUSERS |
+			    VMUSAGE_COL_EUSERS)) == 0)
+				continue;
+		}
+		count++;
+		if (out_result != NULL) {
+			if (bufsize < count) {
+				ret = set_errno(EOVERFLOW);
+			} else {
+				if (copyout(result, out_result,
+				    sizeof (vmusage_t)))
+					return (set_errno(EFAULT));
+				out_result++;
+			}
+		}
+	}
+	if (nres != NULL)
+		if (copyout(&count, (void *)nres, sizeof (size_t)))
+			return (set_errno(EFAULT));
+
+	return (ret);
+}
+
+/*
+ * vm_getusage()
+ *
+ * Counts rss and swap by zone, project, task, and/or user.  The flags argument
+ * determines the type of results structures returned.  Flags requesting
+ * results from more than one zone are "flattened" to the local zone if the
+ * caller is not the global zone.
+ *
+ * args:
+ *	flags:	bitmap consisting of one or more of VMUSAGE_*.
+ *	age:	maximum allowable age (time since counting was done) in
+ *		seconds of the results.  Results from previous callers are
+ *		cached in kernel.
+ *	buf:	pointer to buffer array of vmusage_t.  If NULL, then only nres
+ *		set on success.
+ *	nres:	Set to number of vmusage_t structures pointed to by buf
+ *		before calling vm_getusage().
+ *		On return 0 (success) or ENOSPC, is set to the number of result
+ *		structures returned or attempted to return.
+ *
+ * returns 0 on success, -1 on failure:
+ *	EINTR (interrupted)
+ *	ENOSPC (nres to small for results, nres set to needed value for success)
+ *	EINVAL (flags invalid)
+ *	EFAULT (bad address for buf or nres)
+ */
+int
+vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres)
+{
+	vmu_entity_t *entity;
+	vmusage_t *result;
+	int ret = 0;
+	int cacherecent = 0;
+	hrtime_t now;
+	uint_t flags_orig;
+
+	/*
+	 * Non-global zones cannot request system wide and/or collated
+	 * results, or the system result, so munge the flags accordingly.
+	 */
+	flags_orig = flags;
+	if (curproc->p_zone != global_zone) {
+		if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
+			flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
+			flags |= VMUSAGE_PROJECTS;
+		}
+		if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
+			flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
+			flags |= VMUSAGE_RUSERS;
+		}
+		if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
+			flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
+			flags |= VMUSAGE_EUSERS;
+		}
+		if (flags & VMUSAGE_SYSTEM) {
+			flags &= ~VMUSAGE_SYSTEM;
+			flags |= VMUSAGE_ZONE;
+		}
+	}
+
+	/* Check for unknown flags */
+	if ((flags & (~VMUSAGE_MASK)) != 0)
+		return (set_errno(EINVAL));
+
+	/* Check for no flags */
+	if ((flags & VMUSAGE_MASK) == 0)
+		return (set_errno(EINVAL));
+
+	mutex_enter(&vmu_data.vmu_lock);
+	now = gethrtime();
+
+start:
+	if (vmu_data.vmu_cache != NULL) {
+
+		vmu_cache_t *cache;
+
+		if ((vmu_data.vmu_cache->vmc_timestamp +
+		    ((hrtime_t)age * NANOSEC)) > now)
+			cacherecent = 1;
+
+		if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
+		    cacherecent == 1) {
+			cache = vmu_data.vmu_cache;
+			vmu_cache_hold(cache);
+			mutex_exit(&vmu_data.vmu_lock);
+
+			ret = vmu_copyout_results(cache, buf, nres, flags_orig);
+			mutex_enter(&vmu_data.vmu_lock);
+			vmu_cache_rele(cache);
+			if (vmu_data.vmu_pending_waiters > 0)
+				cv_broadcast(&vmu_data.vmu_cv);
+			mutex_exit(&vmu_data.vmu_lock);
+			return (ret);
+		}
+		/*
+		 * If the cache is recent, it is likely that there are other
+		 * consumers of vm_getusage running, so add their flags to the
+		 * desired flags for the calculation.
+		 */
+		if (cacherecent == 1)
+			flags = vmu_data.vmu_cache->vmc_flags | flags;
+	}
+	if (vmu_data.vmu_calc_thread == NULL) {
+
+		vmu_cache_t *cache;
+
+		vmu_data.vmu_calc_thread = curthread;
+		vmu_data.vmu_calc_flags = flags;
+		vmu_data.vmu_entities = NULL;
+		vmu_data.vmu_nentities = 0;
+		if (vmu_data.vmu_pending_waiters > 0)
+			vmu_data.vmu_calc_flags |=
+			    vmu_data.vmu_pending_flags;
+
+		vmu_data.vmu_pending_flags = 0;
+		mutex_exit(&vmu_data.vmu_lock);
+		vmu_calculate();
+		mutex_enter(&vmu_data.vmu_lock);
+		/* copy results to cache */
+		if (vmu_data.vmu_cache != NULL)
+			vmu_cache_rele(vmu_data.vmu_cache);
+		cache = vmu_data.vmu_cache =
+		    vmu_cache_alloc(vmu_data.vmu_nentities,
+			vmu_data.vmu_calc_flags);
+
+		result = cache->vmc_results;
+		for (entity = vmu_data.vmu_entities; entity != NULL;
+		    entity = entity->vme_next) {
+			*result = entity->vme_result;
+			result++;
+		}
+		cache->vmc_timestamp = gethrtime();
+		vmu_cache_hold(cache);
+
+		vmu_data.vmu_calc_flags = 0;
+		vmu_data.vmu_calc_thread = NULL;
+
+		if (vmu_data.vmu_pending_waiters > 0)
+			cv_broadcast(&vmu_data.vmu_cv);
+
+		mutex_exit(&vmu_data.vmu_lock);
+
+		/* copy cache */
+		ret = vmu_copyout_results(cache, buf, nres, flags_orig);
+		mutex_enter(&vmu_data.vmu_lock);
+		vmu_cache_rele(cache);
+		mutex_exit(&vmu_data.vmu_lock);
+
+		return (ret);
+	}
+	vmu_data.vmu_pending_flags |= flags;
+	vmu_data.vmu_pending_waiters++;
+	while (vmu_data.vmu_calc_thread != NULL) {
+		if (cv_wait_sig(&vmu_data.vmu_cv,
+		    &vmu_data.vmu_lock) == 0) {
+			vmu_data.vmu_pending_waiters--;
+			mutex_exit(&vmu_data.vmu_lock);
+			return (set_errno(EINTR));
+		}
+	}
+	vmu_data.vmu_pending_waiters--;
+	goto start;
+}