summaryrefslogtreecommitdiff
path: root/usr/src/uts
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts')
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/disp/priocntl.c17
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_tnode.c15
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_vnops.c21
-rw-r--r--usr/src/uts/common/os/modhash.c24
-rw-r--r--usr/src/uts/common/os/pid.c82
-rw-r--r--usr/src/uts/common/os/pool.c4
-rw-r--r--usr/src/uts/common/os/project.c95
-rw-r--r--usr/src/uts/common/os/rctl.c122
-rw-r--r--usr/src/uts/common/os/schedctl.c13
-rw-r--r--usr/src/uts/common/os/sysent.c4
-rw-r--r--usr/src/uts/common/os/task.c11
-rw-r--r--usr/src/uts/common/os/zone.c376
-rw-r--r--usr/src/uts/common/sys/Makefile1
-rw-r--r--usr/src/uts/common/sys/modhash_impl.h19
-rw-r--r--usr/src/uts/common/sys/priocntl.h1
-rw-r--r--usr/src/uts/common/sys/proc.h2
-rw-r--r--usr/src/uts/common/sys/project.h14
-rw-r--r--usr/src/uts/common/sys/rctl.h9
-rw-r--r--usr/src/uts/common/sys/resource.h8
-rw-r--r--usr/src/uts/common/sys/syscall.h3
-rw-r--r--usr/src/uts/common/sys/vm_usage.h120
-rw-r--r--usr/src/uts/common/sys/zone.h30
-rw-r--r--usr/src/uts/common/syscall/processor_bind.c10
-rw-r--r--usr/src/uts/common/syscall/pset.c3
-rw-r--r--usr/src/uts/common/syscall/rusagesys.c19
-rw-r--r--usr/src/uts/common/syscall/tasksys.c9
-rw-r--r--usr/src/uts/common/vm/anon.h16
-rw-r--r--usr/src/uts/common/vm/seg.h10
-rw-r--r--usr/src/uts/common/vm/seg_kp.c22
-rw-r--r--usr/src/uts/common/vm/seg_vn.c5
-rw-r--r--usr/src/uts/common/vm/vm_anon.c20
-rw-r--r--usr/src/uts/common/vm/vm_page.c3
-rw-r--r--usr/src/uts/common/vm/vm_seg.c56
-rw-r--r--usr/src/uts/common/vm/vm_usage.c1978
35 files changed, 2984 insertions, 159 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 32a63d6c22..b2bbcbc8c3 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -334,6 +334,7 @@ GENUNIX_OBJS += \
vm_seg.o \
vm_subr.o \
vm_swap.o \
+ vm_usage.o \
vnode.o \
vuid_queue.o \
vuid_store.o \
diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c
index 3bb90cf1fa..9197dc815b 100644
--- a/usr/src/uts/common/disp/priocntl.c
+++ b/usr/src/uts/common/disp/priocntl.c
@@ -136,6 +136,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg,
struct pcmpargs pcmpargs;
pc_vaparms_t vaparms;
char clname[PC_CLNMSZ];
+ char *outstr;
int count;
kthread_id_t retthreadp;
proc_t *initpp;
@@ -145,6 +146,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg,
int rv = 0;
pid_t saved_pid;
id_t classid;
+ int size;
int (*copyinfn)(const void *, void *, size_t);
int (*copyoutfn)(const void *, void *, size_t);
@@ -692,6 +694,21 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg,
ASSERT(defaultcid > 0 && defaultcid < loaded_classes);
break;
+ case PC_GETDFLCL:
+ mutex_enter(&class_lock);
+
+ if (defaultcid >= loaded_classes)
+ outstr = "";
+ else
+ outstr = sclass[defaultcid].cl_name;
+ size = strlen(outstr) + 1;
+ if (arg != NULL)
+ if ((*copyoutfn)(outstr, arg, size) != 0)
+ error = EFAULT;
+
+ mutex_exit(&class_lock);
+ break;
+
default:
error = EINVAL;
break;
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
index 5a7000c242..c5145cccf0 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -67,6 +66,7 @@ tmp_resv(
int pagecreate) /* call anon_resv if set */
{
pgcnt_t pages = btopr(delta);
+ zone_t *zone;
ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
ASSERT(tp->tn_type == VREG);
@@ -79,9 +79,10 @@ tmp_resv(
*
* Deny if trying to reserve more than tmpfs can allocate
*/
+ zone = tm->tm_vfsp->vfs_zone;
if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) ||
- (!anon_checkspace(ptob(pages + tmpfs_minfree))) ||
- (anon_resv(delta) == 0))) {
+ (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) ||
+ (anon_resv_zone(delta, zone) == 0))) {
return (1);
}
@@ -114,7 +115,7 @@ tmp_unresv(
ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
ASSERT(tp->tn_type == VREG);
- anon_unresv(delta);
+ anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone);
mutex_enter(&tm->tm_contents);
tm->tm_anonmem -= btopr(delta);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index d623dce3f7..aa870b124a 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -215,9 +215,26 @@ wrtmp(
if (delta > 0) {
pagecreate = 1;
if (tmp_resv(tm, tp, delta, pagecreate)) {
- cmn_err(CE_WARN,
- "%s: File system full, swap space limit exceeded",
+ /*
+ * Log file system full in the zone that owns
+ * the tmpfs mount, as well as in the global
+ * zone if necessary.
+ */
+ zcmn_err(tm->tm_vfsp->vfs_zone->zone_id,
+ CE_WARN, "%s: File system full, "
+ "swap space limit exceeded",
tm->tm_mntpath);
+
+ if (tm->tm_vfsp->vfs_zone->zone_id !=
+ GLOBAL_ZONEID) {
+
+ vfs_t *vfs = tm->tm_vfsp;
+
+ zcmn_err(GLOBAL_ZONEID,
+ CE_WARN, "%s: File system full, "
+ "swap space limit exceeded",
+ vfs->vfs_vnodecovered->v_path);
+ }
error = ENOSPC;
break;
}
diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c
index 19700ce685..3c63231253 100644
--- a/usr/src/uts/common/os/modhash.c
+++ b/usr/src/uts/common/os/modhash.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -165,15 +164,6 @@
*/
#define MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2))
-static void i_mod_hash_clear_nosync(mod_hash_t *);
-static int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t,
- mod_hash_val_t *);
-static int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t,
- mod_hash_val_t, mod_hash_hndl_t);
-static int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t,
- mod_hash_val_t *);
-static uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t);
-
/*
* Cache for struct mod_hash_entry
*/
@@ -522,7 +512,7 @@ mod_hash_destroy_hash(mod_hash_t *hash)
* i_mod_hash()
* Call the hashing algorithm for this hash table, with the given key.
*/
-static uint_t
+uint_t
i_mod_hash(mod_hash_t *hash, mod_hash_key_t key)
{
uint_t h;
@@ -778,7 +768,7 @@ mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key)
* mod_hash_find()
* Find a value in the hash table corresponding to the given key.
*/
-static int
+int
i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key,
mod_hash_val_t *val)
{
@@ -826,7 +816,7 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
return (res);
}
-static void
+void
i_mod_hash_walk_nosync(mod_hash_t *hash,
uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg)
{
@@ -870,7 +860,7 @@ mod_hash_walk(mod_hash_t *hash,
* Clears the given hash table by calling the destructor of every hash
* element and freeing up all mod_hash_entry's.
*/
-static void
+void
i_mod_hash_clear_nosync(mod_hash_t *hash)
{
int i;
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index 88b0258afe..fecc4a6c45 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -385,6 +385,56 @@ pgfind(pid_t pgid)
}
/*
+ * Sets P_PR_LOCK on a non-system process. Process must be fully created
+ * and not exiting to succeed.
+ *
+ * Returns 0 on success.
+ * Returns 1 if P_PR_LOCK is set.
+ * Returns -1 if proc is in invalid state.
+ */
+int
+sprtrylock_proc(proc_t *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ /* skip system and incomplete processes */
+ if (p->p_stat == SIDL || p->p_stat == SZOMB ||
+ (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) {
+ return (-1);
+ }
+
+ if (p->p_proc_flag & P_PR_LOCK)
+ return (1);
+
+ p->p_proc_flag |= P_PR_LOCK;
+ THREAD_KPRI_REQUEST();
+
+ return (0);
+}
+
+/*
+ * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped,
+ * and the proc pointer no longer valid, as the proc may have exited.
+ */
+void
+sprwaitlock_proc(proc_t *p)
+{
+ kmutex_t *mp;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+
+ /*
+ * p_lock is persistent, but p itself is not -- it could
+ * vanish during cv_wait(). Load p->p_lock now so we can
+ * drop it after cv_wait() without referencing p.
+ */
+ mp = &p->p_lock;
+ cv_wait(&pr_pid_cv[p->p_slot], mp);
+ mutex_exit(mp);
+}
+
+/*
* If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
* Returns the proc pointer on success, NULL on failure. sprlock() is
* really just a stripped-down version of pr_p_lock() to allow practive
@@ -394,7 +444,7 @@ proc_t *
sprlock_zone(pid_t pid, zoneid_t zoneid)
{
proc_t *p;
- kmutex_t *mp;
+ int ret;
for (;;) {
mutex_enter(&pidlock);
@@ -402,31 +452,21 @@ sprlock_zone(pid_t pid, zoneid_t zoneid)
mutex_exit(&pidlock);
return (NULL);
}
- /*
- * p_lock is persistent, but p itself is not -- it could
- * vanish during cv_wait(). Load p->p_lock now so we can
- * drop it after cv_wait() without referencing p.
- */
- mp = &p->p_lock;
- mutex_enter(mp);
+ mutex_enter(&p->p_lock);
mutex_exit(&pidlock);
- /*
- * If the process is in some half-baked state, fail.
- */
- if (p->p_stat == SZOMB || p->p_stat == SIDL ||
- (p->p_flag & (SEXITING | SEXITLWPS))) {
- mutex_exit(mp);
- return (NULL);
- }
+
if (panicstr)
return (p);
- if (!(p->p_proc_flag & P_PR_LOCK))
+
+ ret = sprtrylock_proc(p);
+ if (ret == -1) {
+ mutex_exit(&p->p_lock);
+ return (NULL);
+ } else if (ret == 0) {
break;
- cv_wait(&pr_pid_cv[p->p_slot], mp);
- mutex_exit(mp);
+ }
+ sprwaitlock_proc(p);
}
- p->p_proc_flag |= P_PR_LOCK;
- THREAD_KPRI_REQUEST();
return (p);
}
diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c
index ceb90850fa..818bb54701 100644
--- a/usr/src/uts/common/os/pool.c
+++ b/usr/src/uts/common/os/pool.c
@@ -293,6 +293,8 @@ pool_enable(void)
(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
+ (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
+ "wt-load");
(void) nvlist_alloc(&pool_default->pool_props,
NV_UNIQUE_NAME, KM_SLEEP);
@@ -1309,7 +1311,7 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
}
if (idtype == P_PROJID) {
- kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND);
+ kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
if (kpj == NULL)
return (ESRCH);
mutex_enter(&kpj->kpj_poolbind);
diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c
index 6c266c0ca3..d75b60f6e9 100644
--- a/usr/src/uts/common/os/project.c
+++ b/usr/src/uts/common/os/project.c
@@ -29,6 +29,7 @@
#include <sys/modhash.h>
#include <sys/modctl.h>
#include <sys/kmem.h>
+#include <sys/kstat.h>
#include <sys/atomic.h>
#include <sys/cmn_err.h>
#include <sys/proc.h>
@@ -103,6 +104,8 @@ struct project_zone {
* acquired, the hash lock is to be acquired first.
*/
+static kstat_t *project_kstat_create(kproject_t *pj, zone_t *zone);
+static void project_kstat_delete(kproject_t *pj);
static void
project_data_init(kproject_data_t *data)
@@ -118,6 +121,7 @@ project_data_init(kproject_data_t *data)
data->kpd_locked_mem_ctl = UINT64_MAX;
data->kpd_contract = 0;
data->kpd_crypto_mem = 0;
+ data->kpd_lockedmem_kstat = NULL;
}
/*ARGSUSED*/
@@ -179,11 +183,11 @@ project_hold(kproject_t *p)
}
/*
- * kproject_t *project_hold_by_id(projid_t, zoneid_t, int)
+ * kproject_t *project_hold_by_id(projid_t, zone_t *, int)
*
* Overview
* project_hold_by_id() performs a look-up in the dictionary of projects
- * active on the system by specified project ID + zone ID and puts a hold on
+ * active on the system by specified project ID + zone and puts a hold on
* it. The third argument defines the desired behavior in the case when
* project with given project ID cannot be found:
*
@@ -202,7 +206,7 @@ project_hold(kproject_t *p)
* Caller must be in a context suitable for KM_SLEEP allocations.
*/
kproject_t *
-project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
+project_hold_by_id(projid_t id, zone_t *zone, int flag)
{
kproject_t *spare_p;
kproject_t *p;
@@ -211,9 +215,11 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
rctl_alloc_gp_t *gp;
rctl_entity_p_t e;
struct project_zone pz;
+ boolean_t create = B_FALSE;
+ kstat_t *ksp;
pz.kpj_id = id;
- pz.kpj_zoneid = zoneid;
+ pz.kpj_zoneid = zone->zone_id;
if (flag == PROJECT_HOLD_FIND) {
mutex_enter(&project_hash_lock);
@@ -241,9 +247,10 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
mutex_enter(&project_hash_lock);
if (mod_hash_find(projects_hash, (mod_hash_key_t)&pz,
(mod_hash_val_t *)&p) == MH_ERR_NOTFOUND) {
+
p = spare_p;
p->kpj_id = id;
- p->kpj_zoneid = zoneid;
+ p->kpj_zoneid = zone->zone_id;
p->kpj_count = 0;
p->kpj_shares = 1;
p->kpj_nlwps = 0;
@@ -265,7 +272,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
* Insert project into global project list.
*/
mutex_enter(&projects_list_lock);
- if (id != 0 || zoneid != GLOBAL_ZONEID) {
+ if (id != 0 || zone != &zone0) {
p->kpj_next = projects_list;
p->kpj_prev = projects_list->kpj_prev;
p->kpj_prev->kpj_next = p;
@@ -279,6 +286,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
projects_list = p;
}
mutex_exit(&projects_list_lock);
+ create = B_TRUE;
} else {
mutex_exit(&curproc->p_lock);
mod_hash_cancel(projects_hash, &hndl);
@@ -290,10 +298,20 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
p->kpj_count++;
mutex_exit(&project_hash_lock);
+ /*
+ * The kstat stores the project's zone name, as zoneid's may change
+ * across reboots.
+ */
+ if (create == B_TRUE) {
+ ksp = project_kstat_create(p, zone);
+ mutex_enter(&project_hash_lock);
+ ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL);
+ p->kpj_data.kpd_lockedmem_kstat = ksp;
+ mutex_exit(&project_hash_lock);
+ }
return (p);
}
-
/*
* void project_rele(kproject_t *)
*
@@ -325,6 +343,7 @@ project_rele(kproject_t *p)
mutex_exit(&projects_list_lock);
rctl_set_free(p->kpj_rctls);
+ project_kstat_delete(p);
if (mod_hash_destroy(projects_hash, (mod_hash_key_t)p))
panic("unable to delete project %d zone %d", p->kpj_id,
@@ -636,9 +655,9 @@ project_locked_mem_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
ASSERT(MUTEX_HELD(&p->p_lock));
- mutex_enter(&p->p_zone->zone_rctl_lock);
+ mutex_enter(&p->p_zone->zone_mem_lock);
q = p->p_task->tk_proj->kpj_data.kpd_locked_mem;
- mutex_exit(&p->p_zone->zone_rctl_lock);
+ mutex_exit(&p->p_zone->zone_mem_lock);
return (q);
}
@@ -649,7 +668,7 @@ project_locked_mem_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
{
rctl_qty_t q;
ASSERT(MUTEX_HELD(&p->p_lock));
- ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock));
+ ASSERT(MUTEX_HELD(&p->p_zone->zone_mem_lock));
q = p->p_task->tk_proj->kpj_data.kpd_locked_mem;
if (q + inc > rval->rcv_value)
return (1);
@@ -868,7 +887,7 @@ project_init(void)
rctl_add_default_limit("project.max-contracts", 10000,
RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
- t0.t_proj = proj0p = project_hold_by_id(0, GLOBAL_ZONEID,
+ t0.t_proj = proj0p = project_hold_by_id(0, &zone0,
PROJECT_HOLD_INSERT);
mutex_enter(&p0.p_lock);
@@ -876,3 +895,57 @@ project_init(void)
mutex_exit(&p0.p_lock);
proj0p->kpj_ntasks = 1;
}
+
+static int
+project_lockedmem_kstat_update(kstat_t *ksp, int rw)
+{
+ kproject_t *pj = ksp->ks_private;
+ kproject_kstat_t *kpk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ kpk->kpk_usage.value.ui64 = pj->kpj_data.kpd_locked_mem;
+ kpk->kpk_value.value.ui64 = pj->kpj_data.kpd_locked_mem_ctl;
+ return (0);
+}
+
+static kstat_t *
+project_kstat_create(kproject_t *pj, zone_t *zone)
+{
+ kstat_t *ksp;
+ kproject_kstat_t *kpk;
+ char *zonename = zone->zone_name;
+
+ ksp = rctl_kstat_create_project(pj, "lockedmem", KSTAT_TYPE_NAMED,
+ sizeof (kproject_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp == NULL)
+ return (NULL);
+
+ kpk = ksp->ks_data = kmem_alloc(sizeof (kproject_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zonename) + 1;
+ kstat_named_init(&kpk->kpk_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&kpk->kpk_zonename, zonename);
+ kstat_named_init(&kpk->kpk_usage, "usage", KSTAT_DATA_UINT64);
+ kstat_named_init(&kpk->kpk_value, "value", KSTAT_DATA_UINT64);
+ ksp->ks_update = project_lockedmem_kstat_update;
+ ksp->ks_private = pj;
+ kstat_install(ksp);
+
+ return (ksp);
+}
+
+static void
+project_kstat_delete(kproject_t *pj)
+{
+ void *data;
+
+ if (pj->kpj_data.kpd_lockedmem_kstat != NULL) {
+ data = pj->kpj_data.kpd_lockedmem_kstat->ks_data;
+ kstat_delete(pj->kpj_data.kpd_lockedmem_kstat);
+ kmem_free(data, sizeof (zone_kstat_t));
+ }
+ pj->kpj_data.kpd_lockedmem_kstat = NULL;
+}
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index 4de4c74fe8..c0479005ea 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -29,6 +29,7 @@
#include <sys/cmn_err.h>
#include <sys/id_space.h>
#include <sys/kmem.h>
+#include <sys/kstat.h>
#include <sys/log.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
@@ -2599,7 +2600,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
zonep = p->p_zone;
}
- mutex_enter(&zonep->zone_rctl_lock);
+ mutex_enter(&zonep->zone_mem_lock);
e.rcep_p.proj = projp;
e.rcep_t = RCENTITY_PROJECT;
@@ -2627,7 +2628,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
p->p_locked_mem += inc;
}
out:
- mutex_exit(&zonep->zone_rctl_lock);
+ mutex_exit(&zonep->zone_mem_lock);
if (proj != NULL)
zone_rele(zonep);
return (ret);
@@ -2661,7 +2662,7 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
zonep = p->p_zone;
}
- mutex_enter(&zonep->zone_rctl_lock);
+ mutex_enter(&zonep->zone_mem_lock);
zonep->zone_locked_mem -= inc;
projp->kpj_data.kpd_locked_mem -= inc;
if (creditproc != 0) {
@@ -2669,7 +2670,120 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
ASSERT(MUTEX_HELD(&p->p_lock));
p->p_locked_mem -= inc;
}
- mutex_exit(&zonep->zone_rctl_lock);
+ mutex_exit(&zonep->zone_mem_lock);
if (proj != NULL)
zone_rele(zonep);
}
+
+/*
+ * rctl_incr_swap(proc_t *, zone_t *, size_t)
+ *
+ * Overview
+ * Increments the swap charge on the specified zone.
+ *
+ * Return values
+ * 0 on success. EAGAIN if swap increment fails due an rctl value
+ * on the zone.
+ *
+ * Callers context
+ * p_lock held on specified proc.
+ * swap must be even multiple of PAGESIZE
+ */
+int
+rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
+{
+ rctl_entity_p_t e;
+
+ ASSERT(MUTEX_HELD(&proc->p_lock));
+ ASSERT((swap & PAGEOFFSET) == 0);
+ e.rcep_p.zone = zone;
+ e.rcep_t = RCENTITY_ZONE;
+
+ mutex_enter(&zone->zone_mem_lock);
+
+ if ((zone->zone_max_swap + swap) >
+ zone->zone_max_swap_ctl) {
+
+ if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
+ proc, &e, swap, 0) & RCT_DENY) {
+ mutex_exit(&zone->zone_mem_lock);
+ return (EAGAIN);
+ }
+ }
+ zone->zone_max_swap += swap;
+ mutex_exit(&zone->zone_mem_lock);
+ return (0);
+}
+
+/*
+ * rctl_decr_swap(zone_t *, size_t)
+ *
+ * Overview
+ * Decrements the swap charge on the specified zone.
+ *
+ * Return values
+ * None
+ *
+ * Callers context
+ * swap must be even multiple of PAGESIZE
+ */
+void
+rctl_decr_swap(zone_t *zone, size_t swap)
+{
+ ASSERT((swap & PAGEOFFSET) == 0);
+ mutex_enter(&zone->zone_mem_lock);
+ ASSERT(zone->zone_max_swap >= swap);
+ zone->zone_max_swap -= swap;
+ mutex_exit(&zone->zone_mem_lock);
+}
+
+/*
+ * Create resource kstat
+ */
+static kstat_t *
+rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
+ uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
+{
+ kstat_t *ksp = NULL;
+ char name[KSTAT_STRLEN];
+
+ (void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
+
+ if ((ksp = kstat_create_zone("caps", ks_zoneid,
+ name, ks_class, ks_type,
+ ks_ndata, ks_flags, ks_zoneid)) != NULL) {
+ if (ks_zoneid != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+ }
+ return (ksp);
+}
+
+/*
+ * Create zone-specific resource kstat
+ */
+kstat_t *
+rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
+ uint_t ks_ndata, uchar_t ks_flags)
+{
+ char name[KSTAT_STRLEN];
+
+ (void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
+
+ return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
+ ks_type, ks_ndata, ks_flags, zone->zone_id));
+}
+
+/*
+ * Create project-specific resource kstat
+ */
+kstat_t *
+rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
+ uint_t ks_ndata, uchar_t ks_flags)
+{
+ char name[KSTAT_STRLEN];
+
+ (void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
+
+ return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
+ ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
+}
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 66aae7d2bc..62279e0777 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -542,13 +541,13 @@ schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr)
* Set up anonymous memory struct. No swap reservation is
* needed since the page will be locked into memory.
*/
- amp = anonmap_alloc(PAGESIZE, PAGESIZE);
+ amp = anonmap_alloc(PAGESIZE, 0);
/*
* Allocate the page.
*/
- kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_LOCKED | KPD_ZERO,
- amp);
+ kaddr = segkp_get_withanonmap(segkp, PAGESIZE,
+ KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp);
if (kaddr == NULL) {
amp->refcnt--;
anonmap_free(amp);
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index 9ada0aac18..a7ef99fddb 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -666,7 +666,7 @@ struct sysent sysent[NSYSCALL] =
/* 178 */ SYSENT_LOADABLE(), /* kaio */
/* 179 */ SYSENT_LOADABLE(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
- /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2),
+ /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5),
/* 182 */ SYSENT_LOADABLE(), /* portfs */
/* 183 */ SYSENT_CI("pollsys", pollsys, 4),
/* 184 */ SYSENT_CI("labelsys", labelsys, 5),
@@ -1044,7 +1044,7 @@ struct sysent sysent32[NSYSCALL] =
/* 178 */ SYSENT_LOADABLE32(), /* kaio */
/* 179 */ SYSENT_LOADABLE32(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
- /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2),
+ /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5),
/* 182 */ SYSENT_LOADABLE32(), /* portfs */
/* 183 */ SYSENT_CI("pollsys", pollsys, 4),
/* 184 */ SYSENT_CI("labelsys", labelsys, 5),
diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c
index 562e3596b5..785f74c145 100644
--- a/usr/src/uts/common/os/task.c
+++ b/usr/src/uts/common/os/task.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -389,7 +388,7 @@ task_create(projid_t projid, zone_t *zone)
tk->tk_nlwps = 0;
tk->tk_nlwps_ctl = INT_MAX;
tk->tk_usage = tu;
- tk->tk_proj = project_hold_by_id(projid, zone->zone_id,
+ tk->tk_proj = project_hold_by_id(projid, zone,
PROJECT_HOLD_INSERT);
tk->tk_flags = TASK_NORMAL;
@@ -848,7 +847,7 @@ task_init(void)
task0p->tk_tkid = id_alloc(taskid_space);
task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
- task0p->tk_proj = project_hold_by_id(0, GLOBAL_ZONEID,
+ task0p->tk_proj = project_hold_by_id(0, &zone0,
PROJECT_HOLD_INSERT);
task0p->tk_flags = TASK_NORMAL;
task0p->tk_nlwps = p->p_lwpcnt;
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 0fb2c2be55..19ea8b31f1 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -154,6 +154,10 @@
* zone_lock: This is a per-zone lock used to protect several fields of
* the zone_t (see <sys/zone.h> for details). In addition, holding
* this lock means that the zone cannot go away.
+ * zone_nlwps_lock: This is a per-zone lock used to protect the fields
+ * related to the zone.max-lwps rctl.
+ * zone_mem_lock: This is a per-zone lock used to protect the fields
+ * related to the zone.max-locked-memory and zone.max-swap rctls.
* zsd_key_lock: This is a global lock protecting the key state for ZSD.
* zone_deathrow_lock: This is a global lock protecting the "deathrow"
* list (a list of zones in the ZONE_IS_DEAD state).
@@ -162,6 +166,10 @@
* pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
* zone_lock --> zsd_key_lock --> pidlock --> p_lock
*
+ * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
+ * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
+ * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
+ *
* Blocking memory allocations are permitted while holding any of the
* zone locks.
*
@@ -190,6 +198,7 @@
#include <sys/debug.h>
#include <sys/file.h>
#include <sys/kmem.h>
+#include <sys/kstat.h>
#include <sys/mutex.h>
#include <sys/note.h>
#include <sys/pathname.h>
@@ -232,6 +241,8 @@
#include <sys/zone.h>
#include <sys/tsol/label.h>
+#include <vm/seg.h>
+
/*
* cv used to signal that all references to the zone have been released. This
* needs to be global since there may be multiple waiters, and the first to
@@ -317,6 +328,7 @@ const char *zone_status_table[] = {
*/
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
+rctl_hndl_t rc_zone_max_swap;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_shmmax;
rctl_hndl_t rc_zone_shmmni;
@@ -1011,9 +1023,9 @@ zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
ASSERT(MUTEX_HELD(&p->p_lock));
- mutex_enter(&p->p_zone->zone_rctl_lock);
+ mutex_enter(&p->p_zone->zone_mem_lock);
q = p->p_zone->zone_locked_mem;
- mutex_exit(&p->p_zone->zone_rctl_lock);
+ mutex_exit(&p->p_zone->zone_mem_lock);
return (q);
}
@@ -1023,9 +1035,12 @@ zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
{
rctl_qty_t q;
+ zone_t *z;
+
+ z = e->rcep_p.zone;
ASSERT(MUTEX_HELD(&p->p_lock));
- ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock));
- q = p->p_zone->zone_locked_mem;
+ ASSERT(MUTEX_HELD(&z->zone_mem_lock));
+ q = z->zone_locked_mem;
if (q + incr > rcntl->rcv_value)
return (1);
return (0);
@@ -1051,6 +1066,57 @@ static rctl_ops_t zone_locked_mem_ops = {
zone_locked_mem_test
};
+/*ARGSUSED*/
+static rctl_qty_t
+zone_max_swap_usage(rctl_t *rctl, struct proc *p)
+{
+ rctl_qty_t q;
+ zone_t *z = p->p_zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ mutex_enter(&z->zone_mem_lock);
+ q = z->zone_max_swap;
+ mutex_exit(&z->zone_mem_lock);
+ return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
+ rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
+{
+ rctl_qty_t q;
+ zone_t *z;
+
+ z = e->rcep_p.zone;
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(MUTEX_HELD(&z->zone_mem_lock));
+ q = z->zone_max_swap;
+ if (q + incr > rcntl->rcv_value)
+ return (1);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+ if (e->rcep_p.zone == NULL)
+ return (0);
+ e->rcep_p.zone->zone_max_swap_ctl = nv;
+ return (0);
+}
+
+static rctl_ops_t zone_max_swap_ops = {
+ rcop_no_action,
+ zone_max_swap_usage,
+ zone_max_swap_set,
+ zone_max_swap_test
+};
+
/*
* Helper function to brand the zone with a unique ID.
*/
@@ -1080,6 +1146,96 @@ zone_get_kcred(zoneid_t zoneid)
return (cr);
}
+static int
+zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = zone->zone_locked_mem;
+ zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
+ return (0);
+}
+
+static int
+zone_swapresv_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = zone->zone_max_swap;
+ zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
+ return (0);
+}
+
+static void
+zone_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_kstat_t *zk;
+
+ ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED,
+ sizeof (zone_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp == NULL)
+ return;
+
+ zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
+ kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
+ kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
+ ksp->ks_update = zone_lockedmem_kstat_update;
+ ksp->ks_private = zone;
+ kstat_install(ksp);
+
+ zone->zone_lockedmem_kstat = ksp;
+
+ ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED,
+ sizeof (zone_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp == NULL)
+ return;
+
+ zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
+ kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
+ kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
+ ksp->ks_update = zone_swapresv_kstat_update;
+ ksp->ks_private = zone;
+ kstat_install(ksp);
+
+ zone->zone_swapresv_kstat = ksp;
+}
+
+static void
+zone_kstat_delete(zone_t *zone)
+{
+ void *data;
+
+ if (zone->zone_lockedmem_kstat != NULL) {
+ data = zone->zone_lockedmem_kstat->ks_data;
+ kstat_delete(zone->zone_lockedmem_kstat);
+ kmem_free(data, sizeof (zone_kstat_t));
+ }
+ if (zone->zone_swapresv_kstat != NULL) {
+ data = zone->zone_swapresv_kstat->ks_data;
+ kstat_delete(zone->zone_swapresv_kstat);
+ kmem_free(data, sizeof (zone_kstat_t));
+ }
+}
+
/*
* Called very early on in boot to initialize the ZSD list so that
* zone_key_create() can be called before zone_init(). It also initializes
@@ -1101,8 +1257,14 @@ zone_zsd_init(void)
mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
zone0.zone_shares = 1;
+ zone0.zone_nlwps = 0;
zone0.zone_nlwps_ctl = INT_MAX;
+ zone0.zone_locked_mem = 0;
+ zone0.zone_locked_mem_ctl = UINT64_MAX;
+ ASSERT(zone0.zone_max_swap == 0);
+ zone0.zone_max_swap_ctl = UINT64_MAX;
zone0.zone_shmmax = 0;
zone0.zone_ipc.ipcq_shmmni = 0;
zone0.zone_ipc.ipcq_semmni = 0;
@@ -1120,6 +1282,8 @@ zone_zsd_init(void)
zone0.zone_ncpus_online = 0;
zone0.zone_proc_initpid = 1;
zone0.zone_initname = initname;
+ zone0.zone_lockedmem_kstat = NULL;
+ zone0.zone_swapresv_kstat = NULL;
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
offsetof(struct zsd_entry, zsd_linkage));
list_insert_head(&zone_active, &zone0);
@@ -1259,6 +1423,12 @@ zone_init(void)
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_locked_mem_ops);
+
+ rc_zone_max_swap = rctl_register("zone.max-swap",
+ RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+ RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+ &zone_max_swap_ops);
+
/*
* Initialize the ``global zone''.
*/
@@ -1277,9 +1447,14 @@ zone_init(void)
zone0.zone_brand = &native_brand;
rctl_prealloc_destroy(gp);
/*
- * pool_default hasn't been initialized yet, so we let pool_init() take
- * care of making the global zone is in the default pool.
+ * pool_default hasn't been initialized yet, so we let pool_init()
+ * take care of making sure the global zone is in the default pool.
+ */
+
+ /*
+ * Initialize global zone kstats
*/
+ zone_kstat_create(&zone0);
/*
* Initialize zone label.
@@ -1337,6 +1512,7 @@ zone_init(void)
if (res)
panic("Sysevent_evc_bind failed during zone setup.\n");
+
}
static void
@@ -1476,6 +1652,38 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
return (0);
}
+static int
+zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+{
+ uint64_t mcap;
+ int err = 0;
+
+ if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
+ zone->zone_phys_mcap = mcap;
+
+ return (err);
+}
+
+static int
+zone_set_sched_class(zone_t *zone, const char *new_class)
+{
+ char sched_class[PC_CLNMSZ];
+ id_t classid;
+ int err;
+
+ ASSERT(zone != global_zone);
+ if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
+ return (err); /* EFAULT or ENAMETOOLONG */
+
+ if (getcid(sched_class, &classid) != 0 || classid == syscid)
+ return (set_errno(EINVAL));
+ zone->zone_defaultcid = classid;
+ ASSERT(zone->zone_defaultcid > 0 &&
+ zone->zone_defaultcid < loaded_classes);
+
+ return (0);
+}
+
/*
* Block indefinitely waiting for (zone_status >= status)
*/
@@ -2510,10 +2718,10 @@ zsched(void *arg)
/*
* Decrement locked memory counts on old zone and project.
*/
- mutex_enter(&global_zone->zone_rctl_lock);
+ mutex_enter(&global_zone->zone_mem_lock);
global_zone->zone_locked_mem -= pp->p_locked_mem;
pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
- mutex_exit(&global_zone->zone_rctl_lock);
+ mutex_exit(&global_zone->zone_mem_lock);
/*
* Create and join a new task in project '0' of this zone.
@@ -2529,10 +2737,10 @@ zsched(void *arg)
pj = pp->p_task->tk_proj;
- mutex_enter(&zone->zone_rctl_lock);
+ mutex_enter(&zone->zone_mem_lock);
zone->zone_locked_mem += pp->p_locked_mem;
pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
- mutex_exit(&zone->zone_rctl_lock);
+ mutex_exit(&zone->zone_mem_lock);
/*
* add lwp counts to zsched's zone, and increment project's task count
@@ -2689,7 +2897,10 @@ zsched(void *arg)
* classid 'cid'.
*/
pool_lock();
- cid = pool_get_class(zone->zone_pool);
+ if (zone->zone_defaultcid > 0)
+ cid = zone->zone_defaultcid;
+ else
+ cid = pool_get_class(zone->zone_pool);
if (cid == -1)
cid = defaultcid;
@@ -3019,7 +3230,7 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_initname = NULL;
mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&zone->zone_rctl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
offsetof(struct zsd_entry, zsd_linkage));
@@ -3057,8 +3268,14 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_initname =
kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
(void) strcpy(zone->zone_initname, zone_default_initname);
+ zone->zone_nlwps = 0;
+ zone->zone_nlwps_ctl = INT_MAX;
zone->zone_locked_mem = 0;
zone->zone_locked_mem_ctl = UINT64_MAX;
+ zone->zone_max_swap = 0;
+ zone->zone_max_swap_ctl = UINT64_MAX;
+ zone0.zone_lockedmem_kstat = NULL;
+ zone0.zone_swapresv_kstat = NULL;
/*
* Zsched initializes the rctls.
@@ -3233,6 +3450,11 @@ zone_create(const char *zone_name, const char *zone_root,
*/
/*
+ * Create zone kstats
+ */
+ zone_kstat_create(zone);
+
+ /*
* Let the other lwps continue.
*/
mutex_enter(&pp->p_lock);
@@ -3643,6 +3865,9 @@ zone_destroy(zoneid_t zoneid)
}
+ /* Get rid of the zone's kstats */
+ zone_kstat_delete(zone);
+
/*
* It is now safe to let the zone be recreated; remove it from the
* lists. The memory will not be freed until the last cred
@@ -3892,6 +4117,32 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
error = EFAULT;
}
break;
+ case ZONE_ATTR_PHYS_MCAP:
+ size = sizeof (zone->zone_phys_mcap);
+ if (bufsize > size)
+ bufsize = size;
+ if (buf != NULL &&
+ copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
+ error = EFAULT;
+ break;
+ case ZONE_ATTR_SCHED_CLASS:
+ mutex_enter(&class_lock);
+
+ if (zone->zone_defaultcid >= loaded_classes)
+ outstr = "";
+ else
+ outstr = sclass[zone->zone_defaultcid].cl_name;
+ size = strlen(outstr) + 1;
+ if (bufsize > size)
+ bufsize = size;
+ if (buf != NULL) {
+ err = copyoutstr(outstr, buf, bufsize, NULL);
+ if (err != 0 && err != ENAMETOOLONG)
+ error = EFAULT;
+ }
+
+ mutex_exit(&class_lock);
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -3923,10 +4174,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
return (set_errno(EPERM));
/*
- * At present, attributes can only be set on non-running,
- * non-global zones.
+ * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
+ * global zone.
*/
- if (zoneid == GLOBAL_ZONEID) {
+ if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
return (set_errno(EINVAL));
}
@@ -3938,8 +4189,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
zone_hold(zone);
mutex_exit(&zonehash_lock);
+ /*
+ * At present most attributes can only be set on non-running,
+ * non-global zones.
+ */
zone_status = zone_status_get(zone);
- if (zone_status > ZONE_IS_READY)
+ if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
goto done;
switch (attr) {
@@ -3971,6 +4226,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
if (zone->zone_brand == NULL)
err = EINVAL;
break;
+ case ZONE_ATTR_PHYS_MCAP:
+ err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
+ break;
+ case ZONE_ATTR_SCHED_CLASS:
+ err = zone_set_sched_class(zone, (const char *)buf);
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -3986,6 +4247,11 @@ done:
/*
* Return zero if the process has at least one vnode mapped in to its
* address space which shouldn't be allowed to change zones.
+ *
+ * Also return zero if the process has any shared mappings which reserve
+ * swap. This is because the counting for zone.max-swap does not allow swap
+ * revervation to be shared between zones. zone swap reservation is counted
+ * on zone->zone_max_swap.
*/
static int
as_can_change_zones(void)
@@ -3997,8 +4263,17 @@ as_can_change_zones(void)
int allow = 1;
ASSERT(pp->p_as != &kas);
- AS_LOCK_ENTER(&as, &as->a_lock, RW_READER);
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+
+ /*
+ * Cannot enter zone with shared anon memory which
+ * reserves swap. See comment above.
+ */
+ if (seg_can_change_zones(seg) == B_FALSE) {
+ allow = 0;
+ break;
+ }
/*
* if we can't get a backing vnode for this segment then skip
* it.
@@ -4011,11 +4286,30 @@ as_can_change_zones(void)
break;
}
}
- AS_LOCK_EXIT(&as, &as->a_lock);
+ AS_LOCK_EXIT(as, &as->a_lock);
return (allow);
}
/*
+ * Count swap reserved by curproc's address space
+ */
+static size_t
+as_swresv(void)
+{
+ proc_t *pp = curproc;
+ struct seg *seg;
+ struct as *as = pp->p_as;
+ size_t swap = 0;
+
+ ASSERT(pp->p_as != &kas);
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
+ swap += seg_swresv(seg);
+
+ return (swap);
+}
+
+/*
* Systemcall entry point for zone_enter().
*
* The current process is injected into said zone. In the process
@@ -4043,6 +4337,7 @@ zone_enter(zoneid_t zoneid)
zone_status_t status;
int err = 0;
rctl_entity_p_t e;
+ size_t swap;
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
@@ -4205,6 +4500,15 @@ zone_enter(zoneid_t zoneid)
goto out;
}
+ /*
+ * a_lock must be held while transfering locked memory and swap
+ * reservation from the global zone to the non global zone because
+ * asynchronous faults on the processes' address space can lock
+ * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
+ * segments respectively.
+ */
+ AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
+ swap = as_swresv();
mutex_enter(&pp->p_lock);
zone_proj0 = zone->zone_zsched->p_task->tk_proj;
/* verify that we do not exceed and task or lwp limits */
@@ -4216,10 +4520,11 @@ zone_enter(zoneid_t zoneid)
zone_proj0->kpj_ntasks += 1;
mutex_exit(&zone->zone_nlwps_lock);
- mutex_enter(&zone->zone_rctl_lock);
+ mutex_enter(&zone->zone_mem_lock);
zone->zone_locked_mem += pp->p_locked_mem;
zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
- mutex_exit(&zone->zone_rctl_lock);
+ zone->zone_max_swap += swap;
+ mutex_exit(&zone->zone_mem_lock);
/* remove lwps from proc's old zone and old project */
mutex_enter(&pp->p_zone->zone_nlwps_lock);
@@ -4227,12 +4532,14 @@ zone_enter(zoneid_t zoneid)
pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
mutex_exit(&pp->p_zone->zone_nlwps_lock);
- mutex_enter(&pp->p_zone->zone_rctl_lock);
+ mutex_enter(&pp->p_zone->zone_mem_lock);
pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
- mutex_exit(&pp->p_zone->zone_rctl_lock);
+ pp->p_zone->zone_max_swap -= swap;
+ mutex_exit(&pp->p_zone->zone_mem_lock);
mutex_exit(&pp->p_lock);
+ AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
/*
* Joining the zone cannot fail from now on.
@@ -4289,6 +4596,31 @@ zone_enter(zoneid_t zoneid)
sess_rele(pp->p_sessp, B_TRUE);
pp->p_sessp = sp;
pgjoin(pp, zone->zone_zsched->p_pidp);
+
+ /*
+ * If there is a default scheduling class for the zone and it is not
+ * the class we are currently in, change all of the threads in the
+ * process to the new class. We need to be holding pidlock & p_lock
+ * when we call parmsset so this is a good place to do it.
+ */
+ if (zone->zone_defaultcid > 0 &&
+ zone->zone_defaultcid != curthread->t_cid) {
+ pcparms_t pcparms;
+ kthread_id_t t;
+
+ pcparms.pc_cid = zone->zone_defaultcid;
+ pcparms.pc_clparms[0] = 0;
+
+ /*
+ * If setting the class fails, we still want to enter the zone.
+ */
+ if ((t = pp->p_tlist) != NULL) {
+ do {
+ (void) parmsset(&pcparms, t);
+ } while ((t = t->t_forw) != pp->p_tlist);
+ }
+ }
+
mutex_exit(&pp->p_lock);
mutex_exit(&pidlock);
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index ab103ef4c7..4493f99454 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -544,6 +544,7 @@ CHKHDRS= \
visual_io.h \
vlan.h \
vm.h \
+ vm_usage.h \
vmem.h \
vmem_impl.h \
vmmeter.h \
diff --git a/usr/src/uts/common/sys/modhash_impl.h b/usr/src/uts/common/sys/modhash_impl.h
index 25e45cec23..a187eb68ee 100644
--- a/usr/src/uts/common/sys/modhash_impl.h
+++ b/usr/src/uts/common/sys/modhash_impl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -93,6 +92,18 @@ struct mod_hash {
*/
void mod_hash_init(void);
+/*
+ * Internal routines. Use directly with care.
+ */
+uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t);
+int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t,
+ mod_hash_hndl_t);
+int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t,
+ mod_hash_val_t *, void *), void *);
+void i_mod_hash_clear_nosync(mod_hash_t *hash);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/priocntl.h b/usr/src/uts/common/sys/priocntl.h
index ca1a92400a..6475ed0a4c 100644
--- a/usr/src/uts/common/sys/priocntl.h
+++ b/usr/src/uts/common/sys/priocntl.h
@@ -65,6 +65,7 @@ extern long priocntl(), priocntlset();
#define PC_SETXPARMS 7 /* Set extended scheduling parameters */
#define PC_GETXPARMS 8 /* Get extended scheduling parameters */
#define PC_SETDFLCL 9 /* Set default class, not for general use */
+#define PC_GETDFLCL 10 /* Get default class, not for general use */
#define PC_CLNULL -1
diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h
index fcf953262c..9a0ba2cc37 100644
--- a/usr/src/uts/common/sys/proc.h
+++ b/usr/src/uts/common/sys/proc.h
@@ -613,6 +613,8 @@ extern proc_t *pgfind(pid_t);
extern proc_t *pgfind_zone(pid_t, zoneid_t);
extern proc_t *sprlock(pid_t);
extern proc_t *sprlock_zone(pid_t, zoneid_t);
+extern int sprtrylock_proc(proc_t *);
+extern void sprwaitlock_proc(proc_t *);
extern void sprlock_proc(proc_t *);
extern void sprunlock(proc_t *);
extern void pid_init(void);
diff --git a/usr/src/uts/common/sys/project.h b/usr/src/uts/common/sys/project.h
index 679c1eddc2..5018df8499 100644
--- a/usr/src/uts/common/sys/project.h
+++ b/usr/src/uts/common/sys/project.h
@@ -28,15 +28,24 @@
#pragma ident "%Z%%M% %I% %E% SMI"
+
#ifdef __cplusplus
extern "C" {
#endif
+
+#include <sys/kstat.h>
#include <sys/types.h>
#include <sys/mutex.h>
#include <sys/rctl.h>
#include <sys/ipc_rctl.h>
+typedef struct kproject_kstat {
+ kstat_named_t kpk_zonename;
+ kstat_named_t kpk_usage;
+ kstat_named_t kpk_value;
+} kproject_kstat_t;
+
typedef struct kproject_data { /* Datum protected by: */
rctl_qty_t kpd_shmmax; /* shm's ipcs_lock */
ipc_rqty_t kpd_ipc; /* shm|sem|msg's ipcs lock */
@@ -44,6 +53,7 @@ typedef struct kproject_data { /* Datum protected by: */
rctl_qty_t kpd_locked_mem_ctl; /* kpj_rctls->rcs_lock */
rctl_qty_t kpd_contract; /* contract_lock */
rctl_qty_t kpd_crypto_mem; /* crypto_rctl_lock */
+ kstat_t *kpd_lockedmem_kstat; /* locked memory kstat */
} kproject_data_t;
@@ -76,9 +86,11 @@ typedef struct kproject {
#define PROJECT_HOLD_FIND 1
#define PROJECT_HOLD_INSERT 2
+struct zone;
+
void project_init(void);
kproject_t *project_hold(kproject_t *);
-kproject_t *project_hold_by_id(projid_t, zoneid_t, int);
+kproject_t *project_hold_by_id(projid_t, struct zone *, int);
void project_rele(kproject_t *);
int project_walk_all(zoneid_t, int (*)(kproject_t *, void *), void *);
projid_t curprojid(void);
diff --git a/usr/src/uts/common/sys/rctl.h b/usr/src/uts/common/sys/rctl.h
index eb56fff9e5..a8480c2768 100644
--- a/usr/src/uts/common/sys/rctl.h
+++ b/usr/src/uts/common/sys/rctl.h
@@ -168,6 +168,7 @@ struct proc;
struct task;
struct kproject;
struct zone;
+struct kstat;
typedef struct rctl_entity_p_struct {
rctl_entity_t rcep_t;
@@ -324,6 +325,14 @@ int rctl_incr_locked_mem(struct proc *, struct kproject *, rctl_qty_t,
int);
void rctl_decr_locked_mem(struct proc *, struct kproject *, rctl_qty_t,
int);
+int rctl_incr_swap(struct proc *, struct zone *, size_t);
+void rctl_decr_swap(struct zone *, size_t);
+
+struct kstat *rctl_kstat_create_zone(struct zone *, char *, uchar_t, uint_t,
+ uchar_t);
+
+struct kstat *rctl_kstat_create_project(struct kproject *, char *, uchar_t,
+ uint_t, uchar_t);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h
index 86cc716d56..bf02808d4b 100644
--- a/usr/src/uts/common/sys/resource.h
+++ b/usr/src/uts/common/sys/resource.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -191,6 +190,7 @@ struct rusage {
#define _RUSAGESYS_GETRUSAGE 0 /* rusage process */
#define _RUSAGESYS_GETRUSAGE_CHLD 1 /* rusage child process */
#define _RUSAGESYS_GETRUSAGE_LWP 2 /* rusage lwp */
+#define _RUSAGESYS_GETVMUSAGE 3 /* getvmusage */
#if defined(_SYSCALL32)
diff --git a/usr/src/uts/common/sys/syscall.h b/usr/src/uts/common/sys/syscall.h
index 96cb967023..eedadfa0c0 100644
--- a/usr/src/uts/common/sys/syscall.h
+++ b/usr/src/uts/common/sys/syscall.h
@@ -384,7 +384,8 @@ extern "C" {
#define SYS_rusagesys 181
/*
* subcodes:
- * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE,...)
+ * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE, ...)
+ * getvmusage(...) :: rusagesys(RUSAGESYS_GETVMUSAGE, ...)
*/
#define SYS_port 182
/*
diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h
new file mode 100644
index 0000000000..5f8c8b8fe5
--- /dev/null
+++ b/usr/src/uts/common/sys/vm_usage.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VM_USAGE_H
+#define _SYS_VM_USAGE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The flags passed to getvmusage() request how to aggregate rss/swap results.
+ * Results can be aggregated by zone, project, task, ruser, and/or euser.
+ *
+ * If VMUSAGE_ALL_* or VMUSAGE_COL_* are passed from a non-global-zone, the
+ * flag is treated as VMUSAGE_*. For example, VMUSAGE_ALL_ZONES would be
+ * treated as VMUSAGE_ZONE.
+ *
+ * If VMUSAGE_SYSTEM is passed from a non-global zone, a result of type
+ * VMUSAGE_SYSTEM will be returned, but it will only reflect the usage
+ * of the calling zone.
+ *
+ * VMUSAGE_* requests results for the calling zone.
+ * VMUSAGE_ALL_* requests results for all zones.
+ * VMUSAGE_COL_* requests results for all zones, but collapses out the zoneid.
+ * For example, VMUSAGE_COL_PROJECTS requests results for all
+ * projects in all zones, and project N in ANY zone is treated
+ * as the same project.
+ */
+#define VMUSAGE_SYSTEM 0x1 /* rss/swap for ALL processes */
+#define VMUSAGE_ZONE 0x2 /* rss/swap for caller's zone */
+#define VMUSAGE_PROJECTS 0x4 /* rss/swap for all projects in */
+ /* caller's zone */
+#define VMUSAGE_TASKS 0x8 /* rss/swap for all tasks in */
+ /* caller's zones */
+#define VMUSAGE_RUSERS 0x10 /* rss/swap for all users (by process */
+ /* ruser) in the caller's zone */
+#define VMUSAGE_EUSERS 0x20 /* same as VMUSAGE_RUSERS, but by */
+ /* euser */
+
+#define VMUSAGE_ALL_ZONES 0x40 /* rss/swap for all zones */
+#define VMUSAGE_ALL_PROJECTS 0x80 /* rss/swap for all projects in */
+ /* all zones */
+#define VMUSAGE_ALL_TASKS 0x100 /* rss/swap for all tasks in all */
+ /* zones */
+#define VMUSAGE_ALL_RUSERS 0x200 /* rss/swap for all users (by process */
+ /* ruser) in all zones */
+#define VMUSAGE_ALL_EUSERS 0x400 /* same as VMUSAGE_ALL_RUSERS, but by */
+ /* euser */
+
+#define VMUSAGE_COL_PROJECTS 0x800 /* rss/swap for all projects in */
+ /* all zones. Collapse zoneid. */
+#define VMUSAGE_COL_RUSERS 0x1000 /* rss/swap for all users (by process */
+ /* ruser), in all zones. Collapse */
+ /* zoneid */
+#define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */
+ /* euser */
+
+#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */
+
+typedef struct vmusage {
+ id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */
+ /* VMUSAGE_COL_* results */
+ /* ALL_ZONES means that the result */
+ /* reflects swap and rss usage for */
+ /* a projid/uid across all zones */
+ uint_t vmu_type; /* Entity type of result. One of: */
+ /* VMUSAGE_(SYSTEM|ZONE|PROJECTS| */
+ /* TASKS|RUSERS|EUSERS) */
+ id_t vmu_id; /* zoneid, projid, taskid, ... */
+ size_t vmu_rss_all; /* total resident memory of entity */
+ /* in bytes */
+ size_t vmu_rss_private; /* total resident private memory */
+ size_t vmu_rss_shared; /* total resident shared memory */
+ size_t vmu_swap_all; /* total swap reserved, in bytes */
+ size_t vmu_swap_private; /* swap reserved for private mappings */
+ size_t vmu_swap_shared; /* swap reserved for shared mappings */
+
+} vmusage_t;
+
+extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres);
+
+#ifdef _KERNEL
+
+int vm_getusage(uint_t, time_t, vmusage_t *, size_t *);
+void vm_usage_init();
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VM_USAGE_H */
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index daccd16bdf..94646bc976 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -88,6 +88,8 @@ extern "C" {
#define ZONE_ATTR_INITNAME 9
#define ZONE_ATTR_BOOTARGS 10
#define ZONE_ATTR_BRAND 11
+#define ZONE_ATTR_PHYS_MCAP 12
+#define ZONE_ATTR_SCHED_CLASS 13
/* Start of the brand-specific attribute namespace */
#define ZONE_ATTR_BRAND_ATTRS 32768
@@ -280,6 +282,15 @@ typedef struct zone_dataset {
list_node_t zd_linkage;
} zone_dataset_t;
+/*
+ * structure for zone kstats
+ */
+typedef struct zone_kstat {
+ kstat_named_t zk_zonename;
+ kstat_named_t zk_usage;
+ kstat_named_t zk_value;
+} zone_kstat_t;
+
typedef struct zone {
/*
* zone_name is never modified once set.
@@ -326,14 +337,20 @@ typedef struct zone {
uint_t zone_rootpathlen; /* strlen(zone_rootpath) + 1 */
uint32_t zone_shares; /* FSS shares allocated to zone */
rctl_set_t *zone_rctls; /* zone-wide (zone.*) rctls */
- kmutex_t zone_rctl_lock; /* protects zone_locked_mem and */
+ kmutex_t zone_mem_lock; /* protects zone_locked_mem and */
/* kpd_locked_mem for all */
- /* projects in zone */
+ /* projects in zone. */
+ /* Also protects zone_max_swap */
/* grab after p_lock, before rcs_lock */
- rctl_qty_t zone_locked_mem; /* bytes of locked memory in zone */
- rctl_qty_t zone_locked_mem_ctl; /* current locked memory */
+ rctl_qty_t zone_locked_mem; /* bytes of locked memory in */
+ /* zone */
+ rctl_qty_t zone_locked_mem_ctl; /* Current locked memory */
/* limit. Protected by */
/* zone_rctls->rcs_lock */
+ rctl_qty_t zone_max_swap; /* bytes of swap reserved by zone */
+ rctl_qty_t zone_max_swap_ctl; /* current swap limit. */
+ /* Protected by */
+ /* zone_rctls->rcs_lock */
list_t zone_zsd; /* list of Zone-Specific Data values */
kcondvar_t zone_cv; /* used to signal state changes */
struct proc *zone_zsched; /* Dummy kernel "zsched" process */
@@ -341,6 +358,7 @@ typedef struct zone {
char *zone_initname; /* fs path to 'init' */
int zone_boot_err; /* for zone_boot() if boot fails */
char *zone_bootargs; /* arguments passed via zone_boot() */
+ uint64_t zone_phys_mcap; /* physical memory cap */
/*
* zone_kthreads is protected by zone_status_lock.
*/
@@ -376,6 +394,9 @@ typedef struct zone {
boolean_t zone_restart_init; /* Restart init if it dies? */
struct brand *zone_brand; /* zone's brand */
+ id_t zone_defaultcid; /* dflt scheduling class id */
+ kstat_t *zone_swapresv_kstat;
+ kstat_t *zone_lockedmem_kstat;
} zone_t;
/*
@@ -553,6 +574,7 @@ extern void mount_completed(void);
extern int zone_walk(int (*)(zone_t *, void *), void *);
extern rctl_hndl_t rc_zone_locked_mem;
+extern rctl_hndl_t rc_zone_max_swap;
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/syscall/processor_bind.c b/usr/src/uts/common/syscall/processor_bind.c
index 10ca1178d5..bd416e43e6 100644
--- a/usr/src/uts/common/syscall/processor_bind.c
+++ b/usr/src/uts/common/syscall/processor_bind.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -285,9 +284,10 @@ processor_bind(idtype_t idtype, id_t id, processorid_t bind,
break;
case P_PROJID:
+ pp = curproc;
if (id == P_MYID)
id = curprojid();
- if ((kpj = project_hold_by_id(id, getzoneid(),
+ if ((kpj = project_hold_by_id(id, pp->p_zone,
PROJECT_HOLD_FIND)) == NULL) {
ret = ESRCH;
} else {
diff --git a/usr/src/uts/common/syscall/pset.c b/usr/src/uts/common/syscall/pset.c
index 5d3b7e6233..767529fc5d 100644
--- a/usr/src/uts/common/syscall/pset.c
+++ b/usr/src/uts/common/syscall/pset.c
@@ -542,9 +542,10 @@ pset_bind(psetid_t pset, idtype_t idtype, id_t id, psetid_t *opset)
break;
case P_PROJID:
+ pp = curproc;
if (id == P_MYID)
id = curprojid();
- if ((kpj = project_hold_by_id(id, getzoneid(),
+ if ((kpj = project_hold_by_id(id, pp->p_zone,
PROJECT_HOLD_FIND)) == NULL) {
error = ESRCH;
break;
diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c
index 3e09643981..036500932f 100644
--- a/usr/src/uts/common/syscall/rusagesys.c
+++ b/usr/src/uts/common/syscall/rusagesys.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +34,7 @@
#include <sys/time.h>
#include <sys/errno.h>
#include <sys/resource.h>
+#include <sys/vm_usage.h>
static int
getrusage(void *user_rusage)
@@ -246,16 +246,19 @@ getrusage_lwp(void *user_rusage)
}
int
-rusagesys(int code, void * arg)
+rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4)
{
switch (code) {
case _RUSAGESYS_GETRUSAGE:
- return (getrusage(arg));
+ return (getrusage(arg1));
case _RUSAGESYS_GETRUSAGE_CHLD:
- return (getrusage_chld(arg));
+ return (getrusage_chld(arg1));
case _RUSAGESYS_GETRUSAGE_LWP:
- return (getrusage_lwp(arg));
+ return (getrusage_lwp(arg1));
+ case _RUSAGESYS_GETVMUSAGE:
+ return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2,
+ (vmusage_t *)arg3, (size_t *)arg4));
default:
return (set_errno(EINVAL));
}
diff --git a/usr/src/uts/common/syscall/tasksys.c b/usr/src/uts/common/syscall/tasksys.c
index 705b543a37..bec091e61c 100644
--- a/usr/src/uts/common/syscall/tasksys.c
+++ b/usr/src/uts/common/syscall/tasksys.c
@@ -25,6 +25,7 @@
#pragma ident "%Z%%M% %I% %E% SMI"
+
/*
* System calls for creating and inquiring about tasks and projects
*/
@@ -102,7 +103,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
* Put a hold on our new project and make sure that nobody is
* trying to bind it to a pool while we're joining.
*/
- kpj = project_hold_by_id(projid, getzoneid(), PROJECT_HOLD_INSERT);
+ kpj = project_hold_by_id(projid, p->p_zone, PROJECT_HOLD_INSERT);
e.rcep_p.proj = kpj;
e.rcep_t = RCENTITY_PROJECT;
@@ -111,7 +112,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
zone = p->p_zone;
mutex_enter(&zone->zone_nlwps_lock);
- mutex_enter(&zone->zone_rctl_lock);
+ mutex_enter(&zone->zone_mem_lock);
if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl)
if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e,
@@ -130,7 +131,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
rctlfail = 1;
if (rctlfail) {
- mutex_exit(&zone->zone_rctl_lock);
+ mutex_exit(&zone->zone_mem_lock);
mutex_exit(&zone->zone_nlwps_lock);
if (curthread != p->p_agenttp)
continuelwps(p);
@@ -144,7 +145,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
oldpj->kpj_data.kpd_locked_mem -= p->p_locked_mem;
oldpj->kpj_nlwps -= p->p_lwpcnt;
- mutex_exit(&zone->zone_rctl_lock);
+ mutex_exit(&zone->zone_mem_lock);
mutex_exit(&zone->zone_nlwps_lock);
mutex_exit(&p->p_lock);
diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h
index 90f6e1e661..ed59ec590b 100644
--- a/usr/src/uts/common/vm/anon.h
+++ b/usr/src/uts/common/vm/anon.h
@@ -42,6 +42,7 @@
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/cred.h>
+#include <sys/zone.h>
#include <vm/seg.h>
#include <vm/vpage.h>
@@ -387,8 +388,8 @@ extern int anon_map_demotepages(struct anon_map *, ulong_t,
struct seg *, caddr_t, uint_t,
struct vpage [], struct cred *);
extern void anon_shmap_free_pages(struct anon_map *, ulong_t, size_t);
-extern int anon_resvmem(size_t, uint_t);
-extern void anon_unresv(size_t);
+extern int anon_resvmem(size_t, boolean_t, zone_t *);
+extern void anon_unresvmem(size_t, zone_t *);
extern struct anon_map *anonmap_alloc(size_t, size_t);
extern void anonmap_free(struct anon_map *);
extern void anon_decref(struct anon *);
@@ -416,9 +417,16 @@ extern void anon_array_exit(anon_sync_obj_t *);
* request and if so, reserves the appropriate anonymous memory resources.
* anon_checkspace just checks to see if there is space to fulfill the request,
* without taking any resources. Both return 1 if successful and 0 if not.
+ *
+ * Macros are provided as anon reservation is usually charged to the zone of
+ * the current process. In some cases (such as anon reserved by tmpfs), a
+ * zone pointer is needed to charge the appropriate zone.
*/
-#define anon_resv(size) anon_resvmem((size), 1)
-#define anon_checkspace(size) anon_resvmem((size), 0)
+#define anon_unresv(size) anon_unresvmem(size, curproc->p_zone)
+#define anon_unresv_zone(size, zone) anon_unresvmem(size, zone)
+#define anon_resv(size) anon_resvmem((size), 1, curproc->p_zone)
+#define anon_resv_zone(size, zone) anon_resvmem((size), 1, zone)
+#define anon_checkspace(size, zone) anon_resvmem((size), 0, zone)
/*
* Flags to anon_private
diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h
index 0ee7d62ce1..a9683c0e54 100644
--- a/usr/src/uts/common/vm/seg.h
+++ b/usr/src/uts/common/vm/seg.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -245,6 +244,9 @@ uint_t seg_pages(struct seg *);
#endif /* VMDEBUG */
+boolean_t seg_can_change_zones(struct seg *);
+size_t seg_swresv(struct seg *);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c
index ff9c47e0ff..d58e873a19 100644
--- a/usr/src/uts/common/vm/seg_kp.c
+++ b/usr/src/uts/common/vm/seg_kp.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -147,6 +146,7 @@ uint32_t red_closest = UINT_MAX;
uint32_t red_ndoubles;
pgcnt_t anon_segkp_pages_locked; /* See vm/anon.h */
+pgcnt_t anon_segkp_pages_resv; /* anon reserved by seg_kp */
static struct seg_ops segkp_ops = {
SEGKP_BADOP(int), /* dup */
@@ -448,8 +448,10 @@ segkp_get_internal(
* Note that we don't need swap space for the red zone page.
*/
if (amp != NULL) {
- ASSERT((flags & KPD_NO_ANON) == 0);
- /* The reserve has been done and the anon_hdr is separate. */
+ /*
+ * The swap reservation has been done, if required, and the
+ * anon_hdr is separate.
+ */
anon_idx = 0;
kpd->kp_anon_idx = anon_idx;
kpd->kp_anon = amp->ahp;
@@ -458,7 +460,7 @@ segkp_get_internal(
kpd, vbase, len, flags, 1);
} else if ((flags & KPD_NO_ANON) == 0) {
- if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) {
+ if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) {
if (flags & KPD_LOCKED) {
atomic_add_long(&anon_segkp_pages_locked,
-pages);
@@ -468,6 +470,8 @@ segkp_get_internal(
kmem_free(kpd, sizeof (struct segkp_data));
return (NULL);
}
+ atomic_add_long(&anon_segkp_pages_resv,
+ btop(SEGKP_MAPLEN(len, flags)));
anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT;
kpd->kp_anon_idx = anon_idx;
kpd->kp_anon = kpsd->kpsd_anon;
@@ -704,7 +708,9 @@ segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len)
if ((kpd->kp_flags & KPD_HASAMP) == 0) {
anon_free(kpd->kp_anon, kpd->kp_anon_idx + i,
PAGESIZE);
- anon_unresv(PAGESIZE);
+ anon_unresv_zone(PAGESIZE, NULL);
+ atomic_add_long(&anon_segkp_pages_resv,
+ -1);
}
TRACE_5(TR_FAC_VM,
TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index f48db44acc..e2069b27c6 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -2323,8 +2323,9 @@ segvn_faultpage(
* zeroes. If no advance reservations, reserve now.
*/
if (svd->flags & MAP_NORESERVE) {
- if (anon_resv(ptob(1))) {
- svd->swresv += ptob(1);
+ if (anon_resv_zone(ptob(1),
+ seg->s_as->a_proc->p_zone)) {
+ atomic_add_long(&svd->swresv, ptob(1));
} else {
err = ENOMEM;
goto out;
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index 0cad34257c..3f225a345a 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -113,6 +113,7 @@
#include <sys/policy.h>
#include <sys/condvar_impl.h>
#include <sys/mutex_impl.h>
+#include <sys/rctl.h>
#include <vm/as.h>
#include <vm/hat.h>
@@ -729,12 +730,22 @@ set_anoninfo(void)
* Return non-zero on success.
*/
int
-anon_resvmem(size_t size, uint_t takemem)
+anon_resvmem(size_t size, boolean_t takemem, zone_t *zone)
{
pgcnt_t npages = btopr(size);
pgcnt_t mswap_pages = 0;
pgcnt_t pswap_pages = 0;
+ proc_t *p = curproc;
+ if (zone != NULL && takemem) {
+ /* test zone.max-swap resource control */
+ mutex_enter(&p->p_lock);
+ if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
+ mutex_exit(&p->p_lock);
+ return (0);
+ }
+ mutex_exit(&p->p_lock);
+ }
mutex_enter(&anoninfo_lock);
/*
@@ -834,16 +845,17 @@ anon_resvmem(size_t size, uint_t takemem)
mutex_exit(&anoninfo_lock);
ANON_PRINT(A_RESV,
("anon_resvmem: not enough space from swapfs\n"));
+ if (zone != NULL && takemem)
+ rctl_decr_swap(zone, ptob(npages));
return (0);
}
}
-
/*
* Give back an anon reservation.
*/
void
-anon_unresv(size_t size)
+anon_unresvmem(size_t size, zone_t *zone)
{
pgcnt_t npages = btopr(size);
spgcnt_t mem_free_pages = 0;
@@ -851,6 +863,8 @@ anon_unresv(size_t size)
#ifdef ANON_DEBUG
pgcnt_t mem_resv;
#endif
+ if (zone != NULL)
+ rctl_decr_swap(zone, size);
mutex_enter(&anoninfo_lock);
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 05bfe662be..adac07b766 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -77,7 +77,7 @@
#include <vm/pvn.h>
#include <vm/seg_kmem.h>
#include <vm/vm_dep.h>
-
+#include <sys/vm_usage.h>
#include <fs/fs_subr.h>
static int nopageage = 0;
@@ -343,6 +343,7 @@ vm_init(void)
(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
page_init_mem_config();
page_retire_init();
+ vm_usage_init();
}
/*
diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c
index 50cc21cdf7..aed892969d 100644
--- a/usr/src/uts/common/vm/vm_seg.c
+++ b/usr/src/uts/common/vm/vm_seg.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -54,12 +53,14 @@
#include <sys/cmn_err.h>
#include <sys/callb.h>
#include <sys/mem_config.h>
+#include <sys/mman.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
-
+#include <vm/seg_spt.h>
+#include <vm/seg_vn.h>
/*
* kstats for segment advise
*/
@@ -950,3 +951,48 @@ seg_pinit_mem_config(void)
*/
ASSERT(ret == 0);
}
+
+extern struct seg_ops segvn_ops;
+extern struct seg_ops segspt_shmops;
+
+/*
+ * Verify that segment is not a shared anonymous segment which reserves
+ * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
+ * from one zone to another if any segments are shared. This is because the
+ * last process to exit will credit the swap reservation. This could lead
+ * to the swap being reserved by one zone, and credited to another.
+ */
+boolean_t
+seg_can_change_zones(struct seg *seg)
+{
+ struct segvn_data *svd;
+
+ if (seg->s_ops == &segspt_shmops)
+ return (B_FALSE);
+
+ if (seg->s_ops == &segvn_ops) {
+ svd = (struct segvn_data *)seg->s_data;
+ if (svd->type == MAP_SHARED &&
+ svd->amp != NULL &&
+ svd->amp->swresv > 0)
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+/*
+ * Return swap reserved by a segment backing a private mapping.
+ */
+size_t
+seg_swresv(struct seg *seg)
+{
+ struct segvn_data *svd;
+ size_t swap = 0;
+
+ if (seg->s_ops == &segvn_ops) {
+ svd = (struct segvn_data *)seg->s_data;
+ if (svd->type == MAP_PRIVATE && svd->swresv > 0)
+ swap = svd->swresv;
+ }
+ return (swap);
+}
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
new file mode 100644
index 0000000000..32a8811e10
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -0,0 +1,1978 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * vm_usage
+ *
+ * This file implements the getvmusage() private system call.
+ * getvmusage() counts the amount of resident memory pages and swap
+ * reserved by the specified process collective. A "process collective" is
+ * the set of processes owned by a particular, zone, project, task, or user.
+ *
+ * rss and swap are counted so that for a given process collective, a page is
+ * only counted once. For example, this means that if multiple processes in
+ * the same project map the same page, then the project will only be charged
+ * once for that page. On the other hand, if two processes in different
+ * projects map the same page, then both projects will be charged
+ * for the page.
+ *
+ * The vm_getusage() calculation is implemented so that the first thread
+ * performs the rss/swap counting. Other callers will wait for that thread to
+ * finish, copying the results. This enables multiple rcapds and prstats to
+ * consume data from the same calculation. The results are also cached so that
+ * a caller interested in recent results can just copy them instead of starting
+ * a new calculation. The caller passes the maximium age (in seconds) of the
+ * data. If the cached data is young enough, the cache is copied, otherwise,
+ * a new calculation is executed and the cache is replaced with the new
+ * data.
+ *
+ * The rss calculation for each process collective is as follows:
+ *
+ * - Inspect flags, determine if counting rss for zones, projects, tasks,
+ * and/or users.
+ * - For each proc:
+ * - Figure out proc's collectives (zone, project, task, and/or user).
+ * - For each seg in proc's address space:
+ * - If seg is private:
+ * - Lookup anons in the amp.
+ * - For incore pages not previously visited each of the
+ * proc's collectives, add incore pagesize to each.
+ * collective.
+ * Anon's with a refcnt of 1 can be assummed to be not
+ * previously visited.
+ * - For address ranges without anons in the amp:
+ * - Lookup pages in underlying vnode.
+ * - For incore pages not previously visiting for
+ * each of the proc's collectives, add incore
+ * pagesize to each collective.
+ * - If seg is shared:
+ * - Lookup pages in the shared amp or vnode.
+ * - For incore pages not previously visited for each of
+ * the proc's collectives, add incore pagesize to each
+ * collective.
+ *
+ * Swap is reserved by private segments, and shared anonymous segments.
+ * The only shared anon segments which do not reserve swap are ISM segments
+ * and schedctl segments, both of which can be identified by having
+ * amp->swresv == 0.
+ *
+ * The swap calculation for each collective is as follows:
+ *
+ * - Inspect flags, determine if counting rss for zones, projects, tasks,
+ * and/or users.
+ * - For each proc:
+ * - Figure out proc's collectives (zone, project, task, and/or user).
+ * - For each seg in proc's address space:
+ * - If seg is private:
+ * - Add svd->swresv pages to swap count for each of the
+ * proc's collectives.
+ * - If seg is anon, shared, and amp->swresv != 0
+ * - For address ranges in amp not previously visited for
+ * each of the proc's collectives, add size of address
+ * range to the swap count for each collective.
+ *
+ * These two calculations are done simultaneously, with most of the work
+ * being done in vmu_calculate_seg(). The results of the calculation are
+ * copied into "vmu_data.vmu_cache_results".
+ *
+ * To perform the calculation, various things are tracked and cached:
+ *
+ * - incore/not-incore page ranges for all vnodes.
+ * (vmu_data.vmu_all_vnodes_hash)
+ * This eliminates looking up the same page more than once.
+ *
+ * - incore/not-incore page ranges for all shared amps.
+ * (vmu_data.vmu_all_amps_hash)
+ * This eliminates looking up the same page more than once.
+ *
+ * - visited page ranges for each collective.
+ * - per vnode (entity->vme_vnode_hash)
+ * - per shared amp (entity->vme_amp_hash)
+ * For accurate counting of map-shared and cow-shared pages.
+ *
+ * - visited private anons (refcnt > 1) for each collective.
+ * (entity->vme_anon_hash)
+ * For accurate counting of cow-shared pages.
+ *
+ * The common accounting structure is the vmu_entity_t, which represents
+ * collectives:
+ *
+ * - A zone.
+ * - A project, task, or user within a zone.
+ * - The entire system (vmu_data.vmu_system).
+ * - Each collapsed (col) project and user. This means a given projid or
+ * uid, regardless of which zone the process is in. For instance,
+ * project 0 in the global zone and project 0 in a non global zone are
+ * the same collapsed project.
+ *
+ * Each entity structure tracks which pages have been already visited for
+ * that entity (via previously inspected processes) so that these pages are
+ * not double counted.
+ */
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/zone.h>
+#include <sys/proc.h>
+#include <sys/project.h>
+#include <sys/task.h>
+#include <sys/thread.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/modhash.h>
+#include <sys/modhash_impl.h>
+#include <sys/shm.h>
+#include <sys/swap.h>
+#include <sys/synch.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vm_usage.h>
+#include <sys/zone.h>
+#include <vm/anon.h>
+#include <vm/as.h>
+#include <vm/seg_vn.h>
+#include <vm/seg_spt.h>
+
+#define VMUSAGE_HASH_SIZE 512
+
+#define VMUSAGE_TYPE_VNODE 1
+#define VMUSAGE_TYPE_AMP 2
+#define VMUSAGE_TYPE_ANON 3
+
+#define VMUSAGE_BOUND_UNKNOWN 0
+#define VMUSAGE_BOUND_INCORE 1
+#define VMUSAGE_BOUND_NOT_INCORE 2
+
+/*
+ * bounds for vnodes and shared amps
+ * Each bound is either entirely incore, entirely not in core, or
+ * entirely unknown. bounds are stored in order by offset.
+ */
+typedef struct vmu_bound {
+ struct vmu_bound *vmb_next;
+ pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */
+ pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */
+ char vmb_type; /* One of VMUSAGE_BOUND_* */
+} vmu_bound_t;
+
+/*
+ * hash of visited objects (vnodes or shared amps)
+ * key is address of vnode or amp. Bounds lists known incore/non-incore
+ * bounds for vnode/amp.
+ */
+typedef struct vmu_object {
+ struct vmu_object *vmo_next; /* free list */
+ caddr_t vmo_key;
+ short vmo_type;
+ vmu_bound_t *vmo_bounds;
+} vmu_object_t;
+
+/*
+ * Entity by which to count results.
+ *
+ * The entity structure keeps the current rss/swap counts for each entity
+ * (zone, project, etc), and hashes of vm structures that have already
+ * been visited for the entity.
+ *
+ * vme_next: links the list of all entities currently being counted by
+ * vmu_calculate().
+ *
+ * vme_next_calc: links the list of entities related to the current process
+ * being counted by vmu_calculate_proc().
+ *
+ * vmu_calculate_proc() walks all processes. For each process, it makes a
+ * list of the entities related to that process using vme_next_calc. This
+ * list changes each time vmu_calculate_proc() is called.
+ *
+ */
+typedef struct vmu_entity {
+ struct vmu_entity *vme_next;
+ struct vmu_entity *vme_next_calc;
+ mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
+ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
+ mod_hash_t *vme_anon_hash; /* cow anons visited for entity */
+ vmusage_t vme_result; /* identifies entity and results */
+} vmu_entity_t;
+
+/*
+ * Hash of entities visited within a zone, and an entity for the zone
+ * itself.
+ */
+typedef struct vmu_zone {
+ struct vmu_zone *vmz_next; /* free list */
+ id_t vmz_id;
+ vmu_entity_t *vmz_zone;
+ mod_hash_t *vmz_projects_hash;
+ mod_hash_t *vmz_tasks_hash;
+ mod_hash_t *vmz_rusers_hash;
+ mod_hash_t *vmz_eusers_hash;
+} vmu_zone_t;
+
+/*
+ * Cache of results from last calculation
+ */
+typedef struct vmu_cache {
+ vmusage_t *vmc_results; /* Results from last call to */
+ /* vm_getusage(). */
+ uint64_t vmc_nresults; /* Count of cached results */
+ uint64_t vmc_refcnt; /* refcnt for free */
+ uint_t vmc_flags; /* Flags for vm_getusage() */
+ hrtime_t vmc_timestamp; /* when cache was created */
+} vmu_cache_t;
+
+/*
+ * top level rss info for the system
+ */
+typedef struct vmu_data {
+ kmutex_t vmu_lock; /* Protects vmu_data */
+ kcondvar_t vmu_cv; /* Used to signal threads */
+ /* Waiting for */
+ /* Rss_calc_thread to finish */
+ vmu_entity_t *vmu_system; /* Entity for tracking */
+ /* rss/swap for all processes */
+ /* in all zones */
+ mod_hash_t *vmu_zones_hash; /* Zones visited */
+ mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */
+ mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */
+ mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */
+ /* to implement VMUSAGE_COL_* */
+ /* flags, which aggregate by */
+ /* project or user regardless */
+ /* of zoneid. */
+ mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */
+ /* to track incore/not-incore */
+ mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */
+ /* amps to track incore/not- */
+ /* incore */
+ vmu_entity_t *vmu_entities; /* Linked list of entities */
+ size_t vmu_nentities; /* Count of entities in list */
+ vmu_cache_t *vmu_cache; /* Cached results */
+ kthread_t *vmu_calc_thread; /* NULL, or thread running */
+ /* vmu_calculate() */
+ uint_t vmu_calc_flags; /* Flags being using by */
+ /* currently running calc */
+ /* thread */
+ uint_t vmu_pending_flags; /* Flags of vm_getusage() */
+ /* threads waiting for */
+ /* calc thread to finish */
+ uint_t vmu_pending_waiters; /* Number of threads waiting */
+ /* for calc thread */
+ vmu_bound_t *vmu_free_bounds;
+ vmu_object_t *vmu_free_objects;
+ vmu_entity_t *vmu_free_entities;
+ vmu_zone_t *vmu_free_zones;
+} vmu_data_t;
+
+extern struct as kas;
+extern proc_t *practive;
+extern zone_t *global_zone;
+extern struct seg_ops segvn_ops;
+extern struct seg_ops segspt_shmops;
+
+static vmu_data_t vmu_data;
+static kmem_cache_t *vmu_bound_cache;
+static kmem_cache_t *vmu_object_cache;
+
+/*
+ * Save a bound on the free list
+ */
+static void
+vmu_free_bound(vmu_bound_t *bound)
+{
+ bound->vmb_next = vmu_data.vmu_free_bounds;
+ vmu_data.vmu_free_bounds = bound;
+}
+
+/*
+ * Free an object, and all visited bound info.
+ */
+static void
+vmu_free_object(mod_hash_val_t val)
+{
+ vmu_object_t *obj = (vmu_object_t *)val;
+ vmu_bound_t *bound = obj->vmo_bounds;
+ vmu_bound_t *tmp;
+
+ while (bound != NULL) {
+ tmp = bound;
+ bound = bound->vmb_next;
+ vmu_free_bound(tmp);
+ }
+ obj->vmo_next = vmu_data.vmu_free_objects;
+ vmu_data.vmu_free_objects = obj;
+}
+
+/*
+ * Free an entity, and hashes of visited objects for that entity.
+ */
+static void
+vmu_free_entity(mod_hash_val_t val)
+{
+ vmu_entity_t *entity = (vmu_entity_t *)val;
+
+ if (entity->vme_vnode_hash != NULL)
+ i_mod_hash_clear_nosync(entity->vme_vnode_hash);
+ if (entity->vme_amp_hash != NULL)
+ i_mod_hash_clear_nosync(entity->vme_amp_hash);
+ if (entity->vme_anon_hash != NULL)
+ i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+ entity->vme_next = vmu_data.vmu_free_entities;
+ vmu_data.vmu_free_entities = entity;
+}
+
+/*
+ * Free zone entity, and all hashes of entities inside that zone,
+ * which are projects, tasks, and users.
+ */
+static void
+vmu_free_zone(mod_hash_val_t val)
+{
+ vmu_zone_t *zone = (vmu_zone_t *)val;
+
+ if (zone->vmz_zone != NULL) {
+ vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
+ zone->vmz_zone = NULL;
+ }
+ if (zone->vmz_projects_hash != NULL)
+ i_mod_hash_clear_nosync(zone->vmz_projects_hash);
+ if (zone->vmz_tasks_hash != NULL)
+ i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
+ if (zone->vmz_rusers_hash != NULL)
+ i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
+ if (zone->vmz_eusers_hash != NULL)
+ i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
+ zone->vmz_next = vmu_data.vmu_free_zones;
+ vmu_data.vmu_free_zones = zone;
+}
+
+/*
+ * Initialize synchronization primitives and hashes for system-wide tracking
+ * of visited vnodes and shared amps. Initialize results cache.
+ */
+void
+vm_usage_init()
+{
+ mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
+
+ vmu_data.vmu_system = NULL;
+ vmu_data.vmu_zones_hash = NULL;
+ vmu_data.vmu_projects_col_hash = NULL;
+ vmu_data.vmu_rusers_col_hash = NULL;
+ vmu_data.vmu_eusers_col_hash = NULL;
+
+ vmu_data.vmu_free_bounds = NULL;
+ vmu_data.vmu_free_objects = NULL;
+ vmu_data.vmu_free_entities = NULL;
+ vmu_data.vmu_free_zones = NULL;
+
+ vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
+ "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+ sizeof (vnode_t));
+ vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
+ "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+ sizeof (struct anon_map));
+ vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
+ "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
+ vmu_free_entity);
+ vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
+ "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
+ vmu_free_entity);
+ vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
+ "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
+ vmu_free_entity);
+ vmu_data.vmu_zones_hash = mod_hash_create_idhash(
+ "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
+
+ vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
+ sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ vmu_object_cache = kmem_cache_create("vmu_object_cache",
+ sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ vmu_data.vmu_entities = NULL;
+ vmu_data.vmu_nentities = 0;
+
+ vmu_data.vmu_cache = NULL;
+ vmu_data.vmu_calc_thread = NULL;
+ vmu_data.vmu_calc_flags = 0;
+ vmu_data.vmu_pending_flags = 0;
+ vmu_data.vmu_pending_waiters = 0;
+}
+
+/*
+ * Allocate hashes for tracking vm objects visited for an entity.
+ * Update list of entities.
+ */
+static vmu_entity_t *
+vmu_alloc_entity(id_t id, int type, id_t zoneid)
+{
+ vmu_entity_t *entity;
+
+ if (vmu_data.vmu_free_entities != NULL) {
+ entity = vmu_data.vmu_free_entities;
+ vmu_data.vmu_free_entities =
+ vmu_data.vmu_free_entities->vme_next;
+ bzero(&entity->vme_result, sizeof (vmusage_t));
+ } else {
+ entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
+ }
+ entity->vme_result.vmu_id = id;
+ entity->vme_result.vmu_zoneid = zoneid;
+ entity->vme_result.vmu_type = type;
+
+ if (entity->vme_vnode_hash == NULL)
+ entity->vme_vnode_hash = mod_hash_create_ptrhash(
+ "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+ sizeof (vnode_t));
+
+ if (entity->vme_amp_hash == NULL)
+ entity->vme_amp_hash = mod_hash_create_ptrhash(
+ "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+ sizeof (struct anon_map));
+
+ if (entity->vme_anon_hash == NULL)
+ entity->vme_anon_hash = mod_hash_create_ptrhash(
+ "vmusage anon hash", VMUSAGE_HASH_SIZE,
+ mod_hash_null_valdtor, sizeof (struct anon));
+
+ entity->vme_next = vmu_data.vmu_entities;
+ vmu_data.vmu_entities = entity;
+ vmu_data.vmu_nentities++;
+
+ return (entity);
+}
+
+/*
+ * Allocate a zone entity, and hashes for tracking visited vm objects
+ * for projects, tasks, and users within that zone.
+ */
+static vmu_zone_t *
+vmu_alloc_zone(id_t id)
+{
+ vmu_zone_t *zone;
+
+ if (vmu_data.vmu_free_zones != NULL) {
+ zone = vmu_data.vmu_free_zones;
+ vmu_data.vmu_free_zones =
+ vmu_data.vmu_free_zones->vmz_next;
+ zone->vmz_next = NULL;
+ zone->vmz_zone = NULL;
+ } else {
+ zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
+ }
+
+ zone->vmz_id = id;
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+ zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
+ VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
+ zone->vmz_projects_hash = mod_hash_create_idhash(
+ "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
+ != 0 && zone->vmz_tasks_hash == NULL)
+ zone->vmz_tasks_hash = mod_hash_create_idhash(
+ "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
+ != 0 && zone->vmz_rusers_hash == NULL)
+ zone->vmz_rusers_hash = mod_hash_create_idhash(
+ "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
+ != 0 && zone->vmz_eusers_hash == NULL)
+ zone->vmz_eusers_hash = mod_hash_create_idhash(
+ "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+ return (zone);
+}
+
+/*
+ * Allocate a structure for tracking visited bounds for a vm object.
+ */
+static vmu_object_t *
+vmu_alloc_object(caddr_t key, int type)
+{
+ vmu_object_t *object;
+
+ if (vmu_data.vmu_free_objects != NULL) {
+ object = vmu_data.vmu_free_objects;
+ vmu_data.vmu_free_objects =
+ vmu_data.vmu_free_objects->vmo_next;
+ } else {
+ object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
+ }
+
+ object->vmo_key = key;
+ object->vmo_type = type;
+ object->vmo_bounds = NULL;
+
+ return (object);
+}
+
+/*
+ * Allocate and return a bound structure.
+ */
+static vmu_bound_t *
+vmu_alloc_bound()
+{
+ vmu_bound_t *bound;
+
+ if (vmu_data.vmu_free_bounds != NULL) {
+ bound = vmu_data.vmu_free_bounds;
+ vmu_data.vmu_free_bounds =
+ vmu_data.vmu_free_bounds->vmb_next;
+ bzero(bound, sizeof (vmu_bound_t));
+ } else {
+ bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
+ bzero(bound, sizeof (vmu_bound_t));
+ }
+ return (bound);
+}
+
+/*
+ * vmu_find_insert_* functions implement hash lookup or allocate and
+ * insert operations.
+ */
+static vmu_object_t *
+vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
+{
+ int ret;
+ vmu_object_t *object;
+
+ ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
+ (mod_hash_val_t *)&object);
+ if (ret != 0) {
+ object = vmu_alloc_object(key, type);
+ ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
+ (mod_hash_val_t)object, (mod_hash_hndl_t)0);
+ ASSERT(ret == 0);
+ }
+ return (object);
+}
+
+static int
+vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+{
+ int ret;
+ caddr_t val;
+
+ ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
+ (mod_hash_val_t *)&val);
+
+ if (ret == 0)
+ return (0);
+
+ ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
+ (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+
+ ASSERT(ret == 0);
+
+ return (1);
+}
+
+static vmu_entity_t *
+vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
+{
+ int ret;
+ vmu_entity_t *entity;
+
+ ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
+ (mod_hash_val_t *)&entity);
+ if (ret != 0) {
+ entity = vmu_alloc_entity(id, type, zoneid);
+ ret = i_mod_hash_insert_nosync(hash,
+ (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
+ (mod_hash_hndl_t)0);
+ ASSERT(ret == 0);
+ }
+ return (entity);
+}
+
+
+
+
+/*
+ * Returns list of object bounds between start and end. New bounds inserted
+ * by this call are given type.
+ *
+ * Returns the number of pages covered if new bounds are created. Returns 0
+ * if region between start/end consists of all existing bounds.
+ */
+static pgcnt_t
+vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
+ end, char type, vmu_bound_t **first, vmu_bound_t **last)
+{
+ vmu_bound_t *next;
+ vmu_bound_t *prev = NULL;
+ vmu_bound_t *tmp = NULL;
+ pgcnt_t ret = 0;
+
+ *first = *last = NULL;
+
+ for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) {
+ /*
+ * Find bounds overlapping or overlapped by range [start,end].
+ */
+ if (start > next->vmb_end) {
+ /* bound is before new bound */
+ prev = next;
+ continue;
+ }
+ if (next->vmb_start > end) {
+ /* bound is after new bound */
+ break;
+ }
+ if (*first == NULL)
+ *first = next;
+ *last = next;
+ }
+
+ if (*first == NULL) {
+ ASSERT(*last == NULL);
+ /*
+ * No bounds overlapping range [start,end], so create new
+ * bound
+ */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_start = start;
+ tmp->vmb_end = end;
+ tmp->vmb_type = type;
+ if (prev == NULL) {
+ tmp->vmb_next = ro->vmo_bounds;
+ ro->vmo_bounds = tmp;
+ } else {
+ tmp->vmb_next = prev->vmb_next;
+ prev->vmb_next = tmp;
+ }
+ *first = tmp;
+ *last = tmp;
+ ASSERT(tmp->vmb_end >= tmp->vmb_start);
+ ret = tmp->vmb_end - tmp->vmb_start + 1;
+ return (ret);
+ }
+
+ /* Check to see if start is before first known bound */
+ ASSERT(first != NULL && last != NULL);
+ next = (*first);
+ if (start < (*first)->vmb_start) {
+ /* Create new bound before first bound */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_start = start;
+ tmp->vmb_end = (*first)->vmb_start - 1;
+ tmp->vmb_type = type;
+ tmp->vmb_next = *first;
+ if (*first == ro->vmo_bounds)
+ ro->vmo_bounds = tmp;
+ if (prev != NULL)
+ prev->vmb_next = tmp;
+ ASSERT(tmp->vmb_end >= tmp->vmb_start);
+ ret += tmp->vmb_end - tmp->vmb_start + 1;
+ *first = tmp;
+ }
+ /*
+ * Between start and end, search for gaps between and after existing
+ * bounds. Create new bounds to fill gaps if they exist.
+ */
+ while (end > next->vmb_end) {
+ /*
+ * Check for gap between bound and next bound. if no gap,
+ * continue.
+ */
+ if ((next != *last) &&
+ ((next->vmb_end + 1) == next->vmb_next->vmb_start)) {
+ next = next->vmb_next;
+ continue;
+ }
+ /*
+ * Insert new bound in gap after bound, and before next
+ * bound if next bound exists.
+ */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_type = type;
+ tmp->vmb_next = next->vmb_next;
+ tmp->vmb_start = next->vmb_end + 1;
+
+ if (next != *last) {
+ tmp->vmb_end = next->vmb_next->vmb_start - 1;
+ ASSERT(tmp->vmb_end >= tmp->vmb_start);
+ ret += tmp->vmb_end - tmp->vmb_start + 1;
+ next->vmb_next = tmp;
+ next = tmp->vmb_next;
+ } else {
+ tmp->vmb_end = end;
+ ASSERT(tmp->vmb_end >= tmp->vmb_start);
+ ret += tmp->vmb_end - tmp->vmb_start + 1;
+ next->vmb_next = tmp;
+ *last = tmp;
+ break;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * vmu_update_bounds()
+ *
+ * first, last: list of continuous bounds, of which zero or more are of
+ * type VMUSAGE_BOUND_UNKNOWN.
+ *
+ * new_first, new_last: list of continuous bounds, of which none are of
+ * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
+ * update the types of bounds in (first,last) with
+ * type VMUSAGE_BOUND_UNKNOWN.
+ *
+ * For the list of bounds (first,last), this function updates any bounds
+ * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
+ * the list (new_first, new_last).
+ *
+ * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
+ * (new_first, new_last), it will be split into multiple bounds.
+ *
+ * Return value:
+ * The number of pages in the list of bounds (first,last) that were of
+ * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
+ * VMUSAGE_BOUND_INCORE.
+ *
+ */
+static pgcnt_t
+vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last,
+ vmu_bound_t *new_first, vmu_bound_t *new_last)
+{
+ vmu_bound_t *next, *new_next, *tmp;
+ pgcnt_t rss = 0;
+
+ next = *first;
+ new_next = new_first;
+
+ /* verify bounds span same pages */
+ ASSERT((*first)->vmb_start >= new_next->vmb_start);
+ ASSERT((*last)->vmb_end <= new_last->vmb_end);
+ for (;;) {
+ /* If bound already has type, proceed to next bound */
+ if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
+ if (next == *last)
+ break;
+ next = next->vmb_next;
+ continue;
+ }
+ while (new_next->vmb_end < next->vmb_start)
+ new_next = new_next->vmb_next;
+ ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
+ next->vmb_type = new_next->vmb_type;
+ if (new_next->vmb_end < next->vmb_end) {
+ /* need to split bound */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
+ tmp->vmb_start = new_next->vmb_end + 1;
+ tmp->vmb_end = next->vmb_end;
+ tmp->vmb_next = next->vmb_next;
+ next->vmb_end = new_next->vmb_end;
+ next->vmb_next = tmp;
+ if (*last == next)
+ *last = tmp;
+ if (next->vmb_type == VMUSAGE_BOUND_INCORE)
+ rss += next->vmb_end - next->vmb_start + 1;
+ next = tmp;
+ } else {
+ if (next->vmb_type == VMUSAGE_BOUND_INCORE)
+ rss += next->vmb_end - next->vmb_start + 1;
+ if (next == *last)
+ break;
+ next = next->vmb_next;
+ }
+ }
+ return (rss);
+}
+
+/*
+ * merges adjacent bounds with same type between first and last bound.
+ * After merge, last pointer is no longer valid, as last bound may be
+ * merged away.
+ */
+static void
+vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last)
+{
+ vmu_bound_t *next;
+ vmu_bound_t *tmp;
+
+ ASSERT(*first != NULL);
+ ASSERT(*last != NULL);
+
+ next = *first;
+ while (next != *last) {
+
+ /* If bounds are adjacent and have same type, merge them */
+ if (((next->vmb_end + 1) == next->vmb_next->vmb_start) &&
+ (next->vmb_type == next->vmb_next->vmb_type)) {
+ tmp = next->vmb_next;
+ next->vmb_end = tmp->vmb_end;
+ next->vmb_next = tmp->vmb_next;
+ vmu_free_bound(tmp);
+ if (tmp == *last)
+ *last = next;
+ } else {
+ next = next->vmb_next;
+ }
+ }
+}
+
+/*
+ * Given an amp and a list of bounds, updates each bound's type with
+ * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
+ *
+ * If a bound is partially incore, it will be split into two bounds.
+ * first and last may be modified, as bounds may be split into multiple
+ * bounds if the are partially incore/not-incore.
+ *
+ * Set incore to non-zero if bounds are already known to be incore
+ *
+ */
+static void
+vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first,
+ vmu_bound_t **last, boolean_t incore)
+{
+ vmu_bound_t *next;
+ vmu_bound_t *tmp;
+ pgcnt_t index;
+ short bound_type;
+ short page_type;
+ vnode_t *vn;
+ anoff_t off;
+ struct anon *ap;
+
+ next = *first;
+ /* Shared anon slots don't change once set */
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ for (;;) {
+ if (incore == B_TRUE)
+ next->vmb_type = VMUSAGE_BOUND_INCORE;
+
+ if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
+ if (next == *last)
+ break;
+ next = next->vmb_next;
+ continue;
+ }
+ bound_type = next->vmb_type;
+ index = next->vmb_start;
+ while (index <= next->vmb_end) {
+
+ /*
+ * These are used to determine how much to increment
+ * index when a large page is found.
+ */
+ page_t *page;
+ pgcnt_t pgcnt = 1;
+ uint_t pgshft;
+ pgcnt_t pgmsk;
+
+ ap = anon_get_ptr(amp->ahp, index);
+ if (ap != NULL)
+ swap_xlate(ap, &vn, &off);
+
+ if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
+ (page = page_exists(vn, off)) != NULL) {
+ page_type = VMUSAGE_BOUND_INCORE;
+ if (page->p_szc > 0) {
+ pgcnt = page_get_pagecnt(page->p_szc);
+ pgshft = page_get_shift(page->p_szc);
+ pgmsk = (0x1 << (pgshft - PAGESHIFT))
+ - 1;
+ }
+ } else {
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ }
+ if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
+ next->vmb_type = page_type;
+ } else if (next->vmb_type != page_type) {
+ /*
+ * if current bound type does not match page
+ * type, need to split off new bound.
+ */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_type = page_type;
+ tmp->vmb_start = index;
+ tmp->vmb_end = next->vmb_end;
+ tmp->vmb_next = next->vmb_next;
+ next->vmb_end = index - 1;
+ next->vmb_next = tmp;
+ if (*last == next)
+ *last = tmp;
+ next = tmp;
+ }
+ if (pgcnt > 1) {
+ /*
+ * If inside large page, jump to next large
+ * page
+ */
+ index = (index & ~pgmsk) + pgcnt;
+ } else {
+ index++;
+ }
+ }
+ if (next == *last) {
+ ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
+ break;
+ } else
+ next = next->vmb_next;
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+}
+
+/*
+ * Same as vmu_amp_update_incore_bounds(), except for tracking
+ * incore-/not-incore for vnodes.
+ */
+static void
+vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first,
+ vmu_bound_t **last)
+{
+ vmu_bound_t *next;
+ vmu_bound_t *tmp;
+ pgcnt_t index;
+ short bound_type;
+ short page_type;
+
+ next = *first;
+ for (;;) {
+ if (vnode->v_pages == NULL)
+ next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
+
+ if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
+ if (next == *last)
+ break;
+ next = next->vmb_next;
+ continue;
+ }
+
+ bound_type = next->vmb_type;
+ index = next->vmb_start;
+ while (index <= next->vmb_end) {
+
+ /*
+ * These are used to determine how much to increment
+ * index when a large page is found.
+ */
+ page_t *page;
+ pgcnt_t pgcnt = 1;
+ uint_t pgshft;
+ pgcnt_t pgmsk;
+
+ if (vnode->v_pages != NULL &&
+ (page = page_exists(vnode, ptob(index))) != NULL) {
+ page_type = VMUSAGE_BOUND_INCORE;
+ if (page->p_szc > 0) {
+ pgcnt = page_get_pagecnt(page->p_szc);
+ pgshft = page_get_shift(page->p_szc);
+ pgmsk = (0x1 << (pgshft - PAGESHIFT))
+ - 1;
+ }
+ } else {
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ }
+ if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
+ next->vmb_type = page_type;
+ } else if (next->vmb_type != page_type) {
+ /*
+ * if current bound type does not match page
+ * type, need to split off new bound.
+ */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_type = page_type;
+ tmp->vmb_start = index;
+ tmp->vmb_end = next->vmb_end;
+ tmp->vmb_next = next->vmb_next;
+ next->vmb_end = index - 1;
+ next->vmb_next = tmp;
+ if (*last == next)
+ *last = tmp;
+ next = tmp;
+ }
+ if (pgcnt > 1) {
+ /*
+ * If inside large page, jump to next large
+ * page
+ */
+ index = (index & ~pgmsk) + pgcnt;
+ } else {
+ index++;
+ }
+ }
+ if (next == *last) {
+ ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
+ break;
+ } else
+ next = next->vmb_next;
+ }
+}
+
+/*
+ * Calculate the rss and swap consumed by a segment. vmu_entities is the
+ * list of entities to visit. For shared segments, the vnode or amp
+ * is looked up in each entity to see if has been already counted. Private
+ * anon pages are checked per entity to ensure that cow pages are not
+ * double counted.
+ *
+ * For private mapped files, first the amp is checked for private pages.
+ * Bounds not backed by the amp are looked up in the vnode for each entity
+ * to avoid double counting of private COW vnode pages.
+ */
+static void
+vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
+{
+ struct segvn_data *svd;
+ struct shm_data *shmd;
+ struct spt_data *sptd;
+ vmu_object_t *shared_object = NULL;
+ vmu_object_t *entity_object = NULL;
+ vmu_entity_t *entity;
+ vmusage_t *result;
+ vmu_bound_t *first = NULL;
+ vmu_bound_t *last = NULL;
+ vmu_bound_t *cur = NULL;
+ vmu_bound_t *e_first = NULL;
+ vmu_bound_t *e_last = NULL;
+ vmu_bound_t *tmp;
+ pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
+ struct anon_map *private_amp = NULL;
+ boolean_t incore = B_FALSE;
+ boolean_t shared = B_FALSE;
+ int file = 0;
+ pgcnt_t swresv = 0;
+ pgcnt_t panon = 0;
+
+ /* Can zero-length segments exist? Not sure, so parenoia */
+ if (seg->s_size <= 0)
+ return;
+
+ /*
+ * Figure out if there is a shared object (such as a named vnode or
+ * a shared amp, then figure out if there is a private amp, which
+ * identifies private pages.
+ */
+ if (seg->s_ops == &segvn_ops) {
+ svd = (struct segvn_data *)seg->s_data;
+ if (svd->type == MAP_SHARED)
+ shared = B_TRUE;
+ else
+ swresv = svd->swresv;
+
+ if (svd->vp != NULL) {
+ file = 1;
+ shared_object = vmu_find_insert_object(
+ vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
+ VMUSAGE_TYPE_VNODE);
+ s_start = btop(svd->offset);
+ s_end = btop(svd->offset + seg->s_size) - 1;
+ }
+ if (svd->amp != NULL && svd->type == MAP_SHARED) {
+ ASSERT(shared_object == NULL);
+ shared_object = vmu_find_insert_object(
+ vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
+ VMUSAGE_TYPE_AMP);
+ s_start = svd->anon_index;
+ s_end = svd->anon_index + btop(seg->s_size) - 1;
+ /* schedctl mappings are always in core */
+ if (svd->amp->swresv == 0)
+ incore = B_TRUE;
+ }
+ if (svd->amp != NULL && svd->type == MAP_PRIVATE) {
+ private_amp = svd->amp;
+ p_start = svd->anon_index;
+ p_end = svd->anon_index + btop(seg->s_size) - 1;
+ }
+ } else if (seg->s_ops == &segspt_shmops) {
+ shared = B_TRUE;
+ shmd = (struct shm_data *)seg->s_data;
+ shared_object = vmu_find_insert_object(
+ vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
+ VMUSAGE_TYPE_AMP);
+ s_start = 0;
+ s_end = btop(seg->s_size) - 1;
+ sptd = shmd->shm_sptseg->s_data;
+
+ /* ism segments are always incore and do not reserve swap */
+ if (sptd->spt_flags & SHM_SHARE_MMU)
+ incore = B_TRUE;
+
+ } else {
+ return;
+ }
+
+ /*
+ * If there is a private amp, count anon pages that exist. If an
+ * anon has a refcnt > 1 (cow sharing), then save the anon in a
+ * hash so that it is not double counted.
+ *
+ * If there is also a shared object, they figure out the bounds
+ * which are not mapped by the private amp.
+ */
+ if (private_amp != NULL) {
+
+ /* Enter as writer to prevent cow anons from being freed */
+ ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
+
+ p_index = p_start;
+ s_index = s_start;
+
+ while (p_index <= p_end) {
+
+ pgcnt_t p_index_next;
+ pgcnt_t p_bound_size;
+ int cnt;
+ anoff_t off;
+ struct vnode *vn;
+ struct anon *ap;
+ page_t *page; /* For handling of large */
+ pgcnt_t pgcnt = 1; /* pages */
+ pgcnt_t pgstart;
+ pgcnt_t pgend;
+ uint_t pgshft;
+ pgcnt_t pgmsk;
+
+ p_index_next = p_index;
+ ap = anon_get_next_ptr(private_amp->ahp,
+ &p_index_next);
+
+ /*
+ * If next anon is past end of mapping, simulate
+ * end of anon so loop terminates.
+ */
+ if (p_index_next > p_end) {
+ p_index_next = p_end + 1;
+ ap = NULL;
+ }
+ /*
+ * For cow segments, keep track of bounds not
+ * backed by private amp so they can be looked
+ * up in the backing vnode
+ */
+ if (p_index_next != p_index) {
+
+ /*
+ * Compute index difference between anon and
+ * previous anon.
+ */
+ p_bound_size = p_index_next - p_index - 1;
+
+ if (shared_object != NULL) {
+ cur = vmu_alloc_bound();
+ cur->vmb_next = NULL;
+ cur->vmb_start = s_index;
+ cur->vmb_end = s_index + p_bound_size;
+ cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
+ if (first == NULL) {
+ first = cur;
+ last = cur;
+ } else {
+ last->vmb_next = cur;
+ last = cur;
+ }
+ }
+ p_index = p_index + p_bound_size + 1;
+ s_index = s_index + p_bound_size + 1;
+ }
+
+ /* Detect end of anons in amp */
+ if (ap == NULL)
+ break;
+
+ cnt = ap->an_refcnt;
+ swap_xlate(ap, &vn, &off);
+
+ if (vn == NULL || vn->v_pages == NULL ||
+ (page = page_exists(vn, off)) == NULL) {
+ p_index++;
+ s_index++;
+ continue;
+ }
+
+ /*
+ * If large page is found, compute portion of large
+ * page in mapping, and increment indicies to the next
+ * large page.
+ */
+ if (page->p_szc > 0) {
+
+ pgcnt = page_get_pagecnt(page->p_szc);
+ pgshft = page_get_shift(page->p_szc);
+ pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
+
+ /* First page in large page */
+ pgstart = p_index & ~pgmsk;
+ /* Last page in large page */
+ pgend = pgstart + pgcnt - 1;
+ /*
+ * Artifically end page if page extends past
+ * end of mapping.
+ */
+ if (pgend > p_end)
+ pgend = p_end;
+
+ /*
+ * Compute number of pages from large page
+ * which are mapped.
+ */
+ pgcnt = pgend - p_index + 1;
+
+ /*
+ * Point indicies at page after large page,
+ * or at page after end of mapping.
+ */
+ p_index += pgcnt;
+ s_index += pgcnt;
+ } else {
+ p_index++;
+ s_index++;
+ }
+
+ /*
+ * Assume anon structs with a refcnt
+ * of 1 are not cow shared, so there
+ * is no reason to track them per entity.
+ */
+ if (cnt == 1) {
+ panon += pgcnt;
+ continue;
+ }
+ for (entity = vmu_entities; entity != NULL;
+ entity = entity->vme_next_calc) {
+
+ result = &entity->vme_result;
+ /*
+ * Track cow anons per entity so
+ * they are not double counted.
+ */
+ if (vmu_find_insert_anon(entity->vme_anon_hash,
+ (caddr_t)ap) == 0)
+ continue;
+
+ result->vmu_rss_all += (pgcnt << PAGESHIFT);
+ result->vmu_rss_private +=
+ (pgcnt << PAGESHIFT);
+ }
+ }
+ ANON_LOCK_EXIT(&private_amp->a_rwlock);
+ }
+
+ /* Add up resident anon and swap reserved for private mappings */
+ if (swresv > 0 || panon > 0) {
+ for (entity = vmu_entities; entity != NULL;
+ entity = entity->vme_next_calc) {
+ result = &entity->vme_result;
+ result->vmu_swap_all += swresv;
+ result->vmu_swap_private += swresv;
+ result->vmu_rss_all += (panon << PAGESHIFT);
+ result->vmu_rss_private += (panon << PAGESHIFT);
+ }
+ }
+
+ /* Compute resident pages backing shared amp or named vnode */
+ if (shared_object != NULL) {
+ if (first == NULL) {
+ /*
+ * No private amp, or private amp has no anon
+ * structs. This means entire segment is backed by
+ * the shared object.
+ */
+ first = vmu_alloc_bound();
+ first->vmb_next = NULL;
+ first->vmb_start = s_start;
+ first->vmb_end = s_end;
+ first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
+ }
+ /*
+ * Iterate bounds not backed by private amp, and compute
+ * resident pages.
+ */
+ cur = first;
+ while (cur != NULL) {
+
+ if (vmu_insert_lookup_object_bounds(shared_object,
+ cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
+ &first, &last) > 0) {
+ /* new bounds, find incore/not-incore */
+ if (shared_object->vmo_type ==
+ VMUSAGE_TYPE_VNODE)
+ vmu_vnode_update_incore_bounds(
+ (vnode_t *)
+ shared_object->vmo_key, &first,
+ &last);
+ else
+ vmu_amp_update_incore_bounds(
+ (struct anon_map *)
+ shared_object->vmo_key, &first,
+ &last, incore);
+ vmu_merge_bounds(&first, &last);
+ }
+ for (entity = vmu_entities; entity != NULL;
+ entity = entity->vme_next_calc) {
+
+ result = &entity->vme_result;
+
+ entity_object = vmu_find_insert_object(
+ shared_object->vmo_type ==
+ VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
+ entity->vme_amp_hash,
+ shared_object->vmo_key,
+ shared_object->vmo_type);
+
+ virt = vmu_insert_lookup_object_bounds(
+ entity_object, cur->vmb_start, cur->vmb_end,
+ VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
+
+ if (virt == 0)
+ continue;
+ /*
+ * Range visited for this entity
+ */
+ rss = vmu_update_bounds(&e_first,
+ &e_last, first, last);
+ result->vmu_rss_all += (rss << PAGESHIFT);
+ if (shared == B_TRUE && file == B_FALSE) {
+ /* shared anon mapping */
+ result->vmu_swap_all +=
+ (virt << PAGESHIFT);
+ result->vmu_swap_shared +=
+ (virt << PAGESHIFT);
+ result->vmu_rss_shared +=
+ (rss << PAGESHIFT);
+ } else if (shared == B_TRUE && file == B_TRUE) {
+ /* shared file mapping */
+ result->vmu_rss_shared +=
+ (rss << PAGESHIFT);
+ } else if (shared == B_FALSE &&
+ file == B_TRUE) {
+ /* private file mapping */
+ result->vmu_rss_private +=
+ (rss << PAGESHIFT);
+ }
+ vmu_merge_bounds(&e_first, &e_last);
+ }
+ tmp = cur;
+ cur = cur->vmb_next;
+ vmu_free_bound(tmp);
+ }
+ }
+}
+
+/*
+ * Based on the current calculation flags, find the relevant entities
+ * which are relative to the process. Then calculate each segment
+ * in the process'es address space for each relevant entity.
+ */
+static void
+vmu_calculate_proc(proc_t *p)
+{
+ vmu_entity_t *entities = NULL;
+ vmu_zone_t *zone;
+ vmu_entity_t *tmp;
+ struct as *as;
+ struct seg *seg;
+ int ret;
+
+ /* Figure out which entities are being computed */
+ if ((vmu_data.vmu_system) != NULL) {
+ tmp = vmu_data.vmu_system;
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
+ VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+ VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
+ VMUSAGE_ALL_EUSERS)) {
+ ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
+ (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
+ (mod_hash_val_t *)&zone);
+ if (ret != 0) {
+ zone = vmu_alloc_zone(p->p_zone->zone_id);
+ ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
+ (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
+ (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
+ ASSERT(ret == 0);
+ }
+ if (zone->vmz_zone != NULL) {
+ tmp = zone->vmz_zone;
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
+ tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
+ p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
+ zone->vmz_id);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
+ tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
+ p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
+ tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
+ crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
+ tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
+ crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ }
+ /* Entities which collapse projects and users for all zones */
+ if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
+ tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
+ p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
+ tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
+ crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
+ tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
+ crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+
+ ASSERT(entities != NULL);
+ /* process all segs in process's address space */
+ as = p->p_as;
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ for (seg = AS_SEGFIRST(as); seg != NULL;
+ seg = AS_SEGNEXT(as, seg)) {
+ vmu_calculate_seg(entities, seg);
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+}
+
+/*
+ * Free data created by previous call to vmu_calculate().
+ */
+static void
+vmu_clear_calc()
+{
+ if (vmu_data.vmu_system != NULL)
+ vmu_free_entity(vmu_data.vmu_system);
+ vmu_data.vmu_system = NULL;
+ if (vmu_data.vmu_zones_hash != NULL)
+ i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
+ if (vmu_data.vmu_projects_col_hash != NULL)
+ i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
+ if (vmu_data.vmu_rusers_col_hash != NULL)
+ i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
+ if (vmu_data.vmu_eusers_col_hash != NULL)
+ i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
+
+ i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
+ i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
+}
+
+/*
+ * Free unused data structures. These can result if the system workload
+ * decreases between calculations.
+ */
+static void
+vmu_free_extra()
+{
+ vmu_bound_t *tb;
+ vmu_object_t *to;
+ vmu_entity_t *te;
+ vmu_zone_t *tz;
+
+ while (vmu_data.vmu_free_bounds != NULL) {
+ tb = vmu_data.vmu_free_bounds;
+ vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
+ kmem_cache_free(vmu_bound_cache, tb);
+ }
+ while (vmu_data.vmu_free_objects != NULL) {
+ to = vmu_data.vmu_free_objects;
+ vmu_data.vmu_free_objects =
+ vmu_data.vmu_free_objects->vmo_next;
+ kmem_cache_free(vmu_object_cache, to);
+ }
+ while (vmu_data.vmu_free_entities != NULL) {
+ te = vmu_data.vmu_free_entities;
+ vmu_data.vmu_free_entities =
+ vmu_data.vmu_free_entities->vme_next;
+ if (te->vme_vnode_hash != NULL)
+ mod_hash_destroy_hash(te->vme_vnode_hash);
+ if (te->vme_amp_hash != NULL)
+ mod_hash_destroy_hash(te->vme_amp_hash);
+ if (te->vme_anon_hash != NULL)
+ mod_hash_destroy_hash(te->vme_anon_hash);
+ kmem_free(te, sizeof (vmu_entity_t));
+ }
+ while (vmu_data.vmu_free_zones != NULL) {
+ tz = vmu_data.vmu_free_zones;
+ vmu_data.vmu_free_zones =
+ vmu_data.vmu_free_zones->vmz_next;
+ if (tz->vmz_projects_hash != NULL)
+ mod_hash_destroy_hash(tz->vmz_projects_hash);
+ if (tz->vmz_tasks_hash != NULL)
+ mod_hash_destroy_hash(tz->vmz_tasks_hash);
+ if (tz->vmz_rusers_hash != NULL)
+ mod_hash_destroy_hash(tz->vmz_rusers_hash);
+ if (tz->vmz_eusers_hash != NULL)
+ mod_hash_destroy_hash(tz->vmz_eusers_hash);
+ kmem_free(tz, sizeof (vmu_zone_t));
+ }
+}
+
+extern kcondvar_t *pr_pid_cv;
+
+/*
+ * Determine which entity types are relevant and allocate the hashes to
+ * track them. Then walk the process table and count rss and swap
+ * for each process'es address space. Address space object such as
+ * vnodes, amps and anons are tracked per entity, so that they are
+ * not double counted in the results.
+ *
+ */
+static void
+vmu_calculate()
+{
+ int i = 0;
+ int ret;
+ proc_t *p;
+
+ vmu_clear_calc();
+
+ if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
+ vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
+ ALL_ZONES);
+
+ /*
+ * Walk process table and calculate rss of each proc.
+ *
+ * Pidlock and p_lock cannot be held while doing the rss calculation.
+ * This is because:
+ * 1. The calculation allocates using KM_SLEEP.
+ * 2. The calculation grabs a_lock, which cannot be grabbed
+ * after p_lock.
+ *
+ * Since pidlock must be dropped, we cannot simply just walk the
+ * practive list. Instead, we walk the process table, and sprlock
+ * each process to ensure that it does not exit during the
+ * calculation.
+ */
+
+ mutex_enter(&pidlock);
+ for (i = 0; i < v.v_proc; i++) {
+again:
+ p = pid_entry(i);
+ if (p == NULL)
+ continue;
+
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ if (panicstr) {
+ mutex_exit(&p->p_lock);
+ return;
+ }
+
+ /* Try to set P_PR_LOCK */
+ ret = sprtrylock_proc(p);
+ if (ret == -1) {
+ /* Process in invalid state */
+ mutex_exit(&p->p_lock);
+ mutex_enter(&pidlock);
+ continue;
+ } else if (ret == 1) {
+ /*
+ * P_PR_LOCK is already set. Wait and try again.
+ * This also drops p_lock.
+ */
+ sprwaitlock_proc(p);
+ mutex_enter(&pidlock);
+ goto again;
+ }
+ mutex_exit(&p->p_lock);
+
+ vmu_calculate_proc(p);
+
+ mutex_enter(&p->p_lock);
+ sprunlock(p);
+ mutex_enter(&pidlock);
+ }
+ mutex_exit(&pidlock);
+
+ vmu_free_extra();
+}
+
+/*
+ * allocate a new cache for N results satisfying flags
+ */
+vmu_cache_t *
+vmu_cache_alloc(size_t nres, uint_t flags)
+{
+ vmu_cache_t *cache;
+
+ cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
+ cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
+ cache->vmc_nresults = nres;
+ cache->vmc_flags = flags;
+ cache->vmc_refcnt = 1;
+ return (cache);
+}
+
+/*
+ * Make sure cached results are not freed
+ */
+static void
+vmu_cache_hold(vmu_cache_t *cache)
+{
+ ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
+ cache->vmc_refcnt++;
+}
+
+/*
+ * free cache data
+ */
+static void
+vmu_cache_rele(vmu_cache_t *cache)
+{
+ ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
+ ASSERT(cache->vmc_refcnt > 0);
+ cache->vmc_refcnt--;
+ if (cache->vmc_refcnt == 0) {
+ kmem_free(cache->vmc_results, sizeof (vmusage_t) *
+ cache->vmc_nresults);
+ kmem_free(cache, sizeof (vmu_cache_t));
+ }
+}
+
+/*
+ * Copy out the cached results to a caller. Inspect the callers flags
+ * and zone to determine which cached results should be copied.
+ */
+static int
+vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
+ uint_t flags)
+{
+ vmusage_t *result, *out_result;
+ vmusage_t dummy;
+ size_t i, count = 0;
+ size_t bufsize;
+ int ret = 0;
+ uint_t types = 0;
+
+ if (nres != NULL) {
+ if (copyin((caddr_t)nres, &bufsize, sizeof (size_t)))
+ return (set_errno(EFAULT));
+ } else {
+ bufsize = 0;
+ }
+
+ /* figure out what results the caller is interested in. */
+ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
+ types |= VMUSAGE_SYSTEM;
+ if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+ types |= VMUSAGE_ZONE;
+ if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_COL_PROJECTS))
+ types |= VMUSAGE_PROJECTS;
+ if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
+ types |= VMUSAGE_TASKS;
+ if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
+ types |= VMUSAGE_RUSERS;
+ if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
+ types |= VMUSAGE_EUSERS;
+
+ /* count results for current zone */
+ out_result = buf;
+ for (result = cache->vmc_results, i = 0;
+ i < cache->vmc_nresults; result++, i++) {
+
+ /* Do not return "other-zone" results to non-global zones */
+ if (curproc->p_zone != global_zone &&
+ curproc->p_zone->zone_id != result->vmu_zoneid)
+ continue;
+
+ /*
+ * If non-global zone requests VMUSAGE_SYSTEM, fake
+ * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
+ */
+ if (curproc->p_zone != global_zone &&
+ (flags & VMUSAGE_SYSTEM) != 0 &&
+ result->vmu_type == VMUSAGE_ZONE) {
+ count++;
+ if (out_result != NULL) {
+ if (bufsize < count) {
+ ret = set_errno(EOVERFLOW);
+ } else {
+ dummy = *result;
+ dummy.vmu_zoneid = ALL_ZONES;
+ dummy.vmu_id = 0;
+ dummy.vmu_type = VMUSAGE_SYSTEM;
+ if (copyout(&dummy, out_result,
+ sizeof (vmusage_t)))
+ return (set_errno(
+ EFAULT));
+ out_result++;
+ }
+ }
+ }
+
+ /* Skip results that do not match requested type */
+ if ((result->vmu_type & types) == 0)
+ continue;
+
+ /* Skip collated results if not requested */
+ if (result->vmu_zoneid == ALL_ZONES) {
+ if (result->vmu_type == VMUSAGE_PROJECTS &&
+ (flags & VMUSAGE_COL_PROJECTS) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_EUSERS &&
+ (flags & VMUSAGE_COL_EUSERS) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_RUSERS &&
+ (flags & VMUSAGE_COL_RUSERS) == 0)
+ continue;
+ }
+
+ /* Skip "other zone" results if not requested */
+ if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+ if (result->vmu_type == VMUSAGE_ZONE &&
+ (flags & VMUSAGE_ALL_ZONES) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_PROJECTS &&
+ (flags & (VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_COL_PROJECTS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_TASKS &&
+ (flags & VMUSAGE_ALL_TASKS) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_RUSERS &&
+ (flags & (VMUSAGE_ALL_RUSERS |
+ VMUSAGE_COL_RUSERS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_EUSERS &&
+ (flags & (VMUSAGE_ALL_EUSERS |
+ VMUSAGE_COL_EUSERS)) == 0)
+ continue;
+ }
+ count++;
+ if (out_result != NULL) {
+ if (bufsize < count) {
+ ret = set_errno(EOVERFLOW);
+ } else {
+ if (copyout(result, out_result,
+ sizeof (vmusage_t)))
+ return (set_errno(EFAULT));
+ out_result++;
+ }
+ }
+ }
+ if (nres != NULL)
+ if (copyout(&count, (void *)nres, sizeof (size_t)))
+ return (set_errno(EFAULT));
+
+ return (ret);
+}
+
+/*
+ * vm_getusage()
+ *
+ * Counts rss and swap by zone, project, task, and/or user. The flags argument
+ * determines the type of results structures returned. Flags requesting
+ * results from more than one zone are "flattened" to the local zone if the
+ * caller is not the global zone.
+ *
+ * args:
+ * flags: bitmap consisting of one or more of VMUSAGE_*.
+ * age: maximum allowable age (time since counting was done) in
+ * seconds of the results. Results from previous callers are
+ * cached in kernel.
+ * buf: pointer to buffer array of vmusage_t. If NULL, then only nres
+ * set on success.
+ * nres: Set to number of vmusage_t structures pointed to by buf
+ * before calling vm_getusage().
+ * On return 0 (success) or ENOSPC, is set to the number of result
+ * structures returned or attempted to return.
+ *
+ * returns 0 on success, -1 on failure:
+ * EINTR (interrupted)
+ * ENOSPC (nres to small for results, nres set to needed value for success)
+ * EINVAL (flags invalid)
+ * EFAULT (bad address for buf or nres)
+ */
+int
+vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres)
+{
+ vmu_entity_t *entity;
+ vmusage_t *result;
+ int ret = 0;
+ int cacherecent = 0;
+ hrtime_t now;
+ uint_t flags_orig;
+
+ /*
+ * Non-global zones cannot request system wide and/or collated
+ * results, or the system result, so munge the flags accordingly.
+ */
+ flags_orig = flags;
+ if (curproc->p_zone != global_zone) {
+ if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
+ flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
+ flags |= VMUSAGE_PROJECTS;
+ }
+ if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
+ flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
+ flags |= VMUSAGE_RUSERS;
+ }
+ if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
+ flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
+ flags |= VMUSAGE_EUSERS;
+ }
+ if (flags & VMUSAGE_SYSTEM) {
+ flags &= ~VMUSAGE_SYSTEM;
+ flags |= VMUSAGE_ZONE;
+ }
+ }
+
+ /* Check for unknown flags */
+ if ((flags & (~VMUSAGE_MASK)) != 0)
+ return (set_errno(EINVAL));
+
+ /* Check for no flags */
+ if ((flags & VMUSAGE_MASK) == 0)
+ return (set_errno(EINVAL));
+
+ mutex_enter(&vmu_data.vmu_lock);
+ now = gethrtime();
+
+start:
+ if (vmu_data.vmu_cache != NULL) {
+
+ vmu_cache_t *cache;
+
+ if ((vmu_data.vmu_cache->vmc_timestamp +
+ ((hrtime_t)age * NANOSEC)) > now)
+ cacherecent = 1;
+
+ if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
+ cacherecent == 1) {
+ cache = vmu_data.vmu_cache;
+ vmu_cache_hold(cache);
+ mutex_exit(&vmu_data.vmu_lock);
+
+ ret = vmu_copyout_results(cache, buf, nres, flags_orig);
+ mutex_enter(&vmu_data.vmu_lock);
+ vmu_cache_rele(cache);
+ if (vmu_data.vmu_pending_waiters > 0)
+ cv_broadcast(&vmu_data.vmu_cv);
+ mutex_exit(&vmu_data.vmu_lock);
+ return (ret);
+ }
+ /*
+ * If the cache is recent, it is likely that there are other
+ * consumers of vm_getusage running, so add their flags to the
+ * desired flags for the calculation.
+ */
+ if (cacherecent == 1)
+ flags = vmu_data.vmu_cache->vmc_flags | flags;
+ }
+ if (vmu_data.vmu_calc_thread == NULL) {
+
+ vmu_cache_t *cache;
+
+ vmu_data.vmu_calc_thread = curthread;
+ vmu_data.vmu_calc_flags = flags;
+ vmu_data.vmu_entities = NULL;
+ vmu_data.vmu_nentities = 0;
+ if (vmu_data.vmu_pending_waiters > 0)
+ vmu_data.vmu_calc_flags |=
+ vmu_data.vmu_pending_flags;
+
+ vmu_data.vmu_pending_flags = 0;
+ mutex_exit(&vmu_data.vmu_lock);
+ vmu_calculate();
+ mutex_enter(&vmu_data.vmu_lock);
+ /* copy results to cache */
+ if (vmu_data.vmu_cache != NULL)
+ vmu_cache_rele(vmu_data.vmu_cache);
+ cache = vmu_data.vmu_cache =
+ vmu_cache_alloc(vmu_data.vmu_nentities,
+ vmu_data.vmu_calc_flags);
+
+ result = cache->vmc_results;
+ for (entity = vmu_data.vmu_entities; entity != NULL;
+ entity = entity->vme_next) {
+ *result = entity->vme_result;
+ result++;
+ }
+ cache->vmc_timestamp = gethrtime();
+ vmu_cache_hold(cache);
+
+ vmu_data.vmu_calc_flags = 0;
+ vmu_data.vmu_calc_thread = NULL;
+
+ if (vmu_data.vmu_pending_waiters > 0)
+ cv_broadcast(&vmu_data.vmu_cv);
+
+ mutex_exit(&vmu_data.vmu_lock);
+
+ /* copy cache */
+ ret = vmu_copyout_results(cache, buf, nres, flags_orig);
+ mutex_enter(&vmu_data.vmu_lock);
+ vmu_cache_rele(cache);
+ mutex_exit(&vmu_data.vmu_lock);
+
+ return (ret);
+ }
+ vmu_data.vmu_pending_flags |= flags;
+ vmu_data.vmu_pending_waiters++;
+ while (vmu_data.vmu_calc_thread != NULL) {
+ if (cv_wait_sig(&vmu_data.vmu_cv,
+ &vmu_data.vmu_lock) == 0) {
+ vmu_data.vmu_pending_waiters--;
+ mutex_exit(&vmu_data.vmu_lock);
+ return (set_errno(EINTR));
+ }
+ }
+ vmu_data.vmu_pending_waiters--;
+ goto start;
+}