diff options
| author | gjelinek <none@none> | 2006-12-14 13:35:17 -0800 |
|---|---|---|
| committer | gjelinek <none@none> | 2006-12-14 13:35:17 -0800 |
| commit | 0209230bf1261579beab4f55226bb509e6b850cb (patch) | |
| tree | c605b4105191d5a10962c524ad08019742cd52cb /usr/src/uts | |
| parent | 780774645a5b1b0176916fc66312dc1d9b4d14b4 (diff) | |
| download | illumos-joyent-0209230bf1261579beab4f55226bb509e6b850cb.tar.gz | |
PSARC 2006/496 Improved Zones/RM Integration
PSARC 2006/598 Swap resource control; locked memory RM improvements
PSARC 2006/660 rcapadm zone option
4754856 *prstat* prstat -atJTZ should count shared segments only once
4970603 RFE: should be able to persistently specify global zone's cpu shares
5026227 RFE: ability to rcap zones from global zone
5103071 RFE: local zones can run the global zone out of swap
6222025 RFE: simplify rctl syntax and improve cpu-shares/FSS interaction
6420985 rcapstat is broken on amd64
6421202 RFE: simplify and improve zones/pool integration
6442252 zonecfg's "unset" syntax is not documented and confusing
6490516 schedctl pages should not reserve swap
6490938 setproject can bind to the wrong pool
6498635 zone attach failure leaves zone in installed state
6500877 tmpfs syslogs incorrect path when non-global zone tmpfs mounts become full
Diffstat (limited to 'usr/src/uts')
35 files changed, 2984 insertions, 159 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 32a63d6c22..b2bbcbc8c3 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -334,6 +334,7 @@ GENUNIX_OBJS += \ vm_seg.o \ vm_subr.o \ vm_swap.o \ + vm_usage.o \ vnode.o \ vuid_queue.o \ vuid_store.o \ diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c index 3bb90cf1fa..9197dc815b 100644 --- a/usr/src/uts/common/disp/priocntl.c +++ b/usr/src/uts/common/disp/priocntl.c @@ -136,6 +136,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, struct pcmpargs pcmpargs; pc_vaparms_t vaparms; char clname[PC_CLNMSZ]; + char *outstr; int count; kthread_id_t retthreadp; proc_t *initpp; @@ -145,6 +146,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, int rv = 0; pid_t saved_pid; id_t classid; + int size; int (*copyinfn)(const void *, void *, size_t); int (*copyoutfn)(const void *, void *, size_t); @@ -692,6 +694,21 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, ASSERT(defaultcid > 0 && defaultcid < loaded_classes); break; + case PC_GETDFLCL: + mutex_enter(&class_lock); + + if (defaultcid >= loaded_classes) + outstr = ""; + else + outstr = sclass[defaultcid].cl_name; + size = strlen(outstr) + 1; + if (arg != NULL) + if ((*copyoutfn)(outstr, arg, size) != 0) + error = EFAULT; + + mutex_exit(&class_lock); + break; + default: error = EINVAL; break; diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c index 5a7000c242..c5145cccf0 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +66,7 @@ tmp_resv( int pagecreate) /* call anon_resv if set */ { pgcnt_t pages = btopr(delta); + zone_t *zone; ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); @@ -79,9 +79,10 @@ tmp_resv( * * Deny if trying to reserve more than tmpfs can allocate */ + zone = tm->tm_vfsp->vfs_zone; if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) || - (!anon_checkspace(ptob(pages + tmpfs_minfree))) || - (anon_resv(delta) == 0))) { + (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) || + (anon_resv_zone(delta, zone) == 0))) { return (1); } @@ -114,7 +115,7 @@ tmp_unresv( ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); - anon_unresv(delta); + anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone); mutex_enter(&tm->tm_contents); tm->tm_anonmem -= btopr(delta); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index d623dce3f7..aa870b124a 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -215,9 +215,26 @@ wrtmp( if (delta > 0) { pagecreate = 1; if (tmp_resv(tm, tp, delta, pagecreate)) { - cmn_err(CE_WARN, - "%s: File system full, swap space limit exceeded", + /* + * Log file system full in the zone that owns + * the tmpfs mount, as well as in the global + * zone if necessary. + */ + zcmn_err(tm->tm_vfsp->vfs_zone->zone_id, + CE_WARN, "%s: File system full, " + "swap space limit exceeded", tm->tm_mntpath); + + if (tm->tm_vfsp->vfs_zone->zone_id != + GLOBAL_ZONEID) { + + vfs_t *vfs = tm->tm_vfsp; + + zcmn_err(GLOBAL_ZONEID, + CE_WARN, "%s: File system full, " + "swap space limit exceeded", + vfs->vfs_vnodecovered->v_path); + } error = ENOSPC; break; } diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c index 19700ce685..3c63231253 100644 --- a/usr/src/uts/common/os/modhash.c +++ b/usr/src/uts/common/os/modhash.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -165,15 +164,6 @@ */ #define MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2)) -static void i_mod_hash_clear_nosync(mod_hash_t *); -static int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t *); -static int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t, mod_hash_hndl_t); -static int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t *); -static uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t); - /* * Cache for struct mod_hash_entry */ @@ -522,7 +512,7 @@ mod_hash_destroy_hash(mod_hash_t *hash) * i_mod_hash() * Call the hashing algorithm for this hash table, with the given key. */ -static uint_t +uint_t i_mod_hash(mod_hash_t *hash, mod_hash_key_t key) { uint_t h; @@ -778,7 +768,7 @@ mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key) * mod_hash_find() * Find a value in the hash table corresponding to the given key. */ -static int +int i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val) { @@ -826,7 +816,7 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val, return (res); } -static void +void i_mod_hash_walk_nosync(mod_hash_t *hash, uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg) { @@ -870,7 +860,7 @@ mod_hash_walk(mod_hash_t *hash, * Clears the given hash table by calling the destructor of every hash * element and freeing up all mod_hash_entry's. */ -static void +void i_mod_hash_clear_nosync(mod_hash_t *hash) { int i; diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index 88b0258afe..fecc4a6c45 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -385,6 +385,56 @@ pgfind(pid_t pgid) } /* + * Sets P_PR_LOCK on a non-system process. Process must be fully created + * and not exiting to succeed. + * + * Returns 0 on success. + * Returns 1 if P_PR_LOCK is set. + * Returns -1 if proc is in invalid state. + */ +int +sprtrylock_proc(proc_t *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + + /* skip system and incomplete processes */ + if (p->p_stat == SIDL || p->p_stat == SZOMB || + (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { + return (-1); + } + + if (p->p_proc_flag & P_PR_LOCK) + return (1); + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + + return (0); +} + +/* + * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, + * and the proc pointer no longer valid, as the proc may have exited. + */ +void +sprwaitlock_proc(proc_t *p) +{ + kmutex_t *mp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(p->p_proc_flag & P_PR_LOCK); + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); +} + +/* * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. * Returns the proc pointer on success, NULL on failure. sprlock() is * really just a stripped-down version of pr_p_lock() to allow practive @@ -394,7 +444,7 @@ proc_t * sprlock_zone(pid_t pid, zoneid_t zoneid) { proc_t *p; - kmutex_t *mp; + int ret; for (;;) { mutex_enter(&pidlock); @@ -402,31 +452,21 @@ sprlock_zone(pid_t pid, zoneid_t zoneid) mutex_exit(&pidlock); return (NULL); } - /* - * p_lock is persistent, but p itself is not -- it could - * vanish during cv_wait(). Load p->p_lock now so we can - * drop it after cv_wait() without referencing p. - */ - mp = &p->p_lock; - mutex_enter(mp); + mutex_enter(&p->p_lock); mutex_exit(&pidlock); - /* - * If the process is in some half-baked state, fail. - */ - if (p->p_stat == SZOMB || p->p_stat == SIDL || - (p->p_flag & (SEXITING | SEXITLWPS))) { - mutex_exit(mp); - return (NULL); - } + if (panicstr) return (p); - if (!(p->p_proc_flag & P_PR_LOCK)) + + ret = sprtrylock_proc(p); + if (ret == -1) { + mutex_exit(&p->p_lock); + return (NULL); + } else if (ret == 0) { break; - cv_wait(&pr_pid_cv[p->p_slot], mp); - mutex_exit(mp); + } + sprwaitlock_proc(p); } - p->p_proc_flag |= P_PR_LOCK; - THREAD_KPRI_REQUEST(); return (p); } diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c index ceb90850fa..818bb54701 100644 --- a/usr/src/uts/common/os/pool.c +++ b/usr/src/uts/common/os/pool.c @@ -293,6 +293,8 @@ pool_enable(void) (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); + (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", + "wt-load"); (void) nvlist_alloc(&pool_default->pool_props, NV_UNIQUE_NAME, KM_SLEEP); @@ -1309,7 +1311,7 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) } if (idtype == P_PROJID) { - kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND); + kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); if (kpj == NULL) return (ESRCH); mutex_enter(&kpj->kpj_poolbind); diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c index 6c266c0ca3..d75b60f6e9 100644 --- a/usr/src/uts/common/os/project.c +++ b/usr/src/uts/common/os/project.c @@ -29,6 +29,7 @@ #include <sys/modhash.h> #include <sys/modctl.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/atomic.h> #include <sys/cmn_err.h> #include <sys/proc.h> @@ -103,6 +104,8 @@ struct project_zone { * acquired, the hash lock is to be acquired first. */ +static kstat_t *project_kstat_create(kproject_t *pj, zone_t *zone); +static void project_kstat_delete(kproject_t *pj); static void project_data_init(kproject_data_t *data) @@ -118,6 +121,7 @@ project_data_init(kproject_data_t *data) data->kpd_locked_mem_ctl = UINT64_MAX; data->kpd_contract = 0; data->kpd_crypto_mem = 0; + data->kpd_lockedmem_kstat = NULL; } /*ARGSUSED*/ @@ -179,11 +183,11 @@ project_hold(kproject_t *p) } /* - * kproject_t *project_hold_by_id(projid_t, zoneid_t, int) + * kproject_t *project_hold_by_id(projid_t, zone_t *, int) * * Overview * project_hold_by_id() performs a look-up in the dictionary of projects - * active on the system by specified project ID + zone ID and puts a hold on + * active on the system by specified project ID + zone and puts a hold on * it. The third argument defines the desired behavior in the case when * project with given project ID cannot be found: * @@ -202,7 +206,7 @@ project_hold(kproject_t *p) * Caller must be in a context suitable for KM_SLEEP allocations. */ kproject_t * -project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) +project_hold_by_id(projid_t id, zone_t *zone, int flag) { kproject_t *spare_p; kproject_t *p; @@ -211,9 +215,11 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) rctl_alloc_gp_t *gp; rctl_entity_p_t e; struct project_zone pz; + boolean_t create = B_FALSE; + kstat_t *ksp; pz.kpj_id = id; - pz.kpj_zoneid = zoneid; + pz.kpj_zoneid = zone->zone_id; if (flag == PROJECT_HOLD_FIND) { mutex_enter(&project_hash_lock); @@ -241,9 +247,10 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) mutex_enter(&project_hash_lock); if (mod_hash_find(projects_hash, (mod_hash_key_t)&pz, (mod_hash_val_t *)&p) == MH_ERR_NOTFOUND) { + p = spare_p; p->kpj_id = id; - p->kpj_zoneid = zoneid; + p->kpj_zoneid = zone->zone_id; p->kpj_count = 0; p->kpj_shares = 1; p->kpj_nlwps = 0; @@ -265,7 +272,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) * Insert project into global project list. */ mutex_enter(&projects_list_lock); - if (id != 0 || zoneid != GLOBAL_ZONEID) { + if (id != 0 || zone != &zone0) { p->kpj_next = projects_list; p->kpj_prev = projects_list->kpj_prev; p->kpj_prev->kpj_next = p; @@ -279,6 +286,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) projects_list = p; } mutex_exit(&projects_list_lock); + create = B_TRUE; } else { mutex_exit(&curproc->p_lock); mod_hash_cancel(projects_hash, &hndl); @@ -290,10 +298,20 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) p->kpj_count++; mutex_exit(&project_hash_lock); + /* + * The kstat stores the project's zone name, as zoneid's may change + * across reboots. + */ + if (create == B_TRUE) { + ksp = project_kstat_create(p, zone); + mutex_enter(&project_hash_lock); + ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL); + p->kpj_data.kpd_lockedmem_kstat = ksp; + mutex_exit(&project_hash_lock); + } return (p); } - /* * void project_rele(kproject_t *) * @@ -325,6 +343,7 @@ project_rele(kproject_t *p) mutex_exit(&projects_list_lock); rctl_set_free(p->kpj_rctls); + project_kstat_delete(p); if (mod_hash_destroy(projects_hash, (mod_hash_key_t)p)) panic("unable to delete project %d zone %d", p->kpj_id, @@ -636,9 +655,9 @@ project_locked_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - mutex_enter(&p->p_zone->zone_rctl_lock); + mutex_enter(&p->p_zone->zone_mem_lock); q = p->p_task->tk_proj->kpj_data.kpd_locked_mem; - mutex_exit(&p->p_zone->zone_rctl_lock); + mutex_exit(&p->p_zone->zone_mem_lock); return (q); } @@ -649,7 +668,7 @@ project_locked_mem_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock)); + ASSERT(MUTEX_HELD(&p->p_zone->zone_mem_lock)); q = p->p_task->tk_proj->kpj_data.kpd_locked_mem; if (q + inc > rval->rcv_value) return (1); @@ -868,7 +887,7 @@ project_init(void) rctl_add_default_limit("project.max-contracts", 10000, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); - t0.t_proj = proj0p = project_hold_by_id(0, GLOBAL_ZONEID, + t0.t_proj = proj0p = project_hold_by_id(0, &zone0, PROJECT_HOLD_INSERT); mutex_enter(&p0.p_lock); @@ -876,3 +895,57 @@ project_init(void) mutex_exit(&p0.p_lock); proj0p->kpj_ntasks = 1; } + +static int +project_lockedmem_kstat_update(kstat_t *ksp, int rw) +{ + kproject_t *pj = ksp->ks_private; + kproject_kstat_t *kpk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + kpk->kpk_usage.value.ui64 = pj->kpj_data.kpd_locked_mem; + kpk->kpk_value.value.ui64 = pj->kpj_data.kpd_locked_mem_ctl; + return (0); +} + +static kstat_t * +project_kstat_create(kproject_t *pj, zone_t *zone) +{ + kstat_t *ksp; + kproject_kstat_t *kpk; + char *zonename = zone->zone_name; + + ksp = rctl_kstat_create_project(pj, "lockedmem", KSTAT_TYPE_NAMED, + sizeof (kproject_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return (NULL); + + kpk = ksp->ks_data = kmem_alloc(sizeof (kproject_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zonename) + 1; + kstat_named_init(&kpk->kpk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&kpk->kpk_zonename, zonename); + kstat_named_init(&kpk->kpk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&kpk->kpk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = project_lockedmem_kstat_update; + ksp->ks_private = pj; + kstat_install(ksp); + + return (ksp); +} + +static void +project_kstat_delete(kproject_t *pj) +{ + void *data; + + if (pj->kpj_data.kpd_lockedmem_kstat != NULL) { + data = pj->kpj_data.kpd_lockedmem_kstat->ks_data; + kstat_delete(pj->kpj_data.kpd_lockedmem_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } + pj->kpj_data.kpd_lockedmem_kstat = NULL; +} diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 4de4c74fe8..c0479005ea 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -29,6 +29,7 @@ #include <sys/cmn_err.h> #include <sys/id_space.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/log.h> #include <sys/modctl.h> #include <sys/modhash.h> @@ -2599,7 +2600,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, zonep = p->p_zone; } - mutex_enter(&zonep->zone_rctl_lock); + mutex_enter(&zonep->zone_mem_lock); e.rcep_p.proj = projp; e.rcep_t = RCENTITY_PROJECT; @@ -2627,7 +2628,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, p->p_locked_mem += inc; } out: - mutex_exit(&zonep->zone_rctl_lock); + mutex_exit(&zonep->zone_mem_lock); if (proj != NULL) zone_rele(zonep); return (ret); @@ -2661,7 +2662,7 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, zonep = p->p_zone; } - mutex_enter(&zonep->zone_rctl_lock); + mutex_enter(&zonep->zone_mem_lock); zonep->zone_locked_mem -= inc; projp->kpj_data.kpd_locked_mem -= inc; if (creditproc != 0) { @@ -2669,7 +2670,120 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, ASSERT(MUTEX_HELD(&p->p_lock)); p->p_locked_mem -= inc; } - mutex_exit(&zonep->zone_rctl_lock); + mutex_exit(&zonep->zone_mem_lock); if (proj != NULL) zone_rele(zonep); } + +/* + * rctl_incr_swap(proc_t *, zone_t *, size_t) + * + * Overview + * Increments the swap charge on the specified zone. + * + * Return values + * 0 on success. EAGAIN if swap increment fails due an rctl value + * on the zone. + * + * Callers context + * p_lock held on specified proc. + * swap must be even multiple of PAGESIZE + */ +int +rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap) +{ + rctl_entity_p_t e; + + ASSERT(MUTEX_HELD(&proc->p_lock)); + ASSERT((swap & PAGEOFFSET) == 0); + e.rcep_p.zone = zone; + e.rcep_t = RCENTITY_ZONE; + + mutex_enter(&zone->zone_mem_lock); + + if ((zone->zone_max_swap + swap) > + zone->zone_max_swap_ctl) { + + if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls, + proc, &e, swap, 0) & RCT_DENY) { + mutex_exit(&zone->zone_mem_lock); + return (EAGAIN); + } + } + zone->zone_max_swap += swap; + mutex_exit(&zone->zone_mem_lock); + return (0); +} + +/* + * rctl_decr_swap(zone_t *, size_t) + * + * Overview + * Decrements the swap charge on the specified zone. + * + * Return values + * None + * + * Callers context + * swap must be even multiple of PAGESIZE + */ +void +rctl_decr_swap(zone_t *zone, size_t swap) +{ + ASSERT((swap & PAGEOFFSET) == 0); + mutex_enter(&zone->zone_mem_lock); + ASSERT(zone->zone_max_swap >= swap); + zone->zone_max_swap -= swap; + mutex_exit(&zone->zone_mem_lock); +} + +/* + * Create resource kstat + */ +static kstat_t * +rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class, + uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid) +{ + kstat_t *ksp = NULL; + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance); + + if ((ksp = kstat_create_zone("caps", ks_zoneid, + name, ks_class, ks_type, + ks_ndata, ks_flags, ks_zoneid)) != NULL) { + if (ks_zoneid != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + } + return (ksp); +} + +/* + * Create zone-specific resource kstat + */ +kstat_t * +rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags) +{ + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name); + + return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps", + ks_type, ks_ndata, ks_flags, zone->zone_id)); +} + +/* + * Create project-specific resource kstat + */ +kstat_t * +rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags) +{ + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name); + + return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps", + ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid)); +} diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 66aae7d2bc..62279e0777 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -542,13 +541,13 @@ schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) * Set up anonymous memory struct. No swap reservation is * needed since the page will be locked into memory. */ - amp = anonmap_alloc(PAGESIZE, PAGESIZE); + amp = anonmap_alloc(PAGESIZE, 0); /* * Allocate the page. */ - kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_LOCKED | KPD_ZERO, - amp); + kaddr = segkp_get_withanonmap(segkp, PAGESIZE, + KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); if (kaddr == NULL) { amp->refcnt--; anonmap_free(amp); diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index 9ada0aac18..a7ef99fddb 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -666,7 +666,7 @@ struct sysent sysent[NSYSCALL] = /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), - /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2), + /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5), /* 182 */ SYSENT_LOADABLE(), /* portfs */ /* 183 */ SYSENT_CI("pollsys", pollsys, 4), /* 184 */ SYSENT_CI("labelsys", labelsys, 5), @@ -1044,7 +1044,7 @@ struct sysent sysent32[NSYSCALL] = /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), - /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2), + /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5), /* 182 */ SYSENT_LOADABLE32(), /* portfs */ /* 183 */ SYSENT_CI("pollsys", pollsys, 4), /* 184 */ SYSENT_CI("labelsys", labelsys, 5), diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c index 562e3596b5..785f74c145 100644 --- a/usr/src/uts/common/os/task.c +++ b/usr/src/uts/common/os/task.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -389,7 +388,7 @@ task_create(projid_t projid, zone_t *zone) tk->tk_nlwps = 0; tk->tk_nlwps_ctl = INT_MAX; tk->tk_usage = tu; - tk->tk_proj = project_hold_by_id(projid, zone->zone_id, + tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT); tk->tk_flags = TASK_NORMAL; @@ -848,7 +847,7 @@ task_init(void) task0p->tk_tkid = id_alloc(taskid_space); task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); - task0p->tk_proj = project_hold_by_id(0, GLOBAL_ZONEID, + task0p->tk_proj = project_hold_by_id(0, &zone0, PROJECT_HOLD_INSERT); task0p->tk_flags = TASK_NORMAL; task0p->tk_nlwps = p->p_lwpcnt; diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 0fb2c2be55..19ea8b31f1 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -154,6 +154,10 @@ * zone_lock: This is a per-zone lock used to protect several fields of * the zone_t (see <sys/zone.h> for details). In addition, holding * this lock means that the zone cannot go away. + * zone_nlwps_lock: This is a per-zone lock used to protect the fields + * related to the zone.max-lwps rctl. + * zone_mem_lock: This is a per-zone lock used to protect the fields + * related to the zone.max-locked-memory and zone.max-swap rctls. * zsd_key_lock: This is a global lock protecting the key state for ZSD. * zone_deathrow_lock: This is a global lock protecting the "deathrow" * list (a list of zones in the ZONE_IS_DEAD state). @@ -162,6 +166,10 @@ * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> * zone_lock --> zsd_key_lock --> pidlock --> p_lock * + * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is: + * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock + * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock + * * Blocking memory allocations are permitted while holding any of the * zone locks. * @@ -190,6 +198,7 @@ #include <sys/debug.h> #include <sys/file.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/mutex.h> #include <sys/note.h> #include <sys/pathname.h> @@ -232,6 +241,8 @@ #include <sys/zone.h> #include <sys/tsol/label.h> +#include <vm/seg.h> + /* * cv used to signal that all references to the zone have been released. This * needs to be global since there may be multiple waiters, and the first to @@ -317,6 +328,7 @@ const char *zone_status_table[] = { */ rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; +rctl_hndl_t rc_zone_max_swap; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_shmmax; rctl_hndl_t rc_zone_shmmni; @@ -1011,9 +1023,9 @@ zone_locked_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - mutex_enter(&p->p_zone->zone_rctl_lock); + mutex_enter(&p->p_zone->zone_mem_lock); q = p->p_zone->zone_locked_mem; - mutex_exit(&p->p_zone->zone_rctl_lock); + mutex_exit(&p->p_zone->zone_mem_lock); return (q); } @@ -1023,9 +1035,12 @@ zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) { rctl_qty_t q; + zone_t *z; + + z = e->rcep_p.zone; ASSERT(MUTEX_HELD(&p->p_lock)); - ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock)); - q = p->p_zone->zone_locked_mem; + ASSERT(MUTEX_HELD(&z->zone_mem_lock)); + q = z->zone_locked_mem; if (q + incr > rcntl->rcv_value) return (1); return (0); @@ -1051,6 +1066,57 @@ static rctl_ops_t zone_locked_mem_ops = { zone_locked_mem_test }; +/*ARGSUSED*/ +static rctl_qty_t +zone_max_swap_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + mutex_enter(&z->zone_mem_lock); + q = z->zone_max_swap; + mutex_exit(&z->zone_mem_lock); + return (q); +} + +/*ARGSUSED*/ +static int +zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, + rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) +{ + rctl_qty_t q; + zone_t *z; + + z = e->rcep_p.zone; + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(MUTEX_HELD(&z->zone_mem_lock)); + q = z->zone_max_swap; + if (q + incr > rcntl->rcv_value) + return (1); + return (0); +} + +/*ARGSUSED*/ +static int +zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_max_swap_ctl = nv; + return (0); +} + +static rctl_ops_t zone_max_swap_ops = { + rcop_no_action, + zone_max_swap_usage, + zone_max_swap_set, + zone_max_swap_test +}; + /* * Helper function to brand the zone with a unique ID. */ @@ -1080,6 +1146,96 @@ zone_get_kcred(zoneid_t zoneid) return (cr); } +static int +zone_lockedmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_locked_mem; + zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl; + return (0); +} + +static int +zone_swapresv_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_max_swap; + zk->zk_value.value.ui64 = zone->zone_max_swap_ctl; + return (0); +} + +static void +zone_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_kstat_t *zk; + + ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED, + sizeof (zone_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zk->zk_zonename, zone->zone_name); + kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = zone_lockedmem_kstat_update; + ksp->ks_private = zone; + kstat_install(ksp); + + zone->zone_lockedmem_kstat = ksp; + + ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED, + sizeof (zone_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zk->zk_zonename, zone->zone_name); + kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = zone_swapresv_kstat_update; + ksp->ks_private = zone; + kstat_install(ksp); + + zone->zone_swapresv_kstat = ksp; +} + +static void +zone_kstat_delete(zone_t *zone) +{ + void *data; + + if (zone->zone_lockedmem_kstat != NULL) { + data = zone->zone_lockedmem_kstat->ks_data; + kstat_delete(zone->zone_lockedmem_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } + if (zone->zone_swapresv_kstat != NULL) { + data = zone->zone_swapresv_kstat->ks_data; + kstat_delete(zone->zone_swapresv_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } +} + /* * Called very early on in boot to initialize the ZSD list so that * zone_key_create() can be called before zone_init(). It also initializes @@ -1101,8 +1257,14 @@ zone_zsd_init(void) mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); zone0.zone_shares = 1; + zone0.zone_nlwps = 0; zone0.zone_nlwps_ctl = INT_MAX; + zone0.zone_locked_mem = 0; + zone0.zone_locked_mem_ctl = UINT64_MAX; + ASSERT(zone0.zone_max_swap == 0); + zone0.zone_max_swap_ctl = UINT64_MAX; zone0.zone_shmmax = 0; zone0.zone_ipc.ipcq_shmmni = 0; zone0.zone_ipc.ipcq_semmni = 0; @@ -1120,6 +1282,8 @@ zone_zsd_init(void) zone0.zone_ncpus_online = 0; zone0.zone_proc_initpid = 1; zone0.zone_initname = initname; + zone0.zone_lockedmem_kstat = NULL; + zone0.zone_swapresv_kstat = NULL; list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); list_insert_head(&zone_active, &zone0); @@ -1259,6 +1423,12 @@ zone_init(void) RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_locked_mem_ops); + + rc_zone_max_swap = rctl_register("zone.max-swap", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_max_swap_ops); + /* * Initialize the ``global zone''. */ @@ -1277,9 +1447,14 @@ zone_init(void) zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* - * pool_default hasn't been initialized yet, so we let pool_init() take - * care of making the global zone is in the default pool. + * pool_default hasn't been initialized yet, so we let pool_init() + * take care of making sure the global zone is in the default pool. + */ + + /* + * Initialize global zone kstats */ + zone_kstat_create(&zone0); /* * Initialize zone label. @@ -1337,6 +1512,7 @@ zone_init(void) if (res) panic("Sysevent_evc_bind failed during zone setup.\n"); + } static void @@ -1476,6 +1652,38 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +static int +zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +{ + uint64_t mcap; + int err = 0; + + if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) + zone->zone_phys_mcap = mcap; + + return (err); +} + +static int +zone_set_sched_class(zone_t *zone, const char *new_class) +{ + char sched_class[PC_CLNMSZ]; + id_t classid; + int err; + + ASSERT(zone != global_zone); + if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0) + return (err); /* EFAULT or ENAMETOOLONG */ + + if (getcid(sched_class, &classid) != 0 || classid == syscid) + return (set_errno(EINVAL)); + zone->zone_defaultcid = classid; + ASSERT(zone->zone_defaultcid > 0 && + zone->zone_defaultcid < loaded_classes); + + return (0); +} + /* * Block indefinitely waiting for (zone_status >= status) */ @@ -2510,10 +2718,10 @@ zsched(void *arg) /* * Decrement locked memory counts on old zone and project. */ - mutex_enter(&global_zone->zone_rctl_lock); + mutex_enter(&global_zone->zone_mem_lock); global_zone->zone_locked_mem -= pp->p_locked_mem; pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; - mutex_exit(&global_zone->zone_rctl_lock); + mutex_exit(&global_zone->zone_mem_lock); /* * Create and join a new task in project '0' of this zone. @@ -2529,10 +2737,10 @@ zsched(void *arg) pj = pp->p_task->tk_proj; - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); zone->zone_locked_mem += pp->p_locked_mem; pj->kpj_data.kpd_locked_mem += pp->p_locked_mem; - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); /* * add lwp counts to zsched's zone, and increment project's task count @@ -2689,7 +2897,10 @@ zsched(void *arg) * classid 'cid'. */ pool_lock(); - cid = pool_get_class(zone->zone_pool); + if (zone->zone_defaultcid > 0) + cid = zone->zone_defaultcid; + else + cid = pool_get_class(zone->zone_pool); if (cid == -1) cid = defaultcid; @@ -3019,7 +3230,7 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zone->zone_rctl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); list_create(&zone->zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); @@ -3057,8 +3268,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_initname = kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP); (void) strcpy(zone->zone_initname, zone_default_initname); + zone->zone_nlwps = 0; + zone->zone_nlwps_ctl = INT_MAX; zone->zone_locked_mem = 0; zone->zone_locked_mem_ctl = UINT64_MAX; + zone->zone_max_swap = 0; + zone->zone_max_swap_ctl = UINT64_MAX; + zone0.zone_lockedmem_kstat = NULL; + zone0.zone_swapresv_kstat = NULL; /* * Zsched initializes the rctls. @@ -3233,6 +3450,11 @@ zone_create(const char *zone_name, const char *zone_root, */ /* + * Create zone kstats + */ + zone_kstat_create(zone); + + /* * Let the other lwps continue. */ mutex_enter(&pp->p_lock); @@ -3643,6 +3865,9 @@ zone_destroy(zoneid_t zoneid) } + /* Get rid of the zone's kstats */ + zone_kstat_delete(zone); + /* * It is now safe to let the zone be recreated; remove it from the * lists. The memory will not be freed until the last cred @@ -3892,6 +4117,32 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; + case ZONE_ATTR_PHYS_MCAP: + size = sizeof (zone->zone_phys_mcap); + if (bufsize > size) + bufsize = size; + if (buf != NULL && + copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_CLASS: + mutex_enter(&class_lock); + + if (zone->zone_defaultcid >= loaded_classes) + outstr = ""; + else + outstr = sclass[zone->zone_defaultcid].cl_name; + size = strlen(outstr) + 1; + if (bufsize > size) + bufsize = size; + if (buf != NULL) { + err = copyoutstr(outstr, buf, bufsize, NULL); + if (err != 0 && err != ENAMETOOLONG) + error = EFAULT; + } + + mutex_exit(&class_lock); + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -3923,10 +4174,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * At present, attributes can only be set on non-running, - * non-global zones. + * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the + * global zone. */ - if (zoneid == GLOBAL_ZONEID) { + if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { return (set_errno(EINVAL)); } @@ -3938,8 +4189,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) zone_hold(zone); mutex_exit(&zonehash_lock); + /* + * At present most attributes can only be set on non-running, + * non-global zones. + */ zone_status = zone_status_get(zone); - if (zone_status > ZONE_IS_READY) + if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) goto done; switch (attr) { @@ -3971,6 +4226,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) if (zone->zone_brand == NULL) err = EINVAL; break; + case ZONE_ATTR_PHYS_MCAP: + err = zone_set_phys_mcap(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_SCHED_CLASS: + err = zone_set_sched_class(zone, (const char *)buf); + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -3986,6 +4247,11 @@ done: /* * Return zero if the process has at least one vnode mapped in to its * address space which shouldn't be allowed to change zones. + * + * Also return zero if the process has any shared mappings which reserve + * swap. This is because the counting for zone.max-swap does not allow swap + * revervation to be shared between zones. zone swap reservation is counted + * on zone->zone_max_swap. */ static int as_can_change_zones(void) @@ -3997,8 +4263,17 @@ as_can_change_zones(void) int allow = 1; ASSERT(pp->p_as != &kas); - AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + + /* + * Cannot enter zone with shared anon memory which + * reserves swap. See comment above. + */ + if (seg_can_change_zones(seg) == B_FALSE) { + allow = 0; + break; + } /* * if we can't get a backing vnode for this segment then skip * it. @@ -4011,11 +4286,30 @@ as_can_change_zones(void) break; } } - AS_LOCK_EXIT(&as, &as->a_lock); + AS_LOCK_EXIT(as, &as->a_lock); return (allow); } /* + * Count swap reserved by curproc's address space + */ +static size_t +as_swresv(void) +{ + proc_t *pp = curproc; + struct seg *seg; + struct as *as = pp->p_as; + size_t swap = 0; + + ASSERT(pp->p_as != &kas); + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) + swap += seg_swresv(seg); + + return (swap); +} + +/* * Systemcall entry point for zone_enter(). * * The current process is injected into said zone. In the process @@ -4043,6 +4337,7 @@ zone_enter(zoneid_t zoneid) zone_status_t status; int err = 0; rctl_entity_p_t e; + size_t swap; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -4205,6 +4500,15 @@ zone_enter(zoneid_t zoneid) goto out; } + /* + * a_lock must be held while transfering locked memory and swap + * reservation from the global zone to the non global zone because + * asynchronous faults on the processes' address space can lock + * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE + * segments respectively. + */ + AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER); + swap = as_swresv(); mutex_enter(&pp->p_lock); zone_proj0 = zone->zone_zsched->p_task->tk_proj; /* verify that we do not exceed and task or lwp limits */ @@ -4216,10 +4520,11 @@ zone_enter(zoneid_t zoneid) zone_proj0->kpj_ntasks += 1; mutex_exit(&zone->zone_nlwps_lock); - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); zone->zone_locked_mem += pp->p_locked_mem; zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem; - mutex_exit(&zone->zone_rctl_lock); + zone->zone_max_swap += swap; + mutex_exit(&zone->zone_mem_lock); /* remove lwps from proc's old zone and old project */ mutex_enter(&pp->p_zone->zone_nlwps_lock); @@ -4227,12 +4532,14 @@ zone_enter(zoneid_t zoneid) pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; mutex_exit(&pp->p_zone->zone_nlwps_lock); - mutex_enter(&pp->p_zone->zone_rctl_lock); + mutex_enter(&pp->p_zone->zone_mem_lock); pp->p_zone->zone_locked_mem -= pp->p_locked_mem; pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; - mutex_exit(&pp->p_zone->zone_rctl_lock); + pp->p_zone->zone_max_swap -= swap; + mutex_exit(&pp->p_zone->zone_mem_lock); mutex_exit(&pp->p_lock); + AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock); /* * Joining the zone cannot fail from now on. @@ -4289,6 +4596,31 @@ zone_enter(zoneid_t zoneid) sess_rele(pp->p_sessp, B_TRUE); pp->p_sessp = sp; pgjoin(pp, zone->zone_zsched->p_pidp); + + /* + * If there is a default scheduling class for the zone and it is not + * the class we are currently in, change all of the threads in the + * process to the new class. We need to be holding pidlock & p_lock + * when we call parmsset so this is a good place to do it. + */ + if (zone->zone_defaultcid > 0 && + zone->zone_defaultcid != curthread->t_cid) { + pcparms_t pcparms; + kthread_id_t t; + + pcparms.pc_cid = zone->zone_defaultcid; + pcparms.pc_clparms[0] = 0; + + /* + * If setting the class fails, we still want to enter the zone. + */ + if ((t = pp->p_tlist) != NULL) { + do { + (void) parmsset(&pcparms, t); + } while ((t = t->t_forw) != pp->p_tlist); + } + } + mutex_exit(&pp->p_lock); mutex_exit(&pidlock); diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index ab103ef4c7..4493f99454 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -544,6 +544,7 @@ CHKHDRS= \ visual_io.h \ vlan.h \ vm.h \ + vm_usage.h \ vmem.h \ vmem_impl.h \ vmmeter.h \ diff --git a/usr/src/uts/common/sys/modhash_impl.h b/usr/src/uts/common/sys/modhash_impl.h index 25e45cec23..a187eb68ee 100644 --- a/usr/src/uts/common/sys/modhash_impl.h +++ b/usr/src/uts/common/sys/modhash_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -93,6 +92,18 @@ struct mod_hash { */ void mod_hash_init(void); +/* + * Internal routines. Use directly with care. + */ +uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t); +int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t, + mod_hash_hndl_t); +int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t, + mod_hash_val_t *, void *), void *); +void i_mod_hash_clear_nosync(mod_hash_t *hash); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/priocntl.h b/usr/src/uts/common/sys/priocntl.h index ca1a92400a..6475ed0a4c 100644 --- a/usr/src/uts/common/sys/priocntl.h +++ b/usr/src/uts/common/sys/priocntl.h @@ -65,6 +65,7 @@ extern long priocntl(), priocntlset(); #define PC_SETXPARMS 7 /* Set extended scheduling parameters */ #define PC_GETXPARMS 8 /* Get extended scheduling parameters */ #define PC_SETDFLCL 9 /* Set default class, not for general use */ +#define PC_GETDFLCL 10 /* Get default class, not for general use */ #define PC_CLNULL -1 diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index fcf953262c..9a0ba2cc37 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -613,6 +613,8 @@ extern proc_t *pgfind(pid_t); extern proc_t *pgfind_zone(pid_t, zoneid_t); extern proc_t *sprlock(pid_t); extern proc_t *sprlock_zone(pid_t, zoneid_t); +extern int sprtrylock_proc(proc_t *); +extern void sprwaitlock_proc(proc_t *); extern void sprlock_proc(proc_t *); extern void sprunlock(proc_t *); extern void pid_init(void); diff --git a/usr/src/uts/common/sys/project.h b/usr/src/uts/common/sys/project.h index 679c1eddc2..5018df8499 100644 --- a/usr/src/uts/common/sys/project.h +++ b/usr/src/uts/common/sys/project.h @@ -28,15 +28,24 @@ #pragma ident "%Z%%M% %I% %E% SMI" + #ifdef __cplusplus extern "C" { #endif + +#include <sys/kstat.h> #include <sys/types.h> #include <sys/mutex.h> #include <sys/rctl.h> #include <sys/ipc_rctl.h> +typedef struct kproject_kstat { + kstat_named_t kpk_zonename; + kstat_named_t kpk_usage; + kstat_named_t kpk_value; +} kproject_kstat_t; + typedef struct kproject_data { /* Datum protected by: */ rctl_qty_t kpd_shmmax; /* shm's ipcs_lock */ ipc_rqty_t kpd_ipc; /* shm|sem|msg's ipcs lock */ @@ -44,6 +53,7 @@ typedef struct kproject_data { /* Datum protected by: */ rctl_qty_t kpd_locked_mem_ctl; /* kpj_rctls->rcs_lock */ rctl_qty_t kpd_contract; /* contract_lock */ rctl_qty_t kpd_crypto_mem; /* crypto_rctl_lock */ + kstat_t *kpd_lockedmem_kstat; /* locked memory kstat */ } kproject_data_t; @@ -76,9 +86,11 @@ typedef struct kproject { #define PROJECT_HOLD_FIND 1 #define PROJECT_HOLD_INSERT 2 +struct zone; + void project_init(void); kproject_t *project_hold(kproject_t *); -kproject_t *project_hold_by_id(projid_t, zoneid_t, int); +kproject_t *project_hold_by_id(projid_t, struct zone *, int); void project_rele(kproject_t *); int project_walk_all(zoneid_t, int (*)(kproject_t *, void *), void *); projid_t curprojid(void); diff --git a/usr/src/uts/common/sys/rctl.h b/usr/src/uts/common/sys/rctl.h index eb56fff9e5..a8480c2768 100644 --- a/usr/src/uts/common/sys/rctl.h +++ b/usr/src/uts/common/sys/rctl.h @@ -168,6 +168,7 @@ struct proc; struct task; struct kproject; struct zone; +struct kstat; typedef struct rctl_entity_p_struct { rctl_entity_t rcep_t; @@ -324,6 +325,14 @@ int rctl_incr_locked_mem(struct proc *, struct kproject *, rctl_qty_t, int); void rctl_decr_locked_mem(struct proc *, struct kproject *, rctl_qty_t, int); +int rctl_incr_swap(struct proc *, struct zone *, size_t); +void rctl_decr_swap(struct zone *, size_t); + +struct kstat *rctl_kstat_create_zone(struct zone *, char *, uchar_t, uint_t, + uchar_t); + +struct kstat *rctl_kstat_create_project(struct kproject *, char *, uchar_t, + uint_t, uchar_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index 86cc716d56..bf02808d4b 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -191,6 +190,7 @@ struct rusage { #define _RUSAGESYS_GETRUSAGE 0 /* rusage process */ #define _RUSAGESYS_GETRUSAGE_CHLD 1 /* rusage child process */ #define _RUSAGESYS_GETRUSAGE_LWP 2 /* rusage lwp */ +#define _RUSAGESYS_GETVMUSAGE 3 /* getvmusage */ #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/syscall.h b/usr/src/uts/common/sys/syscall.h index 96cb967023..eedadfa0c0 100644 --- a/usr/src/uts/common/sys/syscall.h +++ b/usr/src/uts/common/sys/syscall.h @@ -384,7 +384,8 @@ extern "C" { #define SYS_rusagesys 181 /* * subcodes: - * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE,...) + * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE, ...) + * getvmusage(...) :: rusagesys(RUSAGESYS_GETVMUSAGE, ...) */ #define SYS_port 182 /* diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h new file mode 100644 index 0000000000..5f8c8b8fe5 --- /dev/null +++ b/usr/src/uts/common/sys/vm_usage.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VM_USAGE_H +#define _SYS_VM_USAGE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The flags passed to getvmusage() request how to aggregate rss/swap results. + * Results can be aggregated by zone, project, task, ruser, and/or euser. + * + * If VMUSAGE_ALL_* or VMUSAGE_COL_* are passed from a non-global-zone, the + * flag is treated as VMUSAGE_*. For example, VMUSAGE_ALL_ZONES would be + * treated as VMUSAGE_ZONE. + * + * If VMUSAGE_SYSTEM is passed from a non-global zone, a result of type + * VMUSAGE_SYSTEM will be returned, but it will only reflect the usage + * of the calling zone. + * + * VMUSAGE_* requests results for the calling zone. + * VMUSAGE_ALL_* requests results for all zones. + * VMUSAGE_COL_* requests results for all zones, but collapses out the zoneid. + * For example, VMUSAGE_COL_PROJECTS requests results for all + * projects in all zones, and project N in ANY zone is treated + * as the same project. + */ +#define VMUSAGE_SYSTEM 0x1 /* rss/swap for ALL processes */ +#define VMUSAGE_ZONE 0x2 /* rss/swap for caller's zone */ +#define VMUSAGE_PROJECTS 0x4 /* rss/swap for all projects in */ + /* caller's zone */ +#define VMUSAGE_TASKS 0x8 /* rss/swap for all tasks in */ + /* caller's zones */ +#define VMUSAGE_RUSERS 0x10 /* rss/swap for all users (by process */ + /* ruser) in the caller's zone */ +#define VMUSAGE_EUSERS 0x20 /* same as VMUSAGE_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_ALL_ZONES 0x40 /* rss/swap for all zones */ +#define VMUSAGE_ALL_PROJECTS 0x80 /* rss/swap for all projects in */ + /* all zones */ +#define VMUSAGE_ALL_TASKS 0x100 /* rss/swap for all tasks in all */ + /* zones */ +#define VMUSAGE_ALL_RUSERS 0x200 /* rss/swap for all users (by process */ + /* ruser) in all zones */ +#define VMUSAGE_ALL_EUSERS 0x400 /* same as VMUSAGE_ALL_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_COL_PROJECTS 0x800 /* rss/swap for all projects in */ + /* all zones. Collapse zoneid. */ +#define VMUSAGE_COL_RUSERS 0x1000 /* rss/swap for all users (by process */ + /* ruser), in all zones. Collapse */ + /* zoneid */ +#define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ + +typedef struct vmusage { + id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ + /* VMUSAGE_COL_* results */ + /* ALL_ZONES means that the result */ + /* reflects swap and rss usage for */ + /* a projid/uid across all zones */ + uint_t vmu_type; /* Entity type of result. One of: */ + /* VMUSAGE_(SYSTEM|ZONE|PROJECTS| */ + /* TASKS|RUSERS|EUSERS) */ + id_t vmu_id; /* zoneid, projid, taskid, ... */ + size_t vmu_rss_all; /* total resident memory of entity */ + /* in bytes */ + size_t vmu_rss_private; /* total resident private memory */ + size_t vmu_rss_shared; /* total resident shared memory */ + size_t vmu_swap_all; /* total swap reserved, in bytes */ + size_t vmu_swap_private; /* swap reserved for private mappings */ + size_t vmu_swap_shared; /* swap reserved for shared mappings */ + +} vmusage_t; + +extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres); + +#ifdef _KERNEL + +int vm_getusage(uint_t, time_t, vmusage_t *, size_t *); +void vm_usage_init(); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VM_USAGE_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index daccd16bdf..94646bc976 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -88,6 +88,8 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 +#define ZONE_ATTR_PHYS_MCAP 12 +#define ZONE_ATTR_SCHED_CLASS 13 /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -280,6 +282,15 @@ typedef struct zone_dataset { list_node_t zd_linkage; } zone_dataset_t; +/* + * structure for zone kstats + */ +typedef struct zone_kstat { + kstat_named_t zk_zonename; + kstat_named_t zk_usage; + kstat_named_t zk_value; +} zone_kstat_t; + typedef struct zone { /* * zone_name is never modified once set. @@ -326,14 +337,20 @@ typedef struct zone { uint_t zone_rootpathlen; /* strlen(zone_rootpath) + 1 */ uint32_t zone_shares; /* FSS shares allocated to zone */ rctl_set_t *zone_rctls; /* zone-wide (zone.*) rctls */ - kmutex_t zone_rctl_lock; /* protects zone_locked_mem and */ + kmutex_t zone_mem_lock; /* protects zone_locked_mem and */ /* kpd_locked_mem for all */ - /* projects in zone */ + /* projects in zone. */ + /* Also protects zone_max_swap */ /* grab after p_lock, before rcs_lock */ - rctl_qty_t zone_locked_mem; /* bytes of locked memory in zone */ - rctl_qty_t zone_locked_mem_ctl; /* current locked memory */ + rctl_qty_t zone_locked_mem; /* bytes of locked memory in */ + /* zone */ + rctl_qty_t zone_locked_mem_ctl; /* Current locked memory */ /* limit. Protected by */ /* zone_rctls->rcs_lock */ + rctl_qty_t zone_max_swap; /* bytes of swap reserved by zone */ + rctl_qty_t zone_max_swap_ctl; /* current swap limit. */ + /* Protected by */ + /* zone_rctls->rcs_lock */ list_t zone_zsd; /* list of Zone-Specific Data values */ kcondvar_t zone_cv; /* used to signal state changes */ struct proc *zone_zsched; /* Dummy kernel "zsched" process */ @@ -341,6 +358,7 @@ typedef struct zone { char *zone_initname; /* fs path to 'init' */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ + uint64_t zone_phys_mcap; /* physical memory cap */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -376,6 +394,9 @@ typedef struct zone { boolean_t zone_restart_init; /* Restart init if it dies? */ struct brand *zone_brand; /* zone's brand */ + id_t zone_defaultcid; /* dflt scheduling class id */ + kstat_t *zone_swapresv_kstat; + kstat_t *zone_lockedmem_kstat; } zone_t; /* @@ -553,6 +574,7 @@ extern void mount_completed(void); extern int zone_walk(int (*)(zone_t *, void *), void *); extern rctl_hndl_t rc_zone_locked_mem; +extern rctl_hndl_t rc_zone_max_swap; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/syscall/processor_bind.c b/usr/src/uts/common/syscall/processor_bind.c index 10ca1178d5..bd416e43e6 100644 --- a/usr/src/uts/common/syscall/processor_bind.c +++ b/usr/src/uts/common/syscall/processor_bind.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -285,9 +284,10 @@ processor_bind(idtype_t idtype, id_t id, processorid_t bind, break; case P_PROJID: + pp = curproc; if (id == P_MYID) id = curprojid(); - if ((kpj = project_hold_by_id(id, getzoneid(), + if ((kpj = project_hold_by_id(id, pp->p_zone, PROJECT_HOLD_FIND)) == NULL) { ret = ESRCH; } else { diff --git a/usr/src/uts/common/syscall/pset.c b/usr/src/uts/common/syscall/pset.c index 5d3b7e6233..767529fc5d 100644 --- a/usr/src/uts/common/syscall/pset.c +++ b/usr/src/uts/common/syscall/pset.c @@ -542,9 +542,10 @@ pset_bind(psetid_t pset, idtype_t idtype, id_t id, psetid_t *opset) break; case P_PROJID: + pp = curproc; if (id == P_MYID) id = curprojid(); - if ((kpj = project_hold_by_id(id, getzoneid(), + if ((kpj = project_hold_by_id(id, pp->p_zone, PROJECT_HOLD_FIND)) == NULL) { error = ESRCH; break; diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c index 3e09643981..036500932f 100644 --- a/usr/src/uts/common/syscall/rusagesys.c +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +34,7 @@ #include <sys/time.h> #include <sys/errno.h> #include <sys/resource.h> +#include <sys/vm_usage.h> static int getrusage(void *user_rusage) @@ -246,16 +246,19 @@ getrusage_lwp(void *user_rusage) } int -rusagesys(int code, void * arg) +rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4) { switch (code) { case _RUSAGESYS_GETRUSAGE: - return (getrusage(arg)); + return (getrusage(arg1)); case _RUSAGESYS_GETRUSAGE_CHLD: - return (getrusage_chld(arg)); + return (getrusage_chld(arg1)); case _RUSAGESYS_GETRUSAGE_LWP: - return (getrusage_lwp(arg)); + return (getrusage_lwp(arg1)); + case _RUSAGESYS_GETVMUSAGE: + return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2, + (vmusage_t *)arg3, (size_t *)arg4)); default: return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/tasksys.c b/usr/src/uts/common/syscall/tasksys.c index 705b543a37..bec091e61c 100644 --- a/usr/src/uts/common/syscall/tasksys.c +++ b/usr/src/uts/common/syscall/tasksys.c @@ -25,6 +25,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" + /* * System calls for creating and inquiring about tasks and projects */ @@ -102,7 +103,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) * Put a hold on our new project and make sure that nobody is * trying to bind it to a pool while we're joining. */ - kpj = project_hold_by_id(projid, getzoneid(), PROJECT_HOLD_INSERT); + kpj = project_hold_by_id(projid, p->p_zone, PROJECT_HOLD_INSERT); e.rcep_p.proj = kpj; e.rcep_t = RCENTITY_PROJECT; @@ -111,7 +112,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) zone = p->p_zone; mutex_enter(&zone->zone_nlwps_lock); - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl) if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e, @@ -130,7 +131,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) rctlfail = 1; if (rctlfail) { - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); mutex_exit(&zone->zone_nlwps_lock); if (curthread != p->p_agenttp) continuelwps(p); @@ -144,7 +145,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) oldpj->kpj_data.kpd_locked_mem -= p->p_locked_mem; oldpj->kpj_nlwps -= p->p_lwpcnt; - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); mutex_exit(&zone->zone_nlwps_lock); mutex_exit(&p->p_lock); diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h index 90f6e1e661..ed59ec590b 100644 --- a/usr/src/uts/common/vm/anon.h +++ b/usr/src/uts/common/vm/anon.h @@ -42,6 +42,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/cred.h> +#include <sys/zone.h> #include <vm/seg.h> #include <vm/vpage.h> @@ -387,8 +388,8 @@ extern int anon_map_demotepages(struct anon_map *, ulong_t, struct seg *, caddr_t, uint_t, struct vpage [], struct cred *); extern void anon_shmap_free_pages(struct anon_map *, ulong_t, size_t); -extern int anon_resvmem(size_t, uint_t); -extern void anon_unresv(size_t); +extern int anon_resvmem(size_t, boolean_t, zone_t *); +extern void anon_unresvmem(size_t, zone_t *); extern struct anon_map *anonmap_alloc(size_t, size_t); extern void anonmap_free(struct anon_map *); extern void anon_decref(struct anon *); @@ -416,9 +417,16 @@ extern void anon_array_exit(anon_sync_obj_t *); * request and if so, reserves the appropriate anonymous memory resources. * anon_checkspace just checks to see if there is space to fulfill the request, * without taking any resources. Both return 1 if successful and 0 if not. + * + * Macros are provided as anon reservation is usually charged to the zone of + * the current process. In some cases (such as anon reserved by tmpfs), a + * zone pointer is needed to charge the appropriate zone. */ -#define anon_resv(size) anon_resvmem((size), 1) -#define anon_checkspace(size) anon_resvmem((size), 0) +#define anon_unresv(size) anon_unresvmem(size, curproc->p_zone) +#define anon_unresv_zone(size, zone) anon_unresvmem(size, zone) +#define anon_resv(size) anon_resvmem((size), 1, curproc->p_zone) +#define anon_resv_zone(size, zone) anon_resvmem((size), 1, zone) +#define anon_checkspace(size, zone) anon_resvmem((size), 0, zone) /* * Flags to anon_private diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h index 0ee7d62ce1..a9683c0e54 100644 --- a/usr/src/uts/common/vm/seg.h +++ b/usr/src/uts/common/vm/seg.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -245,6 +244,9 @@ uint_t seg_pages(struct seg *); #endif /* VMDEBUG */ +boolean_t seg_can_change_zones(struct seg *); +size_t seg_swresv(struct seg *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c index ff9c47e0ff..d58e873a19 100644 --- a/usr/src/uts/common/vm/seg_kp.c +++ b/usr/src/uts/common/vm/seg_kp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -147,6 +146,7 @@ uint32_t red_closest = UINT_MAX; uint32_t red_ndoubles; pgcnt_t anon_segkp_pages_locked; /* See vm/anon.h */ +pgcnt_t anon_segkp_pages_resv; /* anon reserved by seg_kp */ static struct seg_ops segkp_ops = { SEGKP_BADOP(int), /* dup */ @@ -448,8 +448,10 @@ segkp_get_internal( * Note that we don't need swap space for the red zone page. */ if (amp != NULL) { - ASSERT((flags & KPD_NO_ANON) == 0); - /* The reserve has been done and the anon_hdr is separate. */ + /* + * The swap reservation has been done, if required, and the + * anon_hdr is separate. + */ anon_idx = 0; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = amp->ahp; @@ -458,7 +460,7 @@ segkp_get_internal( kpd, vbase, len, flags, 1); } else if ((flags & KPD_NO_ANON) == 0) { - if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) { + if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) { if (flags & KPD_LOCKED) { atomic_add_long(&anon_segkp_pages_locked, -pages); @@ -468,6 +470,8 @@ segkp_get_internal( kmem_free(kpd, sizeof (struct segkp_data)); return (NULL); } + atomic_add_long(&anon_segkp_pages_resv, + btop(SEGKP_MAPLEN(len, flags))); anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = kpsd->kpsd_anon; @@ -704,7 +708,9 @@ segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len) if ((kpd->kp_flags & KPD_HASAMP) == 0) { anon_free(kpd->kp_anon, kpd->kp_anon_idx + i, PAGESIZE); - anon_unresv(PAGESIZE); + anon_unresv_zone(PAGESIZE, NULL); + atomic_add_long(&anon_segkp_pages_resv, + -1); } TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u", diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index f48db44acc..e2069b27c6 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -2323,8 +2323,9 @@ segvn_faultpage( * zeroes. If no advance reservations, reserve now. */ if (svd->flags & MAP_NORESERVE) { - if (anon_resv(ptob(1))) { - svd->swresv += ptob(1); + if (anon_resv_zone(ptob(1), + seg->s_as->a_proc->p_zone)) { + atomic_add_long(&svd->swresv, ptob(1)); } else { err = ENOMEM; goto out; diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c index 0cad34257c..3f225a345a 100644 --- a/usr/src/uts/common/vm/vm_anon.c +++ b/usr/src/uts/common/vm/vm_anon.c @@ -113,6 +113,7 @@ #include <sys/policy.h> #include <sys/condvar_impl.h> #include <sys/mutex_impl.h> +#include <sys/rctl.h> #include <vm/as.h> #include <vm/hat.h> @@ -729,12 +730,22 @@ set_anoninfo(void) * Return non-zero on success. */ int -anon_resvmem(size_t size, uint_t takemem) +anon_resvmem(size_t size, boolean_t takemem, zone_t *zone) { pgcnt_t npages = btopr(size); pgcnt_t mswap_pages = 0; pgcnt_t pswap_pages = 0; + proc_t *p = curproc; + if (zone != NULL && takemem) { + /* test zone.max-swap resource control */ + mutex_enter(&p->p_lock); + if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { + mutex_exit(&p->p_lock); + return (0); + } + mutex_exit(&p->p_lock); + } mutex_enter(&anoninfo_lock); /* @@ -834,16 +845,17 @@ anon_resvmem(size_t size, uint_t takemem) mutex_exit(&anoninfo_lock); ANON_PRINT(A_RESV, ("anon_resvmem: not enough space from swapfs\n")); + if (zone != NULL && takemem) + rctl_decr_swap(zone, ptob(npages)); return (0); } } - /* * Give back an anon reservation. */ void -anon_unresv(size_t size) +anon_unresvmem(size_t size, zone_t *zone) { pgcnt_t npages = btopr(size); spgcnt_t mem_free_pages = 0; @@ -851,6 +863,8 @@ anon_unresv(size_t size) #ifdef ANON_DEBUG pgcnt_t mem_resv; #endif + if (zone != NULL) + rctl_decr_swap(zone, size); mutex_enter(&anoninfo_lock); diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index 05bfe662be..adac07b766 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -77,7 +77,7 @@ #include <vm/pvn.h> #include <vm/seg_kmem.h> #include <vm/vm_dep.h> - +#include <sys/vm_usage.h> #include <fs/fs_subr.h> static int nopageage = 0; @@ -343,6 +343,7 @@ vm_init(void) (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); page_init_mem_config(); page_retire_init(); + vm_usage_init(); } /* diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c index 50cc21cdf7..aed892969d 100644 --- a/usr/src/uts/common/vm/vm_seg.c +++ b/usr/src/uts/common/vm/vm_seg.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,12 +53,14 @@ #include <sys/cmn_err.h> #include <sys/callb.h> #include <sys/mem_config.h> +#include <sys/mman.h> #include <vm/hat.h> #include <vm/as.h> #include <vm/seg.h> #include <vm/seg_kmem.h> - +#include <vm/seg_spt.h> +#include <vm/seg_vn.h> /* * kstats for segment advise */ @@ -950,3 +951,48 @@ seg_pinit_mem_config(void) */ ASSERT(ret == 0); } + +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; + +/* + * Verify that segment is not a shared anonymous segment which reserves + * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered + * from one zone to another if any segments are shared. This is because the + * last process to exit will credit the swap reservation. This could lead + * to the swap being reserved by one zone, and credited to another. + */ +boolean_t +seg_can_change_zones(struct seg *seg) +{ + struct segvn_data *svd; + + if (seg->s_ops == &segspt_shmops) + return (B_FALSE); + + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_SHARED && + svd->amp != NULL && + svd->amp->swresv > 0) + return (B_FALSE); + } + return (B_TRUE); +} + +/* + * Return swap reserved by a segment backing a private mapping. + */ +size_t +seg_swresv(struct seg *seg) +{ + struct segvn_data *svd; + size_t swap = 0; + + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_PRIVATE && svd->swresv > 0) + swap = svd->swresv; + } + return (swap); +} diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c new file mode 100644 index 0000000000..32a8811e10 --- /dev/null +++ b/usr/src/uts/common/vm/vm_usage.c @@ -0,0 +1,1978 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * vm_usage + * + * This file implements the getvmusage() private system call. + * getvmusage() counts the amount of resident memory pages and swap + * reserved by the specified process collective. A "process collective" is + * the set of processes owned by a particular, zone, project, task, or user. + * + * rss and swap are counted so that for a given process collective, a page is + * only counted once. For example, this means that if multiple processes in + * the same project map the same page, then the project will only be charged + * once for that page. On the other hand, if two processes in different + * projects map the same page, then both projects will be charged + * for the page. + * + * The vm_getusage() calculation is implemented so that the first thread + * performs the rss/swap counting. Other callers will wait for that thread to + * finish, copying the results. This enables multiple rcapds and prstats to + * consume data from the same calculation. The results are also cached so that + * a caller interested in recent results can just copy them instead of starting + * a new calculation. The caller passes the maximium age (in seconds) of the + * data. If the cached data is young enough, the cache is copied, otherwise, + * a new calculation is executed and the cache is replaced with the new + * data. + * + * The rss calculation for each process collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Lookup anons in the amp. + * - For incore pages not previously visited each of the + * proc's collectives, add incore pagesize to each. + * collective. + * Anon's with a refcnt of 1 can be assummed to be not + * previously visited. + * - For address ranges without anons in the amp: + * - Lookup pages in underlying vnode. + * - For incore pages not previously visiting for + * each of the proc's collectives, add incore + * pagesize to each collective. + * - If seg is shared: + * - Lookup pages in the shared amp or vnode. + * - For incore pages not previously visited for each of + * the proc's collectives, add incore pagesize to each + * collective. + * + * Swap is reserved by private segments, and shared anonymous segments. + * The only shared anon segments which do not reserve swap are ISM segments + * and schedctl segments, both of which can be identified by having + * amp->swresv == 0. + * + * The swap calculation for each collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Add svd->swresv pages to swap count for each of the + * proc's collectives. + * - If seg is anon, shared, and amp->swresv != 0 + * - For address ranges in amp not previously visited for + * each of the proc's collectives, add size of address + * range to the swap count for each collective. + * + * These two calculations are done simultaneously, with most of the work + * being done in vmu_calculate_seg(). The results of the calculation are + * copied into "vmu_data.vmu_cache_results". + * + * To perform the calculation, various things are tracked and cached: + * + * - incore/not-incore page ranges for all vnodes. + * (vmu_data.vmu_all_vnodes_hash) + * This eliminates looking up the same page more than once. + * + * - incore/not-incore page ranges for all shared amps. + * (vmu_data.vmu_all_amps_hash) + * This eliminates looking up the same page more than once. + * + * - visited page ranges for each collective. + * - per vnode (entity->vme_vnode_hash) + * - per shared amp (entity->vme_amp_hash) + * For accurate counting of map-shared and cow-shared pages. + * + * - visited private anons (refcnt > 1) for each collective. + * (entity->vme_anon_hash) + * For accurate counting of cow-shared pages. + * + * The common accounting structure is the vmu_entity_t, which represents + * collectives: + * + * - A zone. + * - A project, task, or user within a zone. + * - The entire system (vmu_data.vmu_system). + * - Each collapsed (col) project and user. This means a given projid or + * uid, regardless of which zone the process is in. For instance, + * project 0 in the global zone and project 0 in a non global zone are + * the same collapsed project. + * + * Each entity structure tracks which pages have been already visited for + * that entity (via previously inspected processes) so that these pages are + * not double counted. + */ + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/zone.h> +#include <sys/proc.h> +#include <sys/project.h> +#include <sys/task.h> +#include <sys/thread.h> +#include <sys/time.h> +#include <sys/mman.h> +#include <sys/modhash.h> +#include <sys/modhash_impl.h> +#include <sys/shm.h> +#include <sys/swap.h> +#include <sys/synch.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vm_usage.h> +#include <sys/zone.h> +#include <vm/anon.h> +#include <vm/as.h> +#include <vm/seg_vn.h> +#include <vm/seg_spt.h> + +#define VMUSAGE_HASH_SIZE 512 + +#define VMUSAGE_TYPE_VNODE 1 +#define VMUSAGE_TYPE_AMP 2 +#define VMUSAGE_TYPE_ANON 3 + +#define VMUSAGE_BOUND_UNKNOWN 0 +#define VMUSAGE_BOUND_INCORE 1 +#define VMUSAGE_BOUND_NOT_INCORE 2 + +/* + * bounds for vnodes and shared amps + * Each bound is either entirely incore, entirely not in core, or + * entirely unknown. bounds are stored in order by offset. + */ +typedef struct vmu_bound { + struct vmu_bound *vmb_next; + pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */ + pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */ + char vmb_type; /* One of VMUSAGE_BOUND_* */ +} vmu_bound_t; + +/* + * hash of visited objects (vnodes or shared amps) + * key is address of vnode or amp. Bounds lists known incore/non-incore + * bounds for vnode/amp. + */ +typedef struct vmu_object { + struct vmu_object *vmo_next; /* free list */ + caddr_t vmo_key; + short vmo_type; + vmu_bound_t *vmo_bounds; +} vmu_object_t; + +/* + * Entity by which to count results. + * + * The entity structure keeps the current rss/swap counts for each entity + * (zone, project, etc), and hashes of vm structures that have already + * been visited for the entity. + * + * vme_next: links the list of all entities currently being counted by + * vmu_calculate(). + * + * vme_next_calc: links the list of entities related to the current process + * being counted by vmu_calculate_proc(). + * + * vmu_calculate_proc() walks all processes. For each process, it makes a + * list of the entities related to that process using vme_next_calc. This + * list changes each time vmu_calculate_proc() is called. + * + */ +typedef struct vmu_entity { + struct vmu_entity *vme_next; + struct vmu_entity *vme_next_calc; + mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ + mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ + mod_hash_t *vme_anon_hash; /* cow anons visited for entity */ + vmusage_t vme_result; /* identifies entity and results */ +} vmu_entity_t; + +/* + * Hash of entities visited within a zone, and an entity for the zone + * itself. + */ +typedef struct vmu_zone { + struct vmu_zone *vmz_next; /* free list */ + id_t vmz_id; + vmu_entity_t *vmz_zone; + mod_hash_t *vmz_projects_hash; + mod_hash_t *vmz_tasks_hash; + mod_hash_t *vmz_rusers_hash; + mod_hash_t *vmz_eusers_hash; +} vmu_zone_t; + +/* + * Cache of results from last calculation + */ +typedef struct vmu_cache { + vmusage_t *vmc_results; /* Results from last call to */ + /* vm_getusage(). */ + uint64_t vmc_nresults; /* Count of cached results */ + uint64_t vmc_refcnt; /* refcnt for free */ + uint_t vmc_flags; /* Flags for vm_getusage() */ + hrtime_t vmc_timestamp; /* when cache was created */ +} vmu_cache_t; + +/* + * top level rss info for the system + */ +typedef struct vmu_data { + kmutex_t vmu_lock; /* Protects vmu_data */ + kcondvar_t vmu_cv; /* Used to signal threads */ + /* Waiting for */ + /* Rss_calc_thread to finish */ + vmu_entity_t *vmu_system; /* Entity for tracking */ + /* rss/swap for all processes */ + /* in all zones */ + mod_hash_t *vmu_zones_hash; /* Zones visited */ + mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */ + mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */ + mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */ + /* to implement VMUSAGE_COL_* */ + /* flags, which aggregate by */ + /* project or user regardless */ + /* of zoneid. */ + mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */ + /* to track incore/not-incore */ + mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */ + /* amps to track incore/not- */ + /* incore */ + vmu_entity_t *vmu_entities; /* Linked list of entities */ + size_t vmu_nentities; /* Count of entities in list */ + vmu_cache_t *vmu_cache; /* Cached results */ + kthread_t *vmu_calc_thread; /* NULL, or thread running */ + /* vmu_calculate() */ + uint_t vmu_calc_flags; /* Flags being using by */ + /* currently running calc */ + /* thread */ + uint_t vmu_pending_flags; /* Flags of vm_getusage() */ + /* threads waiting for */ + /* calc thread to finish */ + uint_t vmu_pending_waiters; /* Number of threads waiting */ + /* for calc thread */ + vmu_bound_t *vmu_free_bounds; + vmu_object_t *vmu_free_objects; + vmu_entity_t *vmu_free_entities; + vmu_zone_t *vmu_free_zones; +} vmu_data_t; + +extern struct as kas; +extern proc_t *practive; +extern zone_t *global_zone; +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; + +static vmu_data_t vmu_data; +static kmem_cache_t *vmu_bound_cache; +static kmem_cache_t *vmu_object_cache; + +/* + * Save a bound on the free list + */ +static void +vmu_free_bound(vmu_bound_t *bound) +{ + bound->vmb_next = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = bound; +} + +/* + * Free an object, and all visited bound info. + */ +static void +vmu_free_object(mod_hash_val_t val) +{ + vmu_object_t *obj = (vmu_object_t *)val; + vmu_bound_t *bound = obj->vmo_bounds; + vmu_bound_t *tmp; + + while (bound != NULL) { + tmp = bound; + bound = bound->vmb_next; + vmu_free_bound(tmp); + } + obj->vmo_next = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = obj; +} + +/* + * Free an entity, and hashes of visited objects for that entity. + */ +static void +vmu_free_entity(mod_hash_val_t val) +{ + vmu_entity_t *entity = (vmu_entity_t *)val; + + if (entity->vme_vnode_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_vnode_hash); + if (entity->vme_amp_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_amp_hash); + if (entity->vme_anon_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_anon_hash); + + entity->vme_next = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = entity; +} + +/* + * Free zone entity, and all hashes of entities inside that zone, + * which are projects, tasks, and users. + */ +static void +vmu_free_zone(mod_hash_val_t val) +{ + vmu_zone_t *zone = (vmu_zone_t *)val; + + if (zone->vmz_zone != NULL) { + vmu_free_entity((mod_hash_val_t)zone->vmz_zone); + zone->vmz_zone = NULL; + } + if (zone->vmz_projects_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_projects_hash); + if (zone->vmz_tasks_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_tasks_hash); + if (zone->vmz_rusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_rusers_hash); + if (zone->vmz_eusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_eusers_hash); + zone->vmz_next = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = zone; +} + +/* + * Initialize synchronization primitives and hashes for system-wide tracking + * of visited vnodes and shared amps. Initialize results cache. + */ +void +vm_usage_init() +{ + mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL); + + vmu_data.vmu_system = NULL; + vmu_data.vmu_zones_hash = NULL; + vmu_data.vmu_projects_col_hash = NULL; + vmu_data.vmu_rusers_col_hash = NULL; + vmu_data.vmu_eusers_col_hash = NULL; + + vmu_data.vmu_free_bounds = NULL; + vmu_data.vmu_free_objects = NULL; + vmu_data.vmu_free_entities = NULL; + vmu_data.vmu_free_zones = NULL; + + vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + vmu_data.vmu_projects_col_hash = mod_hash_create_idhash( + "vmusage collapsed project hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash( + "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash( + "vmusage collpased euser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_zones_hash = mod_hash_create_idhash( + "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone); + + vmu_bound_cache = kmem_cache_create("vmu_bound_cache", + sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + vmu_object_cache = kmem_cache_create("vmu_object_cache", + sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + + vmu_data.vmu_cache = NULL; + vmu_data.vmu_calc_thread = NULL; + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_pending_flags = 0; + vmu_data.vmu_pending_waiters = 0; +} + +/* + * Allocate hashes for tracking vm objects visited for an entity. + * Update list of entities. + */ +static vmu_entity_t * +vmu_alloc_entity(id_t id, int type, id_t zoneid) +{ + vmu_entity_t *entity; + + if (vmu_data.vmu_free_entities != NULL) { + entity = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + bzero(&entity->vme_result, sizeof (vmusage_t)); + } else { + entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP); + } + entity->vme_result.vmu_id = id; + entity->vme_result.vmu_zoneid = zoneid; + entity->vme_result.vmu_type = type; + + if (entity->vme_vnode_hash == NULL) + entity->vme_vnode_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + + if (entity->vme_amp_hash == NULL) + entity->vme_amp_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + + if (entity->vme_anon_hash == NULL) + entity->vme_anon_hash = mod_hash_create_ptrhash( + "vmusage anon hash", VMUSAGE_HASH_SIZE, + mod_hash_null_valdtor, sizeof (struct anon)); + + entity->vme_next = vmu_data.vmu_entities; + vmu_data.vmu_entities = entity; + vmu_data.vmu_nentities++; + + return (entity); +} + +/* + * Allocate a zone entity, and hashes for tracking visited vm objects + * for projects, tasks, and users within that zone. + */ +static vmu_zone_t * +vmu_alloc_zone(id_t id) +{ + vmu_zone_t *zone; + + if (vmu_data.vmu_free_zones != NULL) { + zone = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + zone->vmz_next = NULL; + zone->vmz_zone = NULL; + } else { + zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); + } + + zone->vmz_id = id; + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) + zone->vmz_projects_hash = mod_hash_create_idhash( + "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + != 0 && zone->vmz_tasks_hash == NULL) + zone->vmz_tasks_hash = mod_hash_create_idhash( + "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) + != 0 && zone->vmz_rusers_hash == NULL) + zone->vmz_rusers_hash = mod_hash_create_idhash( + "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) + != 0 && zone->vmz_eusers_hash == NULL) + zone->vmz_eusers_hash = mod_hash_create_idhash( + "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + return (zone); +} + +/* + * Allocate a structure for tracking visited bounds for a vm object. + */ +static vmu_object_t * +vmu_alloc_object(caddr_t key, int type) +{ + vmu_object_t *object; + + if (vmu_data.vmu_free_objects != NULL) { + object = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + } else { + object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP); + } + + object->vmo_key = key; + object->vmo_type = type; + object->vmo_bounds = NULL; + + return (object); +} + +/* + * Allocate and return a bound structure. + */ +static vmu_bound_t * +vmu_alloc_bound() +{ + vmu_bound_t *bound; + + if (vmu_data.vmu_free_bounds != NULL) { + bound = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = + vmu_data.vmu_free_bounds->vmb_next; + bzero(bound, sizeof (vmu_bound_t)); + } else { + bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP); + bzero(bound, sizeof (vmu_bound_t)); + } + return (bound); +} + +/* + * vmu_find_insert_* functions implement hash lookup or allocate and + * insert operations. + */ +static vmu_object_t * +vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) +{ + int ret; + vmu_object_t *object; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&object); + if (ret != 0) { + object = vmu_alloc_object(key, type); + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)object, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (object); +} + +static int +vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +{ + int ret; + caddr_t val; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&val); + + if (ret == 0) + return (0); + + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)key, (mod_hash_hndl_t)0); + + ASSERT(ret == 0); + + return (1); +} + +static vmu_entity_t * +vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid) +{ + int ret; + vmu_entity_t *entity; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&entity); + if (ret != 0) { + entity = vmu_alloc_entity(id, type, zoneid); + ret = i_mod_hash_insert_nosync(hash, + (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity, + (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (entity); +} + + + + +/* + * Returns list of object bounds between start and end. New bounds inserted + * by this call are given type. + * + * Returns the number of pages covered if new bounds are created. Returns 0 + * if region between start/end consists of all existing bounds. + */ +static pgcnt_t +vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t + end, char type, vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *prev = NULL; + vmu_bound_t *tmp = NULL; + pgcnt_t ret = 0; + + *first = *last = NULL; + + for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) { + /* + * Find bounds overlapping or overlapped by range [start,end]. + */ + if (start > next->vmb_end) { + /* bound is before new bound */ + prev = next; + continue; + } + if (next->vmb_start > end) { + /* bound is after new bound */ + break; + } + if (*first == NULL) + *first = next; + *last = next; + } + + if (*first == NULL) { + ASSERT(*last == NULL); + /* + * No bounds overlapping range [start,end], so create new + * bound + */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = end; + tmp->vmb_type = type; + if (prev == NULL) { + tmp->vmb_next = ro->vmo_bounds; + ro->vmo_bounds = tmp; + } else { + tmp->vmb_next = prev->vmb_next; + prev->vmb_next = tmp; + } + *first = tmp; + *last = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret = tmp->vmb_end - tmp->vmb_start + 1; + return (ret); + } + + /* Check to see if start is before first known bound */ + ASSERT(first != NULL && last != NULL); + next = (*first); + if (start < (*first)->vmb_start) { + /* Create new bound before first bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = (*first)->vmb_start - 1; + tmp->vmb_type = type; + tmp->vmb_next = *first; + if (*first == ro->vmo_bounds) + ro->vmo_bounds = tmp; + if (prev != NULL) + prev->vmb_next = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + *first = tmp; + } + /* + * Between start and end, search for gaps between and after existing + * bounds. Create new bounds to fill gaps if they exist. + */ + while (end > next->vmb_end) { + /* + * Check for gap between bound and next bound. if no gap, + * continue. + */ + if ((next != *last) && + ((next->vmb_end + 1) == next->vmb_next->vmb_start)) { + next = next->vmb_next; + continue; + } + /* + * Insert new bound in gap after bound, and before next + * bound if next bound exists. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = type; + tmp->vmb_next = next->vmb_next; + tmp->vmb_start = next->vmb_end + 1; + + if (next != *last) { + tmp->vmb_end = next->vmb_next->vmb_start - 1; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + next = tmp->vmb_next; + } else { + tmp->vmb_end = end; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + *last = tmp; + break; + } + } + return (ret); +} + +/* + * vmu_update_bounds() + * + * first, last: list of continuous bounds, of which zero or more are of + * type VMUSAGE_BOUND_UNKNOWN. + * + * new_first, new_last: list of continuous bounds, of which none are of + * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to + * update the types of bounds in (first,last) with + * type VMUSAGE_BOUND_UNKNOWN. + * + * For the list of bounds (first,last), this function updates any bounds + * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in + * the list (new_first, new_last). + * + * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list + * (new_first, new_last), it will be split into multiple bounds. + * + * Return value: + * The number of pages in the list of bounds (first,last) that were of + * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type + * VMUSAGE_BOUND_INCORE. + * + */ +static pgcnt_t +vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last, + vmu_bound_t *new_first, vmu_bound_t *new_last) +{ + vmu_bound_t *next, *new_next, *tmp; + pgcnt_t rss = 0; + + next = *first; + new_next = new_first; + + /* verify bounds span same pages */ + ASSERT((*first)->vmb_start >= new_next->vmb_start); + ASSERT((*last)->vmb_end <= new_last->vmb_end); + for (;;) { + /* If bound already has type, proceed to next bound */ + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + while (new_next->vmb_end < next->vmb_start) + new_next = new_next->vmb_next; + ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + next->vmb_type = new_next->vmb_type; + if (new_next->vmb_end < next->vmb_end) { + /* need to split bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN; + tmp->vmb_start = new_next->vmb_end + 1; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = new_next->vmb_end; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + next = tmp; + } else { + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + if (next == *last) + break; + next = next->vmb_next; + } + } + return (rss); +} + +/* + * merges adjacent bounds with same type between first and last bound. + * After merge, last pointer is no longer valid, as last bound may be + * merged away. + */ +static void +vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + + ASSERT(*first != NULL); + ASSERT(*last != NULL); + + next = *first; + while (next != *last) { + + /* If bounds are adjacent and have same type, merge them */ + if (((next->vmb_end + 1) == next->vmb_next->vmb_start) && + (next->vmb_type == next->vmb_next->vmb_type)) { + tmp = next->vmb_next; + next->vmb_end = tmp->vmb_end; + next->vmb_next = tmp->vmb_next; + vmu_free_bound(tmp); + if (tmp == *last) + *last = next; + } else { + next = next->vmb_next; + } + } +} + +/* + * Given an amp and a list of bounds, updates each bound's type with + * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE. + * + * If a bound is partially incore, it will be split into two bounds. + * first and last may be modified, as bounds may be split into multiple + * bounds if the are partially incore/not-incore. + * + * Set incore to non-zero if bounds are already known to be incore + * + */ +static void +vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first, + vmu_bound_t **last, boolean_t incore) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + vnode_t *vn; + anoff_t off; + struct anon *ap; + + next = *first; + /* Shared anon slots don't change once set */ + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (;;) { + if (incore == B_TRUE) + next->vmb_type = VMUSAGE_BOUND_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + ap = anon_get_ptr(amp->ahp, index); + if (ap != NULL) + swap_xlate(ap, &vn, &off); + + if (ap != NULL && vn != NULL && vn->v_pages != NULL && + (page = page_exists(vn, off)) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } + ANON_LOCK_EXIT(&->a_rwlock); +} + +/* + * Same as vmu_amp_update_incore_bounds(), except for tracking + * incore-/not-incore for vnodes. + */ +static void +vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first, + vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + + next = *first; + for (;;) { + if (vnode->v_pages == NULL) + next->vmb_type = VMUSAGE_BOUND_NOT_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + if (vnode->v_pages != NULL && + (page = page_exists(vnode, ptob(index))) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } +} + +/* + * Calculate the rss and swap consumed by a segment. vmu_entities is the + * list of entities to visit. For shared segments, the vnode or amp + * is looked up in each entity to see if has been already counted. Private + * anon pages are checked per entity to ensure that cow pages are not + * double counted. + * + * For private mapped files, first the amp is checked for private pages. + * Bounds not backed by the amp are looked up in the vnode for each entity + * to avoid double counting of private COW vnode pages. + */ +static void +vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) +{ + struct segvn_data *svd; + struct shm_data *shmd; + struct spt_data *sptd; + vmu_object_t *shared_object = NULL; + vmu_object_t *entity_object = NULL; + vmu_entity_t *entity; + vmusage_t *result; + vmu_bound_t *first = NULL; + vmu_bound_t *last = NULL; + vmu_bound_t *cur = NULL; + vmu_bound_t *e_first = NULL; + vmu_bound_t *e_last = NULL; + vmu_bound_t *tmp; + pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt; + struct anon_map *private_amp = NULL; + boolean_t incore = B_FALSE; + boolean_t shared = B_FALSE; + int file = 0; + pgcnt_t swresv = 0; + pgcnt_t panon = 0; + + /* Can zero-length segments exist? Not sure, so parenoia */ + if (seg->s_size <= 0) + return; + + /* + * Figure out if there is a shared object (such as a named vnode or + * a shared amp, then figure out if there is a private amp, which + * identifies private pages. + */ + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_SHARED) + shared = B_TRUE; + else + swresv = svd->swresv; + + if (svd->vp != NULL) { + file = 1; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp, + VMUSAGE_TYPE_VNODE); + s_start = btop(svd->offset); + s_end = btop(svd->offset + seg->s_size) - 1; + } + if (svd->amp != NULL && svd->type == MAP_SHARED) { + ASSERT(shared_object == NULL); + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp, + VMUSAGE_TYPE_AMP); + s_start = svd->anon_index; + s_end = svd->anon_index + btop(seg->s_size) - 1; + /* schedctl mappings are always in core */ + if (svd->amp->swresv == 0) + incore = B_TRUE; + } + if (svd->amp != NULL && svd->type == MAP_PRIVATE) { + private_amp = svd->amp; + p_start = svd->anon_index; + p_end = svd->anon_index + btop(seg->s_size) - 1; + } + } else if (seg->s_ops == &segspt_shmops) { + shared = B_TRUE; + shmd = (struct shm_data *)seg->s_data; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp, + VMUSAGE_TYPE_AMP); + s_start = 0; + s_end = btop(seg->s_size) - 1; + sptd = shmd->shm_sptseg->s_data; + + /* ism segments are always incore and do not reserve swap */ + if (sptd->spt_flags & SHM_SHARE_MMU) + incore = B_TRUE; + + } else { + return; + } + + /* + * If there is a private amp, count anon pages that exist. If an + * anon has a refcnt > 1 (cow sharing), then save the anon in a + * hash so that it is not double counted. + * + * If there is also a shared object, they figure out the bounds + * which are not mapped by the private amp. + */ + if (private_amp != NULL) { + + /* Enter as writer to prevent cow anons from being freed */ + ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER); + + p_index = p_start; + s_index = s_start; + + while (p_index <= p_end) { + + pgcnt_t p_index_next; + pgcnt_t p_bound_size; + int cnt; + anoff_t off; + struct vnode *vn; + struct anon *ap; + page_t *page; /* For handling of large */ + pgcnt_t pgcnt = 1; /* pages */ + pgcnt_t pgstart; + pgcnt_t pgend; + uint_t pgshft; + pgcnt_t pgmsk; + + p_index_next = p_index; + ap = anon_get_next_ptr(private_amp->ahp, + &p_index_next); + + /* + * If next anon is past end of mapping, simulate + * end of anon so loop terminates. + */ + if (p_index_next > p_end) { + p_index_next = p_end + 1; + ap = NULL; + } + /* + * For cow segments, keep track of bounds not + * backed by private amp so they can be looked + * up in the backing vnode + */ + if (p_index_next != p_index) { + + /* + * Compute index difference between anon and + * previous anon. + */ + p_bound_size = p_index_next - p_index - 1; + + if (shared_object != NULL) { + cur = vmu_alloc_bound(); + cur->vmb_next = NULL; + cur->vmb_start = s_index; + cur->vmb_end = s_index + p_bound_size; + cur->vmb_type = VMUSAGE_BOUND_UNKNOWN; + if (first == NULL) { + first = cur; + last = cur; + } else { + last->vmb_next = cur; + last = cur; + } + } + p_index = p_index + p_bound_size + 1; + s_index = s_index + p_bound_size + 1; + } + + /* Detect end of anons in amp */ + if (ap == NULL) + break; + + cnt = ap->an_refcnt; + swap_xlate(ap, &vn, &off); + + if (vn == NULL || vn->v_pages == NULL || + (page = page_exists(vn, off)) == NULL) { + p_index++; + s_index++; + continue; + } + + /* + * If large page is found, compute portion of large + * page in mapping, and increment indicies to the next + * large page. + */ + if (page->p_szc > 0) { + + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; + + /* First page in large page */ + pgstart = p_index & ~pgmsk; + /* Last page in large page */ + pgend = pgstart + pgcnt - 1; + /* + * Artifically end page if page extends past + * end of mapping. + */ + if (pgend > p_end) + pgend = p_end; + + /* + * Compute number of pages from large page + * which are mapped. + */ + pgcnt = pgend - p_index + 1; + + /* + * Point indicies at page after large page, + * or at page after end of mapping. + */ + p_index += pgcnt; + s_index += pgcnt; + } else { + p_index++; + s_index++; + } + + /* + * Assume anon structs with a refcnt + * of 1 are not cow shared, so there + * is no reason to track them per entity. + */ + if (cnt == 1) { + panon += pgcnt; + continue; + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + /* + * Track cow anons per entity so + * they are not double counted. + */ + if (vmu_find_insert_anon(entity->vme_anon_hash, + (caddr_t)ap) == 0) + continue; + + result->vmu_rss_all += (pgcnt << PAGESHIFT); + result->vmu_rss_private += + (pgcnt << PAGESHIFT); + } + } + ANON_LOCK_EXIT(&private_amp->a_rwlock); + } + + /* Add up resident anon and swap reserved for private mappings */ + if (swresv > 0 || panon > 0) { + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + result = &entity->vme_result; + result->vmu_swap_all += swresv; + result->vmu_swap_private += swresv; + result->vmu_rss_all += (panon << PAGESHIFT); + result->vmu_rss_private += (panon << PAGESHIFT); + } + } + + /* Compute resident pages backing shared amp or named vnode */ + if (shared_object != NULL) { + if (first == NULL) { + /* + * No private amp, or private amp has no anon + * structs. This means entire segment is backed by + * the shared object. + */ + first = vmu_alloc_bound(); + first->vmb_next = NULL; + first->vmb_start = s_start; + first->vmb_end = s_end; + first->vmb_type = VMUSAGE_BOUND_UNKNOWN; + } + /* + * Iterate bounds not backed by private amp, and compute + * resident pages. + */ + cur = first; + while (cur != NULL) { + + if (vmu_insert_lookup_object_bounds(shared_object, + cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, + &first, &last) > 0) { + /* new bounds, find incore/not-incore */ + if (shared_object->vmo_type == + VMUSAGE_TYPE_VNODE) + vmu_vnode_update_incore_bounds( + (vnode_t *) + shared_object->vmo_key, &first, + &last); + else + vmu_amp_update_incore_bounds( + (struct anon_map *) + shared_object->vmo_key, &first, + &last, incore); + vmu_merge_bounds(&first, &last); + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + + entity_object = vmu_find_insert_object( + shared_object->vmo_type == + VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash: + entity->vme_amp_hash, + shared_object->vmo_key, + shared_object->vmo_type); + + virt = vmu_insert_lookup_object_bounds( + entity_object, cur->vmb_start, cur->vmb_end, + VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last); + + if (virt == 0) + continue; + /* + * Range visited for this entity + */ + rss = vmu_update_bounds(&e_first, + &e_last, first, last); + result->vmu_rss_all += (rss << PAGESHIFT); + if (shared == B_TRUE && file == B_FALSE) { + /* shared anon mapping */ + result->vmu_swap_all += + (virt << PAGESHIFT); + result->vmu_swap_shared += + (virt << PAGESHIFT); + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_TRUE && file == B_TRUE) { + /* shared file mapping */ + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_FALSE && + file == B_TRUE) { + /* private file mapping */ + result->vmu_rss_private += + (rss << PAGESHIFT); + } + vmu_merge_bounds(&e_first, &e_last); + } + tmp = cur; + cur = cur->vmb_next; + vmu_free_bound(tmp); + } + } +} + +/* + * Based on the current calculation flags, find the relevant entities + * which are relative to the process. Then calculate each segment + * in the process'es address space for each relevant entity. + */ +static void +vmu_calculate_proc(proc_t *p) +{ + vmu_entity_t *entities = NULL; + vmu_zone_t *zone; + vmu_entity_t *tmp; + struct as *as; + struct seg *seg; + int ret; + + /* Figure out which entities are being computed */ + if ((vmu_data.vmu_system) != NULL) { + tmp = vmu_data.vmu_system; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | + VMUSAGE_ALL_EUSERS)) { + ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t *)&zone); + if (ret != 0) { + zone = vmu_alloc_zone(p->p_zone->zone_id); + ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t)zone, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + if (zone->vmz_zone != NULL) { + tmp = zone->vmz_zone; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) { + tmp = vmu_find_insert_entity(zone->vmz_projects_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, + zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) { + tmp = vmu_find_insert_entity(zone->vmz_tasks_hash, + p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_rusers_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_eusers_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + } + /* Entities which collapse projects and users for all zones */ + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + + ASSERT(entities != NULL); + /* process all segs in process's address space */ + as = p->p_as; + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; + seg = AS_SEGNEXT(as, seg)) { + vmu_calculate_seg(entities, seg); + } + AS_LOCK_EXIT(as, &as->a_lock); +} + +/* + * Free data created by previous call to vmu_calculate(). + */ +static void +vmu_clear_calc() +{ + if (vmu_data.vmu_system != NULL) + vmu_free_entity(vmu_data.vmu_system); + vmu_data.vmu_system = NULL; + if (vmu_data.vmu_zones_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash); + if (vmu_data.vmu_projects_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash); + if (vmu_data.vmu_rusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash); + if (vmu_data.vmu_eusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash); + + i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash); + i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash); +} + +/* + * Free unused data structures. These can result if the system workload + * decreases between calculations. + */ +static void +vmu_free_extra() +{ + vmu_bound_t *tb; + vmu_object_t *to; + vmu_entity_t *te; + vmu_zone_t *tz; + + while (vmu_data.vmu_free_bounds != NULL) { + tb = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; + kmem_cache_free(vmu_bound_cache, tb); + } + while (vmu_data.vmu_free_objects != NULL) { + to = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + kmem_cache_free(vmu_object_cache, to); + } + while (vmu_data.vmu_free_entities != NULL) { + te = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + if (te->vme_vnode_hash != NULL) + mod_hash_destroy_hash(te->vme_vnode_hash); + if (te->vme_amp_hash != NULL) + mod_hash_destroy_hash(te->vme_amp_hash); + if (te->vme_anon_hash != NULL) + mod_hash_destroy_hash(te->vme_anon_hash); + kmem_free(te, sizeof (vmu_entity_t)); + } + while (vmu_data.vmu_free_zones != NULL) { + tz = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + if (tz->vmz_projects_hash != NULL) + mod_hash_destroy_hash(tz->vmz_projects_hash); + if (tz->vmz_tasks_hash != NULL) + mod_hash_destroy_hash(tz->vmz_tasks_hash); + if (tz->vmz_rusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_rusers_hash); + if (tz->vmz_eusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_eusers_hash); + kmem_free(tz, sizeof (vmu_zone_t)); + } +} + +extern kcondvar_t *pr_pid_cv; + +/* + * Determine which entity types are relevant and allocate the hashes to + * track them. Then walk the process table and count rss and swap + * for each process'es address space. Address space object such as + * vnodes, amps and anons are tracked per entity, so that they are + * not double counted in the results. + * + */ +static void +vmu_calculate() +{ + int i = 0; + int ret; + proc_t *p; + + vmu_clear_calc(); + + if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM) + vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, + ALL_ZONES); + + /* + * Walk process table and calculate rss of each proc. + * + * Pidlock and p_lock cannot be held while doing the rss calculation. + * This is because: + * 1. The calculation allocates using KM_SLEEP. + * 2. The calculation grabs a_lock, which cannot be grabbed + * after p_lock. + * + * Since pidlock must be dropped, we cannot simply just walk the + * practive list. Instead, we walk the process table, and sprlock + * each process to ensure that it does not exit during the + * calculation. + */ + + mutex_enter(&pidlock); + for (i = 0; i < v.v_proc; i++) { +again: + p = pid_entry(i); + if (p == NULL) + continue; + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (panicstr) { + mutex_exit(&p->p_lock); + return; + } + + /* Try to set P_PR_LOCK */ + ret = sprtrylock_proc(p); + if (ret == -1) { + /* Process in invalid state */ + mutex_exit(&p->p_lock); + mutex_enter(&pidlock); + continue; + } else if (ret == 1) { + /* + * P_PR_LOCK is already set. Wait and try again. + * This also drops p_lock. + */ + sprwaitlock_proc(p); + mutex_enter(&pidlock); + goto again; + } + mutex_exit(&p->p_lock); + + vmu_calculate_proc(p); + + mutex_enter(&p->p_lock); + sprunlock(p); + mutex_enter(&pidlock); + } + mutex_exit(&pidlock); + + vmu_free_extra(); +} + +/* + * allocate a new cache for N results satisfying flags + */ +vmu_cache_t * +vmu_cache_alloc(size_t nres, uint_t flags) +{ + vmu_cache_t *cache; + + cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP); + cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP); + cache->vmc_nresults = nres; + cache->vmc_flags = flags; + cache->vmc_refcnt = 1; + return (cache); +} + +/* + * Make sure cached results are not freed + */ +static void +vmu_cache_hold(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + cache->vmc_refcnt++; +} + +/* + * free cache data + */ +static void +vmu_cache_rele(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + ASSERT(cache->vmc_refcnt > 0); + cache->vmc_refcnt--; + if (cache->vmc_refcnt == 0) { + kmem_free(cache->vmc_results, sizeof (vmusage_t) * + cache->vmc_nresults); + kmem_free(cache, sizeof (vmu_cache_t)); + } +} + +/* + * Copy out the cached results to a caller. Inspect the callers flags + * and zone to determine which cached results should be copied. + */ +static int +vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, + uint_t flags) +{ + vmusage_t *result, *out_result; + vmusage_t dummy; + size_t i, count = 0; + size_t bufsize; + int ret = 0; + uint_t types = 0; + + if (nres != NULL) { + if (copyin((caddr_t)nres, &bufsize, sizeof (size_t))) + return (set_errno(EFAULT)); + } else { + bufsize = 0; + } + + /* figure out what results the caller is interested in. */ + if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) + types |= VMUSAGE_SYSTEM; + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + types |= VMUSAGE_ZONE; + if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) + types |= VMUSAGE_PROJECTS; + if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + types |= VMUSAGE_TASKS; + if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) + types |= VMUSAGE_RUSERS; + if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) + types |= VMUSAGE_EUSERS; + + /* count results for current zone */ + out_result = buf; + for (result = cache->vmc_results, i = 0; + i < cache->vmc_nresults; result++, i++) { + + /* Do not return "other-zone" results to non-global zones */ + if (curproc->p_zone != global_zone && + curproc->p_zone->zone_id != result->vmu_zoneid) + continue; + + /* + * If non-global zone requests VMUSAGE_SYSTEM, fake + * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result. + */ + if (curproc->p_zone != global_zone && + (flags & VMUSAGE_SYSTEM) != 0 && + result->vmu_type == VMUSAGE_ZONE) { + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + dummy = *result; + dummy.vmu_zoneid = ALL_ZONES; + dummy.vmu_id = 0; + dummy.vmu_type = VMUSAGE_SYSTEM; + if (copyout(&dummy, out_result, + sizeof (vmusage_t))) + return (set_errno( + EFAULT)); + out_result++; + } + } + } + + /* Skip results that do not match requested type */ + if ((result->vmu_type & types) == 0) + continue; + + /* Skip collated results if not requested */ + if (result->vmu_zoneid == ALL_ZONES) { + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & VMUSAGE_COL_PROJECTS) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & VMUSAGE_COL_EUSERS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & VMUSAGE_COL_RUSERS) == 0) + continue; + } + + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + if (copyout(result, out_result, + sizeof (vmusage_t))) + return (set_errno(EFAULT)); + out_result++; + } + } + } + if (nres != NULL) + if (copyout(&count, (void *)nres, sizeof (size_t))) + return (set_errno(EFAULT)); + + return (ret); +} + +/* + * vm_getusage() + * + * Counts rss and swap by zone, project, task, and/or user. The flags argument + * determines the type of results structures returned. Flags requesting + * results from more than one zone are "flattened" to the local zone if the + * caller is not the global zone. + * + * args: + * flags: bitmap consisting of one or more of VMUSAGE_*. + * age: maximum allowable age (time since counting was done) in + * seconds of the results. Results from previous callers are + * cached in kernel. + * buf: pointer to buffer array of vmusage_t. If NULL, then only nres + * set on success. + * nres: Set to number of vmusage_t structures pointed to by buf + * before calling vm_getusage(). + * On return 0 (success) or ENOSPC, is set to the number of result + * structures returned or attempted to return. + * + * returns 0 on success, -1 on failure: + * EINTR (interrupted) + * ENOSPC (nres to small for results, nres set to needed value for success) + * EINVAL (flags invalid) + * EFAULT (bad address for buf or nres) + */ +int +vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres) +{ + vmu_entity_t *entity; + vmusage_t *result; + int ret = 0; + int cacherecent = 0; + hrtime_t now; + uint_t flags_orig; + + /* + * Non-global zones cannot request system wide and/or collated + * results, or the system result, so munge the flags accordingly. + */ + flags_orig = flags; + if (curproc->p_zone != global_zone) { + if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { + flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); + flags |= VMUSAGE_PROJECTS; + } + if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) { + flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS); + flags |= VMUSAGE_RUSERS; + } + if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) { + flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS); + flags |= VMUSAGE_EUSERS; + } + if (flags & VMUSAGE_SYSTEM) { + flags &= ~VMUSAGE_SYSTEM; + flags |= VMUSAGE_ZONE; + } + } + + /* Check for unknown flags */ + if ((flags & (~VMUSAGE_MASK)) != 0) + return (set_errno(EINVAL)); + + /* Check for no flags */ + if ((flags & VMUSAGE_MASK) == 0) + return (set_errno(EINVAL)); + + mutex_enter(&vmu_data.vmu_lock); + now = gethrtime(); + +start: + if (vmu_data.vmu_cache != NULL) { + + vmu_cache_t *cache; + + if ((vmu_data.vmu_cache->vmc_timestamp + + ((hrtime_t)age * NANOSEC)) > now) + cacherecent = 1; + + if ((vmu_data.vmu_cache->vmc_flags & flags) == flags && + cacherecent == 1) { + cache = vmu_data.vmu_cache; + vmu_cache_hold(cache); + mutex_exit(&vmu_data.vmu_lock); + + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + mutex_exit(&vmu_data.vmu_lock); + return (ret); + } + /* + * If the cache is recent, it is likely that there are other + * consumers of vm_getusage running, so add their flags to the + * desired flags for the calculation. + */ + if (cacherecent == 1) + flags = vmu_data.vmu_cache->vmc_flags | flags; + } + if (vmu_data.vmu_calc_thread == NULL) { + + vmu_cache_t *cache; + + vmu_data.vmu_calc_thread = curthread; + vmu_data.vmu_calc_flags = flags; + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + if (vmu_data.vmu_pending_waiters > 0) + vmu_data.vmu_calc_flags |= + vmu_data.vmu_pending_flags; + + vmu_data.vmu_pending_flags = 0; + mutex_exit(&vmu_data.vmu_lock); + vmu_calculate(); + mutex_enter(&vmu_data.vmu_lock); + /* copy results to cache */ + if (vmu_data.vmu_cache != NULL) + vmu_cache_rele(vmu_data.vmu_cache); + cache = vmu_data.vmu_cache = + vmu_cache_alloc(vmu_data.vmu_nentities, + vmu_data.vmu_calc_flags); + + result = cache->vmc_results; + for (entity = vmu_data.vmu_entities; entity != NULL; + entity = entity->vme_next) { + *result = entity->vme_result; + result++; + } + cache->vmc_timestamp = gethrtime(); + vmu_cache_hold(cache); + + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_calc_thread = NULL; + + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + + mutex_exit(&vmu_data.vmu_lock); + + /* copy cache */ + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + mutex_exit(&vmu_data.vmu_lock); + + return (ret); + } + vmu_data.vmu_pending_flags |= flags; + vmu_data.vmu_pending_waiters++; + while (vmu_data.vmu_calc_thread != NULL) { + if (cv_wait_sig(&vmu_data.vmu_cv, + &vmu_data.vmu_lock) == 0) { + vmu_data.vmu_pending_waiters--; + mutex_exit(&vmu_data.vmu_lock); + return (set_errno(EINTR)); + } + } + vmu_data.vmu_pending_waiters--; + goto start; +} |
