diff options
Diffstat (limited to 'usr/src/uts')
35 files changed, 2984 insertions, 159 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 32a63d6c22..b2bbcbc8c3 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -334,6 +334,7 @@ GENUNIX_OBJS += \ vm_seg.o \ vm_subr.o \ vm_swap.o \ + vm_usage.o \ vnode.o \ vuid_queue.o \ vuid_store.o \ diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c index 3bb90cf1fa..9197dc815b 100644 --- a/usr/src/uts/common/disp/priocntl.c +++ b/usr/src/uts/common/disp/priocntl.c @@ -136,6 +136,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, struct pcmpargs pcmpargs; pc_vaparms_t vaparms; char clname[PC_CLNMSZ]; + char *outstr; int count; kthread_id_t retthreadp; proc_t *initpp; @@ -145,6 +146,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, int rv = 0; pid_t saved_pid; id_t classid; + int size; int (*copyinfn)(const void *, void *, size_t); int (*copyoutfn)(const void *, void *, size_t); @@ -692,6 +694,21 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, ASSERT(defaultcid > 0 && defaultcid < loaded_classes); break; + case PC_GETDFLCL: + mutex_enter(&class_lock); + + if (defaultcid >= loaded_classes) + outstr = ""; + else + outstr = sclass[defaultcid].cl_name; + size = strlen(outstr) + 1; + if (arg != NULL) + if ((*copyoutfn)(outstr, arg, size) != 0) + error = EFAULT; + + mutex_exit(&class_lock); + break; + default: error = EINVAL; break; diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c index 5a7000c242..c5145cccf0 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +66,7 @@ tmp_resv( int pagecreate) /* call anon_resv if set */ { pgcnt_t pages = btopr(delta); + zone_t *zone; ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); @@ -79,9 +79,10 @@ tmp_resv( * * Deny if trying to reserve more than tmpfs can allocate */ + zone = tm->tm_vfsp->vfs_zone; if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) || - (!anon_checkspace(ptob(pages + tmpfs_minfree))) || - (anon_resv(delta) == 0))) { + (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) || + (anon_resv_zone(delta, zone) == 0))) { return (1); } @@ -114,7 +115,7 @@ tmp_unresv( ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); - anon_unresv(delta); + anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone); mutex_enter(&tm->tm_contents); tm->tm_anonmem -= btopr(delta); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index d623dce3f7..aa870b124a 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -215,9 +215,26 @@ wrtmp( if (delta > 0) { pagecreate = 1; if (tmp_resv(tm, tp, delta, pagecreate)) { - cmn_err(CE_WARN, - "%s: File system full, swap space limit exceeded", + /* + * Log file system full in the zone that owns + * the tmpfs mount, as well as in the global + * zone if necessary. + */ + zcmn_err(tm->tm_vfsp->vfs_zone->zone_id, + CE_WARN, "%s: File system full, " + "swap space limit exceeded", tm->tm_mntpath); + + if (tm->tm_vfsp->vfs_zone->zone_id != + GLOBAL_ZONEID) { + + vfs_t *vfs = tm->tm_vfsp; + + zcmn_err(GLOBAL_ZONEID, + CE_WARN, "%s: File system full, " + "swap space limit exceeded", + vfs->vfs_vnodecovered->v_path); + } error = ENOSPC; break; } diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c index 19700ce685..3c63231253 100644 --- a/usr/src/uts/common/os/modhash.c +++ b/usr/src/uts/common/os/modhash.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -165,15 +164,6 @@ */ #define MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2)) -static void i_mod_hash_clear_nosync(mod_hash_t *); -static int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t *); -static int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t, mod_hash_hndl_t); -static int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t *); -static uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t); - /* * Cache for struct mod_hash_entry */ @@ -522,7 +512,7 @@ mod_hash_destroy_hash(mod_hash_t *hash) * i_mod_hash() * Call the hashing algorithm for this hash table, with the given key. */ -static uint_t +uint_t i_mod_hash(mod_hash_t *hash, mod_hash_key_t key) { uint_t h; @@ -778,7 +768,7 @@ mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key) * mod_hash_find() * Find a value in the hash table corresponding to the given key. */ -static int +int i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val) { @@ -826,7 +816,7 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val, return (res); } -static void +void i_mod_hash_walk_nosync(mod_hash_t *hash, uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg) { @@ -870,7 +860,7 @@ mod_hash_walk(mod_hash_t *hash, * Clears the given hash table by calling the destructor of every hash * element and freeing up all mod_hash_entry's. */ -static void +void i_mod_hash_clear_nosync(mod_hash_t *hash) { int i; diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index 88b0258afe..fecc4a6c45 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -385,6 +385,56 @@ pgfind(pid_t pgid) } /* + * Sets P_PR_LOCK on a non-system process. Process must be fully created + * and not exiting to succeed. + * + * Returns 0 on success. + * Returns 1 if P_PR_LOCK is set. + * Returns -1 if proc is in invalid state. + */ +int +sprtrylock_proc(proc_t *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + + /* skip system and incomplete processes */ + if (p->p_stat == SIDL || p->p_stat == SZOMB || + (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { + return (-1); + } + + if (p->p_proc_flag & P_PR_LOCK) + return (1); + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + + return (0); +} + +/* + * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, + * and the proc pointer no longer valid, as the proc may have exited. + */ +void +sprwaitlock_proc(proc_t *p) +{ + kmutex_t *mp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(p->p_proc_flag & P_PR_LOCK); + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); +} + +/* * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. * Returns the proc pointer on success, NULL on failure. sprlock() is * really just a stripped-down version of pr_p_lock() to allow practive @@ -394,7 +444,7 @@ proc_t * sprlock_zone(pid_t pid, zoneid_t zoneid) { proc_t *p; - kmutex_t *mp; + int ret; for (;;) { mutex_enter(&pidlock); @@ -402,31 +452,21 @@ sprlock_zone(pid_t pid, zoneid_t zoneid) mutex_exit(&pidlock); return (NULL); } - /* - * p_lock is persistent, but p itself is not -- it could - * vanish during cv_wait(). Load p->p_lock now so we can - * drop it after cv_wait() without referencing p. - */ - mp = &p->p_lock; - mutex_enter(mp); + mutex_enter(&p->p_lock); mutex_exit(&pidlock); - /* - * If the process is in some half-baked state, fail. - */ - if (p->p_stat == SZOMB || p->p_stat == SIDL || - (p->p_flag & (SEXITING | SEXITLWPS))) { - mutex_exit(mp); - return (NULL); - } + if (panicstr) return (p); - if (!(p->p_proc_flag & P_PR_LOCK)) + + ret = sprtrylock_proc(p); + if (ret == -1) { + mutex_exit(&p->p_lock); + return (NULL); + } else if (ret == 0) { break; - cv_wait(&pr_pid_cv[p->p_slot], mp); - mutex_exit(mp); + } + sprwaitlock_proc(p); } - p->p_proc_flag |= P_PR_LOCK; - THREAD_KPRI_REQUEST(); return (p); } diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c index ceb90850fa..818bb54701 100644 --- a/usr/src/uts/common/os/pool.c +++ b/usr/src/uts/common/os/pool.c @@ -293,6 +293,8 @@ pool_enable(void) (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); + (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", + "wt-load"); (void) nvlist_alloc(&pool_default->pool_props, NV_UNIQUE_NAME, KM_SLEEP); @@ -1309,7 +1311,7 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) } if (idtype == P_PROJID) { - kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND); + kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); if (kpj == NULL) return (ESRCH); mutex_enter(&kpj->kpj_poolbind); diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c index 6c266c0ca3..d75b60f6e9 100644 --- a/usr/src/uts/common/os/project.c +++ b/usr/src/uts/common/os/project.c @@ -29,6 +29,7 @@ #include <sys/modhash.h> #include <sys/modctl.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/atomic.h> #include <sys/cmn_err.h> #include <sys/proc.h> @@ -103,6 +104,8 @@ struct project_zone { * acquired, the hash lock is to be acquired first. */ +static kstat_t *project_kstat_create(kproject_t *pj, zone_t *zone); +static void project_kstat_delete(kproject_t *pj); static void project_data_init(kproject_data_t *data) @@ -118,6 +121,7 @@ project_data_init(kproject_data_t *data) data->kpd_locked_mem_ctl = UINT64_MAX; data->kpd_contract = 0; data->kpd_crypto_mem = 0; + data->kpd_lockedmem_kstat = NULL; } /*ARGSUSED*/ @@ -179,11 +183,11 @@ project_hold(kproject_t *p) } /* - * kproject_t *project_hold_by_id(projid_t, zoneid_t, int) + * kproject_t *project_hold_by_id(projid_t, zone_t *, int) * * Overview * project_hold_by_id() performs a look-up in the dictionary of projects - * active on the system by specified project ID + zone ID and puts a hold on + * active on the system by specified project ID + zone and puts a hold on * it. The third argument defines the desired behavior in the case when * project with given project ID cannot be found: * @@ -202,7 +206,7 @@ project_hold(kproject_t *p) * Caller must be in a context suitable for KM_SLEEP allocations. */ kproject_t * -project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) +project_hold_by_id(projid_t id, zone_t *zone, int flag) { kproject_t *spare_p; kproject_t *p; @@ -211,9 +215,11 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) rctl_alloc_gp_t *gp; rctl_entity_p_t e; struct project_zone pz; + boolean_t create = B_FALSE; + kstat_t *ksp; pz.kpj_id = id; - pz.kpj_zoneid = zoneid; + pz.kpj_zoneid = zone->zone_id; if (flag == PROJECT_HOLD_FIND) { mutex_enter(&project_hash_lock); @@ -241,9 +247,10 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) mutex_enter(&project_hash_lock); if (mod_hash_find(projects_hash, (mod_hash_key_t)&pz, (mod_hash_val_t *)&p) == MH_ERR_NOTFOUND) { + p = spare_p; p->kpj_id = id; - p->kpj_zoneid = zoneid; + p->kpj_zoneid = zone->zone_id; p->kpj_count = 0; p->kpj_shares = 1; p->kpj_nlwps = 0; @@ -265,7 +272,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) * Insert project into global project list. */ mutex_enter(&projects_list_lock); - if (id != 0 || zoneid != GLOBAL_ZONEID) { + if (id != 0 || zone != &zone0) { p->kpj_next = projects_list; p->kpj_prev = projects_list->kpj_prev; p->kpj_prev->kpj_next = p; @@ -279,6 +286,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) projects_list = p; } mutex_exit(&projects_list_lock); + create = B_TRUE; } else { mutex_exit(&curproc->p_lock); mod_hash_cancel(projects_hash, &hndl); @@ -290,10 +298,20 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) p->kpj_count++; mutex_exit(&project_hash_lock); + /* + * The kstat stores the project's zone name, as zoneid's may change + * across reboots. + */ + if (create == B_TRUE) { + ksp = project_kstat_create(p, zone); + mutex_enter(&project_hash_lock); + ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL); + p->kpj_data.kpd_lockedmem_kstat = ksp; + mutex_exit(&project_hash_lock); + } return (p); } - /* * void project_rele(kproject_t *) * @@ -325,6 +343,7 @@ project_rele(kproject_t *p) mutex_exit(&projects_list_lock); rctl_set_free(p->kpj_rctls); + project_kstat_delete(p); if (mod_hash_destroy(projects_hash, (mod_hash_key_t)p)) panic("unable to delete project %d zone %d", p->kpj_id, @@ -636,9 +655,9 @@ project_locked_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - mutex_enter(&p->p_zone->zone_rctl_lock); + mutex_enter(&p->p_zone->zone_mem_lock); q = p->p_task->tk_proj->kpj_data.kpd_locked_mem; - mutex_exit(&p->p_zone->zone_rctl_lock); + mutex_exit(&p->p_zone->zone_mem_lock); return (q); } @@ -649,7 +668,7 @@ project_locked_mem_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock)); + ASSERT(MUTEX_HELD(&p->p_zone->zone_mem_lock)); q = p->p_task->tk_proj->kpj_data.kpd_locked_mem; if (q + inc > rval->rcv_value) return (1); @@ -868,7 +887,7 @@ project_init(void) rctl_add_default_limit("project.max-contracts", 10000, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); - t0.t_proj = proj0p = project_hold_by_id(0, GLOBAL_ZONEID, + t0.t_proj = proj0p = project_hold_by_id(0, &zone0, PROJECT_HOLD_INSERT); mutex_enter(&p0.p_lock); @@ -876,3 +895,57 @@ project_init(void) mutex_exit(&p0.p_lock); proj0p->kpj_ntasks = 1; } + +static int +project_lockedmem_kstat_update(kstat_t *ksp, int rw) +{ + kproject_t *pj = ksp->ks_private; + kproject_kstat_t *kpk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + kpk->kpk_usage.value.ui64 = pj->kpj_data.kpd_locked_mem; + kpk->kpk_value.value.ui64 = pj->kpj_data.kpd_locked_mem_ctl; + return (0); +} + +static kstat_t * +project_kstat_create(kproject_t *pj, zone_t *zone) +{ + kstat_t *ksp; + kproject_kstat_t *kpk; + char *zonename = zone->zone_name; + + ksp = rctl_kstat_create_project(pj, "lockedmem", KSTAT_TYPE_NAMED, + sizeof (kproject_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return (NULL); + + kpk = ksp->ks_data = kmem_alloc(sizeof (kproject_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zonename) + 1; + kstat_named_init(&kpk->kpk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&kpk->kpk_zonename, zonename); + kstat_named_init(&kpk->kpk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&kpk->kpk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = project_lockedmem_kstat_update; + ksp->ks_private = pj; + kstat_install(ksp); + + return (ksp); +} + +static void +project_kstat_delete(kproject_t *pj) +{ + void *data; + + if (pj->kpj_data.kpd_lockedmem_kstat != NULL) { + data = pj->kpj_data.kpd_lockedmem_kstat->ks_data; + kstat_delete(pj->kpj_data.kpd_lockedmem_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } + pj->kpj_data.kpd_lockedmem_kstat = NULL; +} diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 4de4c74fe8..c0479005ea 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -29,6 +29,7 @@ #include <sys/cmn_err.h> #include <sys/id_space.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/log.h> #include <sys/modctl.h> #include <sys/modhash.h> @@ -2599,7 +2600,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, zonep = p->p_zone; } - mutex_enter(&zonep->zone_rctl_lock); + mutex_enter(&zonep->zone_mem_lock); e.rcep_p.proj = projp; e.rcep_t = RCENTITY_PROJECT; @@ -2627,7 +2628,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, p->p_locked_mem += inc; } out: - mutex_exit(&zonep->zone_rctl_lock); + mutex_exit(&zonep->zone_mem_lock); if (proj != NULL) zone_rele(zonep); return (ret); @@ -2661,7 +2662,7 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, zonep = p->p_zone; } - mutex_enter(&zonep->zone_rctl_lock); + mutex_enter(&zonep->zone_mem_lock); zonep->zone_locked_mem -= inc; projp->kpj_data.kpd_locked_mem -= inc; if (creditproc != 0) { @@ -2669,7 +2670,120 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, ASSERT(MUTEX_HELD(&p->p_lock)); p->p_locked_mem -= inc; } - mutex_exit(&zonep->zone_rctl_lock); + mutex_exit(&zonep->zone_mem_lock); if (proj != NULL) zone_rele(zonep); } + +/* + * rctl_incr_swap(proc_t *, zone_t *, size_t) + * + * Overview + * Increments the swap charge on the specified zone. + * + * Return values + * 0 on success. EAGAIN if swap increment fails due an rctl value + * on the zone. + * + * Callers context + * p_lock held on specified proc. + * swap must be even multiple of PAGESIZE + */ +int +rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap) +{ + rctl_entity_p_t e; + + ASSERT(MUTEX_HELD(&proc->p_lock)); + ASSERT((swap & PAGEOFFSET) == 0); + e.rcep_p.zone = zone; + e.rcep_t = RCENTITY_ZONE; + + mutex_enter(&zone->zone_mem_lock); + + if ((zone->zone_max_swap + swap) > + zone->zone_max_swap_ctl) { + + if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls, + proc, &e, swap, 0) & RCT_DENY) { + mutex_exit(&zone->zone_mem_lock); + return (EAGAIN); + } + } + zone->zone_max_swap += swap; + mutex_exit(&zone->zone_mem_lock); + return (0); +} + +/* + * rctl_decr_swap(zone_t *, size_t) + * + * Overview + * Decrements the swap charge on the specified zone. + * + * Return values + * None + * + * Callers context + * swap must be even multiple of PAGESIZE + */ +void +rctl_decr_swap(zone_t *zone, size_t swap) +{ + ASSERT((swap & PAGEOFFSET) == 0); + mutex_enter(&zone->zone_mem_lock); + ASSERT(zone->zone_max_swap >= swap); + zone->zone_max_swap -= swap; + mutex_exit(&zone->zone_mem_lock); +} + +/* + * Create resource kstat + */ +static kstat_t * +rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class, + uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid) +{ + kstat_t *ksp = NULL; + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance); + + if ((ksp = kstat_create_zone("caps", ks_zoneid, + name, ks_class, ks_type, + ks_ndata, ks_flags, ks_zoneid)) != NULL) { + if (ks_zoneid != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + } + return (ksp); +} + +/* + * Create zone-specific resource kstat + */ +kstat_t * +rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags) +{ + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name); + + return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps", + ks_type, ks_ndata, ks_flags, zone->zone_id)); +} + +/* + * Create project-specific resource kstat + */ +kstat_t * +rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags) +{ + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name); + + return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps", + ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid)); +} diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 66aae7d2bc..62279e0777 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -542,13 +541,13 @@ schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) * Set up anonymous memory struct. No swap reservation is * needed since the page will be locked into memory. */ - amp = anonmap_alloc(PAGESIZE, PAGESIZE); + amp = anonmap_alloc(PAGESIZE, 0); /* * Allocate the page. */ - kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_LOCKED | KPD_ZERO, - amp); + kaddr = segkp_get_withanonmap(segkp, PAGESIZE, + KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); if (kaddr == NULL) { amp->refcnt--; anonmap_free(amp); diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index 9ada0aac18..a7ef99fddb 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -666,7 +666,7 @@ struct sysent sysent[NSYSCALL] = /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), - /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2), + /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5), /* 182 */ SYSENT_LOADABLE(), /* portfs */ /* 183 */ SYSENT_CI("pollsys", pollsys, 4), /* 184 */ SYSENT_CI("labelsys", labelsys, 5), @@ -1044,7 +1044,7 @@ struct sysent sysent32[NSYSCALL] = /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), - /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2), + /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5), /* 182 */ SYSENT_LOADABLE32(), /* portfs */ /* 183 */ SYSENT_CI("pollsys", pollsys, 4), /* 184 */ SYSENT_CI("labelsys", labelsys, 5), diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c index 562e3596b5..785f74c145 100644 --- a/usr/src/uts/common/os/task.c +++ b/usr/src/uts/common/os/task.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -389,7 +388,7 @@ task_create(projid_t projid, zone_t *zone) tk->tk_nlwps = 0; tk->tk_nlwps_ctl = INT_MAX; tk->tk_usage = tu; - tk->tk_proj = project_hold_by_id(projid, zone->zone_id, + tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT); tk->tk_flags = TASK_NORMAL; @@ -848,7 +847,7 @@ task_init(void) task0p->tk_tkid = id_alloc(taskid_space); task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); - task0p->tk_proj = project_hold_by_id(0, GLOBAL_ZONEID, + task0p->tk_proj = project_hold_by_id(0, &zone0, PROJECT_HOLD_INSERT); task0p->tk_flags = TASK_NORMAL; task0p->tk_nlwps = p->p_lwpcnt; diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 0fb2c2be55..19ea8b31f1 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -154,6 +154,10 @@ * zone_lock: This is a per-zone lock used to protect several fields of * the zone_t (see <sys/zone.h> for details). In addition, holding * this lock means that the zone cannot go away. + * zone_nlwps_lock: This is a per-zone lock used to protect the fields + * related to the zone.max-lwps rctl. + * zone_mem_lock: This is a per-zone lock used to protect the fields + * related to the zone.max-locked-memory and zone.max-swap rctls. * zsd_key_lock: This is a global lock protecting the key state for ZSD. * zone_deathrow_lock: This is a global lock protecting the "deathrow" * list (a list of zones in the ZONE_IS_DEAD state). @@ -162,6 +166,10 @@ * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> * zone_lock --> zsd_key_lock --> pidlock --> p_lock * + * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is: + * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock + * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock + * * Blocking memory allocations are permitted while holding any of the * zone locks. * @@ -190,6 +198,7 @@ #include <sys/debug.h> #include <sys/file.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/mutex.h> #include <sys/note.h> #include <sys/pathname.h> @@ -232,6 +241,8 @@ #include <sys/zone.h> #include <sys/tsol/label.h> +#include <vm/seg.h> + /* * cv used to signal that all references to the zone have been released. This * needs to be global since there may be multiple waiters, and the first to @@ -317,6 +328,7 @@ const char *zone_status_table[] = { */ rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; +rctl_hndl_t rc_zone_max_swap; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_shmmax; rctl_hndl_t rc_zone_shmmni; @@ -1011,9 +1023,9 @@ zone_locked_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - mutex_enter(&p->p_zone->zone_rctl_lock); + mutex_enter(&p->p_zone->zone_mem_lock); q = p->p_zone->zone_locked_mem; - mutex_exit(&p->p_zone->zone_rctl_lock); + mutex_exit(&p->p_zone->zone_mem_lock); return (q); } @@ -1023,9 +1035,12 @@ zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) { rctl_qty_t q; + zone_t *z; + + z = e->rcep_p.zone; ASSERT(MUTEX_HELD(&p->p_lock)); - ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock)); - q = p->p_zone->zone_locked_mem; + ASSERT(MUTEX_HELD(&z->zone_mem_lock)); + q = z->zone_locked_mem; if (q + incr > rcntl->rcv_value) return (1); return (0); @@ -1051,6 +1066,57 @@ static rctl_ops_t zone_locked_mem_ops = { zone_locked_mem_test }; +/*ARGSUSED*/ +static rctl_qty_t +zone_max_swap_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + mutex_enter(&z->zone_mem_lock); + q = z->zone_max_swap; + mutex_exit(&z->zone_mem_lock); + return (q); +} + +/*ARGSUSED*/ +static int +zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, + rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) +{ + rctl_qty_t q; + zone_t *z; + + z = e->rcep_p.zone; + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(MUTEX_HELD(&z->zone_mem_lock)); + q = z->zone_max_swap; + if (q + incr > rcntl->rcv_value) + return (1); + return (0); +} + +/*ARGSUSED*/ +static int +zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_max_swap_ctl = nv; + return (0); +} + +static rctl_ops_t zone_max_swap_ops = { + rcop_no_action, + zone_max_swap_usage, + zone_max_swap_set, + zone_max_swap_test +}; + /* * Helper function to brand the zone with a unique ID. */ @@ -1080,6 +1146,96 @@ zone_get_kcred(zoneid_t zoneid) return (cr); } +static int +zone_lockedmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_locked_mem; + zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl; + return (0); +} + +static int +zone_swapresv_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_max_swap; + zk->zk_value.value.ui64 = zone->zone_max_swap_ctl; + return (0); +} + +static void +zone_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_kstat_t *zk; + + ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED, + sizeof (zone_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zk->zk_zonename, zone->zone_name); + kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = zone_lockedmem_kstat_update; + ksp->ks_private = zone; + kstat_install(ksp); + + zone->zone_lockedmem_kstat = ksp; + + ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED, + sizeof (zone_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zk->zk_zonename, zone->zone_name); + kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = zone_swapresv_kstat_update; + ksp->ks_private = zone; + kstat_install(ksp); + + zone->zone_swapresv_kstat = ksp; +} + +static void +zone_kstat_delete(zone_t *zone) +{ + void *data; + + if (zone->zone_lockedmem_kstat != NULL) { + data = zone->zone_lockedmem_kstat->ks_data; + kstat_delete(zone->zone_lockedmem_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } + if (zone->zone_swapresv_kstat != NULL) { + data = zone->zone_swapresv_kstat->ks_data; + kstat_delete(zone->zone_swapresv_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } +} + /* * Called very early on in boot to initialize the ZSD list so that * zone_key_create() can be called before zone_init(). It also initializes @@ -1101,8 +1257,14 @@ zone_zsd_init(void) mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); zone0.zone_shares = 1; + zone0.zone_nlwps = 0; zone0.zone_nlwps_ctl = INT_MAX; + zone0.zone_locked_mem = 0; + zone0.zone_locked_mem_ctl = UINT64_MAX; + ASSERT(zone0.zone_max_swap == 0); + zone0.zone_max_swap_ctl = UINT64_MAX; zone0.zone_shmmax = 0; zone0.zone_ipc.ipcq_shmmni = 0; zone0.zone_ipc.ipcq_semmni = 0; @@ -1120,6 +1282,8 @@ zone_zsd_init(void) zone0.zone_ncpus_online = 0; zone0.zone_proc_initpid = 1; zone0.zone_initname = initname; + zone0.zone_lockedmem_kstat = NULL; + zone0.zone_swapresv_kstat = NULL; list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); list_insert_head(&zone_active, &zone0); @@ -1259,6 +1423,12 @@ zone_init(void) RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_locked_mem_ops); + + rc_zone_max_swap = rctl_register("zone.max-swap", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_max_swap_ops); + /* * Initialize the ``global zone''. */ @@ -1277,9 +1447,14 @@ zone_init(void) zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* - * pool_default hasn't been initialized yet, so we let pool_init() take - * care of making the global zone is in the default pool. + * pool_default hasn't been initialized yet, so we let pool_init() + * take care of making sure the global zone is in the default pool. + */ + + /* + * Initialize global zone kstats */ + zone_kstat_create(&zone0); /* * Initialize zone label. @@ -1337,6 +1512,7 @@ zone_init(void) if (res) panic("Sysevent_evc_bind failed during zone setup.\n"); + } static void @@ -1476,6 +1652,38 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +static int +zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +{ + uint64_t mcap; + int err = 0; + + if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) + zone->zone_phys_mcap = mcap; + + return (err); +} + +static int +zone_set_sched_class(zone_t *zone, const char *new_class) +{ + char sched_class[PC_CLNMSZ]; + id_t classid; + int err; + + ASSERT(zone != global_zone); + if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0) + return (err); /* EFAULT or ENAMETOOLONG */ + + if (getcid(sched_class, &classid) != 0 || classid == syscid) + return (set_errno(EINVAL)); + zone->zone_defaultcid = classid; + ASSERT(zone->zone_defaultcid > 0 && + zone->zone_defaultcid < loaded_classes); + + return (0); +} + /* * Block indefinitely waiting for (zone_status >= status) */ @@ -2510,10 +2718,10 @@ zsched(void *arg) /* * Decrement locked memory counts on old zone and project. */ - mutex_enter(&global_zone->zone_rctl_lock); + mutex_enter(&global_zone->zone_mem_lock); global_zone->zone_locked_mem -= pp->p_locked_mem; pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; - mutex_exit(&global_zone->zone_rctl_lock); + mutex_exit(&global_zone->zone_mem_lock); /* * Create and join a new task in project '0' of this zone. @@ -2529,10 +2737,10 @@ zsched(void *arg) pj = pp->p_task->tk_proj; - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); zone->zone_locked_mem += pp->p_locked_mem; pj->kpj_data.kpd_locked_mem += pp->p_locked_mem; - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); /* * add lwp counts to zsched's zone, and increment project's task count @@ -2689,7 +2897,10 @@ zsched(void *arg) * classid 'cid'. */ pool_lock(); - cid = pool_get_class(zone->zone_pool); + if (zone->zone_defaultcid > 0) + cid = zone->zone_defaultcid; + else + cid = pool_get_class(zone->zone_pool); if (cid == -1) cid = defaultcid; @@ -3019,7 +3230,7 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zone->zone_rctl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); list_create(&zone->zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); @@ -3057,8 +3268,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_initname = kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP); (void) strcpy(zone->zone_initname, zone_default_initname); + zone->zone_nlwps = 0; + zone->zone_nlwps_ctl = INT_MAX; zone->zone_locked_mem = 0; zone->zone_locked_mem_ctl = UINT64_MAX; + zone->zone_max_swap = 0; + zone->zone_max_swap_ctl = UINT64_MAX; + zone0.zone_lockedmem_kstat = NULL; + zone0.zone_swapresv_kstat = NULL; /* * Zsched initializes the rctls. @@ -3233,6 +3450,11 @@ zone_create(const char *zone_name, const char *zone_root, */ /* + * Create zone kstats + */ + zone_kstat_create(zone); + + /* * Let the other lwps continue. */ mutex_enter(&pp->p_lock); @@ -3643,6 +3865,9 @@ zone_destroy(zoneid_t zoneid) } + /* Get rid of the zone's kstats */ + zone_kstat_delete(zone); + /* * It is now safe to let the zone be recreated; remove it from the * lists. The memory will not be freed until the last cred @@ -3892,6 +4117,32 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; + case ZONE_ATTR_PHYS_MCAP: + size = sizeof (zone->zone_phys_mcap); + if (bufsize > size) + bufsize = size; + if (buf != NULL && + copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_CLASS: + mutex_enter(&class_lock); + + if (zone->zone_defaultcid >= loaded_classes) + outstr = ""; + else + outstr = sclass[zone->zone_defaultcid].cl_name; + size = strlen(outstr) + 1; + if (bufsize > size) + bufsize = size; + if (buf != NULL) { + err = copyoutstr(outstr, buf, bufsize, NULL); + if (err != 0 && err != ENAMETOOLONG) + error = EFAULT; + } + + mutex_exit(&class_lock); + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -3923,10 +4174,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * At present, attributes can only be set on non-running, - * non-global zones. + * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the + * global zone. */ - if (zoneid == GLOBAL_ZONEID) { + if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { return (set_errno(EINVAL)); } @@ -3938,8 +4189,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) zone_hold(zone); mutex_exit(&zonehash_lock); + /* + * At present most attributes can only be set on non-running, + * non-global zones. + */ zone_status = zone_status_get(zone); - if (zone_status > ZONE_IS_READY) + if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) goto done; switch (attr) { @@ -3971,6 +4226,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) if (zone->zone_brand == NULL) err = EINVAL; break; + case ZONE_ATTR_PHYS_MCAP: + err = zone_set_phys_mcap(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_SCHED_CLASS: + err = zone_set_sched_class(zone, (const char *)buf); + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -3986,6 +4247,11 @@ done: /* * Return zero if the process has at least one vnode mapped in to its * address space which shouldn't be allowed to change zones. + * + * Also return zero if the process has any shared mappings which reserve + * swap. This is because the counting for zone.max-swap does not allow swap + * revervation to be shared between zones. zone swap reservation is counted + * on zone->zone_max_swap. */ static int as_can_change_zones(void) @@ -3997,8 +4263,17 @@ as_can_change_zones(void) int allow = 1; ASSERT(pp->p_as != &kas); - AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + + /* + * Cannot enter zone with shared anon memory which + * reserves swap. See comment above. + */ + if (seg_can_change_zones(seg) == B_FALSE) { + allow = 0; + break; + } /* * if we can't get a backing vnode for this segment then skip * it. @@ -4011,11 +4286,30 @@ as_can_change_zones(void) break; } } - AS_LOCK_EXIT(&as, &as->a_lock); + AS_LOCK_EXIT(as, &as->a_lock); return (allow); } /* + * Count swap reserved by curproc's address space + */ +static size_t +as_swresv(void) +{ + proc_t *pp = curproc; + struct seg *seg; + struct as *as = pp->p_as; + size_t swap = 0; + + ASSERT(pp->p_as != &kas); + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) + swap += seg_swresv(seg); + + return (swap); +} + +/* * Systemcall entry point for zone_enter(). * * The current process is injected into said zone. In the process @@ -4043,6 +4337,7 @@ zone_enter(zoneid_t zoneid) zone_status_t status; int err = 0; rctl_entity_p_t e; + size_t swap; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -4205,6 +4500,15 @@ zone_enter(zoneid_t zoneid) goto out; } + /* + * a_lock must be held while transfering locked memory and swap + * reservation from the global zone to the non global zone because + * asynchronous faults on the processes' address space can lock + * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE + * segments respectively. + */ + AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER); + swap = as_swresv(); mutex_enter(&pp->p_lock); zone_proj0 = zone->zone_zsched->p_task->tk_proj; /* verify that we do not exceed and task or lwp limits */ @@ -4216,10 +4520,11 @@ zone_enter(zoneid_t zoneid) zone_proj0->kpj_ntasks += 1; mutex_exit(&zone->zone_nlwps_lock); - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); zone->zone_locked_mem += pp->p_locked_mem; zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem; - mutex_exit(&zone->zone_rctl_lock); + zone->zone_max_swap += swap; + mutex_exit(&zone->zone_mem_lock); /* remove lwps from proc's old zone and old project */ mutex_enter(&pp->p_zone->zone_nlwps_lock); @@ -4227,12 +4532,14 @@ zone_enter(zoneid_t zoneid) pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; mutex_exit(&pp->p_zone->zone_nlwps_lock); - mutex_enter(&pp->p_zone->zone_rctl_lock); + mutex_enter(&pp->p_zone->zone_mem_lock); pp->p_zone->zone_locked_mem -= pp->p_locked_mem; pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; - mutex_exit(&pp->p_zone->zone_rctl_lock); + pp->p_zone->zone_max_swap -= swap; + mutex_exit(&pp->p_zone->zone_mem_lock); mutex_exit(&pp->p_lock); + AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock); /* * Joining the zone cannot fail from now on. @@ -4289,6 +4596,31 @@ zone_enter(zoneid_t zoneid) sess_rele(pp->p_sessp, B_TRUE); pp->p_sessp = sp; pgjoin(pp, zone->zone_zsched->p_pidp); + + /* + * If there is a default scheduling class for the zone and it is not + * the class we are currently in, change all of the threads in the + * process to the new class. We need to be holding pidlock & p_lock + * when we call parmsset so this is a good place to do it. + */ + if (zone->zone_defaultcid > 0 && + zone->zone_defaultcid != curthread->t_cid) { + pcparms_t pcparms; + kthread_id_t t; + + pcparms.pc_cid = zone->zone_defaultcid; + pcparms.pc_clparms[0] = 0; + + /* + * If setting the class fails, we still want to enter the zone. + */ + if ((t = pp->p_tlist) != NULL) { + do { + (void) parmsset(&pcparms, t); + } while ((t = t->t_forw) != pp->p_tlist); + } + } + mutex_exit(&pp->p_lock); mutex_exit(&pidlock); diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index ab103ef4c7..4493f99454 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -544,6 +544,7 @@ CHKHDRS= \ visual_io.h \ vlan.h \ vm.h \ + vm_usage.h \ vmem.h \ vmem_impl.h \ vmmeter.h \ diff --git a/usr/src/uts/common/sys/modhash_impl.h b/usr/src/uts/common/sys/modhash_impl.h index 25e45cec23..a187eb68ee 100644 --- a/usr/src/uts/common/sys/modhash_impl.h +++ b/usr/src/uts/common/sys/modhash_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -93,6 +92,18 @@ struct mod_hash { */ void mod_hash_init(void); +/* + * Internal routines. Use directly with care. + */ +uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t); +int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t, + mod_hash_hndl_t); +int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t, + mod_hash_val_t *, void *), void *); +void i_mod_hash_clear_nosync(mod_hash_t *hash); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/priocntl.h b/usr/src/uts/common/sys/priocntl.h index ca1a92400a..6475ed0a4c 100644 --- a/usr/src/uts/common/sys/priocntl.h +++ b/usr/src/uts/common/sys/priocntl.h @@ -65,6 +65,7 @@ extern long priocntl(), priocntlset(); #define PC_SETXPARMS 7 /* Set extended scheduling parameters */ #define PC_GETXPARMS 8 /* Get extended scheduling parameters */ #define PC_SETDFLCL 9 /* Set default class, not for general use */ +#define PC_GETDFLCL 10 /* Get default class, not for general use */ #define PC_CLNULL -1 diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index fcf953262c..9a0ba2cc37 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -613,6 +613,8 @@ extern proc_t *pgfind(pid_t); extern proc_t *pgfind_zone(pid_t, zoneid_t); extern proc_t *sprlock(pid_t); extern proc_t *sprlock_zone(pid_t, zoneid_t); +extern int sprtrylock_proc(proc_t *); +extern void sprwaitlock_proc(proc_t *); extern void sprlock_proc(proc_t *); extern void sprunlock(proc_t *); extern void pid_init(void); diff --git a/usr/src/uts/common/sys/project.h b/usr/src/uts/common/sys/project.h index 679c1eddc2..5018df8499 100644 --- a/usr/src/uts/common/sys/project.h +++ b/usr/src/uts/common/sys/project.h @@ -28,15 +28,24 @@ #pragma ident "%Z%%M% %I% %E% SMI" + #ifdef __cplusplus extern "C" { #endif + +#include <sys/kstat.h> #include <sys/types.h> #include <sys/mutex.h> #include <sys/rctl.h> #include <sys/ipc_rctl.h> +typedef struct kproject_kstat { + kstat_named_t kpk_zonename; + kstat_named_t kpk_usage; + kstat_named_t kpk_value; +} kproject_kstat_t; + typedef struct kproject_data { /* Datum protected by: */ rctl_qty_t kpd_shmmax; /* shm's ipcs_lock */ ipc_rqty_t kpd_ipc; /* shm|sem|msg's ipcs lock */ @@ -44,6 +53,7 @@ typedef struct kproject_data { /* Datum protected by: */ rctl_qty_t kpd_locked_mem_ctl; /* kpj_rctls->rcs_lock */ rctl_qty_t kpd_contract; /* contract_lock */ rctl_qty_t kpd_crypto_mem; /* crypto_rctl_lock */ + kstat_t *kpd_lockedmem_kstat; /* locked memory kstat */ } kproject_data_t; @@ -76,9 +86,11 @@ typedef struct kproject { #define PROJECT_HOLD_FIND 1 #define PROJECT_HOLD_INSERT 2 +struct zone; + void project_init(void); kproject_t *project_hold(kproject_t *); -kproject_t *project_hold_by_id(projid_t, zoneid_t, int); +kproject_t *project_hold_by_id(projid_t, struct zone *, int); void project_rele(kproject_t *); int project_walk_all(zoneid_t, int (*)(kproject_t *, void *), void *); projid_t curprojid(void); diff --git a/usr/src/uts/common/sys/rctl.h b/usr/src/uts/common/sys/rctl.h index eb56fff9e5..a8480c2768 100644 --- a/usr/src/uts/common/sys/rctl.h +++ b/usr/src/uts/common/sys/rctl.h @@ -168,6 +168,7 @@ struct proc; struct task; struct kproject; struct zone; +struct kstat; typedef struct rctl_entity_p_struct { rctl_entity_t rcep_t; @@ -324,6 +325,14 @@ int rctl_incr_locked_mem(struct proc *, struct kproject *, rctl_qty_t, int); void rctl_decr_locked_mem(struct proc *, struct kproject *, rctl_qty_t, int); +int rctl_incr_swap(struct proc *, struct zone *, size_t); +void rctl_decr_swap(struct zone *, size_t); + +struct kstat *rctl_kstat_create_zone(struct zone *, char *, uchar_t, uint_t, + uchar_t); + +struct kstat *rctl_kstat_create_project(struct kproject *, char *, uchar_t, + uint_t, uchar_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index 86cc716d56..bf02808d4b 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -191,6 +190,7 @@ struct rusage { #define _RUSAGESYS_GETRUSAGE 0 /* rusage process */ #define _RUSAGESYS_GETRUSAGE_CHLD 1 /* rusage child process */ #define _RUSAGESYS_GETRUSAGE_LWP 2 /* rusage lwp */ +#define _RUSAGESYS_GETVMUSAGE 3 /* getvmusage */ #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/syscall.h b/usr/src/uts/common/sys/syscall.h index 96cb967023..eedadfa0c0 100644 --- a/usr/src/uts/common/sys/syscall.h +++ b/usr/src/uts/common/sys/syscall.h @@ -384,7 +384,8 @@ extern "C" { #define SYS_rusagesys 181 /* * subcodes: - * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE,...) + * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE, ...) + * getvmusage(...) :: rusagesys(RUSAGESYS_GETVMUSAGE, ...) */ #define SYS_port 182 /* diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h new file mode 100644 index 0000000000..5f8c8b8fe5 --- /dev/null +++ b/usr/src/uts/common/sys/vm_usage.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VM_USAGE_H +#define _SYS_VM_USAGE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The flags passed to getvmusage() request how to aggregate rss/swap results. + * Results can be aggregated by zone, project, task, ruser, and/or euser. + * + * If VMUSAGE_ALL_* or VMUSAGE_COL_* are passed from a non-global-zone, the + * flag is treated as VMUSAGE_*. For example, VMUSAGE_ALL_ZONES would be + * treated as VMUSAGE_ZONE. + * + * If VMUSAGE_SYSTEM is passed from a non-global zone, a result of type + * VMUSAGE_SYSTEM will be returned, but it will only reflect the usage + * of the calling zone. + * + * VMUSAGE_* requests results for the calling zone. + * VMUSAGE_ALL_* requests results for all zones. + * VMUSAGE_COL_* requests results for all zones, but collapses out the zoneid. + * For example, VMUSAGE_COL_PROJECTS requests results for all + * projects in all zones, and project N in ANY zone is treated + * as the same project. + */ +#define VMUSAGE_SYSTEM 0x1 /* rss/swap for ALL processes */ +#define VMUSAGE_ZONE 0x2 /* rss/swap for caller's zone */ +#define VMUSAGE_PROJECTS 0x4 /* rss/swap for all projects in */ + /* caller's zone */ +#define VMUSAGE_TASKS 0x8 /* rss/swap for all tasks in */ + /* caller's zones */ +#define VMUSAGE_RUSERS 0x10 /* rss/swap for all users (by process */ + /* ruser) in the caller's zone */ +#define VMUSAGE_EUSERS 0x20 /* same as VMUSAGE_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_ALL_ZONES 0x40 /* rss/swap for all zones */ +#define VMUSAGE_ALL_PROJECTS 0x80 /* rss/swap for all projects in */ + /* all zones */ +#define VMUSAGE_ALL_TASKS 0x100 /* rss/swap for all tasks in all */ + /* zones */ +#define VMUSAGE_ALL_RUSERS 0x200 /* rss/swap for all users (by process */ + /* ruser) in all zones */ +#define VMUSAGE_ALL_EUSERS 0x400 /* same as VMUSAGE_ALL_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_COL_PROJECTS 0x800 /* rss/swap for all projects in */ + /* all zones. Collapse zoneid. */ +#define VMUSAGE_COL_RUSERS 0x1000 /* rss/swap for all users (by process */ + /* ruser), in all zones. Collapse */ + /* zoneid */ +#define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ + +typedef struct vmusage { + id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ + /* VMUSAGE_COL_* results */ + /* ALL_ZONES means that the result */ + /* reflects swap and rss usage for */ + /* a projid/uid across all zones */ + uint_t vmu_type; /* Entity type of result. One of: */ + /* VMUSAGE_(SYSTEM|ZONE|PROJECTS| */ + /* TASKS|RUSERS|EUSERS) */ + id_t vmu_id; /* zoneid, projid, taskid, ... */ + size_t vmu_rss_all; /* total resident memory of entity */ + /* in bytes */ + size_t vmu_rss_private; /* total resident private memory */ + size_t vmu_rss_shared; /* total resident shared memory */ + size_t vmu_swap_all; /* total swap reserved, in bytes */ + size_t vmu_swap_private; /* swap reserved for private mappings */ + size_t vmu_swap_shared; /* swap reserved for shared mappings */ + +} vmusage_t; + +extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres); + +#ifdef _KERNEL + +int vm_getusage(uint_t, time_t, vmusage_t *, size_t *); +void vm_usage_init(); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VM_USAGE_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index daccd16bdf..94646bc976 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -88,6 +88,8 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 +#define ZONE_ATTR_PHYS_MCAP 12 +#define ZONE_ATTR_SCHED_CLASS 13 /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -280,6 +282,15 @@ typedef struct zone_dataset { list_node_t zd_linkage; } zone_dataset_t; +/* + * structure for zone kstats + */ +typedef struct zone_kstat { + kstat_named_t zk_zonename; + kstat_named_t zk_usage; + kstat_named_t zk_value; +} zone_kstat_t; + typedef struct zone { /* * zone_name is never modified once set. @@ -326,14 +337,20 @@ typedef struct zone { uint_t zone_rootpathlen; /* strlen(zone_rootpath) + 1 */ uint32_t zone_shares; /* FSS shares allocated to zone */ rctl_set_t *zone_rctls; /* zone-wide (zone.*) rctls */ - kmutex_t zone_rctl_lock; /* protects zone_locked_mem and */ + kmutex_t zone_mem_lock; /* protects zone_locked_mem and */ /* kpd_locked_mem for all */ - /* projects in zone */ + /* projects in zone. */ + /* Also protects zone_max_swap */ /* grab after p_lock, before rcs_lock */ - rctl_qty_t zone_locked_mem; /* bytes of locked memory in zone */ - rctl_qty_t zone_locked_mem_ctl; /* current locked memory */ + rctl_qty_t zone_locked_mem; /* bytes of locked memory in */ + /* zone */ + rctl_qty_t zone_locked_mem_ctl; /* Current locked memory */ /* limit. Protected by */ /* zone_rctls->rcs_lock */ + rctl_qty_t zone_max_swap; /* bytes of swap reserved by zone */ + rctl_qty_t zone_max_swap_ctl; /* current swap limit. */ + /* Protected by */ + /* zone_rctls->rcs_lock */ list_t zone_zsd; /* list of Zone-Specific Data values */ kcondvar_t zone_cv; /* used to signal state changes */ struct proc *zone_zsched; /* Dummy kernel "zsched" process */ @@ -341,6 +358,7 @@ typedef struct zone { char *zone_initname; /* fs path to 'init' */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ + uint64_t zone_phys_mcap; /* physical memory cap */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -376,6 +394,9 @@ typedef struct zone { boolean_t zone_restart_init; /* Restart init if it dies? */ struct brand *zone_brand; /* zone's brand */ + id_t zone_defaultcid; /* dflt scheduling class id */ + kstat_t *zone_swapresv_kstat; + kstat_t *zone_lockedmem_kstat; } zone_t; /* @@ -553,6 +574,7 @@ extern void mount_completed(void); extern int zone_walk(int (*)(zone_t *, void *), void *); extern rctl_hndl_t rc_zone_locked_mem; +extern rctl_hndl_t rc_zone_max_swap; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/syscall/processor_bind.c b/usr/src/uts/common/syscall/processor_bind.c index 10ca1178d5..bd416e43e6 100644 --- a/usr/src/uts/common/syscall/processor_bind.c +++ b/usr/src/uts/common/syscall/processor_bind.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -285,9 +284,10 @@ processor_bind(idtype_t idtype, id_t id, processorid_t bind, break; case P_PROJID: + pp = curproc; if (id == P_MYID) id = curprojid(); - if ((kpj = project_hold_by_id(id, getzoneid(), + if ((kpj = project_hold_by_id(id, pp->p_zone, PROJECT_HOLD_FIND)) == NULL) { ret = ESRCH; } else { diff --git a/usr/src/uts/common/syscall/pset.c b/usr/src/uts/common/syscall/pset.c index 5d3b7e6233..767529fc5d 100644 --- a/usr/src/uts/common/syscall/pset.c +++ b/usr/src/uts/common/syscall/pset.c @@ -542,9 +542,10 @@ pset_bind(psetid_t pset, idtype_t idtype, id_t id, psetid_t *opset) break; case P_PROJID: + pp = curproc; if (id == P_MYID) id = curprojid(); - if ((kpj = project_hold_by_id(id, getzoneid(), + if ((kpj = project_hold_by_id(id, pp->p_zone, PROJECT_HOLD_FIND)) == NULL) { error = ESRCH; break; diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c index 3e09643981..036500932f 100644 --- a/usr/src/uts/common/syscall/rusagesys.c +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +34,7 @@ #include <sys/time.h> #include <sys/errno.h> #include <sys/resource.h> +#include <sys/vm_usage.h> static int getrusage(void *user_rusage) @@ -246,16 +246,19 @@ getrusage_lwp(void *user_rusage) } int -rusagesys(int code, void * arg) +rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4) { switch (code) { case _RUSAGESYS_GETRUSAGE: - return (getrusage(arg)); + return (getrusage(arg1)); case _RUSAGESYS_GETRUSAGE_CHLD: - return (getrusage_chld(arg)); + return (getrusage_chld(arg1)); case _RUSAGESYS_GETRUSAGE_LWP: - return (getrusage_lwp(arg)); + return (getrusage_lwp(arg1)); + case _RUSAGESYS_GETVMUSAGE: + return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2, + (vmusage_t *)arg3, (size_t *)arg4)); default: return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/tasksys.c b/usr/src/uts/common/syscall/tasksys.c index 705b543a37..bec091e61c 100644 --- a/usr/src/uts/common/syscall/tasksys.c +++ b/usr/src/uts/common/syscall/tasksys.c @@ -25,6 +25,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" + /* * System calls for creating and inquiring about tasks and projects */ @@ -102,7 +103,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) * Put a hold on our new project and make sure that nobody is * trying to bind it to a pool while we're joining. */ - kpj = project_hold_by_id(projid, getzoneid(), PROJECT_HOLD_INSERT); + kpj = project_hold_by_id(projid, p->p_zone, PROJECT_HOLD_INSERT); e.rcep_p.proj = kpj; e.rcep_t = RCENTITY_PROJECT; @@ -111,7 +112,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) zone = p->p_zone; mutex_enter(&zone->zone_nlwps_lock); - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl) if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e, @@ -130,7 +131,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) rctlfail = 1; if (rctlfail) { - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); mutex_exit(&zone->zone_nlwps_lock); if (curthread != p->p_agenttp) continuelwps(p); @@ -144,7 +145,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) oldpj->kpj_data.kpd_locked_mem -= p->p_locked_mem; oldpj->kpj_nlwps -= p->p_lwpcnt; - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); mutex_exit(&zone->zone_nlwps_lock); mutex_exit(&p->p_lock); diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h index 90f6e1e661..ed59ec590b 100644 --- a/usr/src/uts/common/vm/anon.h +++ b/usr/src/uts/common/vm/anon.h @@ -42,6 +42,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/cred.h> +#include <sys/zone.h> #include <vm/seg.h> #include <vm/vpage.h> @@ -387,8 +388,8 @@ extern int anon_map_demotepages(struct anon_map *, ulong_t, struct seg *, caddr_t, uint_t, struct vpage [], struct cred *); extern void anon_shmap_free_pages(struct anon_map *, ulong_t, size_t); -extern int anon_resvmem(size_t, uint_t); -extern void anon_unresv(size_t); +extern int anon_resvmem(size_t, boolean_t, zone_t *); +extern void anon_unresvmem(size_t, zone_t *); extern struct anon_map *anonmap_alloc(size_t, size_t); extern void anonmap_free(struct anon_map *); extern void anon_decref(struct anon *); @@ -416,9 +417,16 @@ extern void anon_array_exit(anon_sync_obj_t *); * request and if so, reserves the appropriate anonymous memory resources. * anon_checkspace just checks to see if there is space to fulfill the request, * without taking any resources. Both return 1 if successful and 0 if not. + * + * Macros are provided as anon reservation is usually charged to the zone of + * the current process. In some cases (such as anon reserved by tmpfs), a + * zone pointer is needed to charge the appropriate zone. */ -#define anon_resv(size) anon_resvmem((size), 1) -#define anon_checkspace(size) anon_resvmem((size), 0) +#define anon_unresv(size) anon_unresvmem(size, curproc->p_zone) +#define anon_unresv_zone(size, zone) anon_unresvmem(size, zone) +#define anon_resv(size) anon_resvmem((size), 1, curproc->p_zone) +#define anon_resv_zone(size, zone) anon_resvmem((size), 1, zone) +#define anon_checkspace(size, zone) anon_resvmem((size), 0, zone) /* * Flags to anon_private diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h index 0ee7d62ce1..a9683c0e54 100644 --- a/usr/src/uts/common/vm/seg.h +++ b/usr/src/uts/common/vm/seg.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -245,6 +244,9 @@ uint_t seg_pages(struct seg *); #endif /* VMDEBUG */ +boolean_t seg_can_change_zones(struct seg *); +size_t seg_swresv(struct seg *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c index ff9c47e0ff..d58e873a19 100644 --- a/usr/src/uts/common/vm/seg_kp.c +++ b/usr/src/uts/common/vm/seg_kp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -147,6 +146,7 @@ uint32_t red_closest = UINT_MAX; uint32_t red_ndoubles; pgcnt_t anon_segkp_pages_locked; /* See vm/anon.h */ +pgcnt_t anon_segkp_pages_resv; /* anon reserved by seg_kp */ static struct seg_ops segkp_ops = { SEGKP_BADOP(int), /* dup */ @@ -448,8 +448,10 @@ segkp_get_internal( * Note that we don't need swap space for the red zone page. */ if (amp != NULL) { - ASSERT((flags & KPD_NO_ANON) == 0); - /* The reserve has been done and the anon_hdr is separate. */ + /* + * The swap reservation has been done, if required, and the + * anon_hdr is separate. + */ anon_idx = 0; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = amp->ahp; @@ -458,7 +460,7 @@ segkp_get_internal( kpd, vbase, len, flags, 1); } else if ((flags & KPD_NO_ANON) == 0) { - if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) { + if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) { if (flags & KPD_LOCKED) { atomic_add_long(&anon_segkp_pages_locked, -pages); @@ -468,6 +470,8 @@ segkp_get_internal( kmem_free(kpd, sizeof (struct segkp_data)); return (NULL); } + atomic_add_long(&anon_segkp_pages_resv, + btop(SEGKP_MAPLEN(len, flags))); anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = kpsd->kpsd_anon; @@ -704,7 +708,9 @@ segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len) if ((kpd->kp_flags & KPD_HASAMP) == 0) { anon_free(kpd->kp_anon, kpd->kp_anon_idx + i, PAGESIZE); - anon_unresv(PAGESIZE); + anon_unresv_zone(PAGESIZE, NULL); + atomic_add_long(&anon_segkp_pages_resv, + -1); } TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u", diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index f48db44acc..e2069b27c6 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -2323,8 +2323,9 @@ segvn_faultpage( * zeroes. If no advance reservations, reserve now. */ if (svd->flags & MAP_NORESERVE) { - if (anon_resv(ptob(1))) { - svd->swresv += ptob(1); + if (anon_resv_zone(ptob(1), + seg->s_as->a_proc->p_zone)) { + atomic_add_long(&svd->swresv, ptob(1)); } else { err = ENOMEM; goto out; diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c index 0cad34257c..3f225a345a 100644 --- a/usr/src/uts/common/vm/vm_anon.c +++ b/usr/src/uts/common/vm/vm_anon.c @@ -113,6 +113,7 @@ #include <sys/policy.h> #include <sys/condvar_impl.h> #include <sys/mutex_impl.h> +#include <sys/rctl.h> #include <vm/as.h> #include <vm/hat.h> @@ -729,12 +730,22 @@ set_anoninfo(void) * Return non-zero on success. */ int -anon_resvmem(size_t size, uint_t takemem) +anon_resvmem(size_t size, boolean_t takemem, zone_t *zone) { pgcnt_t npages = btopr(size); pgcnt_t mswap_pages = 0; pgcnt_t pswap_pages = 0; + proc_t *p = curproc; + if (zone != NULL && takemem) { + /* test zone.max-swap resource control */ + mutex_enter(&p->p_lock); + if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { + mutex_exit(&p->p_lock); + return (0); + } + mutex_exit(&p->p_lock); + } mutex_enter(&anoninfo_lock); /* @@ -834,16 +845,17 @@ anon_resvmem(size_t size, uint_t takemem) mutex_exit(&anoninfo_lock); ANON_PRINT(A_RESV, ("anon_resvmem: not enough space from swapfs\n")); + if (zone != NULL && takemem) + rctl_decr_swap(zone, ptob(npages)); return (0); } } - /* * Give back an anon reservation. */ void -anon_unresv(size_t size) +anon_unresvmem(size_t size, zone_t *zone) { pgcnt_t npages = btopr(size); spgcnt_t mem_free_pages = 0; @@ -851,6 +863,8 @@ anon_unresv(size_t size) #ifdef ANON_DEBUG pgcnt_t mem_resv; #endif + if (zone != NULL) + rctl_decr_swap(zone, size); mutex_enter(&anoninfo_lock); diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index 05bfe662be..adac07b766 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -77,7 +77,7 @@ #include <vm/pvn.h> #include <vm/seg_kmem.h> #include <vm/vm_dep.h> - +#include <sys/vm_usage.h> #include <fs/fs_subr.h> static int nopageage = 0; @@ -343,6 +343,7 @@ vm_init(void) (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); page_init_mem_config(); page_retire_init(); + vm_usage_init(); } /* diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c index 50cc21cdf7..aed892969d 100644 --- a/usr/src/uts/common/vm/vm_seg.c +++ b/usr/src/uts/common/vm/vm_seg.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,12 +53,14 @@ #include <sys/cmn_err.h> #include <sys/callb.h> #include <sys/mem_config.h> +#include <sys/mman.h> #include <vm/hat.h> #include <vm/as.h> #include <vm/seg.h> #include <vm/seg_kmem.h> - +#include <vm/seg_spt.h> +#include <vm/seg_vn.h> /* * kstats for segment advise */ @@ -950,3 +951,48 @@ seg_pinit_mem_config(void) */ ASSERT(ret == 0); } + +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; + +/* + * Verify that segment is not a shared anonymous segment which reserves + * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered + * from one zone to another if any segments are shared. This is because the + * last process to exit will credit the swap reservation. This could lead + * to the swap being reserved by one zone, and credited to another. + */ +boolean_t +seg_can_change_zones(struct seg *seg) +{ + struct segvn_data *svd; + + if (seg->s_ops == &segspt_shmops) + return (B_FALSE); + + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_SHARED && + svd->amp != NULL && + svd->amp->swresv > 0) + return (B_FALSE); + } + return (B_TRUE); +} + +/* + * Return swap reserved by a segment backing a private mapping. + */ +size_t +seg_swresv(struct seg *seg) +{ + struct segvn_data *svd; + size_t swap = 0; + + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_PRIVATE && svd->swresv > 0) + swap = svd->swresv; + } + return (swap); +} diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c new file mode 100644 index 0000000000..32a8811e10 --- /dev/null +++ b/usr/src/uts/common/vm/vm_usage.c @@ -0,0 +1,1978 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * vm_usage + * + * This file implements the getvmusage() private system call. + * getvmusage() counts the amount of resident memory pages and swap + * reserved by the specified process collective. A "process collective" is + * the set of processes owned by a particular, zone, project, task, or user. + * + * rss and swap are counted so that for a given process collective, a page is + * only counted once. For example, this means that if multiple processes in + * the same project map the same page, then the project will only be charged + * once for that page. On the other hand, if two processes in different + * projects map the same page, then both projects will be charged + * for the page. + * + * The vm_getusage() calculation is implemented so that the first thread + * performs the rss/swap counting. Other callers will wait for that thread to + * finish, copying the results. This enables multiple rcapds and prstats to + * consume data from the same calculation. The results are also cached so that + * a caller interested in recent results can just copy them instead of starting + * a new calculation. The caller passes the maximium age (in seconds) of the + * data. If the cached data is young enough, the cache is copied, otherwise, + * a new calculation is executed and the cache is replaced with the new + * data. + * + * The rss calculation for each process collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Lookup anons in the amp. + * - For incore pages not previously visited each of the + * proc's collectives, add incore pagesize to each. + * collective. + * Anon's with a refcnt of 1 can be assummed to be not + * previously visited. + * - For address ranges without anons in the amp: + * - Lookup pages in underlying vnode. + * - For incore pages not previously visiting for + * each of the proc's collectives, add incore + * pagesize to each collective. + * - If seg is shared: + * - Lookup pages in the shared amp or vnode. + * - For incore pages not previously visited for each of + * the proc's collectives, add incore pagesize to each + * collective. + * + * Swap is reserved by private segments, and shared anonymous segments. + * The only shared anon segments which do not reserve swap are ISM segments + * and schedctl segments, both of which can be identified by having + * amp->swresv == 0. + * + * The swap calculation for each collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Add svd->swresv pages to swap count for each of the + * proc's collectives. + * - If seg is anon, shared, and amp->swresv != 0 + * - For address ranges in amp not previously visited for + * each of the proc's collectives, add size of address + * range to the swap count for each collective. + * + * These two calculations are done simultaneously, with most of the work + * being done in vmu_calculate_seg(). The results of the calculation are + * copied into "vmu_data.vmu_cache_results". + * + * To perform the calculation, various things are tracked and cached: + * + * - incore/not-incore page ranges for all vnodes. + * (vmu_data.vmu_all_vnodes_hash) + * This eliminates looking up the same page more than once. + * + * - incore/not-incore page ranges for all shared amps. + * (vmu_data.vmu_all_amps_hash) + * This eliminates looking up the same page more than once. + * + * - visited page ranges for each collective. + * - per vnode (entity->vme_vnode_hash) + * - per shared amp (entity->vme_amp_hash) + * For accurate counting of map-shared and cow-shared pages. + * + * - visited private anons (refcnt > 1) for each collective. + * (entity->vme_anon_hash) + * For accurate counting of cow-shared pages. + * + * The common accounting structure is the vmu_entity_t, which represents + * collectives: + * + * - A zone. + * - A project, task, or user within a zone. + * - The entire system (vmu_data.vmu_system). + * - Each collapsed (col) project and user. This means a given projid or + * uid, regardless of which zone the process is in. For instance, + * project 0 in the global zone and project 0 in a non global zone are + * the same collapsed project. + * + * Each entity structure tracks which pages have been already visited for + * that entity (via previously inspected processes) so that these pages are + * not double counted. + */ + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/zone.h> +#include <sys/proc.h> +#include <sys/project.h> +#include <sys/task.h> +#include <sys/thread.h> +#include <sys/time.h> +#include <sys/mman.h> +#include <sys/modhash.h> +#include <sys/modhash_impl.h> +#include <sys/shm.h> +#include <sys/swap.h> +#include <sys/synch.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vm_usage.h> +#include <sys/zone.h> +#include <vm/anon.h> +#include <vm/as.h> +#include <vm/seg_vn.h> +#include <vm/seg_spt.h> + +#define VMUSAGE_HASH_SIZE 512 + +#define VMUSAGE_TYPE_VNODE 1 +#define VMUSAGE_TYPE_AMP 2 +#define VMUSAGE_TYPE_ANON 3 + +#define VMUSAGE_BOUND_UNKNOWN 0 +#define VMUSAGE_BOUND_INCORE 1 +#define VMUSAGE_BOUND_NOT_INCORE 2 + +/* + * bounds for vnodes and shared amps + * Each bound is either entirely incore, entirely not in core, or + * entirely unknown. bounds are stored in order by offset. + */ +typedef struct vmu_bound { + struct vmu_bound *vmb_next; + pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */ + pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */ + char vmb_type; /* One of VMUSAGE_BOUND_* */ +} vmu_bound_t; + +/* + * hash of visited objects (vnodes or shared amps) + * key is address of vnode or amp. Bounds lists known incore/non-incore + * bounds for vnode/amp. + */ +typedef struct vmu_object { + struct vmu_object *vmo_next; /* free list */ + caddr_t vmo_key; + short vmo_type; + vmu_bound_t *vmo_bounds; +} vmu_object_t; + +/* + * Entity by which to count results. + * + * The entity structure keeps the current rss/swap counts for each entity + * (zone, project, etc), and hashes of vm structures that have already + * been visited for the entity. + * + * vme_next: links the list of all entities currently being counted by + * vmu_calculate(). + * + * vme_next_calc: links the list of entities related to the current process + * being counted by vmu_calculate_proc(). + * + * vmu_calculate_proc() walks all processes. For each process, it makes a + * list of the entities related to that process using vme_next_calc. This + * list changes each time vmu_calculate_proc() is called. + * + */ +typedef struct vmu_entity { + struct vmu_entity *vme_next; + struct vmu_entity *vme_next_calc; + mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ + mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ + mod_hash_t *vme_anon_hash; /* cow anons visited for entity */ + vmusage_t vme_result; /* identifies entity and results */ +} vmu_entity_t; + +/* + * Hash of entities visited within a zone, and an entity for the zone + * itself. + */ +typedef struct vmu_zone { + struct vmu_zone *vmz_next; /* free list */ + id_t vmz_id; + vmu_entity_t *vmz_zone; + mod_hash_t *vmz_projects_hash; + mod_hash_t *vmz_tasks_hash; + mod_hash_t *vmz_rusers_hash; + mod_hash_t *vmz_eusers_hash; +} vmu_zone_t; + +/* + * Cache of results from last calculation + */ +typedef struct vmu_cache { + vmusage_t *vmc_results; /* Results from last call to */ + /* vm_getusage(). */ + uint64_t vmc_nresults; /* Count of cached results */ + uint64_t vmc_refcnt; /* refcnt for free */ + uint_t vmc_flags; /* Flags for vm_getusage() */ + hrtime_t vmc_timestamp; /* when cache was created */ +} vmu_cache_t; + +/* + * top level rss info for the system + */ +typedef struct vmu_data { + kmutex_t vmu_lock; /* Protects vmu_data */ + kcondvar_t vmu_cv; /* Used to signal threads */ + /* Waiting for */ + /* Rss_calc_thread to finish */ + vmu_entity_t *vmu_system; /* Entity for tracking */ + /* rss/swap for all processes */ + /* in all zones */ + mod_hash_t *vmu_zones_hash; /* Zones visited */ + mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */ + mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */ + mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */ + /* to implement VMUSAGE_COL_* */ + /* flags, which aggregate by */ + /* project or user regardless */ + /* of zoneid. */ + mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */ + /* to track incore/not-incore */ + mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */ + /* amps to track incore/not- */ + /* incore */ + vmu_entity_t *vmu_entities; /* Linked list of entities */ + size_t vmu_nentities; /* Count of entities in list */ + vmu_cache_t *vmu_cache; /* Cached results */ + kthread_t *vmu_calc_thread; /* NULL, or thread running */ + /* vmu_calculate() */ + uint_t vmu_calc_flags; /* Flags being using by */ + /* currently running calc */ + /* thread */ + uint_t vmu_pending_flags; /* Flags of vm_getusage() */ + /* threads waiting for */ + /* calc thread to finish */ + uint_t vmu_pending_waiters; /* Number of threads waiting */ + /* for calc thread */ + vmu_bound_t *vmu_free_bounds; + vmu_object_t *vmu_free_objects; + vmu_entity_t *vmu_free_entities; + vmu_zone_t *vmu_free_zones; +} vmu_data_t; + +extern struct as kas; +extern proc_t *practive; +extern zone_t *global_zone; +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; + +static vmu_data_t vmu_data; +static kmem_cache_t *vmu_bound_cache; +static kmem_cache_t *vmu_object_cache; + +/* + * Save a bound on the free list + */ +static void +vmu_free_bound(vmu_bound_t *bound) +{ + bound->vmb_next = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = bound; +} + +/* + * Free an object, and all visited bound info. + */ +static void +vmu_free_object(mod_hash_val_t val) +{ + vmu_object_t *obj = (vmu_object_t *)val; + vmu_bound_t *bound = obj->vmo_bounds; + vmu_bound_t *tmp; + + while (bound != NULL) { + tmp = bound; + bound = bound->vmb_next; + vmu_free_bound(tmp); + } + obj->vmo_next = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = obj; +} + +/* + * Free an entity, and hashes of visited objects for that entity. + */ +static void +vmu_free_entity(mod_hash_val_t val) +{ + vmu_entity_t *entity = (vmu_entity_t *)val; + + if (entity->vme_vnode_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_vnode_hash); + if (entity->vme_amp_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_amp_hash); + if (entity->vme_anon_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_anon_hash); + + entity->vme_next = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = entity; +} + +/* + * Free zone entity, and all hashes of entities inside that zone, + * which are projects, tasks, and users. + */ +static void +vmu_free_zone(mod_hash_val_t val) +{ + vmu_zone_t *zone = (vmu_zone_t *)val; + + if (zone->vmz_zone != NULL) { + vmu_free_entity((mod_hash_val_t)zone->vmz_zone); + zone->vmz_zone = NULL; + } + if (zone->vmz_projects_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_projects_hash); + if (zone->vmz_tasks_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_tasks_hash); + if (zone->vmz_rusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_rusers_hash); + if (zone->vmz_eusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_eusers_hash); + zone->vmz_next = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = zone; +} + +/* + * Initialize synchronization primitives and hashes for system-wide tracking + * of visited vnodes and shared amps. Initialize results cache. + */ +void +vm_usage_init() +{ + mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL); + + vmu_data.vmu_system = NULL; + vmu_data.vmu_zones_hash = NULL; + vmu_data.vmu_projects_col_hash = NULL; + vmu_data.vmu_rusers_col_hash = NULL; + vmu_data.vmu_eusers_col_hash = NULL; + + vmu_data.vmu_free_bounds = NULL; + vmu_data.vmu_free_objects = NULL; + vmu_data.vmu_free_entities = NULL; + vmu_data.vmu_free_zones = NULL; + + vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + vmu_data.vmu_projects_col_hash = mod_hash_create_idhash( + "vmusage collapsed project hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash( + "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash( + "vmusage collpased euser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_zones_hash = mod_hash_create_idhash( + "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone); + + vmu_bound_cache = kmem_cache_create("vmu_bound_cache", + sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + vmu_object_cache = kmem_cache_create("vmu_object_cache", + sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + + vmu_data.vmu_cache = NULL; + vmu_data.vmu_calc_thread = NULL; + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_pending_flags = 0; + vmu_data.vmu_pending_waiters = 0; +} + +/* + * Allocate hashes for tracking vm objects visited for an entity. + * Update list of entities. + */ +static vmu_entity_t * +vmu_alloc_entity(id_t id, int type, id_t zoneid) +{ + vmu_entity_t *entity; + + if (vmu_data.vmu_free_entities != NULL) { + entity = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + bzero(&entity->vme_result, sizeof (vmusage_t)); + } else { + entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP); + } + entity->vme_result.vmu_id = id; + entity->vme_result.vmu_zoneid = zoneid; + entity->vme_result.vmu_type = type; + + if (entity->vme_vnode_hash == NULL) + entity->vme_vnode_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + + if (entity->vme_amp_hash == NULL) + entity->vme_amp_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + + if (entity->vme_anon_hash == NULL) + entity->vme_anon_hash = mod_hash_create_ptrhash( + "vmusage anon hash", VMUSAGE_HASH_SIZE, + mod_hash_null_valdtor, sizeof (struct anon)); + + entity->vme_next = vmu_data.vmu_entities; + vmu_data.vmu_entities = entity; + vmu_data.vmu_nentities++; + + return (entity); +} + +/* + * Allocate a zone entity, and hashes for tracking visited vm objects + * for projects, tasks, and users within that zone. + */ +static vmu_zone_t * +vmu_alloc_zone(id_t id) +{ + vmu_zone_t *zone; + + if (vmu_data.vmu_free_zones != NULL) { + zone = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + zone->vmz_next = NULL; + zone->vmz_zone = NULL; + } else { + zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); + } + + zone->vmz_id = id; + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) + zone->vmz_projects_hash = mod_hash_create_idhash( + "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + != 0 && zone->vmz_tasks_hash == NULL) + zone->vmz_tasks_hash = mod_hash_create_idhash( + "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) + != 0 && zone->vmz_rusers_hash == NULL) + zone->vmz_rusers_hash = mod_hash_create_idhash( + "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) + != 0 && zone->vmz_eusers_hash == NULL) + zone->vmz_eusers_hash = mod_hash_create_idhash( + "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + return (zone); +} + +/* + * Allocate a structure for tracking visited bounds for a vm object. + */ +static vmu_object_t * +vmu_alloc_object(caddr_t key, int type) +{ + vmu_object_t *object; + + if (vmu_data.vmu_free_objects != NULL) { + object = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + } else { + object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP); + } + + object->vmo_key = key; + object->vmo_type = type; + object->vmo_bounds = NULL; + + return (object); +} + +/* + * Allocate and return a bound structure. + */ +static vmu_bound_t * +vmu_alloc_bound() +{ + vmu_bound_t *bound; + + if (vmu_data.vmu_free_bounds != NULL) { + bound = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = + vmu_data.vmu_free_bounds->vmb_next; + bzero(bound, sizeof (vmu_bound_t)); + } else { + bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP); + bzero(bound, sizeof (vmu_bound_t)); + } + return (bound); +} + +/* + * vmu_find_insert_* functions implement hash lookup or allocate and + * insert operations. + */ +static vmu_object_t * +vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) +{ + int ret; + vmu_object_t *object; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&object); + if (ret != 0) { + object = vmu_alloc_object(key, type); + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)object, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (object); +} + +static int +vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +{ + int ret; + caddr_t val; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&val); + + if (ret == 0) + return (0); + + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)key, (mod_hash_hndl_t)0); + + ASSERT(ret == 0); + + return (1); +} + +static vmu_entity_t * +vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid) +{ + int ret; + vmu_entity_t *entity; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&entity); + if (ret != 0) { + entity = vmu_alloc_entity(id, type, zoneid); + ret = i_mod_hash_insert_nosync(hash, + (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity, + (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (entity); +} + + + + +/* + * Returns list of object bounds between start and end. New bounds inserted + * by this call are given type. + * + * Returns the number of pages covered if new bounds are created. Returns 0 + * if region between start/end consists of all existing bounds. + */ +static pgcnt_t +vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t + end, char type, vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *prev = NULL; + vmu_bound_t *tmp = NULL; + pgcnt_t ret = 0; + + *first = *last = NULL; + + for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) { + /* + * Find bounds overlapping or overlapped by range [start,end]. + */ + if (start > next->vmb_end) { + /* bound is before new bound */ + prev = next; + continue; + } + if (next->vmb_start > end) { + /* bound is after new bound */ + break; + } + if (*first == NULL) + *first = next; + *last = next; + } + + if (*first == NULL) { + ASSERT(*last == NULL); + /* + * No bounds overlapping range [start,end], so create new + * bound + */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = end; + tmp->vmb_type = type; + if (prev == NULL) { + tmp->vmb_next = ro->vmo_bounds; + ro->vmo_bounds = tmp; + } else { + tmp->vmb_next = prev->vmb_next; + prev->vmb_next = tmp; + } + *first = tmp; + *last = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret = tmp->vmb_end - tmp->vmb_start + 1; + return (ret); + } + + /* Check to see if start is before first known bound */ + ASSERT(first != NULL && last != NULL); + next = (*first); + if (start < (*first)->vmb_start) { + /* Create new bound before first bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = (*first)->vmb_start - 1; + tmp->vmb_type = type; + tmp->vmb_next = *first; + if (*first == ro->vmo_bounds) + ro->vmo_bounds = tmp; + if (prev != NULL) + prev->vmb_next = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + *first = tmp; + } + /* + * Between start and end, search for gaps between and after existing + * bounds. Create new bounds to fill gaps if they exist. + */ + while (end > next->vmb_end) { + /* + * Check for gap between bound and next bound. if no gap, + * continue. + */ + if ((next != *last) && + ((next->vmb_end + 1) == next->vmb_next->vmb_start)) { + next = next->vmb_next; + continue; + } + /* + * Insert new bound in gap after bound, and before next + * bound if next bound exists. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = type; + tmp->vmb_next = next->vmb_next; + tmp->vmb_start = next->vmb_end + 1; + + if (next != *last) { + tmp->vmb_end = next->vmb_next->vmb_start - 1; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + next = tmp->vmb_next; + } else { + tmp->vmb_end = end; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + *last = tmp; + break; + } + } + return (ret); +} + +/* + * vmu_update_bounds() + * + * first, last: list of continuous bounds, of which zero or more are of + * type VMUSAGE_BOUND_UNKNOWN. + * + * new_first, new_last: list of continuous bounds, of which none are of + * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to + * update the types of bounds in (first,last) with + * type VMUSAGE_BOUND_UNKNOWN. + * + * For the list of bounds (first,last), this function updates any bounds + * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in + * the list (new_first, new_last). + * + * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list + * (new_first, new_last), it will be split into multiple bounds. + * + * Return value: + * The number of pages in the list of bounds (first,last) that were of + * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type + * VMUSAGE_BOUND_INCORE. + * + */ +static pgcnt_t +vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last, + vmu_bound_t *new_first, vmu_bound_t *new_last) +{ + vmu_bound_t *next, *new_next, *tmp; + pgcnt_t rss = 0; + + next = *first; + new_next = new_first; + + /* verify bounds span same pages */ + ASSERT((*first)->vmb_start >= new_next->vmb_start); + ASSERT((*last)->vmb_end <= new_last->vmb_end); + for (;;) { + /* If bound already has type, proceed to next bound */ + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + while (new_next->vmb_end < next->vmb_start) + new_next = new_next->vmb_next; + ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + next->vmb_type = new_next->vmb_type; + if (new_next->vmb_end < next->vmb_end) { + /* need to split bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN; + tmp->vmb_start = new_next->vmb_end + 1; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = new_next->vmb_end; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + next = tmp; + } else { + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + if (next == *last) + break; + next = next->vmb_next; + } + } + return (rss); +} + +/* + * merges adjacent bounds with same type between first and last bound. + * After merge, last pointer is no longer valid, as last bound may be + * merged away. + */ +static void +vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + + ASSERT(*first != NULL); + ASSERT(*last != NULL); + + next = *first; + while (next != *last) { + + /* If bounds are adjacent and have same type, merge them */ + if (((next->vmb_end + 1) == next->vmb_next->vmb_start) && + (next->vmb_type == next->vmb_next->vmb_type)) { + tmp = next->vmb_next; + next->vmb_end = tmp->vmb_end; + next->vmb_next = tmp->vmb_next; + vmu_free_bound(tmp); + if (tmp == *last) + *last = next; + } else { + next = next->vmb_next; + } + } +} + +/* + * Given an amp and a list of bounds, updates each bound's type with + * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE. + * + * If a bound is partially incore, it will be split into two bounds. + * first and last may be modified, as bounds may be split into multiple + * bounds if the are partially incore/not-incore. + * + * Set incore to non-zero if bounds are already known to be incore + * + */ +static void +vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first, + vmu_bound_t **last, boolean_t incore) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + vnode_t *vn; + anoff_t off; + struct anon *ap; + + next = *first; + /* Shared anon slots don't change once set */ + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (;;) { + if (incore == B_TRUE) + next->vmb_type = VMUSAGE_BOUND_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + ap = anon_get_ptr(amp->ahp, index); + if (ap != NULL) + swap_xlate(ap, &vn, &off); + + if (ap != NULL && vn != NULL && vn->v_pages != NULL && + (page = page_exists(vn, off)) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } + ANON_LOCK_EXIT(&->a_rwlock); +} + +/* + * Same as vmu_amp_update_incore_bounds(), except for tracking + * incore-/not-incore for vnodes. + */ +static void +vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first, + vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + + next = *first; + for (;;) { + if (vnode->v_pages == NULL) + next->vmb_type = VMUSAGE_BOUND_NOT_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + if (vnode->v_pages != NULL && + (page = page_exists(vnode, ptob(index))) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } +} + +/* + * Calculate the rss and swap consumed by a segment. vmu_entities is the + * list of entities to visit. For shared segments, the vnode or amp + * is looked up in each entity to see if has been already counted. Private + * anon pages are checked per entity to ensure that cow pages are not + * double counted. + * + * For private mapped files, first the amp is checked for private pages. + * Bounds not backed by the amp are looked up in the vnode for each entity + * to avoid double counting of private COW vnode pages. + */ +static void +vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) +{ + struct segvn_data *svd; + struct shm_data *shmd; + struct spt_data *sptd; + vmu_object_t *shared_object = NULL; + vmu_object_t *entity_object = NULL; + vmu_entity_t *entity; + vmusage_t *result; + vmu_bound_t *first = NULL; + vmu_bound_t *last = NULL; + vmu_bound_t *cur = NULL; + vmu_bound_t *e_first = NULL; + vmu_bound_t *e_last = NULL; + vmu_bound_t *tmp; + pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt; + struct anon_map *private_amp = NULL; + boolean_t incore = B_FALSE; + boolean_t shared = B_FALSE; + int file = 0; + pgcnt_t swresv = 0; + pgcnt_t panon = 0; + + /* Can zero-length segments exist? Not sure, so parenoia */ + if (seg->s_size <= 0) + return; + + /* + * Figure out if there is a shared object (such as a named vnode or + * a shared amp, then figure out if there is a private amp, which + * identifies private pages. + */ + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_SHARED) + shared = B_TRUE; + else + swresv = svd->swresv; + + if (svd->vp != NULL) { + file = 1; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp, + VMUSAGE_TYPE_VNODE); + s_start = btop(svd->offset); + s_end = btop(svd->offset + seg->s_size) - 1; + } + if (svd->amp != NULL && svd->type == MAP_SHARED) { + ASSERT(shared_object == NULL); + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp, + VMUSAGE_TYPE_AMP); + s_start = svd->anon_index; + s_end = svd->anon_index + btop(seg->s_size) - 1; + /* schedctl mappings are always in core */ + if (svd->amp->swresv == 0) + incore = B_TRUE; + } + if (svd->amp != NULL && svd->type == MAP_PRIVATE) { + private_amp = svd->amp; + p_start = svd->anon_index; + p_end = svd->anon_index + btop(seg->s_size) - 1; + } + } else if (seg->s_ops == &segspt_shmops) { + shared = B_TRUE; + shmd = (struct shm_data *)seg->s_data; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp, + VMUSAGE_TYPE_AMP); + s_start = 0; + s_end = btop(seg->s_size) - 1; + sptd = shmd->shm_sptseg->s_data; + + /* ism segments are always incore and do not reserve swap */ + if (sptd->spt_flags & SHM_SHARE_MMU) + incore = B_TRUE; + + } else { + return; + } + + /* + * If there is a private amp, count anon pages that exist. If an + * anon has a refcnt > 1 (cow sharing), then save the anon in a + * hash so that it is not double counted. + * + * If there is also a shared object, they figure out the bounds + * which are not mapped by the private amp. + */ + if (private_amp != NULL) { + + /* Enter as writer to prevent cow anons from being freed */ + ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER); + + p_index = p_start; + s_index = s_start; + + while (p_index <= p_end) { + + pgcnt_t p_index_next; + pgcnt_t p_bound_size; + int cnt; + anoff_t off; + struct vnode *vn; + struct anon *ap; + page_t *page; /* For handling of large */ + pgcnt_t pgcnt = 1; /* pages */ + pgcnt_t pgstart; + pgcnt_t pgend; + uint_t pgshft; + pgcnt_t pgmsk; + + p_index_next = p_index; + ap = anon_get_next_ptr(private_amp->ahp, + &p_index_next); + + /* + * If next anon is past end of mapping, simulate + * end of anon so loop terminates. + */ + if (p_index_next > p_end) { + p_index_next = p_end + 1; + ap = NULL; + } + /* + * For cow segments, keep track of bounds not + * backed by private amp so they can be looked + * up in the backing vnode + */ + if (p_index_next != p_index) { + + /* + * Compute index difference between anon and + * previous anon. + */ + p_bound_size = p_index_next - p_index - 1; + + if (shared_object != NULL) { + cur = vmu_alloc_bound(); + cur->vmb_next = NULL; + cur->vmb_start = s_index; + cur->vmb_end = s_index + p_bound_size; + cur->vmb_type = VMUSAGE_BOUND_UNKNOWN; + if (first == NULL) { + first = cur; + last = cur; + } else { + last->vmb_next = cur; + last = cur; + } + } + p_index = p_index + p_bound_size + 1; + s_index = s_index + p_bound_size + 1; + } + + /* Detect end of anons in amp */ + if (ap == NULL) + break; + + cnt = ap->an_refcnt; + swap_xlate(ap, &vn, &off); + + if (vn == NULL || vn->v_pages == NULL || + (page = page_exists(vn, off)) == NULL) { + p_index++; + s_index++; + continue; + } + + /* + * If large page is found, compute portion of large + * page in mapping, and increment indicies to the next + * large page. + */ + if (page->p_szc > 0) { + + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; + + /* First page in large page */ + pgstart = p_index & ~pgmsk; + /* Last page in large page */ + pgend = pgstart + pgcnt - 1; + /* + * Artifically end page if page extends past + * end of mapping. + */ + if (pgend > p_end) + pgend = p_end; + + /* + * Compute number of pages from large page + * which are mapped. + */ + pgcnt = pgend - p_index + 1; + + /* + * Point indicies at page after large page, + * or at page after end of mapping. + */ + p_index += pgcnt; + s_index += pgcnt; + } else { + p_index++; + s_index++; + } + + /* + * Assume anon structs with a refcnt + * of 1 are not cow shared, so there + * is no reason to track them per entity. + */ + if (cnt == 1) { + panon += pgcnt; + continue; + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + /* + * Track cow anons per entity so + * they are not double counted. + */ + if (vmu_find_insert_anon(entity->vme_anon_hash, + (caddr_t)ap) == 0) + continue; + + result->vmu_rss_all += (pgcnt << PAGESHIFT); + result->vmu_rss_private += + (pgcnt << PAGESHIFT); + } + } + ANON_LOCK_EXIT(&private_amp->a_rwlock); + } + + /* Add up resident anon and swap reserved for private mappings */ + if (swresv > 0 || panon > 0) { + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + result = &entity->vme_result; + result->vmu_swap_all += swresv; + result->vmu_swap_private += swresv; + result->vmu_rss_all += (panon << PAGESHIFT); + result->vmu_rss_private += (panon << PAGESHIFT); + } + } + + /* Compute resident pages backing shared amp or named vnode */ + if (shared_object != NULL) { + if (first == NULL) { + /* + * No private amp, or private amp has no anon + * structs. This means entire segment is backed by + * the shared object. + */ + first = vmu_alloc_bound(); + first->vmb_next = NULL; + first->vmb_start = s_start; + first->vmb_end = s_end; + first->vmb_type = VMUSAGE_BOUND_UNKNOWN; + } + /* + * Iterate bounds not backed by private amp, and compute + * resident pages. + */ + cur = first; + while (cur != NULL) { + + if (vmu_insert_lookup_object_bounds(shared_object, + cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, + &first, &last) > 0) { + /* new bounds, find incore/not-incore */ + if (shared_object->vmo_type == + VMUSAGE_TYPE_VNODE) + vmu_vnode_update_incore_bounds( + (vnode_t *) + shared_object->vmo_key, &first, + &last); + else + vmu_amp_update_incore_bounds( + (struct anon_map *) + shared_object->vmo_key, &first, + &last, incore); + vmu_merge_bounds(&first, &last); + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + + entity_object = vmu_find_insert_object( + shared_object->vmo_type == + VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash: + entity->vme_amp_hash, + shared_object->vmo_key, + shared_object->vmo_type); + + virt = vmu_insert_lookup_object_bounds( + entity_object, cur->vmb_start, cur->vmb_end, + VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last); + + if (virt == 0) + continue; + /* + * Range visited for this entity + */ + rss = vmu_update_bounds(&e_first, + &e_last, first, last); + result->vmu_rss_all += (rss << PAGESHIFT); + if (shared == B_TRUE && file == B_FALSE) { + /* shared anon mapping */ + result->vmu_swap_all += + (virt << PAGESHIFT); + result->vmu_swap_shared += + (virt << PAGESHIFT); + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_TRUE && file == B_TRUE) { + /* shared file mapping */ + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_FALSE && + file == B_TRUE) { + /* private file mapping */ + result->vmu_rss_private += + (rss << PAGESHIFT); + } + vmu_merge_bounds(&e_first, &e_last); + } + tmp = cur; + cur = cur->vmb_next; + vmu_free_bound(tmp); + } + } +} + +/* + * Based on the current calculation flags, find the relevant entities + * which are relative to the process. Then calculate each segment + * in the process'es address space for each relevant entity. + */ +static void +vmu_calculate_proc(proc_t *p) +{ + vmu_entity_t *entities = NULL; + vmu_zone_t *zone; + vmu_entity_t *tmp; + struct as *as; + struct seg *seg; + int ret; + + /* Figure out which entities are being computed */ + if ((vmu_data.vmu_system) != NULL) { + tmp = vmu_data.vmu_system; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | + VMUSAGE_ALL_EUSERS)) { + ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t *)&zone); + if (ret != 0) { + zone = vmu_alloc_zone(p->p_zone->zone_id); + ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t)zone, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + if (zone->vmz_zone != NULL) { + tmp = zone->vmz_zone; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) { + tmp = vmu_find_insert_entity(zone->vmz_projects_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, + zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) { + tmp = vmu_find_insert_entity(zone->vmz_tasks_hash, + p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_rusers_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_eusers_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + } + /* Entities which collapse projects and users for all zones */ + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + + ASSERT(entities != NULL); + /* process all segs in process's address space */ + as = p->p_as; + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; + seg = AS_SEGNEXT(as, seg)) { + vmu_calculate_seg(entities, seg); + } + AS_LOCK_EXIT(as, &as->a_lock); +} + +/* + * Free data created by previous call to vmu_calculate(). + */ +static void +vmu_clear_calc() +{ + if (vmu_data.vmu_system != NULL) + vmu_free_entity(vmu_data.vmu_system); + vmu_data.vmu_system = NULL; + if (vmu_data.vmu_zones_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash); + if (vmu_data.vmu_projects_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash); + if (vmu_data.vmu_rusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash); + if (vmu_data.vmu_eusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash); + + i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash); + i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash); +} + +/* + * Free unused data structures. These can result if the system workload + * decreases between calculations. + */ +static void +vmu_free_extra() +{ + vmu_bound_t *tb; + vmu_object_t *to; + vmu_entity_t *te; + vmu_zone_t *tz; + + while (vmu_data.vmu_free_bounds != NULL) { + tb = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; + kmem_cache_free(vmu_bound_cache, tb); + } + while (vmu_data.vmu_free_objects != NULL) { + to = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + kmem_cache_free(vmu_object_cache, to); + } + while (vmu_data.vmu_free_entities != NULL) { + te = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + if (te->vme_vnode_hash != NULL) + mod_hash_destroy_hash(te->vme_vnode_hash); + if (te->vme_amp_hash != NULL) + mod_hash_destroy_hash(te->vme_amp_hash); + if (te->vme_anon_hash != NULL) + mod_hash_destroy_hash(te->vme_anon_hash); + kmem_free(te, sizeof (vmu_entity_t)); + } + while (vmu_data.vmu_free_zones != NULL) { + tz = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + if (tz->vmz_projects_hash != NULL) + mod_hash_destroy_hash(tz->vmz_projects_hash); + if (tz->vmz_tasks_hash != NULL) + mod_hash_destroy_hash(tz->vmz_tasks_hash); + if (tz->vmz_rusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_rusers_hash); + if (tz->vmz_eusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_eusers_hash); + kmem_free(tz, sizeof (vmu_zone_t)); + } +} + +extern kcondvar_t *pr_pid_cv; + +/* + * Determine which entity types are relevant and allocate the hashes to + * track them. Then walk the process table and count rss and swap + * for each process'es address space. Address space object such as + * vnodes, amps and anons are tracked per entity, so that they are + * not double counted in the results. + * + */ +static void +vmu_calculate() +{ + int i = 0; + int ret; + proc_t *p; + + vmu_clear_calc(); + + if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM) + vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, + ALL_ZONES); + + /* + * Walk process table and calculate rss of each proc. + * + * Pidlock and p_lock cannot be held while doing the rss calculation. + * This is because: + * 1. The calculation allocates using KM_SLEEP. + * 2. The calculation grabs a_lock, which cannot be grabbed + * after p_lock. + * + * Since pidlock must be dropped, we cannot simply just walk the + * practive list. Instead, we walk the process table, and sprlock + * each process to ensure that it does not exit during the + * calculation. + */ + + mutex_enter(&pidlock); + for (i = 0; i < v.v_proc; i++) { +again: + p = pid_entry(i); + if (p == NULL) + continue; + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (panicstr) { + mutex_exit(&p->p_lock); + return; + } + + /* Try to set P_PR_LOCK */ + ret = sprtrylock_proc(p); + if (ret == -1) { + /* Process in invalid state */ + mutex_exit(&p->p_lock); + mutex_enter(&pidlock); + continue; + } else if (ret == 1) { + /* + * P_PR_LOCK is already set. Wait and try again. + * This also drops p_lock. + */ + sprwaitlock_proc(p); + mutex_enter(&pidlock); + goto again; + } + mutex_exit(&p->p_lock); + + vmu_calculate_proc(p); + + mutex_enter(&p->p_lock); + sprunlock(p); + mutex_enter(&pidlock); + } + mutex_exit(&pidlock); + + vmu_free_extra(); +} + +/* + * allocate a new cache for N results satisfying flags + */ +vmu_cache_t * +vmu_cache_alloc(size_t nres, uint_t flags) +{ + vmu_cache_t *cache; + + cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP); + cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP); + cache->vmc_nresults = nres; + cache->vmc_flags = flags; + cache->vmc_refcnt = 1; + return (cache); +} + +/* + * Make sure cached results are not freed + */ +static void +vmu_cache_hold(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + cache->vmc_refcnt++; +} + +/* + * free cache data + */ +static void +vmu_cache_rele(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + ASSERT(cache->vmc_refcnt > 0); + cache->vmc_refcnt--; + if (cache->vmc_refcnt == 0) { + kmem_free(cache->vmc_results, sizeof (vmusage_t) * + cache->vmc_nresults); + kmem_free(cache, sizeof (vmu_cache_t)); + } +} + +/* + * Copy out the cached results to a caller. Inspect the callers flags + * and zone to determine which cached results should be copied. + */ +static int +vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, + uint_t flags) +{ + vmusage_t *result, *out_result; + vmusage_t dummy; + size_t i, count = 0; + size_t bufsize; + int ret = 0; + uint_t types = 0; + + if (nres != NULL) { + if (copyin((caddr_t)nres, &bufsize, sizeof (size_t))) + return (set_errno(EFAULT)); + } else { + bufsize = 0; + } + + /* figure out what results the caller is interested in. */ + if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) + types |= VMUSAGE_SYSTEM; + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + types |= VMUSAGE_ZONE; + if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) + types |= VMUSAGE_PROJECTS; + if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + types |= VMUSAGE_TASKS; + if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) + types |= VMUSAGE_RUSERS; + if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) + types |= VMUSAGE_EUSERS; + + /* count results for current zone */ + out_result = buf; + for (result = cache->vmc_results, i = 0; + i < cache->vmc_nresults; result++, i++) { + + /* Do not return "other-zone" results to non-global zones */ + if (curproc->p_zone != global_zone && + curproc->p_zone->zone_id != result->vmu_zoneid) + continue; + + /* + * If non-global zone requests VMUSAGE_SYSTEM, fake + * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result. + */ + if (curproc->p_zone != global_zone && + (flags & VMUSAGE_SYSTEM) != 0 && + result->vmu_type == VMUSAGE_ZONE) { + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + dummy = *result; + dummy.vmu_zoneid = ALL_ZONES; + dummy.vmu_id = 0; + dummy.vmu_type = VMUSAGE_SYSTEM; + if (copyout(&dummy, out_result, + sizeof (vmusage_t))) + return (set_errno( + EFAULT)); + out_result++; + } + } + } + + /* Skip results that do not match requested type */ + if ((result->vmu_type & types) == 0) + continue; + + /* Skip collated results if not requested */ + if (result->vmu_zoneid == ALL_ZONES) { + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & VMUSAGE_COL_PROJECTS) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & VMUSAGE_COL_EUSERS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & VMUSAGE_COL_RUSERS) == 0) + continue; + } + + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + if (copyout(result, out_result, + sizeof (vmusage_t))) + return (set_errno(EFAULT)); + out_result++; + } + } + } + if (nres != NULL) + if (copyout(&count, (void *)nres, sizeof (size_t))) + return (set_errno(EFAULT)); + + return (ret); +} + +/* + * vm_getusage() + * + * Counts rss and swap by zone, project, task, and/or user. The flags argument + * determines the type of results structures returned. Flags requesting + * results from more than one zone are "flattened" to the local zone if the + * caller is not the global zone. + * + * args: + * flags: bitmap consisting of one or more of VMUSAGE_*. + * age: maximum allowable age (time since counting was done) in + * seconds of the results. Results from previous callers are + * cached in kernel. + * buf: pointer to buffer array of vmusage_t. If NULL, then only nres + * set on success. + * nres: Set to number of vmusage_t structures pointed to by buf + * before calling vm_getusage(). + * On return 0 (success) or ENOSPC, is set to the number of result + * structures returned or attempted to return. + * + * returns 0 on success, -1 on failure: + * EINTR (interrupted) + * ENOSPC (nres to small for results, nres set to needed value for success) + * EINVAL (flags invalid) + * EFAULT (bad address for buf or nres) + */ +int +vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres) +{ + vmu_entity_t *entity; + vmusage_t *result; + int ret = 0; + int cacherecent = 0; + hrtime_t now; + uint_t flags_orig; + + /* + * Non-global zones cannot request system wide and/or collated + * results, or the system result, so munge the flags accordingly. + */ + flags_orig = flags; + if (curproc->p_zone != global_zone) { + if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { + flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); + flags |= VMUSAGE_PROJECTS; + } + if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) { + flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS); + flags |= VMUSAGE_RUSERS; + } + if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) { + flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS); + flags |= VMUSAGE_EUSERS; + } + if (flags & VMUSAGE_SYSTEM) { + flags &= ~VMUSAGE_SYSTEM; + flags |= VMUSAGE_ZONE; + } + } + + /* Check for unknown flags */ + if ((flags & (~VMUSAGE_MASK)) != 0) + return (set_errno(EINVAL)); + + /* Check for no flags */ + if ((flags & VMUSAGE_MASK) == 0) + return (set_errno(EINVAL)); + + mutex_enter(&vmu_data.vmu_lock); + now = gethrtime(); + +start: + if (vmu_data.vmu_cache != NULL) { + + vmu_cache_t *cache; + + if ((vmu_data.vmu_cache->vmc_timestamp + + ((hrtime_t)age * NANOSEC)) > now) + cacherecent = 1; + + if ((vmu_data.vmu_cache->vmc_flags & flags) == flags && + cacherecent == 1) { + cache = vmu_data.vmu_cache; + vmu_cache_hold(cache); + mutex_exit(&vmu_data.vmu_lock); + + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + mutex_exit(&vmu_data.vmu_lock); + return (ret); + } + /* + * If the cache is recent, it is likely that there are other + * consumers of vm_getusage running, so add their flags to the + * desired flags for the calculation. + */ + if (cacherecent == 1) + flags = vmu_data.vmu_cache->vmc_flags | flags; + } + if (vmu_data.vmu_calc_thread == NULL) { + + vmu_cache_t *cache; + + vmu_data.vmu_calc_thread = curthread; + vmu_data.vmu_calc_flags = flags; + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + if (vmu_data.vmu_pending_waiters > 0) + vmu_data.vmu_calc_flags |= + vmu_data.vmu_pending_flags; + + vmu_data.vmu_pending_flags = 0; + mutex_exit(&vmu_data.vmu_lock); + vmu_calculate(); + mutex_enter(&vmu_data.vmu_lock); + /* copy results to cache */ + if (vmu_data.vmu_cache != NULL) + vmu_cache_rele(vmu_data.vmu_cache); + cache = vmu_data.vmu_cache = + vmu_cache_alloc(vmu_data.vmu_nentities, + vmu_data.vmu_calc_flags); + + result = cache->vmc_results; + for (entity = vmu_data.vmu_entities; entity != NULL; + entity = entity->vme_next) { + *result = entity->vme_result; + result++; + } + cache->vmc_timestamp = gethrtime(); + vmu_cache_hold(cache); + + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_calc_thread = NULL; + + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + + mutex_exit(&vmu_data.vmu_lock); + + /* copy cache */ + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + mutex_exit(&vmu_data.vmu_lock); + + return (ret); + } + vmu_data.vmu_pending_flags |= flags; + vmu_data.vmu_pending_waiters++; + while (vmu_data.vmu_calc_thread != NULL) { + if (cv_wait_sig(&vmu_data.vmu_cv, + &vmu_data.vmu_lock) == 0) { + vmu_data.vmu_pending_waiters--; + mutex_exit(&vmu_data.vmu_lock); + return (set_errno(EINTR)); + } + } + vmu_data.vmu_pending_waiters--; + goto start; +} |
