diff options
Diffstat (limited to 'usr/src/uts/common/vm/vm_usage.c')
| -rw-r--r-- | usr/src/uts/common/vm/vm_usage.c | 1978 |
1 files changed, 1978 insertions, 0 deletions
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c new file mode 100644 index 0000000000..32a8811e10 --- /dev/null +++ b/usr/src/uts/common/vm/vm_usage.c @@ -0,0 +1,1978 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * vm_usage + * + * This file implements the getvmusage() private system call. + * getvmusage() counts the amount of resident memory pages and swap + * reserved by the specified process collective. A "process collective" is + * the set of processes owned by a particular, zone, project, task, or user. + * + * rss and swap are counted so that for a given process collective, a page is + * only counted once. For example, this means that if multiple processes in + * the same project map the same page, then the project will only be charged + * once for that page. On the other hand, if two processes in different + * projects map the same page, then both projects will be charged + * for the page. + * + * The vm_getusage() calculation is implemented so that the first thread + * performs the rss/swap counting. Other callers will wait for that thread to + * finish, copying the results. This enables multiple rcapds and prstats to + * consume data from the same calculation. The results are also cached so that + * a caller interested in recent results can just copy them instead of starting + * a new calculation. The caller passes the maximium age (in seconds) of the + * data. If the cached data is young enough, the cache is copied, otherwise, + * a new calculation is executed and the cache is replaced with the new + * data. + * + * The rss calculation for each process collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Lookup anons in the amp. + * - For incore pages not previously visited each of the + * proc's collectives, add incore pagesize to each. + * collective. + * Anon's with a refcnt of 1 can be assummed to be not + * previously visited. + * - For address ranges without anons in the amp: + * - Lookup pages in underlying vnode. + * - For incore pages not previously visiting for + * each of the proc's collectives, add incore + * pagesize to each collective. + * - If seg is shared: + * - Lookup pages in the shared amp or vnode. + * - For incore pages not previously visited for each of + * the proc's collectives, add incore pagesize to each + * collective. + * + * Swap is reserved by private segments, and shared anonymous segments. + * The only shared anon segments which do not reserve swap are ISM segments + * and schedctl segments, both of which can be identified by having + * amp->swresv == 0. + * + * The swap calculation for each collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Add svd->swresv pages to swap count for each of the + * proc's collectives. + * - If seg is anon, shared, and amp->swresv != 0 + * - For address ranges in amp not previously visited for + * each of the proc's collectives, add size of address + * range to the swap count for each collective. + * + * These two calculations are done simultaneously, with most of the work + * being done in vmu_calculate_seg(). The results of the calculation are + * copied into "vmu_data.vmu_cache_results". + * + * To perform the calculation, various things are tracked and cached: + * + * - incore/not-incore page ranges for all vnodes. + * (vmu_data.vmu_all_vnodes_hash) + * This eliminates looking up the same page more than once. + * + * - incore/not-incore page ranges for all shared amps. + * (vmu_data.vmu_all_amps_hash) + * This eliminates looking up the same page more than once. + * + * - visited page ranges for each collective. + * - per vnode (entity->vme_vnode_hash) + * - per shared amp (entity->vme_amp_hash) + * For accurate counting of map-shared and cow-shared pages. + * + * - visited private anons (refcnt > 1) for each collective. + * (entity->vme_anon_hash) + * For accurate counting of cow-shared pages. + * + * The common accounting structure is the vmu_entity_t, which represents + * collectives: + * + * - A zone. + * - A project, task, or user within a zone. + * - The entire system (vmu_data.vmu_system). + * - Each collapsed (col) project and user. This means a given projid or + * uid, regardless of which zone the process is in. For instance, + * project 0 in the global zone and project 0 in a non global zone are + * the same collapsed project. + * + * Each entity structure tracks which pages have been already visited for + * that entity (via previously inspected processes) so that these pages are + * not double counted. + */ + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/zone.h> +#include <sys/proc.h> +#include <sys/project.h> +#include <sys/task.h> +#include <sys/thread.h> +#include <sys/time.h> +#include <sys/mman.h> +#include <sys/modhash.h> +#include <sys/modhash_impl.h> +#include <sys/shm.h> +#include <sys/swap.h> +#include <sys/synch.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vm_usage.h> +#include <sys/zone.h> +#include <vm/anon.h> +#include <vm/as.h> +#include <vm/seg_vn.h> +#include <vm/seg_spt.h> + +#define VMUSAGE_HASH_SIZE 512 + +#define VMUSAGE_TYPE_VNODE 1 +#define VMUSAGE_TYPE_AMP 2 +#define VMUSAGE_TYPE_ANON 3 + +#define VMUSAGE_BOUND_UNKNOWN 0 +#define VMUSAGE_BOUND_INCORE 1 +#define VMUSAGE_BOUND_NOT_INCORE 2 + +/* + * bounds for vnodes and shared amps + * Each bound is either entirely incore, entirely not in core, or + * entirely unknown. bounds are stored in order by offset. + */ +typedef struct vmu_bound { + struct vmu_bound *vmb_next; + pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */ + pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */ + char vmb_type; /* One of VMUSAGE_BOUND_* */ +} vmu_bound_t; + +/* + * hash of visited objects (vnodes or shared amps) + * key is address of vnode or amp. Bounds lists known incore/non-incore + * bounds for vnode/amp. + */ +typedef struct vmu_object { + struct vmu_object *vmo_next; /* free list */ + caddr_t vmo_key; + short vmo_type; + vmu_bound_t *vmo_bounds; +} vmu_object_t; + +/* + * Entity by which to count results. + * + * The entity structure keeps the current rss/swap counts for each entity + * (zone, project, etc), and hashes of vm structures that have already + * been visited for the entity. + * + * vme_next: links the list of all entities currently being counted by + * vmu_calculate(). + * + * vme_next_calc: links the list of entities related to the current process + * being counted by vmu_calculate_proc(). + * + * vmu_calculate_proc() walks all processes. For each process, it makes a + * list of the entities related to that process using vme_next_calc. This + * list changes each time vmu_calculate_proc() is called. + * + */ +typedef struct vmu_entity { + struct vmu_entity *vme_next; + struct vmu_entity *vme_next_calc; + mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ + mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ + mod_hash_t *vme_anon_hash; /* cow anons visited for entity */ + vmusage_t vme_result; /* identifies entity and results */ +} vmu_entity_t; + +/* + * Hash of entities visited within a zone, and an entity for the zone + * itself. + */ +typedef struct vmu_zone { + struct vmu_zone *vmz_next; /* free list */ + id_t vmz_id; + vmu_entity_t *vmz_zone; + mod_hash_t *vmz_projects_hash; + mod_hash_t *vmz_tasks_hash; + mod_hash_t *vmz_rusers_hash; + mod_hash_t *vmz_eusers_hash; +} vmu_zone_t; + +/* + * Cache of results from last calculation + */ +typedef struct vmu_cache { + vmusage_t *vmc_results; /* Results from last call to */ + /* vm_getusage(). */ + uint64_t vmc_nresults; /* Count of cached results */ + uint64_t vmc_refcnt; /* refcnt for free */ + uint_t vmc_flags; /* Flags for vm_getusage() */ + hrtime_t vmc_timestamp; /* when cache was created */ +} vmu_cache_t; + +/* + * top level rss info for the system + */ +typedef struct vmu_data { + kmutex_t vmu_lock; /* Protects vmu_data */ + kcondvar_t vmu_cv; /* Used to signal threads */ + /* Waiting for */ + /* Rss_calc_thread to finish */ + vmu_entity_t *vmu_system; /* Entity for tracking */ + /* rss/swap for all processes */ + /* in all zones */ + mod_hash_t *vmu_zones_hash; /* Zones visited */ + mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */ + mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */ + mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */ + /* to implement VMUSAGE_COL_* */ + /* flags, which aggregate by */ + /* project or user regardless */ + /* of zoneid. */ + mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */ + /* to track incore/not-incore */ + mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */ + /* amps to track incore/not- */ + /* incore */ + vmu_entity_t *vmu_entities; /* Linked list of entities */ + size_t vmu_nentities; /* Count of entities in list */ + vmu_cache_t *vmu_cache; /* Cached results */ + kthread_t *vmu_calc_thread; /* NULL, or thread running */ + /* vmu_calculate() */ + uint_t vmu_calc_flags; /* Flags being using by */ + /* currently running calc */ + /* thread */ + uint_t vmu_pending_flags; /* Flags of vm_getusage() */ + /* threads waiting for */ + /* calc thread to finish */ + uint_t vmu_pending_waiters; /* Number of threads waiting */ + /* for calc thread */ + vmu_bound_t *vmu_free_bounds; + vmu_object_t *vmu_free_objects; + vmu_entity_t *vmu_free_entities; + vmu_zone_t *vmu_free_zones; +} vmu_data_t; + +extern struct as kas; +extern proc_t *practive; +extern zone_t *global_zone; +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; + +static vmu_data_t vmu_data; +static kmem_cache_t *vmu_bound_cache; +static kmem_cache_t *vmu_object_cache; + +/* + * Save a bound on the free list + */ +static void +vmu_free_bound(vmu_bound_t *bound) +{ + bound->vmb_next = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = bound; +} + +/* + * Free an object, and all visited bound info. + */ +static void +vmu_free_object(mod_hash_val_t val) +{ + vmu_object_t *obj = (vmu_object_t *)val; + vmu_bound_t *bound = obj->vmo_bounds; + vmu_bound_t *tmp; + + while (bound != NULL) { + tmp = bound; + bound = bound->vmb_next; + vmu_free_bound(tmp); + } + obj->vmo_next = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = obj; +} + +/* + * Free an entity, and hashes of visited objects for that entity. + */ +static void +vmu_free_entity(mod_hash_val_t val) +{ + vmu_entity_t *entity = (vmu_entity_t *)val; + + if (entity->vme_vnode_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_vnode_hash); + if (entity->vme_amp_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_amp_hash); + if (entity->vme_anon_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_anon_hash); + + entity->vme_next = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = entity; +} + +/* + * Free zone entity, and all hashes of entities inside that zone, + * which are projects, tasks, and users. + */ +static void +vmu_free_zone(mod_hash_val_t val) +{ + vmu_zone_t *zone = (vmu_zone_t *)val; + + if (zone->vmz_zone != NULL) { + vmu_free_entity((mod_hash_val_t)zone->vmz_zone); + zone->vmz_zone = NULL; + } + if (zone->vmz_projects_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_projects_hash); + if (zone->vmz_tasks_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_tasks_hash); + if (zone->vmz_rusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_rusers_hash); + if (zone->vmz_eusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_eusers_hash); + zone->vmz_next = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = zone; +} + +/* + * Initialize synchronization primitives and hashes for system-wide tracking + * of visited vnodes and shared amps. Initialize results cache. + */ +void +vm_usage_init() +{ + mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL); + + vmu_data.vmu_system = NULL; + vmu_data.vmu_zones_hash = NULL; + vmu_data.vmu_projects_col_hash = NULL; + vmu_data.vmu_rusers_col_hash = NULL; + vmu_data.vmu_eusers_col_hash = NULL; + + vmu_data.vmu_free_bounds = NULL; + vmu_data.vmu_free_objects = NULL; + vmu_data.vmu_free_entities = NULL; + vmu_data.vmu_free_zones = NULL; + + vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + vmu_data.vmu_projects_col_hash = mod_hash_create_idhash( + "vmusage collapsed project hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash( + "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash( + "vmusage collpased euser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_zones_hash = mod_hash_create_idhash( + "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone); + + vmu_bound_cache = kmem_cache_create("vmu_bound_cache", + sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + vmu_object_cache = kmem_cache_create("vmu_object_cache", + sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + + vmu_data.vmu_cache = NULL; + vmu_data.vmu_calc_thread = NULL; + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_pending_flags = 0; + vmu_data.vmu_pending_waiters = 0; +} + +/* + * Allocate hashes for tracking vm objects visited for an entity. + * Update list of entities. + */ +static vmu_entity_t * +vmu_alloc_entity(id_t id, int type, id_t zoneid) +{ + vmu_entity_t *entity; + + if (vmu_data.vmu_free_entities != NULL) { + entity = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + bzero(&entity->vme_result, sizeof (vmusage_t)); + } else { + entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP); + } + entity->vme_result.vmu_id = id; + entity->vme_result.vmu_zoneid = zoneid; + entity->vme_result.vmu_type = type; + + if (entity->vme_vnode_hash == NULL) + entity->vme_vnode_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + + if (entity->vme_amp_hash == NULL) + entity->vme_amp_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + + if (entity->vme_anon_hash == NULL) + entity->vme_anon_hash = mod_hash_create_ptrhash( + "vmusage anon hash", VMUSAGE_HASH_SIZE, + mod_hash_null_valdtor, sizeof (struct anon)); + + entity->vme_next = vmu_data.vmu_entities; + vmu_data.vmu_entities = entity; + vmu_data.vmu_nentities++; + + return (entity); +} + +/* + * Allocate a zone entity, and hashes for tracking visited vm objects + * for projects, tasks, and users within that zone. + */ +static vmu_zone_t * +vmu_alloc_zone(id_t id) +{ + vmu_zone_t *zone; + + if (vmu_data.vmu_free_zones != NULL) { + zone = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + zone->vmz_next = NULL; + zone->vmz_zone = NULL; + } else { + zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); + } + + zone->vmz_id = id; + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) + zone->vmz_projects_hash = mod_hash_create_idhash( + "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + != 0 && zone->vmz_tasks_hash == NULL) + zone->vmz_tasks_hash = mod_hash_create_idhash( + "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) + != 0 && zone->vmz_rusers_hash == NULL) + zone->vmz_rusers_hash = mod_hash_create_idhash( + "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) + != 0 && zone->vmz_eusers_hash == NULL) + zone->vmz_eusers_hash = mod_hash_create_idhash( + "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + return (zone); +} + +/* + * Allocate a structure for tracking visited bounds for a vm object. + */ +static vmu_object_t * +vmu_alloc_object(caddr_t key, int type) +{ + vmu_object_t *object; + + if (vmu_data.vmu_free_objects != NULL) { + object = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + } else { + object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP); + } + + object->vmo_key = key; + object->vmo_type = type; + object->vmo_bounds = NULL; + + return (object); +} + +/* + * Allocate and return a bound structure. + */ +static vmu_bound_t * +vmu_alloc_bound() +{ + vmu_bound_t *bound; + + if (vmu_data.vmu_free_bounds != NULL) { + bound = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = + vmu_data.vmu_free_bounds->vmb_next; + bzero(bound, sizeof (vmu_bound_t)); + } else { + bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP); + bzero(bound, sizeof (vmu_bound_t)); + } + return (bound); +} + +/* + * vmu_find_insert_* functions implement hash lookup or allocate and + * insert operations. + */ +static vmu_object_t * +vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) +{ + int ret; + vmu_object_t *object; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&object); + if (ret != 0) { + object = vmu_alloc_object(key, type); + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)object, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (object); +} + +static int +vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +{ + int ret; + caddr_t val; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&val); + + if (ret == 0) + return (0); + + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)key, (mod_hash_hndl_t)0); + + ASSERT(ret == 0); + + return (1); +} + +static vmu_entity_t * +vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid) +{ + int ret; + vmu_entity_t *entity; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&entity); + if (ret != 0) { + entity = vmu_alloc_entity(id, type, zoneid); + ret = i_mod_hash_insert_nosync(hash, + (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity, + (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (entity); +} + + + + +/* + * Returns list of object bounds between start and end. New bounds inserted + * by this call are given type. + * + * Returns the number of pages covered if new bounds are created. Returns 0 + * if region between start/end consists of all existing bounds. + */ +static pgcnt_t +vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t + end, char type, vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *prev = NULL; + vmu_bound_t *tmp = NULL; + pgcnt_t ret = 0; + + *first = *last = NULL; + + for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) { + /* + * Find bounds overlapping or overlapped by range [start,end]. + */ + if (start > next->vmb_end) { + /* bound is before new bound */ + prev = next; + continue; + } + if (next->vmb_start > end) { + /* bound is after new bound */ + break; + } + if (*first == NULL) + *first = next; + *last = next; + } + + if (*first == NULL) { + ASSERT(*last == NULL); + /* + * No bounds overlapping range [start,end], so create new + * bound + */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = end; + tmp->vmb_type = type; + if (prev == NULL) { + tmp->vmb_next = ro->vmo_bounds; + ro->vmo_bounds = tmp; + } else { + tmp->vmb_next = prev->vmb_next; + prev->vmb_next = tmp; + } + *first = tmp; + *last = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret = tmp->vmb_end - tmp->vmb_start + 1; + return (ret); + } + + /* Check to see if start is before first known bound */ + ASSERT(first != NULL && last != NULL); + next = (*first); + if (start < (*first)->vmb_start) { + /* Create new bound before first bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = (*first)->vmb_start - 1; + tmp->vmb_type = type; + tmp->vmb_next = *first; + if (*first == ro->vmo_bounds) + ro->vmo_bounds = tmp; + if (prev != NULL) + prev->vmb_next = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + *first = tmp; + } + /* + * Between start and end, search for gaps between and after existing + * bounds. Create new bounds to fill gaps if they exist. + */ + while (end > next->vmb_end) { + /* + * Check for gap between bound and next bound. if no gap, + * continue. + */ + if ((next != *last) && + ((next->vmb_end + 1) == next->vmb_next->vmb_start)) { + next = next->vmb_next; + continue; + } + /* + * Insert new bound in gap after bound, and before next + * bound if next bound exists. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = type; + tmp->vmb_next = next->vmb_next; + tmp->vmb_start = next->vmb_end + 1; + + if (next != *last) { + tmp->vmb_end = next->vmb_next->vmb_start - 1; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + next = tmp->vmb_next; + } else { + tmp->vmb_end = end; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + *last = tmp; + break; + } + } + return (ret); +} + +/* + * vmu_update_bounds() + * + * first, last: list of continuous bounds, of which zero or more are of + * type VMUSAGE_BOUND_UNKNOWN. + * + * new_first, new_last: list of continuous bounds, of which none are of + * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to + * update the types of bounds in (first,last) with + * type VMUSAGE_BOUND_UNKNOWN. + * + * For the list of bounds (first,last), this function updates any bounds + * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in + * the list (new_first, new_last). + * + * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list + * (new_first, new_last), it will be split into multiple bounds. + * + * Return value: + * The number of pages in the list of bounds (first,last) that were of + * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type + * VMUSAGE_BOUND_INCORE. + * + */ +static pgcnt_t +vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last, + vmu_bound_t *new_first, vmu_bound_t *new_last) +{ + vmu_bound_t *next, *new_next, *tmp; + pgcnt_t rss = 0; + + next = *first; + new_next = new_first; + + /* verify bounds span same pages */ + ASSERT((*first)->vmb_start >= new_next->vmb_start); + ASSERT((*last)->vmb_end <= new_last->vmb_end); + for (;;) { + /* If bound already has type, proceed to next bound */ + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + while (new_next->vmb_end < next->vmb_start) + new_next = new_next->vmb_next; + ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + next->vmb_type = new_next->vmb_type; + if (new_next->vmb_end < next->vmb_end) { + /* need to split bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN; + tmp->vmb_start = new_next->vmb_end + 1; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = new_next->vmb_end; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + next = tmp; + } else { + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + if (next == *last) + break; + next = next->vmb_next; + } + } + return (rss); +} + +/* + * merges adjacent bounds with same type between first and last bound. + * After merge, last pointer is no longer valid, as last bound may be + * merged away. + */ +static void +vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + + ASSERT(*first != NULL); + ASSERT(*last != NULL); + + next = *first; + while (next != *last) { + + /* If bounds are adjacent and have same type, merge them */ + if (((next->vmb_end + 1) == next->vmb_next->vmb_start) && + (next->vmb_type == next->vmb_next->vmb_type)) { + tmp = next->vmb_next; + next->vmb_end = tmp->vmb_end; + next->vmb_next = tmp->vmb_next; + vmu_free_bound(tmp); + if (tmp == *last) + *last = next; + } else { + next = next->vmb_next; + } + } +} + +/* + * Given an amp and a list of bounds, updates each bound's type with + * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE. + * + * If a bound is partially incore, it will be split into two bounds. + * first and last may be modified, as bounds may be split into multiple + * bounds if the are partially incore/not-incore. + * + * Set incore to non-zero if bounds are already known to be incore + * + */ +static void +vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first, + vmu_bound_t **last, boolean_t incore) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + vnode_t *vn; + anoff_t off; + struct anon *ap; + + next = *first; + /* Shared anon slots don't change once set */ + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (;;) { + if (incore == B_TRUE) + next->vmb_type = VMUSAGE_BOUND_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + ap = anon_get_ptr(amp->ahp, index); + if (ap != NULL) + swap_xlate(ap, &vn, &off); + + if (ap != NULL && vn != NULL && vn->v_pages != NULL && + (page = page_exists(vn, off)) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } + ANON_LOCK_EXIT(&->a_rwlock); +} + +/* + * Same as vmu_amp_update_incore_bounds(), except for tracking + * incore-/not-incore for vnodes. + */ +static void +vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first, + vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + + next = *first; + for (;;) { + if (vnode->v_pages == NULL) + next->vmb_type = VMUSAGE_BOUND_NOT_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + if (vnode->v_pages != NULL && + (page = page_exists(vnode, ptob(index))) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } +} + +/* + * Calculate the rss and swap consumed by a segment. vmu_entities is the + * list of entities to visit. For shared segments, the vnode or amp + * is looked up in each entity to see if has been already counted. Private + * anon pages are checked per entity to ensure that cow pages are not + * double counted. + * + * For private mapped files, first the amp is checked for private pages. + * Bounds not backed by the amp are looked up in the vnode for each entity + * to avoid double counting of private COW vnode pages. + */ +static void +vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) +{ + struct segvn_data *svd; + struct shm_data *shmd; + struct spt_data *sptd; + vmu_object_t *shared_object = NULL; + vmu_object_t *entity_object = NULL; + vmu_entity_t *entity; + vmusage_t *result; + vmu_bound_t *first = NULL; + vmu_bound_t *last = NULL; + vmu_bound_t *cur = NULL; + vmu_bound_t *e_first = NULL; + vmu_bound_t *e_last = NULL; + vmu_bound_t *tmp; + pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt; + struct anon_map *private_amp = NULL; + boolean_t incore = B_FALSE; + boolean_t shared = B_FALSE; + int file = 0; + pgcnt_t swresv = 0; + pgcnt_t panon = 0; + + /* Can zero-length segments exist? Not sure, so parenoia */ + if (seg->s_size <= 0) + return; + + /* + * Figure out if there is a shared object (such as a named vnode or + * a shared amp, then figure out if there is a private amp, which + * identifies private pages. + */ + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_SHARED) + shared = B_TRUE; + else + swresv = svd->swresv; + + if (svd->vp != NULL) { + file = 1; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp, + VMUSAGE_TYPE_VNODE); + s_start = btop(svd->offset); + s_end = btop(svd->offset + seg->s_size) - 1; + } + if (svd->amp != NULL && svd->type == MAP_SHARED) { + ASSERT(shared_object == NULL); + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp, + VMUSAGE_TYPE_AMP); + s_start = svd->anon_index; + s_end = svd->anon_index + btop(seg->s_size) - 1; + /* schedctl mappings are always in core */ + if (svd->amp->swresv == 0) + incore = B_TRUE; + } + if (svd->amp != NULL && svd->type == MAP_PRIVATE) { + private_amp = svd->amp; + p_start = svd->anon_index; + p_end = svd->anon_index + btop(seg->s_size) - 1; + } + } else if (seg->s_ops == &segspt_shmops) { + shared = B_TRUE; + shmd = (struct shm_data *)seg->s_data; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp, + VMUSAGE_TYPE_AMP); + s_start = 0; + s_end = btop(seg->s_size) - 1; + sptd = shmd->shm_sptseg->s_data; + + /* ism segments are always incore and do not reserve swap */ + if (sptd->spt_flags & SHM_SHARE_MMU) + incore = B_TRUE; + + } else { + return; + } + + /* + * If there is a private amp, count anon pages that exist. If an + * anon has a refcnt > 1 (cow sharing), then save the anon in a + * hash so that it is not double counted. + * + * If there is also a shared object, they figure out the bounds + * which are not mapped by the private amp. + */ + if (private_amp != NULL) { + + /* Enter as writer to prevent cow anons from being freed */ + ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER); + + p_index = p_start; + s_index = s_start; + + while (p_index <= p_end) { + + pgcnt_t p_index_next; + pgcnt_t p_bound_size; + int cnt; + anoff_t off; + struct vnode *vn; + struct anon *ap; + page_t *page; /* For handling of large */ + pgcnt_t pgcnt = 1; /* pages */ + pgcnt_t pgstart; + pgcnt_t pgend; + uint_t pgshft; + pgcnt_t pgmsk; + + p_index_next = p_index; + ap = anon_get_next_ptr(private_amp->ahp, + &p_index_next); + + /* + * If next anon is past end of mapping, simulate + * end of anon so loop terminates. + */ + if (p_index_next > p_end) { + p_index_next = p_end + 1; + ap = NULL; + } + /* + * For cow segments, keep track of bounds not + * backed by private amp so they can be looked + * up in the backing vnode + */ + if (p_index_next != p_index) { + + /* + * Compute index difference between anon and + * previous anon. + */ + p_bound_size = p_index_next - p_index - 1; + + if (shared_object != NULL) { + cur = vmu_alloc_bound(); + cur->vmb_next = NULL; + cur->vmb_start = s_index; + cur->vmb_end = s_index + p_bound_size; + cur->vmb_type = VMUSAGE_BOUND_UNKNOWN; + if (first == NULL) { + first = cur; + last = cur; + } else { + last->vmb_next = cur; + last = cur; + } + } + p_index = p_index + p_bound_size + 1; + s_index = s_index + p_bound_size + 1; + } + + /* Detect end of anons in amp */ + if (ap == NULL) + break; + + cnt = ap->an_refcnt; + swap_xlate(ap, &vn, &off); + + if (vn == NULL || vn->v_pages == NULL || + (page = page_exists(vn, off)) == NULL) { + p_index++; + s_index++; + continue; + } + + /* + * If large page is found, compute portion of large + * page in mapping, and increment indicies to the next + * large page. + */ + if (page->p_szc > 0) { + + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; + + /* First page in large page */ + pgstart = p_index & ~pgmsk; + /* Last page in large page */ + pgend = pgstart + pgcnt - 1; + /* + * Artifically end page if page extends past + * end of mapping. + */ + if (pgend > p_end) + pgend = p_end; + + /* + * Compute number of pages from large page + * which are mapped. + */ + pgcnt = pgend - p_index + 1; + + /* + * Point indicies at page after large page, + * or at page after end of mapping. + */ + p_index += pgcnt; + s_index += pgcnt; + } else { + p_index++; + s_index++; + } + + /* + * Assume anon structs with a refcnt + * of 1 are not cow shared, so there + * is no reason to track them per entity. + */ + if (cnt == 1) { + panon += pgcnt; + continue; + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + /* + * Track cow anons per entity so + * they are not double counted. + */ + if (vmu_find_insert_anon(entity->vme_anon_hash, + (caddr_t)ap) == 0) + continue; + + result->vmu_rss_all += (pgcnt << PAGESHIFT); + result->vmu_rss_private += + (pgcnt << PAGESHIFT); + } + } + ANON_LOCK_EXIT(&private_amp->a_rwlock); + } + + /* Add up resident anon and swap reserved for private mappings */ + if (swresv > 0 || panon > 0) { + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + result = &entity->vme_result; + result->vmu_swap_all += swresv; + result->vmu_swap_private += swresv; + result->vmu_rss_all += (panon << PAGESHIFT); + result->vmu_rss_private += (panon << PAGESHIFT); + } + } + + /* Compute resident pages backing shared amp or named vnode */ + if (shared_object != NULL) { + if (first == NULL) { + /* + * No private amp, or private amp has no anon + * structs. This means entire segment is backed by + * the shared object. + */ + first = vmu_alloc_bound(); + first->vmb_next = NULL; + first->vmb_start = s_start; + first->vmb_end = s_end; + first->vmb_type = VMUSAGE_BOUND_UNKNOWN; + } + /* + * Iterate bounds not backed by private amp, and compute + * resident pages. + */ + cur = first; + while (cur != NULL) { + + if (vmu_insert_lookup_object_bounds(shared_object, + cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, + &first, &last) > 0) { + /* new bounds, find incore/not-incore */ + if (shared_object->vmo_type == + VMUSAGE_TYPE_VNODE) + vmu_vnode_update_incore_bounds( + (vnode_t *) + shared_object->vmo_key, &first, + &last); + else + vmu_amp_update_incore_bounds( + (struct anon_map *) + shared_object->vmo_key, &first, + &last, incore); + vmu_merge_bounds(&first, &last); + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + + entity_object = vmu_find_insert_object( + shared_object->vmo_type == + VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash: + entity->vme_amp_hash, + shared_object->vmo_key, + shared_object->vmo_type); + + virt = vmu_insert_lookup_object_bounds( + entity_object, cur->vmb_start, cur->vmb_end, + VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last); + + if (virt == 0) + continue; + /* + * Range visited for this entity + */ + rss = vmu_update_bounds(&e_first, + &e_last, first, last); + result->vmu_rss_all += (rss << PAGESHIFT); + if (shared == B_TRUE && file == B_FALSE) { + /* shared anon mapping */ + result->vmu_swap_all += + (virt << PAGESHIFT); + result->vmu_swap_shared += + (virt << PAGESHIFT); + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_TRUE && file == B_TRUE) { + /* shared file mapping */ + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_FALSE && + file == B_TRUE) { + /* private file mapping */ + result->vmu_rss_private += + (rss << PAGESHIFT); + } + vmu_merge_bounds(&e_first, &e_last); + } + tmp = cur; + cur = cur->vmb_next; + vmu_free_bound(tmp); + } + } +} + +/* + * Based on the current calculation flags, find the relevant entities + * which are relative to the process. Then calculate each segment + * in the process'es address space for each relevant entity. + */ +static void +vmu_calculate_proc(proc_t *p) +{ + vmu_entity_t *entities = NULL; + vmu_zone_t *zone; + vmu_entity_t *tmp; + struct as *as; + struct seg *seg; + int ret; + + /* Figure out which entities are being computed */ + if ((vmu_data.vmu_system) != NULL) { + tmp = vmu_data.vmu_system; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | + VMUSAGE_ALL_EUSERS)) { + ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t *)&zone); + if (ret != 0) { + zone = vmu_alloc_zone(p->p_zone->zone_id); + ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t)zone, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + if (zone->vmz_zone != NULL) { + tmp = zone->vmz_zone; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) { + tmp = vmu_find_insert_entity(zone->vmz_projects_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, + zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) { + tmp = vmu_find_insert_entity(zone->vmz_tasks_hash, + p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_rusers_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_eusers_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + } + /* Entities which collapse projects and users for all zones */ + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + + ASSERT(entities != NULL); + /* process all segs in process's address space */ + as = p->p_as; + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; + seg = AS_SEGNEXT(as, seg)) { + vmu_calculate_seg(entities, seg); + } + AS_LOCK_EXIT(as, &as->a_lock); +} + +/* + * Free data created by previous call to vmu_calculate(). + */ +static void +vmu_clear_calc() +{ + if (vmu_data.vmu_system != NULL) + vmu_free_entity(vmu_data.vmu_system); + vmu_data.vmu_system = NULL; + if (vmu_data.vmu_zones_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash); + if (vmu_data.vmu_projects_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash); + if (vmu_data.vmu_rusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash); + if (vmu_data.vmu_eusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash); + + i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash); + i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash); +} + +/* + * Free unused data structures. These can result if the system workload + * decreases between calculations. + */ +static void +vmu_free_extra() +{ + vmu_bound_t *tb; + vmu_object_t *to; + vmu_entity_t *te; + vmu_zone_t *tz; + + while (vmu_data.vmu_free_bounds != NULL) { + tb = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; + kmem_cache_free(vmu_bound_cache, tb); + } + while (vmu_data.vmu_free_objects != NULL) { + to = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + kmem_cache_free(vmu_object_cache, to); + } + while (vmu_data.vmu_free_entities != NULL) { + te = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + if (te->vme_vnode_hash != NULL) + mod_hash_destroy_hash(te->vme_vnode_hash); + if (te->vme_amp_hash != NULL) + mod_hash_destroy_hash(te->vme_amp_hash); + if (te->vme_anon_hash != NULL) + mod_hash_destroy_hash(te->vme_anon_hash); + kmem_free(te, sizeof (vmu_entity_t)); + } + while (vmu_data.vmu_free_zones != NULL) { + tz = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + if (tz->vmz_projects_hash != NULL) + mod_hash_destroy_hash(tz->vmz_projects_hash); + if (tz->vmz_tasks_hash != NULL) + mod_hash_destroy_hash(tz->vmz_tasks_hash); + if (tz->vmz_rusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_rusers_hash); + if (tz->vmz_eusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_eusers_hash); + kmem_free(tz, sizeof (vmu_zone_t)); + } +} + +extern kcondvar_t *pr_pid_cv; + +/* + * Determine which entity types are relevant and allocate the hashes to + * track them. Then walk the process table and count rss and swap + * for each process'es address space. Address space object such as + * vnodes, amps and anons are tracked per entity, so that they are + * not double counted in the results. + * + */ +static void +vmu_calculate() +{ + int i = 0; + int ret; + proc_t *p; + + vmu_clear_calc(); + + if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM) + vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, + ALL_ZONES); + + /* + * Walk process table and calculate rss of each proc. + * + * Pidlock and p_lock cannot be held while doing the rss calculation. + * This is because: + * 1. The calculation allocates using KM_SLEEP. + * 2. The calculation grabs a_lock, which cannot be grabbed + * after p_lock. + * + * Since pidlock must be dropped, we cannot simply just walk the + * practive list. Instead, we walk the process table, and sprlock + * each process to ensure that it does not exit during the + * calculation. + */ + + mutex_enter(&pidlock); + for (i = 0; i < v.v_proc; i++) { +again: + p = pid_entry(i); + if (p == NULL) + continue; + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (panicstr) { + mutex_exit(&p->p_lock); + return; + } + + /* Try to set P_PR_LOCK */ + ret = sprtrylock_proc(p); + if (ret == -1) { + /* Process in invalid state */ + mutex_exit(&p->p_lock); + mutex_enter(&pidlock); + continue; + } else if (ret == 1) { + /* + * P_PR_LOCK is already set. Wait and try again. + * This also drops p_lock. + */ + sprwaitlock_proc(p); + mutex_enter(&pidlock); + goto again; + } + mutex_exit(&p->p_lock); + + vmu_calculate_proc(p); + + mutex_enter(&p->p_lock); + sprunlock(p); + mutex_enter(&pidlock); + } + mutex_exit(&pidlock); + + vmu_free_extra(); +} + +/* + * allocate a new cache for N results satisfying flags + */ +vmu_cache_t * +vmu_cache_alloc(size_t nres, uint_t flags) +{ + vmu_cache_t *cache; + + cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP); + cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP); + cache->vmc_nresults = nres; + cache->vmc_flags = flags; + cache->vmc_refcnt = 1; + return (cache); +} + +/* + * Make sure cached results are not freed + */ +static void +vmu_cache_hold(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + cache->vmc_refcnt++; +} + +/* + * free cache data + */ +static void +vmu_cache_rele(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + ASSERT(cache->vmc_refcnt > 0); + cache->vmc_refcnt--; + if (cache->vmc_refcnt == 0) { + kmem_free(cache->vmc_results, sizeof (vmusage_t) * + cache->vmc_nresults); + kmem_free(cache, sizeof (vmu_cache_t)); + } +} + +/* + * Copy out the cached results to a caller. Inspect the callers flags + * and zone to determine which cached results should be copied. + */ +static int +vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, + uint_t flags) +{ + vmusage_t *result, *out_result; + vmusage_t dummy; + size_t i, count = 0; + size_t bufsize; + int ret = 0; + uint_t types = 0; + + if (nres != NULL) { + if (copyin((caddr_t)nres, &bufsize, sizeof (size_t))) + return (set_errno(EFAULT)); + } else { + bufsize = 0; + } + + /* figure out what results the caller is interested in. */ + if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) + types |= VMUSAGE_SYSTEM; + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + types |= VMUSAGE_ZONE; + if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) + types |= VMUSAGE_PROJECTS; + if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + types |= VMUSAGE_TASKS; + if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) + types |= VMUSAGE_RUSERS; + if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) + types |= VMUSAGE_EUSERS; + + /* count results for current zone */ + out_result = buf; + for (result = cache->vmc_results, i = 0; + i < cache->vmc_nresults; result++, i++) { + + /* Do not return "other-zone" results to non-global zones */ + if (curproc->p_zone != global_zone && + curproc->p_zone->zone_id != result->vmu_zoneid) + continue; + + /* + * If non-global zone requests VMUSAGE_SYSTEM, fake + * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result. + */ + if (curproc->p_zone != global_zone && + (flags & VMUSAGE_SYSTEM) != 0 && + result->vmu_type == VMUSAGE_ZONE) { + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + dummy = *result; + dummy.vmu_zoneid = ALL_ZONES; + dummy.vmu_id = 0; + dummy.vmu_type = VMUSAGE_SYSTEM; + if (copyout(&dummy, out_result, + sizeof (vmusage_t))) + return (set_errno( + EFAULT)); + out_result++; + } + } + } + + /* Skip results that do not match requested type */ + if ((result->vmu_type & types) == 0) + continue; + + /* Skip collated results if not requested */ + if (result->vmu_zoneid == ALL_ZONES) { + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & VMUSAGE_COL_PROJECTS) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & VMUSAGE_COL_EUSERS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & VMUSAGE_COL_RUSERS) == 0) + continue; + } + + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + if (copyout(result, out_result, + sizeof (vmusage_t))) + return (set_errno(EFAULT)); + out_result++; + } + } + } + if (nres != NULL) + if (copyout(&count, (void *)nres, sizeof (size_t))) + return (set_errno(EFAULT)); + + return (ret); +} + +/* + * vm_getusage() + * + * Counts rss and swap by zone, project, task, and/or user. The flags argument + * determines the type of results structures returned. Flags requesting + * results from more than one zone are "flattened" to the local zone if the + * caller is not the global zone. + * + * args: + * flags: bitmap consisting of one or more of VMUSAGE_*. + * age: maximum allowable age (time since counting was done) in + * seconds of the results. Results from previous callers are + * cached in kernel. + * buf: pointer to buffer array of vmusage_t. If NULL, then only nres + * set on success. + * nres: Set to number of vmusage_t structures pointed to by buf + * before calling vm_getusage(). + * On return 0 (success) or ENOSPC, is set to the number of result + * structures returned or attempted to return. + * + * returns 0 on success, -1 on failure: + * EINTR (interrupted) + * ENOSPC (nres to small for results, nres set to needed value for success) + * EINVAL (flags invalid) + * EFAULT (bad address for buf or nres) + */ +int +vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres) +{ + vmu_entity_t *entity; + vmusage_t *result; + int ret = 0; + int cacherecent = 0; + hrtime_t now; + uint_t flags_orig; + + /* + * Non-global zones cannot request system wide and/or collated + * results, or the system result, so munge the flags accordingly. + */ + flags_orig = flags; + if (curproc->p_zone != global_zone) { + if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { + flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); + flags |= VMUSAGE_PROJECTS; + } + if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) { + flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS); + flags |= VMUSAGE_RUSERS; + } + if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) { + flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS); + flags |= VMUSAGE_EUSERS; + } + if (flags & VMUSAGE_SYSTEM) { + flags &= ~VMUSAGE_SYSTEM; + flags |= VMUSAGE_ZONE; + } + } + + /* Check for unknown flags */ + if ((flags & (~VMUSAGE_MASK)) != 0) + return (set_errno(EINVAL)); + + /* Check for no flags */ + if ((flags & VMUSAGE_MASK) == 0) + return (set_errno(EINVAL)); + + mutex_enter(&vmu_data.vmu_lock); + now = gethrtime(); + +start: + if (vmu_data.vmu_cache != NULL) { + + vmu_cache_t *cache; + + if ((vmu_data.vmu_cache->vmc_timestamp + + ((hrtime_t)age * NANOSEC)) > now) + cacherecent = 1; + + if ((vmu_data.vmu_cache->vmc_flags & flags) == flags && + cacherecent == 1) { + cache = vmu_data.vmu_cache; + vmu_cache_hold(cache); + mutex_exit(&vmu_data.vmu_lock); + + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + mutex_exit(&vmu_data.vmu_lock); + return (ret); + } + /* + * If the cache is recent, it is likely that there are other + * consumers of vm_getusage running, so add their flags to the + * desired flags for the calculation. + */ + if (cacherecent == 1) + flags = vmu_data.vmu_cache->vmc_flags | flags; + } + if (vmu_data.vmu_calc_thread == NULL) { + + vmu_cache_t *cache; + + vmu_data.vmu_calc_thread = curthread; + vmu_data.vmu_calc_flags = flags; + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + if (vmu_data.vmu_pending_waiters > 0) + vmu_data.vmu_calc_flags |= + vmu_data.vmu_pending_flags; + + vmu_data.vmu_pending_flags = 0; + mutex_exit(&vmu_data.vmu_lock); + vmu_calculate(); + mutex_enter(&vmu_data.vmu_lock); + /* copy results to cache */ + if (vmu_data.vmu_cache != NULL) + vmu_cache_rele(vmu_data.vmu_cache); + cache = vmu_data.vmu_cache = + vmu_cache_alloc(vmu_data.vmu_nentities, + vmu_data.vmu_calc_flags); + + result = cache->vmc_results; + for (entity = vmu_data.vmu_entities; entity != NULL; + entity = entity->vme_next) { + *result = entity->vme_result; + result++; + } + cache->vmc_timestamp = gethrtime(); + vmu_cache_hold(cache); + + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_calc_thread = NULL; + + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + + mutex_exit(&vmu_data.vmu_lock); + + /* copy cache */ + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + mutex_exit(&vmu_data.vmu_lock); + + return (ret); + } + vmu_data.vmu_pending_flags |= flags; + vmu_data.vmu_pending_waiters++; + while (vmu_data.vmu_calc_thread != NULL) { + if (cv_wait_sig(&vmu_data.vmu_cv, + &vmu_data.vmu_lock) == 0) { + vmu_data.vmu_pending_waiters--; + mutex_exit(&vmu_data.vmu_lock); + return (set_errno(EINTR)); + } + } + vmu_data.vmu_pending_waiters--; + goto start; +} |
