diff options
Diffstat (limited to 'usr/src/uts/common/vm')
-rw-r--r-- | usr/src/uts/common/vm/hat.h | 10 | ||||
-rw-r--r-- | usr/src/uts/common/vm/page.h | 7 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg_vn.c | 11 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_as.c | 19 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_page.c | 11 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_pvn.c | 28 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_usage.c | 252 |
7 files changed, 262 insertions, 76 deletions
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h index a2509e7bb6..3735139068 100644 --- a/usr/src/uts/common/vm/hat.h +++ b/usr/src/uts/common/vm/hat.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -269,7 +270,12 @@ void hat_kpm_walk(void (*)(void *, void *, size_t), void *); * call. * * int hat_pageunload(pp, forceflag) - * unload all translations attached to pp. + * Unload all translations attached to pp. On x86 the bulk of the work is + * done by hat_page_inval. + * + * void hat_page_inval(pp, pgsz, curhat) + * Unload translations attached to pp. If curhat is provided, only the + * translation for that process is unloaded, otherwise all are unloaded. * * uint_t hat_pagesync(pp, flags) * get hw stats from hardware into page struct and reset hw stats @@ -291,6 +297,7 @@ void hat_page_setattr(struct page *, uint_t); void hat_page_clrattr(struct page *, uint_t); uint_t hat_page_getattr(struct page *, uint_t); int hat_pageunload(struct page *, uint_t); +void hat_page_inval(struct page *, uint_t, struct hat *); uint_t hat_pagesync(struct page *, uint_t); ulong_t hat_page_getshare(struct page *); int hat_page_checkshare(struct page *, ulong_t); @@ -460,6 +467,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t); */ #define HAT_ADV_PGUNLOAD 0x00 #define HAT_FORCE_PGUNLOAD 0x01 +#define HAT_CURPROC_PGUNLOAD 0x02 /* * Attributes for hat_page_*attr, hat_setstats and diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index b2f61429e9..bd17487be3 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. * Copyright 2021 Oxide Computer Company */ @@ -230,6 +231,7 @@ struct as; * p_nrm * p_mapping * p_share + * p_zoneid * * The following field is file system dependent. How it is used and * the locking strategies applied are up to the individual file system @@ -528,9 +530,8 @@ typedef struct page { pfn_t p_pagenum; /* physical page number */ uint_t p_share; /* number of translations */ -#if defined(_LP64) - uint_t p_sharepad; /* pad for growing p_share */ -#endif + short p_zoneid; /* zone page use tracking */ + short p_pad1; /* TBD */ uint_t p_slckcnt; /* number of softlocks */ #if defined(__sparc) uint_t p_kpmref; /* number of kpm mapping sharers */ diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index e1b5b79b8d..75e3bf5acb 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -7319,7 +7319,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = svd->vpage; offset = svd->offset + (uintptr_t)(addr - seg->s_base); bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | - ((flags & MS_INVALIDATE) ? B_INVAL : 0); + ((flags & MS_INVALIDATE) ? B_INVAL : 0) | + ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0); if (attr) { pageprot = attr & ~(SHARED|PRIVATE); @@ -7344,11 +7345,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = &svd->vpage[seg_page(seg, addr)]; } else if (svd->vp && svd->amp == NULL && - (flags & MS_INVALIDATE) == 0) { + (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) { /* - * No attributes, no anonymous pages and MS_INVALIDATE flag - * is not on, just use one big request. + * No attributes, no anonymous pages and MS_INVAL* flags + * are not on, just use one big request. */ err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, bflags, svd->cred, NULL); @@ -7400,7 +7401,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) * might race in and lock the page after we unlock and before * we do the PUTPAGE, then PUTPAGE simply does nothing. */ - if (flags & MS_INVALIDATE) { + if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) { if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c index c217270943..24e03795e6 100644 --- a/usr/src/uts/common/vm/vm_as.c +++ b/usr/src/uts/common/vm/vm_as.c @@ -57,6 +57,7 @@ #include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/vtrace.h> +#include <sys/ddi.h> #include <vm/hat.h> #include <vm/as.h> @@ -71,6 +72,8 @@ clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ +ulong_t as_user_seg_limit = 0xffff; /* max segments in an (non-kas) AS */ + static struct kmem_cache *as_cache; static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); @@ -838,8 +841,6 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, int as_lock_held; klwp_t *lwp = ttolwp(curthread); - - retry: /* * Indicate that the lwp is not to be stopped while waiting for a @@ -1703,6 +1704,20 @@ as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, p->p_rctls, p, RCA_UNSAFE_ALL); return (ENOMEM); } + + /* + * Keep the number of segments in a userspace AS constrained to + * a reasonable limit. Linux enforces a value slightly less + * than 64k in order to avoid ELF limits if/when a process + * dumps core. While SunOS avoids that specific problem with + * other tricks, the limit is still valuable to keep kernel + * memory consumption in check. + */ + if (avl_numnodes(&as->a_segtree) >= as_user_seg_limit) { + AS_LOCK_EXIT(as); + atomic_inc_32(&p->p_zone->zone_mfseglim); + return (ENOMEM); + } } if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index 134b3bcb33..cb99a6b92b 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -84,6 +84,7 @@ static pgcnt_t max_page_get; /* max page_get request size in pages */ pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ +uint64_t n_throttle = 0; /* num times page create throttled */ /* * freemem_lock protects all freemem variables: @@ -1477,6 +1478,8 @@ page_create_throttle(pgcnt_t npages, int flags) uint_t i; pgcnt_t tf; /* effective value of throttlefree */ + atomic_inc_64(&n_throttle); + /* * Normal priority allocations. */ @@ -1509,7 +1512,7 @@ page_create_throttle(pgcnt_t npages, int flags) tf = throttlefree - ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); for (;;) { fm = 0; @@ -1596,7 +1599,7 @@ checkagain: } ASSERT(proc_pageout != NULL); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, "page_create_sleep_start: freemem %ld needfree %ld", @@ -2243,7 +2246,7 @@ page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, if (nscan < desscan && freemem < minfree) { TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); } pp = rootpp; @@ -2372,7 +2375,7 @@ page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, if (nscan < desscan && freemem < minfree) { TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); } /* diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c index aaad06792b..db8b86d6e6 100644 --- a/usr/src/uts/common/vm/vm_pvn.c +++ b/usr/src/uts/common/vm/vm_pvn.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -431,7 +432,14 @@ pvn_write_done(page_t *plist, int flags) page_io_unlock(pp); page_unlock(pp); } - } else if (flags & B_INVAL) { + } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { + /* + * If B_INVALCURONLY is set, then we handle that case + * in the next conditional if hat_page_is_mapped() + * indicates that there are no additional mappings + * to the page. + */ + /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. @@ -565,8 +573,9 @@ pvn_write_done(page_t *plist, int flags) } /* - * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, - * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster + * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE, + * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}. + * B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page @@ -620,13 +629,17 @@ pvn_getdirty(page_t *pp, int flags) * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. + * If we are only invalidating the page for the + * current process, then pass in a different flag. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ - if (flags & (B_INVAL | B_FREE)) { + if (flags & B_INVALCURONLY) { + (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD); + } else if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); @@ -638,7 +651,7 @@ pvn_getdirty(page_t *pp, int flags) * list after all. */ page_io_unlock(pp); - if (flags & B_INVAL) { + if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { @@ -650,6 +663,9 @@ pvn_getdirty(page_t *pp, int flags) * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() + * We also take this path for B_INVALCURONLY and + * let page_release call VN_DISPOSE if no one else is + * using the page. * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; @@ -674,7 +690,7 @@ pvn_getdirty(page_t *pp, int flags) * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ - if (flags & B_FREE) + if (flags & (B_FREE | B_INVALCURONLY)) page_downgrade(pp); diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index 98ed21d059..88c694336d 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,6 +25,10 @@ */ /* + * Copyright 2018, Joyent, Inc. + */ + +/* * vm_usage * * This file implements the getvmusage() private system call. @@ -114,7 +118,7 @@ * For accurate counting of map-shared and COW-shared pages. * * - visited private anons (refcnt > 1) for each collective. - * (entity->vme_anon_hash) + * (entity->vme_anon) * For accurate counting of COW-shared pages. * * The common accounting structure is the vmu_entity_t, which represents @@ -152,6 +156,7 @@ #include <sys/vm_usage.h> #include <sys/zone.h> #include <sys/sunddi.h> +#include <sys/sysmacros.h> #include <sys/avl.h> #include <vm/anon.h> #include <vm/as.h> @@ -199,6 +204,14 @@ typedef struct vmu_object { } vmu_object_t; /* + * Node for tree of visited COW anons. + */ +typedef struct vmu_anon { + avl_node_t vma_node; + uintptr_t vma_addr; +} vmu_anon_t; + +/* * Entity by which to count results. * * The entity structure keeps the current rss/swap counts for each entity @@ -221,7 +234,7 @@ typedef struct vmu_entity { struct vmu_entity *vme_next_calc; mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ - mod_hash_t *vme_anon_hash; /* COW anons visited for entity */ + avl_tree_t vme_anon; /* COW anons visited for entity */ vmusage_t vme_result; /* identifies entity and results */ } vmu_entity_t; @@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2) } /* + * Comparison routine for our AVL tree of anon structures. + */ +static int +vmu_anon_cmp(const void *lhs, const void *rhs) +{ + const vmu_anon_t *l = lhs, *r = rhs; + + if (l->vma_addr == r->vma_addr) + return (0); + + if (l->vma_addr < r->vma_addr) + return (-1); + + return (1); +} + +/* * Save a bound on the free list. */ static void @@ -363,13 +393,18 @@ static void vmu_free_entity(mod_hash_val_t val) { vmu_entity_t *entity = (vmu_entity_t *)val; + vmu_anon_t *anon; + void *cookie = NULL; if (entity->vme_vnode_hash != NULL) i_mod_hash_clear_nosync(entity->vme_vnode_hash); if (entity->vme_amp_hash != NULL) i_mod_hash_clear_nosync(entity->vme_amp_hash); - if (entity->vme_anon_hash != NULL) - i_mod_hash_clear_nosync(entity->vme_anon_hash); + + while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL) + kmem_free(anon, sizeof (vmu_anon_t)); + + avl_destroy(&entity->vme_anon); entity->vme_next = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = entity; @@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid) "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (struct anon_map)); - if (entity->vme_anon_hash == NULL) - entity->vme_anon_hash = mod_hash_create_ptrhash( - "vmusage anon hash", VMUSAGE_HASH_SIZE, - mod_hash_null_valdtor, sizeof (struct anon)); + VERIFY(avl_first(&entity->vme_anon) == NULL); + + avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon), + offsetof(struct vmu_anon, vma_node)); entity->vme_next = vmu_data.vmu_entities; vmu_data.vmu_entities = entity; @@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id) zone->vmz_id = id; - if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + if ((vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | @@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) } static int -vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +vmu_find_insert_anon(vmu_entity_t *entity, void *key) { - int ret; - caddr_t val; + vmu_anon_t anon, *ap; - ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t *)&val); + anon.vma_addr = (uintptr_t)key; - if (ret == 0) + if (avl_find(&entity->vme_anon, &anon, NULL) != NULL) return (0); - ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t)key, (mod_hash_hndl_t)0); + ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP); + ap->vma_addr = (uintptr_t)key; - ASSERT(ret == 0); + avl_add(&entity->vme_anon, ap); return (1); } @@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, next = AVL_NEXT(tree, next); continue; } + + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, if (ap != NULL && vn != NULL && vn->v_pages != NULL && (page = page_exists(vn, off)) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, continue; } + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, if (vnode->v_pages != NULL && (page = page_exists(vnode, ptob(index))) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1306,6 +1353,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) } /* + * Pages on the free list aren't counted for the rss. + */ + if (PP_ISFREE(page)) + continue; + + /* * Assume anon structs with a refcnt * of 1 are not COW shared, so there * is no reason to track them per entity. @@ -1322,8 +1375,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) * Track COW anons per entity so * they are not double counted. */ - if (vmu_find_insert_anon(entity->vme_anon_hash, - (caddr_t)ap) == 0) + if (vmu_find_insert_anon(entity, ap) == 0) continue; result->vmu_rss_all += (pgcnt << PAGESHIFT); @@ -1463,8 +1515,9 @@ vmu_calculate_proc(proc_t *p) entities = tmp; } if (vmu_data.vmu_calc_flags & - (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | - VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE | + VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, @@ -1597,8 +1650,7 @@ vmu_free_extra() mod_hash_destroy_hash(te->vme_vnode_hash); if (te->vme_amp_hash != NULL) mod_hash_destroy_hash(te->vme_amp_hash); - if (te->vme_anon_hash != NULL) - mod_hash_destroy_hash(te->vme_anon_hash); + VERIFY(avl_first(&te->vme_anon) == NULL); kmem_free(te, sizeof (vmu_entity_t)); } while (vmu_data.vmu_free_zones != NULL) { @@ -1619,13 +1671,42 @@ vmu_free_extra() extern kcondvar_t *pr_pid_cv; +static void +vmu_get_zone_rss(zoneid_t zid) +{ + vmu_zone_t *zone; + zone_t *zp; + int ret; + uint_t pgcnt; + + if ((zp = zone_find_by_id(zid)) == NULL) + return; + + ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)zid, (mod_hash_val_t *)&zone); + if (ret != 0) { + zone = vmu_alloc_zone(zid); + ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)zid, + (mod_hash_val_t)zone, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + pgcnt = zone_pdata[zid].zpers_pg_cnt; + zone->vmz_zone->vme_result.vmu_rss_all = (size_t)ptob(pgcnt); + zone->vmz_zone->vme_result.vmu_swap_all = zp->zone_max_swap; + + zone_rele(zp); +} + /* * Determine which entity types are relevant and allocate the hashes to - * track them. Then walk the process table and count rss and swap - * for each process'es address space. Address space object such as - * vnodes, amps and anons are tracked per entity, so that they are - * not double counted in the results. - * + * track them. First get the zone rss using the data we already have. Then, + * if necessary, walk the process table and count rss and swap for each + * process'es address space. Address space object such as vnodes, amps and + * anons are tracked per entity, so that they are not double counted in the + * results. */ static void vmu_calculate() @@ -1633,6 +1714,7 @@ vmu_calculate() int i = 0; int ret; proc_t *p; + uint_t zone_flags = 0; vmu_clear_calc(); @@ -1640,9 +1722,34 @@ vmu_calculate() vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, ALL_ZONES); + zone_flags = vmu_data.vmu_calc_flags & VMUSAGE_ZONE_FLAGS; + if (zone_flags != 0) { + /* + * Use the accurate zone RSS data we already keep track of. + */ + int i; + + for (i = 0; i <= MAX_ZONEID; i++) { + if (zone_pdata[i].zpers_pg_cnt > 0) { + vmu_get_zone_rss(i); + } + } + } + + /* If only neeeded zone data, we're done. */ + if ((vmu_data.vmu_calc_flags & ~VMUSAGE_ZONE_FLAGS) == 0) { + return; + } + + DTRACE_PROBE(vmu__calculate__all); + vmu_data.vmu_calc_flags &= ~VMUSAGE_ZONE_FLAGS; + /* * Walk process table and calculate rss of each proc. * + * Since we already obtained all zone rss above, the following loop + * executes with the VMUSAGE_ZONE_FLAGS cleared. + * * Pidlock and p_lock cannot be held while doing the rss calculation. * This is because: * 1. The calculation allocates using KM_SLEEP. @@ -1697,6 +1804,12 @@ again: mutex_exit(&pidlock); vmu_free_extra(); + + /* + * Restore any caller-supplied zone flags we blocked during + * the process-table walk. + */ + vmu_data.vmu_calc_flags |= zone_flags; } /* @@ -1747,7 +1860,7 @@ vmu_cache_rele(vmu_cache_t *cache) */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, - uint_t flags, int cpflg) + uint_t flags, id_t req_zone_id, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; @@ -1766,7 +1879,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; - if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) @@ -1829,26 +1942,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, continue; } - /* Skip "other zone" results if not requested */ - if (result->vmu_zoneid != curproc->p_zone->zone_id) { - if (result->vmu_type == VMUSAGE_ZONE && - (flags & VMUSAGE_ALL_ZONES) == 0) - continue; - if (result->vmu_type == VMUSAGE_PROJECTS && - (flags & (VMUSAGE_ALL_PROJECTS | - VMUSAGE_COL_PROJECTS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_TASKS && - (flags & VMUSAGE_ALL_TASKS) == 0) - continue; - if (result->vmu_type == VMUSAGE_RUSERS && - (flags & (VMUSAGE_ALL_RUSERS | - VMUSAGE_COL_RUSERS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_EUSERS && - (flags & (VMUSAGE_ALL_EUSERS | - VMUSAGE_COL_EUSERS)) == 0) + if (result->vmu_type == VMUSAGE_ZONE && + flags & VMUSAGE_A_ZONE) { + /* Skip non-requested zone results */ + if (result->vmu_zoneid != req_zone_id) continue; + } else { + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } } count++; if (out_result != NULL) { @@ -1904,10 +2024,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) int cacherecent = 0; hrtime_t now; uint_t flags_orig; + id_t req_zone_id; /* * Non-global zones cannot request system wide and/or collated - * results, or the system result, so munge the flags accordingly. + * results, or the system result, or usage of another zone, so munge + * the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { @@ -1927,6 +2049,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) flags &= ~VMUSAGE_SYSTEM; flags |= VMUSAGE_ZONE; } + if (flags & VMUSAGE_A_ZONE) { + flags &= ~VMUSAGE_A_ZONE; + flags |= VMUSAGE_ZONE; + } } /* Check for unknown flags */ @@ -1937,6 +2063,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) if ((flags & VMUSAGE_MASK) == 0) return (set_errno(EINVAL)); + /* If requesting results for a specific zone, get the zone ID */ + if (flags & VMUSAGE_A_ZONE) { + size_t bufsize; + vmusage_t zreq; + + if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) + return (set_errno(EFAULT)); + /* Requested zone ID is passed in buf, so 0 len not allowed */ + if (bufsize == 0) + return (set_errno(EINVAL)); + if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg)) + return (set_errno(EFAULT)); + req_zone_id = zreq.vmu_id; + } + mutex_enter(&vmu_data.vmu_lock); now = gethrtime(); @@ -1956,7 +2097,7 @@ start: mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, - cpflg); + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) @@ -2013,7 +2154,8 @@ start: mutex_exit(&vmu_data.vmu_lock); /* copy cache */ - ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); + ret = vmu_copyout_results(cache, buf, nres, flags_orig, + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); |