diff options
Diffstat (limited to 'usr/src/uts/common/vm')
-rw-r--r-- | usr/src/uts/common/vm/hat.h | 10 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg_kmem.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg_kmem.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg_umap.c | 466 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg_umap.h | 43 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg_vn.c | 11 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_anon.c | 11 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_as.c | 20 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_pvn.c | 28 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_swap.c | 27 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_usage.c | 387 |
11 files changed, 942 insertions, 67 deletions
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h index 1d91475e38..c908a9e16c 100644 --- a/usr/src/uts/common/vm/hat.h +++ b/usr/src/uts/common/vm/hat.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -269,7 +270,12 @@ void hat_kpm_walk(void (*)(void *, void *, size_t), void *); * call. * * int hat_pageunload(pp, forceflag) - * unload all translations attached to pp. + * Unload all translations attached to pp. On x86 the bulk of the work is + * done by hat_page_inval. + * + * void hat_page_inval(pp, pgsz, curhat) + * Unload translations attached to pp. If curhat is provided, only the + * translation for that process is unloaded, otherwise all are unloaded. * * uint_t hat_pagesync(pp, flags) * get hw stats from hardware into page struct and reset hw stats @@ -291,6 +297,7 @@ void hat_page_setattr(struct page *, uint_t); void hat_page_clrattr(struct page *, uint_t); uint_t hat_page_getattr(struct page *, uint_t); int hat_pageunload(struct page *, uint_t); +void hat_page_inval(struct page *, uint_t, struct hat *); uint_t hat_pagesync(struct page *, uint_t); ulong_t hat_page_getshare(struct page *); int hat_page_checkshare(struct page *, ulong_t); @@ -460,6 +467,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t); */ #define HAT_ADV_PGUNLOAD 0x00 #define HAT_FORCE_PGUNLOAD 0x01 +#define HAT_CURPROC_PGUNLOAD 0x02 /* * Attributes for hat_page_*attr, hat_setstats and diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c index 90e1b73b70..439c859d96 100644 --- a/usr/src/uts/common/vm/seg_kmem.c +++ b/usr/src/uts/common/vm/seg_kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -773,7 +774,7 @@ segkmem_capable(struct seg *seg, segcapability_t capability) return (0); } -static struct seg_ops segkmem_ops = { +struct seg_ops segkmem_ops = { SEGKMEM_BADOP(int), /* dup */ SEGKMEM_BADOP(int), /* unmap */ SEGKMEM_BADOP(void), /* free */ diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h index 2a4ed3b2aa..3ad4202e91 100644 --- a/usr/src/uts/common/vm/seg_kmem.h +++ b/usr/src/uts/common/vm/seg_kmem.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _VM_SEG_KMEM_H @@ -136,6 +137,8 @@ extern size_t segkmem_kmemlp_max; #define IS_KMEM_VA_LARGEPAGE(vaddr) \ (((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end)) +extern struct seg_ops segkmem_ops; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/vm/seg_umap.c b/usr/src/uts/common/vm/seg_umap.c new file mode 100644 index 0000000000..ccad71c5d6 --- /dev/null +++ b/usr/src/uts/common/vm/seg_umap.c @@ -0,0 +1,466 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * VM - Kernel-to-user mapping segment + * + * The umap segment driver was primarily designed to facilitate the comm page: + * a portion of kernel memory shared with userspace so that certain (namely + * clock-related) actions could operate without making an expensive trip into + * the kernel. + * + * Since the initial requirements for the comm page are slim, advanced features + * of the segment driver such as per-page protection have been left + * unimplemented at this time. + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/lgrp.h> +#include <sys/mman.h> + +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> +#include <vm/seg_umap.h> + + +static boolean_t segumap_verify_safe(caddr_t, size_t); +static int segumap_dup(struct seg *, struct seg *); +static int segumap_unmap(struct seg *, caddr_t, size_t); +static void segumap_free(struct seg *); +static faultcode_t segumap_fault(struct hat *, struct seg *, caddr_t, size_t, + enum fault_type, enum seg_rw); +static faultcode_t segumap_faulta(struct seg *, caddr_t); +static int segumap_setprot(struct seg *, caddr_t, size_t, uint_t); +static int segumap_checkprot(struct seg *, caddr_t, size_t, uint_t); +static int segumap_sync(struct seg *, caddr_t, size_t, int, uint_t); +static size_t segumap_incore(struct seg *, caddr_t, size_t, char *); +static int segumap_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *, + size_t); +static int segumap_getprot(struct seg *, caddr_t, size_t, uint_t *); +static u_offset_t segumap_getoffset(struct seg *, caddr_t); +static int segumap_gettype(struct seg *, caddr_t); +static int segumap_getvp(struct seg *, caddr_t, struct vnode **); +static int segumap_advise(struct seg *, caddr_t, size_t, uint_t); +static void segumap_dump(struct seg *); +static int segumap_pagelock(struct seg *, caddr_t, size_t, struct page ***, + enum lock_type, enum seg_rw); +static int segumap_setpagesize(struct seg *, caddr_t, size_t, uint_t); +static int segumap_getmemid(struct seg *, caddr_t, memid_t *); +static int segumap_capable(struct seg *, segcapability_t); + +static struct seg_ops segumap_ops = { + segumap_dup, + segumap_unmap, + segumap_free, + segumap_fault, + segumap_faulta, + segumap_setprot, + segumap_checkprot, + NULL, /* kluster: disabled */ + NULL, /* swapout: disabled */ + segumap_sync, + segumap_incore, + segumap_lockop, + segumap_getprot, + segumap_getoffset, + segumap_gettype, + segumap_getvp, + segumap_advise, + segumap_dump, + segumap_pagelock, + segumap_setpagesize, + segumap_getmemid, + NULL, /* getpolicy: disabled */ + segumap_capable, + seg_inherit_notsup +}; + + +/* + * Create a kernel/user-mapped segment. + */ +int +segumap_create(struct seg *seg, void *argsp) +{ + segumap_crargs_t *a = (struct segumap_crargs *)argsp; + segumap_data_t *data; + + ASSERT((uintptr_t)a->kaddr > _userlimit); + + /* + * Check several aspects of the mapping request to ensure validity: + * - kernel pages must reside entirely in kernel space + * - target protection must be user-accessible + * - kernel address must be page-aligned + * - kernel address must reside inside a "safe" segment + */ + if ((uintptr_t)a->kaddr <= _userlimit || + ((uintptr_t)a->kaddr + seg->s_size) < (uintptr_t)a->kaddr || + (a->prot & PROT_USER) == 0 || + ((uintptr_t)a->kaddr & PAGEOFFSET) != 0 || + !segumap_verify_safe(a->kaddr, seg->s_size)) { + return (EINVAL); + } + + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + rw_init(&data->sud_lock, NULL, RW_DEFAULT, NULL); + data->sud_kaddr = a->kaddr; + data->sud_prot = a->prot; + data->sud_loaded = B_FALSE; + + seg->s_ops = &segumap_ops; + seg->s_data = data; + return (0); +} + +static boolean_t +segumap_verify_safe(caddr_t kaddr, size_t len) +{ + struct seg *seg; + + /* + * Presently, only pages which are backed by segkmem are allowed to be + * shared with userspace. This prevents nasty paging behavior with + * other drivers such as seg_kp. Furthermore, the backing kernel + * segment must completely contain the region to be mapped. + * + * Failing these checks is fatal for now since such mappings are done + * in a very limited context from the kernel. + */ + AS_LOCK_ENTER(&kas, RW_READER); + seg = as_segat(&kas, kaddr); + VERIFY(seg != NULL); + VERIFY(seg->s_base + seg->s_size >= kaddr + len); + VERIFY(seg->s_ops == &segkmem_ops); + AS_LOCK_EXIT(&kas); + + return (B_TRUE); +} + +static int +segumap_dup(struct seg *seg, struct seg *newseg) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + segumap_data_t *newsud; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + newsud = kmem_zalloc(sizeof (segumap_data_t), KM_SLEEP); + rw_init(&newsud->sud_lock, NULL, RW_DEFAULT, NULL); + newsud->sud_kaddr = sud->sud_kaddr; + newsud->sud_prot = sud->sud_prot; + newsud->sud_loaded = B_FALSE; + + newseg->s_ops = seg->s_ops; + newseg->s_data = newsud; + return (0); +} + +static int +segumap_unmap(struct seg *seg, caddr_t addr, size_t len) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + /* Only allow unmap of entire segment */ + if (addr != seg->s_base || len != seg->s_size) { + return (EINVAL); + } + if (sud->sud_softlockcnt != 0) { + return (EAGAIN); + } + + hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP); + /* + * While setting this field before immediately freeing the segment is + * not necessary, it is done for the sake of completeness. Doing so + * outside sud_lock is safe with the AS write-locked. + */ + sud->sud_loaded = B_FALSE; + + seg_free(seg); + return (0); +} + +static void +segumap_free(struct seg *seg) +{ + segumap_data_t *data = (segumap_data_t *)seg->s_data; + + ASSERT(data != NULL); + + rw_destroy(&data->sud_lock); + VERIFY(data->sud_loaded == B_FALSE); + VERIFY(data->sud_softlockcnt == 0); + kmem_free(data, sizeof (*data)); + seg->s_data = NULL; +} + +/* ARGSUSED */ +static faultcode_t +segumap_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, + enum fault_type type, enum seg_rw tw) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + if (type == F_PROT) { + /* + * Since protection on the segment is fixed, there is nothing + * to do but report an error for protection faults. + */ + return (FC_PROT); + } else if (type == F_SOFTUNLOCK) { + size_t plen = btop(len); + + rw_enter(&sud->sud_lock, RW_WRITER); + VERIFY(sud->sud_softlockcnt >= plen); + sud->sud_softlockcnt -= plen; + rw_exit(&sud->sud_lock); + return (0); + } + + ASSERT(type == F_INVAL || type == F_SOFTLOCK); + rw_enter(&sud->sud_lock, RW_WRITER); + + if (type == F_INVAL && sud->sud_loaded) { + rw_exit(&sud->sud_lock); + return (FC_NOMAP); + } + + /* + * Load the (entire) segment into the HAT if it has not been done so. + */ + if (!sud->sud_loaded) { + for (uintptr_t i = 0; i < seg->s_size; i += PAGESIZE) { + pfn_t pfn; + + pfn = hat_getpfnum(kas.a_hat, sud->sud_kaddr + i); + VERIFY(pfn != PFN_INVALID); + hat_devload(seg->s_as->a_hat, seg->s_base + i, + PAGESIZE, pfn, sud->sud_prot, HAT_LOAD); + } + sud->sud_loaded = B_TRUE; + } else { + /* + * If there the segment has already been loaded, there is no + * reason to take an F_INVALID fault. + */ + VERIFY(type != F_INVAL); + } + + if (type == F_SOFTLOCK) { + size_t nval = sud->sud_softlockcnt + btop(len); + + if (sud->sud_softlockcnt >= nval) { + rw_exit(&sud->sud_lock); + return (FC_MAKE_ERR(EOVERFLOW)); + } + sud->sud_softlockcnt = nval; + } + rw_exit(&sud->sud_lock); + return (0); +} + +/* ARGSUSED */ +static faultcode_t +segumap_faulta(struct seg *seg, caddr_t addr) +{ + /* Do nothing since asynch pagefault should not load translation. */ + return (0); +} + +/* ARGSUSED */ +static int +segumap_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + /* + * The seg_umap driver does not yet allow protection to be changed. + */ + return (EACCES); +} + +/* ARGSUSED */ +static int +segumap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + int error = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&sud->sud_lock, RW_READER); + if ((sud->sud_prot & prot) != prot) { + error = EACCES; + } + rw_exit(&sud->sud_lock); + return (error); +} + +/* ARGSUSED */ +static int +segumap_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) +{ + /* Always succeed since there are no backing store to sync */ + return (0); +} + +/* ARGSUSED */ +static size_t +segumap_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) +{ + size_t sz = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + len = (len + PAGEOFFSET) & PAGEMASK; + while (len > 0) { + *vec = 1; + sz += PAGESIZE; + vec++; + len -= PAGESIZE; + } + return (sz); +} + +/* ARGSUSED */ +static int +segumap_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op, + ulong_t *lockmap, size_t pos) +{ + /* Report success since kernel pages are always in memory. */ + return (0); +} + +static int +segumap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + size_t pgno; + uint_t prot; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&sud->sud_lock, RW_READER); + prot = sud->sud_prot; + rw_exit(&sud->sud_lock); + + /* + * Reporting protection is simple since it is not tracked per-page. + */ + pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; + while (pgno > 0) { + protv[--pgno] = prot; + } + return (0); +} + +/* ARGSUSED */ +static u_offset_t +segumap_getoffset(struct seg *seg, caddr_t addr) +{ + /* + * To avoid leaking information about the layout of the kernel address + * space, always report '0' as the offset. + */ + return (0); +} + +/* ARGSUSED */ +static int +segumap_gettype(struct seg *seg, caddr_t addr) +{ + /* + * Since already-existing kernel pages are being mapped into userspace, + * always report the segment type as shared. + */ + return (MAP_SHARED); +} + +/* ARGSUSED */ +static int +segumap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + *vpp = NULL; + return (0); +} + +/* ARGSUSED */ +static int +segumap_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) +{ + if (behav == MADV_PURGE) { + /* Purge does not make sense for this mapping */ + return (EINVAL); + } + /* Indicate success for everything else. */ + return (0); +} + +/* ARGSUSED */ +static void +segumap_dump(struct seg *seg) +{ + /* + * Since this is a mapping to share kernel data with userspace, nothing + * additional should be dumped. + */ +} + +/* ARGSUSED */ +static int +segumap_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, + enum lock_type type, enum seg_rw rw) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static int +segumap_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) +{ + return (ENOTSUP); +} + +static int +segumap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + + memidp->val[0] = (uintptr_t)sud->sud_kaddr; + memidp->val[1] = (uintptr_t)(addr - seg->s_base); + return (0); +} + +/* ARGSUSED */ +static int +segumap_capable(struct seg *seg, segcapability_t capability) +{ + /* no special capablities */ + return (0); +} diff --git a/usr/src/uts/common/vm/seg_umap.h b/usr/src/uts/common/vm/seg_umap.h new file mode 100644 index 0000000000..bcf7447509 --- /dev/null +++ b/usr/src/uts/common/vm/seg_umap.h @@ -0,0 +1,43 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _VM_SEG_UMAP_H +#define _VM_SEG_UMAP_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct segumap_crargs { + caddr_t kaddr; + uchar_t prot; /* protection */ + uchar_t maxprot; /* maximum protection */ +} segumap_crargs_t; + +typedef struct segumap_data { + krwlock_t sud_lock; + caddr_t sud_kaddr; + uchar_t sud_prot; + size_t sud_softlockcnt; + boolean_t sud_loaded; +} segumap_data_t; + +extern int segumap_create(struct seg *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_UMAP_H */ diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index 875dec7fe9..f143c1e464 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -7308,7 +7308,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = svd->vpage; offset = svd->offset + (uintptr_t)(addr - seg->s_base); bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | - ((flags & MS_INVALIDATE) ? B_INVAL : 0); + ((flags & MS_INVALIDATE) ? B_INVAL : 0) | + ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0); if (attr) { pageprot = attr & ~(SHARED|PRIVATE); @@ -7333,11 +7334,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = &svd->vpage[seg_page(seg, addr)]; } else if (svd->vp && svd->amp == NULL && - (flags & MS_INVALIDATE) == 0) { + (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) { /* - * No attributes, no anonymous pages and MS_INVALIDATE flag - * is not on, just use one big request. + * No attributes, no anonymous pages and MS_INVAL* flags + * are not on, just use one big request. */ err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, bflags, svd->cred, NULL); @@ -7389,7 +7390,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) * might race in and lock the page after we unlock and before * we do the PUTPAGE, then PUTPAGE simply does nothing. */ - if (flags & MS_INVALIDATE) { + if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) { if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c index 4fd32a3f4a..01db9b23d7 100644 --- a/usr/src/uts/common/vm/vm_anon.c +++ b/usr/src/uts/common/vm/vm_anon.c @@ -788,14 +788,21 @@ anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) pgcnt_t pswap_pages = 0; proc_t *p = curproc; - if (zone != NULL && takemem) { + if (zone != NULL) { /* test zone.max-swap resource control */ mutex_enter(&p->p_lock); if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { mutex_exit(&p->p_lock); - atomic_add_64(&zone->zone_anon_alloc_fail, 1); + + if (takemem) + atomic_add_64(&zone->zone_anon_alloc_fail, 1); + return (0); } + + if (!takemem) + rctl_decr_swap(zone, ptob(npages)); + mutex_exit(&p->p_lock); } mutex_enter(&anoninfo_lock); diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c index bb5a96eb0f..b0a5e7fb33 100644 --- a/usr/src/uts/common/vm/vm_as.c +++ b/usr/src/uts/common/vm/vm_as.c @@ -57,6 +57,7 @@ #include <sys/debug.h> #include <sys/tnf_probe.h> #include <sys/vtrace.h> +#include <sys/ddi.h> #include <vm/hat.h> #include <vm/as.h> @@ -848,8 +849,7 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, struct seg *segsav; int as_lock_held; klwp_t *lwp = ttolwp(curthread); - - + zone_t *zonep = curzone; retry: /* @@ -885,6 +885,22 @@ retry: if (as == &kas) CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); CPU_STATS_EXIT_K(); + if (zonep->zone_pg_flt_delay != 0) { + /* + * The zone in which this process is running is + * currently over it's physical memory cap. Throttle + * page faults to help the user-land memory capper + * catch up. Note that drv_usectohz() rounds up. + */ + atomic_add_64(&zonep->zone_pf_throttle, 1); + atomic_add_64(&zonep->zone_pf_throttle_usec, + zonep->zone_pg_flt_delay); + if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1)) { + drv_usecwait(zonep->zone_pg_flt_delay); + } else { + delay(drv_usectohz(zonep->zone_pg_flt_delay)); + } + } break; } diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c index 1b8d12eb8d..a206320a30 100644 --- a/usr/src/uts/common/vm/vm_pvn.c +++ b/usr/src/uts/common/vm/vm_pvn.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -432,7 +433,14 @@ pvn_write_done(page_t *plist, int flags) page_io_unlock(pp); page_unlock(pp); } - } else if (flags & B_INVAL) { + } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { + /* + * If B_INVALCURONLY is set, then we handle that case + * in the next conditional if hat_page_is_mapped() + * indicates that there are no additional mappings + * to the page. + */ + /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. @@ -573,8 +581,9 @@ pvn_write_done(page_t *plist, int flags) } /* - * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, - * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster + * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE, + * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}. + * B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page @@ -628,13 +637,17 @@ pvn_getdirty(page_t *pp, int flags) * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. + * If we are only invalidating the page for the + * current process, then pass in a different flag. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ - if (flags & (B_INVAL | B_FREE)) { + if (flags & B_INVALCURONLY) { + (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD); + } else if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); @@ -646,7 +659,7 @@ pvn_getdirty(page_t *pp, int flags) * list after all. */ page_io_unlock(pp); - if (flags & B_INVAL) { + if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { @@ -658,6 +671,9 @@ pvn_getdirty(page_t *pp, int flags) * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() + * We also take this path for B_INVALCURONLY and + * let page_release call VN_DISPOSE if no one else is + * using the page. * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; @@ -682,7 +698,7 @@ pvn_getdirty(page_t *pp, int flags) * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ - if (flags & B_FREE) + if (flags & (B_FREE | B_INVALCURONLY)) page_downgrade(pp); diff --git a/usr/src/uts/common/vm/vm_swap.c b/usr/src/uts/common/vm/vm_swap.c index 1a28c04357..2a008e114b 100644 --- a/usr/src/uts/common/vm/vm_swap.c +++ b/usr/src/uts/common/vm/vm_swap.c @@ -18,6 +18,11 @@ * * CDDL HEADER END */ + +/* + * Copyright 2015 Joyent, Inc. + */ + /* * Copyright (c) 1987, 2010, Oracle and/or its affiliates. All rights reserved. */ @@ -625,7 +630,18 @@ swapctl(int sc_cmd, void *sc_arg, int *rv) return (0); } beginning: + mutex_enter(&swapinfo_lock); tmp_nswapfiles = nswapfiles; + mutex_exit(&swapinfo_lock); + + /* + * Return early if there are no swap entries to report: + */ + if (tmp_nswapfiles < 1) { + *rv = 0; + return (0); + } + /* Return an error if not enough space for the whole table. */ if (length < tmp_nswapfiles) return (ENOMEM); @@ -920,7 +936,18 @@ swapctl32(int sc_cmd, void *sc_arg, int *rv) return (0); } beginning: + mutex_enter(&swapinfo_lock); tmp_nswapfiles = nswapfiles; + mutex_exit(&swapinfo_lock); + + /* + * Return early if there are no swap entries to report: + */ + if (tmp_nswapfiles < 1) { + *rv = 0; + return (0); + } + /* Return an error if not enough space for the whole table. */ if (length < tmp_nswapfiles) return (ENOMEM); diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index 57166b4e63..8b9fd0d7a3 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,6 +25,10 @@ */ /* + * Copyright 2016, Joyent, Inc. + */ + +/* * vm_usage * * This file implements the getvmusage() private system call. @@ -114,7 +118,7 @@ * For accurate counting of map-shared and COW-shared pages. * * - visited private anons (refcnt > 1) for each collective. - * (entity->vme_anon_hash) + * (entity->vme_anon) * For accurate counting of COW-shared pages. * * The common accounting structure is the vmu_entity_t, which represents @@ -152,6 +156,7 @@ #include <sys/vm_usage.h> #include <sys/zone.h> #include <sys/sunddi.h> +#include <sys/sysmacros.h> #include <sys/avl.h> #include <vm/anon.h> #include <vm/as.h> @@ -199,6 +204,14 @@ typedef struct vmu_object { } vmu_object_t; /* + * Node for tree of visited COW anons. + */ +typedef struct vmu_anon { + avl_node_t vma_node; + uintptr_t vma_addr; +} vmu_anon_t; + +/* * Entity by which to count results. * * The entity structure keeps the current rss/swap counts for each entity @@ -221,7 +234,7 @@ typedef struct vmu_entity { struct vmu_entity *vme_next_calc; mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ - mod_hash_t *vme_anon_hash; /* COW anons visited for entity */ + avl_tree_t vme_anon; /* COW anons visited for entity */ vmusage_t vme_result; /* identifies entity and results */ } vmu_entity_t; @@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2) } /* + * Comparison routine for our AVL tree of anon structures. + */ +static int +vmu_anon_cmp(const void *lhs, const void *rhs) +{ + const vmu_anon_t *l = lhs, *r = rhs; + + if (l->vma_addr == r->vma_addr) + return (0); + + if (l->vma_addr < r->vma_addr) + return (-1); + + return (1); +} + +/* * Save a bound on the free list. */ static void @@ -363,13 +393,18 @@ static void vmu_free_entity(mod_hash_val_t val) { vmu_entity_t *entity = (vmu_entity_t *)val; + vmu_anon_t *anon; + void *cookie = NULL; if (entity->vme_vnode_hash != NULL) i_mod_hash_clear_nosync(entity->vme_vnode_hash); if (entity->vme_amp_hash != NULL) i_mod_hash_clear_nosync(entity->vme_amp_hash); - if (entity->vme_anon_hash != NULL) - i_mod_hash_clear_nosync(entity->vme_anon_hash); + + while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL) + kmem_free(anon, sizeof (vmu_anon_t)); + + avl_destroy(&entity->vme_anon); entity->vme_next = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = entity; @@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid) "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (struct anon_map)); - if (entity->vme_anon_hash == NULL) - entity->vme_anon_hash = mod_hash_create_ptrhash( - "vmusage anon hash", VMUSAGE_HASH_SIZE, - mod_hash_null_valdtor, sizeof (struct anon)); + VERIFY(avl_first(&entity->vme_anon) == NULL); + + avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon), + offsetof(struct vmu_anon, vma_node)); entity->vme_next = vmu_data.vmu_entities; vmu_data.vmu_entities = entity; @@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id) zone->vmz_id = id; - if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + if ((vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | @@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) } static int -vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +vmu_find_insert_anon(vmu_entity_t *entity, void *key) { - int ret; - caddr_t val; + vmu_anon_t anon, *ap; - ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t *)&val); + anon.vma_addr = (uintptr_t)key; - if (ret == 0) + if (avl_find(&entity->vme_anon, &anon, NULL) != NULL) return (0); - ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t)key, (mod_hash_hndl_t)0); + ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP); + ap->vma_addr = (uintptr_t)key; - ASSERT(ret == 0); + avl_add(&entity->vme_anon, ap); return (1); } @@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, next = AVL_NEXT(tree, next); continue; } + + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, if (ap != NULL && vn != NULL && vn->v_pages != NULL && (page = page_exists(vn, off)) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, continue; } + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, if (vnode->v_pages != NULL && (page = page_exists(vnode, ptob(index))) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1304,6 +1351,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) } /* + * Pages on the free list aren't counted for the rss. + */ + if (PP_ISFREE(page)) + continue; + + /* * Assume anon structs with a refcnt * of 1 are not COW shared, so there * is no reason to track them per entity. @@ -1320,8 +1373,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) * Track COW anons per entity so * they are not double counted. */ - if (vmu_find_insert_anon(entity->vme_anon_hash, - (caddr_t)ap) == 0) + if (vmu_find_insert_anon(entity, ap) == 0) continue; result->vmu_rss_all += (pgcnt << PAGESHIFT); @@ -1461,8 +1513,9 @@ vmu_calculate_proc(proc_t *p) entities = tmp; } if (vmu_data.vmu_calc_flags & - (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | - VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE | + VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, @@ -1594,8 +1647,7 @@ vmu_free_extra() mod_hash_destroy_hash(te->vme_vnode_hash); if (te->vme_amp_hash != NULL) mod_hash_destroy_hash(te->vme_amp_hash); - if (te->vme_anon_hash != NULL) - mod_hash_destroy_hash(te->vme_anon_hash); + VERIFY(avl_first(&te->vme_anon) == NULL); kmem_free(te, sizeof (vmu_entity_t)); } while (vmu_data.vmu_free_zones != NULL) { @@ -1739,12 +1791,34 @@ vmu_cache_rele(vmu_cache_t *cache) } /* + * When new data is calculated, update the phys_mem rctl usage value in the + * zones. + */ +static void +vmu_update_zone_rctls(vmu_cache_t *cache) +{ + vmusage_t *rp; + size_t i = 0; + zone_t *zp; + + for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) { + if (rp->vmu_type == VMUSAGE_ZONE && + rp->vmu_zoneid != ALL_ZONES) { + if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) { + zp->zone_phys_mem = rp->vmu_rss_all; + zone_rele(zp); + } + } + } +} + +/* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, - uint_t flags, int cpflg) + uint_t flags, id_t req_zone_id, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; @@ -1763,7 +1837,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; - if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) @@ -1826,26 +1900,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, continue; } - /* Skip "other zone" results if not requested */ - if (result->vmu_zoneid != curproc->p_zone->zone_id) { - if (result->vmu_type == VMUSAGE_ZONE && - (flags & VMUSAGE_ALL_ZONES) == 0) - continue; - if (result->vmu_type == VMUSAGE_PROJECTS && - (flags & (VMUSAGE_ALL_PROJECTS | - VMUSAGE_COL_PROJECTS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_TASKS && - (flags & VMUSAGE_ALL_TASKS) == 0) - continue; - if (result->vmu_type == VMUSAGE_RUSERS && - (flags & (VMUSAGE_ALL_RUSERS | - VMUSAGE_COL_RUSERS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_EUSERS && - (flags & (VMUSAGE_ALL_EUSERS | - VMUSAGE_COL_EUSERS)) == 0) + if (result->vmu_type == VMUSAGE_ZONE && + flags & VMUSAGE_A_ZONE) { + /* Skip non-requested zone results */ + if (result->vmu_zoneid != req_zone_id) continue; + } else { + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } } count++; if (out_result != NULL) { @@ -1901,10 +1982,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) int cacherecent = 0; hrtime_t now; uint_t flags_orig; + id_t req_zone_id; /* * Non-global zones cannot request system wide and/or collated - * results, or the system result, so munge the flags accordingly. + * results, or the system result, or usage of another zone, so munge + * the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { @@ -1924,6 +2007,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) flags &= ~VMUSAGE_SYSTEM; flags |= VMUSAGE_ZONE; } + if (flags & VMUSAGE_A_ZONE) { + flags &= ~VMUSAGE_A_ZONE; + flags |= VMUSAGE_ZONE; + } } /* Check for unknown flags */ @@ -1934,6 +2021,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) if ((flags & VMUSAGE_MASK) == 0) return (set_errno(EINVAL)); + /* If requesting results for a specific zone, get the zone ID */ + if (flags & VMUSAGE_A_ZONE) { + size_t bufsize; + vmusage_t zreq; + + if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) + return (set_errno(EFAULT)); + /* Requested zone ID is passed in buf, so 0 len not allowed */ + if (bufsize == 0) + return (set_errno(EINVAL)); + if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg)) + return (set_errno(EFAULT)); + req_zone_id = zreq.vmu_id; + } + mutex_enter(&vmu_data.vmu_lock); now = gethrtime(); @@ -1953,7 +2055,7 @@ start: mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, - cpflg); + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) @@ -2009,8 +2111,11 @@ start: mutex_exit(&vmu_data.vmu_lock); + /* update zone's phys. mem. rctl usage */ + vmu_update_zone_rctls(cache); /* copy cache */ - ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); + ret = vmu_copyout_results(cache, buf, nres, flags_orig, + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); @@ -2030,3 +2135,185 @@ start: vmu_data.vmu_pending_waiters--; goto start; } + +#if defined(__x86) +/* + * Attempt to invalidate all of the pages in the mapping for the given process. + */ +static void +map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size) +{ + page_t *pp; + size_t psize; + u_offset_t off; + caddr_t eaddr; + struct vnode *vp; + struct segvn_data *svd; + struct hat *victim_hat; + + ASSERT((addr + size) <= (seg->s_base + seg->s_size)); + + victim_hat = p->p_as->a_hat; + svd = (struct segvn_data *)seg->s_data; + vp = svd->vp; + psize = page_get_pagesize(seg->s_szc); + + off = svd->offset + (uintptr_t)(addr - seg->s_base); + + for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) { + pp = page_lookup_nowait(vp, off, SE_SHARED); + + if (pp != NULL) { + /* following logic based on pvn_getdirty() */ + + if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { + page_unlock(pp); + continue; + } + + page_io_lock(pp); + hat_page_inval(pp, 0, victim_hat); + page_io_unlock(pp); + + /* + * For B_INVALCURONLY-style handling we let + * page_release call VN_DISPOSE if no one else is using + * the page. + * + * A hat_ismod() check would be useless because: + * (1) we are not be holding SE_EXCL lock + * (2) we've not unloaded _all_ translations + * + * Let page_release() do the heavy-lifting. + */ + (void) page_release(pp, 1); + } + } +} + +/* + * vm_map_inval() + * + * Invalidate as many pages as possible within the given mapping for the given + * process. addr is expected to be the base address of the mapping and size is + * the length of the mapping. In some cases a mapping will encompass an + * entire segment, but at least for anon or stack mappings, these will be + * regions within a single large segment. Thus, the invalidation is oriented + * around a single mapping and not an entire segment. + * + * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so + * this code is only applicable to x86. + */ +int +vm_map_inval(pid_t pid, caddr_t addr, size_t size) +{ + int ret; + int error = 0; + proc_t *p; /* target proc */ + struct as *as; /* target proc's address space */ + struct seg *seg; /* working segment */ + + if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0) + return (set_errno(EPERM)); + + /* If not a valid mapping address, return an error */ + if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr) + return (set_errno(EINVAL)); + +again: + mutex_enter(&pidlock); + p = prfind(pid); + if (p == NULL) { + mutex_exit(&pidlock); + return (set_errno(ESRCH)); + } + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (panicstr != NULL) { + mutex_exit(&p->p_lock); + return (0); + } + + as = p->p_as; + + /* + * Try to set P_PR_LOCK - prevents process "changing shape" + * - blocks fork + * - blocks sigkill + * - cannot be a system proc + * - must be fully created proc + */ + ret = sprtrylock_proc(p); + if (ret == -1) { + /* Process in invalid state */ + mutex_exit(&p->p_lock); + return (set_errno(ESRCH)); + } + + if (ret == 1) { + /* + * P_PR_LOCK is already set. Wait and try again. This also + * drops p_lock so p may no longer be valid since the proc may + * have exited. + */ + sprwaitlock_proc(p); + goto again; + } + + /* P_PR_LOCK is now set */ + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, RW_READER); + if ((seg = as_segat(as, addr)) == NULL) { + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + sprunlock(p); + return (set_errno(ENOMEM)); + } + + /* + * The invalidation behavior only makes sense for vnode-backed segments. + */ + if (seg->s_ops != &segvn_ops) { + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + sprunlock(p); + return (0); + } + + /* + * If the mapping is out of bounds of the segement return an error. + */ + if ((addr + size) > (seg->s_base + seg->s_size)) { + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + sprunlock(p); + return (set_errno(EINVAL)); + } + + /* + * Don't use MS_INVALCURPROC flag here since that would eventually + * initiate hat invalidation based on curthread. Since we're doing this + * on behalf of a different process, that would erroneously invalidate + * our own process mappings. + */ + error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC); + if (error == 0) { + /* + * Since we didn't invalidate during the sync above, we now + * try to invalidate all of the pages in the mapping. + */ + map_inval(p, seg, addr, size); + } + AS_LOCK_EXIT(as); + + mutex_enter(&p->p_lock); + sprunlock(p); + + if (error) + (void) set_errno(error); + return (error); +} +#endif |