diff options
author | Patrick Mooney <pmooney@pfmooney.com> | 2017-08-03 18:19:56 +0000 |
---|---|---|
committer | Patrick Mooney <pmooney@pfmooney.com> | 2017-09-28 15:02:34 +0000 |
commit | dd35f9f41aba6f51178986d36ec29206d6025757 (patch) | |
tree | c4fb07b811c268cd15e87093cf51597a64f8a460 /usr/src | |
parent | 5ae84a5233b723c890288b775cb5317db2e54d61 (diff) | |
download | illumos-joyent-dd35f9f41aba6f51178986d36ec29206d6025757.tar.gz |
OS-6323 want stack-clash mitigation
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Alex Wilson <alex.wilson@joyent.com>
Approved by: Alex Wilson <alex.wilson@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/mdb/common/modules/genunix/memory.c | 43 | ||||
-rw-r--r-- | usr/src/uts/common/Makefile.files | 1 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/procfs/lx_prvnops.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/exec/elf/elf.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/fs/lxproc/lxpr_vnops.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/proc/prioctl.c | 26 | ||||
-rw-r--r-- | usr/src/uts/common/fs/proc/prsubr.c | 35 | ||||
-rw-r--r-- | usr/src/uts/common/os/exec.c | 62 | ||||
-rw-r--r-- | usr/src/uts/common/os/grow.c | 33 | ||||
-rw-r--r-- | usr/src/uts/common/sys/proc.h | 11 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg_hole.c | 305 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg_hole.h | 40 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_as.c | 48 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/vm_machdep.c | 23 |
15 files changed, 602 insertions, 42 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/memory.c b/usr/src/cmd/mdb/common/modules/genunix/memory.c index 34e746f36c..fa4918b9b8 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/memory.c +++ b/usr/src/cmd/mdb/common/modules/genunix/memory.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ #include <mdb/mdb_param.h> @@ -40,6 +40,7 @@ #include <sys/vnode.h> #include <vm/seg_map.h> #include <vm/seg_vn.h> +#include <vm/seg_hole.h> #if defined(__i386) || defined(__amd64) #include <sys/balloon_impl.h> #endif @@ -975,6 +976,11 @@ seg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } +typedef struct pmap_walk_types { + uintptr_t pwt_segvn; + uintptr_t pwt_seghole; +} pmap_walk_types_t; + /*ARGSUSED*/ static int pmap_walk_count_pages(uintptr_t addr, const void *data, void *out) @@ -987,12 +993,14 @@ pmap_walk_count_pages(uintptr_t addr, const void *data, void *out) } static int -pmap_walk_seg(uintptr_t addr, const struct seg *seg, uintptr_t segvn) +pmap_walk_seg(uintptr_t addr, const struct seg *seg, + const pmap_walk_types_t *types) { + const uintptr_t ops = (uintptr_t)seg->s_ops; mdb_printf("%0?p %0?p %7dk", addr, seg->s_base, seg->s_size / 1024); - if (segvn == (uintptr_t)seg->s_ops && seg->s_data != NULL) { + if (ops == types->pwt_segvn && seg->s_data != NULL) { struct segvn_data svn; pgcnt_t nres = 0; @@ -1018,6 +1026,18 @@ pmap_walk_seg(uintptr_t addr, const struct seg *seg, uintptr_t segvn) } else { mdb_printf(" [ anon ]"); } + } else if (ops == types->pwt_seghole && seg->s_data != NULL) { + seghole_data_t shd; + char name[16]; + + (void) mdb_vread(&shd, sizeof (shd), (uintptr_t)seg->s_data); + if (shd.shd_name == NULL || mdb_readstr(name, sizeof (name), + (uintptr_t)shd.shd_name) == 0) { + name[0] = '\0'; + } + + mdb_printf(" %8s [ hole%s%s ]", "-", + name[0] == '0' ? "" : ":", name); } else { mdb_printf(" %8s [ &%a ]", "?", seg->s_ops); } @@ -1027,11 +1047,14 @@ pmap_walk_seg(uintptr_t addr, const struct seg *seg, uintptr_t segvn) } static int -pmap_walk_seg_quick(uintptr_t addr, const struct seg *seg, uintptr_t segvn) +pmap_walk_seg_quick(uintptr_t addr, const struct seg *seg, + const pmap_walk_types_t *types) { + const uintptr_t ops = (uintptr_t)seg->s_ops; + mdb_printf("%0?p %0?p %7dk", addr, seg->s_base, seg->s_size / 1024); - if (segvn == (uintptr_t)seg->s_ops && seg->s_data != NULL) { + if (ops == types->pwt_segvn && seg->s_data != NULL) { struct segvn_data svn; svn.vp = NULL; @@ -1054,10 +1077,10 @@ pmap_walk_seg_quick(uintptr_t addr, const struct seg *seg, uintptr_t segvn) int pmap(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { - uintptr_t segvn; proc_t proc; uint_t quick = FALSE; mdb_walk_cb_t cb = (mdb_walk_cb_t)pmap_walk_seg; + pmap_walk_types_t wtypes = { 0 }; GElf_Sym sym; @@ -1074,9 +1097,9 @@ pmap(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) } if (mdb_lookup_by_name("segvn_ops", &sym) == 0) - segvn = (uintptr_t)sym.st_value; - else - segvn = NULL; + wtypes.pwt_segvn = (uintptr_t)sym.st_value; + if (mdb_lookup_by_name("seghole_ops", &sym) == 0) + wtypes.pwt_seghole = (uintptr_t)sym.st_value; mdb_printf("%?s %?s %8s ", "SEG", "BASE", "SIZE"); @@ -1087,7 +1110,7 @@ pmap(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) mdb_printf("%8s %s\n", "RES", "PATH"); } - if (mdb_pwalk("seg", cb, (void *)segvn, (uintptr_t)proc.p_as) == -1) { + if (mdb_pwalk("seg", cb, (void *)&wtypes, (uintptr_t)proc.p_as) == -1) { mdb_warn("failed to walk segments of as %p", proc.p_as); return (DCMD_ERR); } diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 7b30741fb0..8220f91f03 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -304,6 +304,7 @@ GENUNIX_OBJS += \ sctp_crc32.o \ secflags.o \ seg_dev.o \ + seg_hole.o \ seg_kp.o \ seg_kpm.o \ seg_map.o \ diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c index 08a817396b..5dfb8ce093 100644 --- a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c @@ -1759,6 +1759,10 @@ lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) vnode_t *vp; uint_t protbits; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); pbuf->saddr = (uintptr_t)seg->s_base; diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c index 23c198897d..023d027789 100644 --- a/usr/src/uts/common/exec/elf/elf.c +++ b/usr/src/uts/common/exec/elf/elf.c @@ -26,7 +26,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -2355,6 +2355,10 @@ top: void *tmp = NULL; extern struct seg_ops segspt_shmops; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { uint_t prot; size_t size; diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c index 85ef7b4b9b..1f7f3074d6 100644 --- a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c +++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c @@ -766,6 +766,10 @@ lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) vnode_t *vp; uint_t protbits; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); pbuf->saddr = seg->s_base; diff --git a/usr/src/uts/common/fs/proc/prioctl.c b/usr/src/uts/common/fs/proc/prioctl.c index 7b7fae7557..470c66362b 100644 --- a/usr/src/uts/common/fs/proc/prioctl.c +++ b/usr/src/uts/common/fs/proc/prioctl.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -3521,6 +3521,10 @@ oprgetmap(proc_t *p, list_t *iolhead) caddr_t saddr, naddr; void *tmp = NULL; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr); if (saddr == naddr) @@ -3581,6 +3585,10 @@ oprgetmap32(proc_t *p, list_t *iolhead) caddr_t saddr, naddr; void *tmp = NULL; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr); if (saddr == naddr) @@ -3634,6 +3642,10 @@ oprpdsize(struct as *as) void *tmp = NULL; size_t npage; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { (void) pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr); if ((npage = (naddr - saddr) / PAGESIZE) != 0) @@ -3664,6 +3676,10 @@ oprpdsize32(struct as *as) void *tmp = NULL; size_t npage; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { (void) pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr); if ((npage = (naddr - saddr) / PAGESIZE) != 0) @@ -3714,6 +3730,10 @@ again: caddr_t saddr, naddr; void *tmp = NULL; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { size_t len; size_t npage; @@ -3821,6 +3841,10 @@ again: caddr_t saddr, naddr; void *tmp = NULL; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { size_t len; size_t npage; diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c index 0645a91de1..2062970885 100644 --- a/usr/src/uts/common/fs/proc/prsubr.c +++ b/usr/src/uts/common/fs/proc/prsubr.c @@ -1416,6 +1416,10 @@ prnsegs(struct as *as, int reserved) caddr_t saddr, naddr; void *tmp = NULL; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { (void) pr_getprot(seg, reserved, &tmp, &saddr, &naddr, eaddr); @@ -1671,6 +1675,10 @@ prgetmap(proc_t *p, int reserved, list_t *iolhead) caddr_t saddr, naddr; void *tmp = NULL; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { prot = pr_getprot(seg, reserved, &tmp, &saddr, &naddr, eaddr); @@ -1782,6 +1790,10 @@ prgetmap32(proc_t *p, int reserved, list_t *iolhead) caddr_t saddr, naddr; void *tmp = NULL; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { prot = pr_getprot(seg, reserved, &tmp, &saddr, &naddr, eaddr); @@ -1885,6 +1897,10 @@ prpdsize(struct as *as) void *tmp = NULL; size_t npage; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { (void) pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr); if ((npage = (naddr - saddr) / PAGESIZE) != 0) @@ -1915,6 +1931,10 @@ prpdsize32(struct as *as) void *tmp = NULL; size_t npage; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { (void) pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr); if ((npage = (naddr - saddr) / PAGESIZE) != 0) @@ -1966,6 +1986,10 @@ again: caddr_t saddr, naddr; void *tmp = NULL; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { struct vnode *vp; struct vattr vattr; @@ -2113,6 +2137,10 @@ again: caddr_t saddr, naddr; void *tmp = NULL; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { struct vnode *vp; struct vattr vattr; @@ -4064,6 +4092,9 @@ prgetxmap(proc_t *p, list_t *iolhead) uint64_t npages; uint64_t pagenum; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } /* * Segment loop part one: iterate from the base of the segment * to its end, pausing at each address boundary (baddr) between @@ -4260,6 +4291,10 @@ prgetxmap32(proc_t *p, list_t *iolhead) uint64_t npages; uint64_t pagenum; + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + /* * Segment loop part one: iterate from the base of the segment * to its end, pausing at each address boundary (baddr) between diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index 2ab4d1f023..96b6081489 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -26,7 +26,7 @@ /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -78,6 +78,7 @@ #include <vm/as.h> #include <vm/seg.h> #include <vm/seg_vn.h> +#include <vm/seg_hole.h> #define PRIV_RESET 0x01 /* needs to reset privs */ #define PRIV_SETID 0x02 /* needs to change uids */ @@ -116,6 +117,14 @@ size_t aslr_max_brk_skew = 16 * 1024 * 1024; /* 16MB */ size_t aslr_max_stack_skew = 64 * 1024; /* 64KB */ /* + * Size of guard segment for 64-bit processes and minimum size it can be shrunk + * to in the case of grow() operations. These are kept as variables in case + * they need to be tuned in an emergency. + */ +size_t stack_guard_seg_sz = 256 * 1024 * 1024; +size_t stack_guard_min_sz = 64 * 1024 * 1024; + +/* * exece() - system call wrapper around exec_common() */ int @@ -1948,6 +1957,15 @@ exec_get_spslew(void) * The initial user stack layout is as follows: * * User Stack + * +---------------+ + * | | + * | stack guard | + * | (64-bit only) | + * | | + * +...............+ <--- stack limit (base - curproc->p_stk_ctl) + * . . + * . . + * . . * +---------------+ <--- curproc->p_usrstack * | | * | slew | @@ -1989,6 +2007,11 @@ exec_get_spslew(void) * +---------------+ <--- argv[] * | argc | * +---------------+ <--- stack base + * + * In 64-bit processes, a stack guard segment is allocated at the address + * immediately below where the stack limit ends. This protects new library + * mappings (such as the linker) from being placed in relatively dangerous + * proximity to the stack. */ int exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) @@ -2002,6 +2025,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) struct as *as; extern int use_stk_lpg; size_t sp_slew; +#if defined(_LP64) + const size_t sg_sz = (stack_guard_seg_sz & PAGEMASK); +#endif /* defined(_LP64) */ args->from_model = p->p_model; if (p->p_model == DATAMODEL_NATIVE) { @@ -2153,6 +2179,8 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) p->p_brkpageszc = 0; p->p_stksize = 0; p->p_stkpageszc = 0; + p->p_stkg_start = 0; + p->p_stkg_end = 0; p->p_model = args->to_model; p->p_usrstack = usrstack; p->p_stkprot = args->stk_prot; @@ -2190,10 +2218,36 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) (void) hat_setup(as->a_hat, HAT_ALLOC); hat_join_srd(as->a_hat, args->ex_vp); - /* - * Finally, write out the contents of the new stack. - */ + /* Write out the contents of the new stack. */ error = stk_copyout(args, usrstack - sp_slew, auxvpp, up); kmem_free(args->stk_base, args->stk_size); + +#if defined(_LP64) + /* Add stack guard segment (if needed) after successful copyout */ + if (error == 0 && p->p_model == DATAMODEL_LP64 && sg_sz != 0) { + seghole_crargs_t sca; + caddr_t addr_end = (caddr_t)(((uintptr_t)usrstack - + p->p_stk_ctl) & PAGEMASK); + caddr_t addr_start = addr_end - sg_sz; + + DTRACE_PROBE4(stack__guard__chk, proc_t *, p, + caddr_t, addr_start, caddr_t, addr_end, size_t, sg_sz); + + if (addr_end >= usrstack || addr_start >= addr_end || + valid_usr_range(addr_start, sg_sz, PROT_NONE, as, + as->a_userlimit) != RANGE_OKAY) { + return (E2BIG); + } + + /* Create un-mappable area in AS with seg_hole */ + sca.name = "stack_guard"; + error = as_map(as, addr_start, sg_sz, seghole_create, &sca); + if (error == 0) { + p->p_stkg_start = (uintptr_t)addr_start; + p->p_stkg_end = (uintptr_t)addr_start + sg_sz; + } + } +#endif /* defined(_LP64) */ + return (error); } diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index a3de80259f..9d40f93da1 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -21,7 +21,7 @@ /* * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* @@ -333,9 +333,10 @@ grow(caddr_t sp) } else { err = grow_internal(sp, p->p_stkpageszc); } + newsize = p->p_stksize; as_rangeunlock(as); - if (err == 0 && (newsize = p->p_stksize) > oldsize) { + if (err == 0 && newsize > oldsize) { ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE)); ASSERT(IS_P2ALIGNED(newsize, PAGESIZE)); /* @@ -428,6 +429,7 @@ grow_internal(caddr_t sp, uint_t growszc) struct proc *p = curproc; size_t newsize; size_t oldsize; + uintptr_t new_start; int error; size_t pgsz; uint_t szc; @@ -498,7 +500,32 @@ grow_internal(caddr_t sp, uint_t growszc) } crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN; - if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize, + /* + * The stack is about to grow into its guard. This can be acceptable + * if the size restriction on the stack has been expanded since its + * initialization during exec(). In such cases, the guard segment will + * be shrunk, provided the new size is reasonable. + */ + new_start = (uintptr_t)p->p_usrstack - newsize; + if (p->p_stkg_start != 0 && new_start > p->p_stkg_start && + new_start < p->p_stkg_end) { + const size_t unmap_sz = p->p_stkg_end - new_start; + const size_t remain_sz = new_start - p->p_stkg_start; + extern size_t stack_guard_min_sz; + + /* Do not allow the guard to shrink below minimum size */ + if (remain_sz < stack_guard_min_sz) { + return (ENOMEM); + } + + error = as_unmap(p->p_as, (caddr_t)new_start, unmap_sz); + if (error != 0) { + return (error); + } + p->p_stkg_end -= unmap_sz; + } + + if ((error = as_map(p->p_as, (caddr_t)new_start, newsize - oldsize, segvn_create, &crargs)) != 0) { if (error == EAGAIN) { cmn_err(CE_WARN, "Sorry, no swap space to grow stack " diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index d5be7943ef..a7fff4e5ab 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -251,8 +251,15 @@ typedef struct proc { kmutex_t p_maplock; /* lock for pr_mappage() */ struct proc *p_rlink; /* linked list for server */ kcondvar_t p_srwchan_cv; - size_t p_stksize; /* process stack size in bytes */ - uint_t p_stkpageszc; /* preferred stack max page size code */ + + /* + * Stack sizing and guard information. + * Generally protected by as_rangelock() + */ + size_t p_stksize; /* process stack size in bytes */ + uint_t p_stkpageszc; /* preferred stack max page size code */ + uintptr_t p_stkg_start; /* start of stack guard */ + uintptr_t p_stkg_end; /* end of stack guard */ /* * Microstate accounting, resource usage, and real-time profiling diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h index be1c9514e9..9dde7028c4 100644 --- a/usr/src/uts/common/vm/seg.h +++ b/usr/src/uts/common/vm/seg.h @@ -21,7 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2015, Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -113,6 +113,7 @@ typedef struct seg { } seg_t; #define S_PURGE (0x01) /* seg should be purged in as_gap() */ +#define S_HOLE (0x02) /* seg represents hole in AS */ struct seg_ops { int (*dup)(struct seg *, struct seg *); diff --git a/usr/src/uts/common/vm/seg_hole.c b/usr/src/uts/common/vm/seg_hole.c new file mode 100644 index 0000000000..a716c270cf --- /dev/null +++ b/usr/src/uts/common/vm/seg_hole.c @@ -0,0 +1,305 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/lgrp.h> +#include <sys/mman.h> + +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_hole.h> + + +static int seghole_dup(struct seg *, struct seg *); +static int seghole_unmap(struct seg *, caddr_t, size_t); +static void seghole_free(struct seg *); +static faultcode_t seghole_fault(struct hat *, struct seg *, caddr_t, size_t, + enum fault_type, enum seg_rw); +static faultcode_t seghole_faulta(struct seg *, caddr_t); +static int seghole_setprot(struct seg *, caddr_t, size_t, uint_t); +static int seghole_checkprot(struct seg *, caddr_t, size_t, uint_t); +static int seghole_sync(struct seg *, caddr_t, size_t, int, uint_t); +static size_t seghole_incore(struct seg *, caddr_t, size_t, char *); +static int seghole_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *, + size_t); +static int seghole_getprot(struct seg *, caddr_t, size_t, uint_t *); +static u_offset_t seghole_getoffset(struct seg *, caddr_t); +static int seghole_gettype(struct seg *, caddr_t); +static int seghole_getvp(struct seg *, caddr_t, struct vnode **); +static int seghole_advise(struct seg *, caddr_t, size_t, uint_t); +static void seghole_dump(struct seg *); +static int seghole_pagelock(struct seg *, caddr_t, size_t, struct page ***, + enum lock_type, enum seg_rw); +static int seghole_setpagesize(struct seg *, caddr_t, size_t, uint_t); +static int seghole_capable(struct seg *, segcapability_t); + +static struct seg_ops seghole_ops = { + seghole_dup, + seghole_unmap, + seghole_free, + seghole_fault, + seghole_faulta, + seghole_setprot, + seghole_checkprot, + NULL, /* kluster: disabled */ + NULL, /* swapout: disabled */ + seghole_sync, + seghole_incore, + seghole_lockop, + seghole_getprot, + seghole_getoffset, + seghole_gettype, + seghole_getvp, + seghole_advise, + seghole_dump, + seghole_pagelock, + seghole_setpagesize, + NULL, /* getmemid: disabled */ + NULL, /* getpolicy: disabled */ + seghole_capable, + seg_inherit_notsup +}; + +/* + * Create a hole in the AS. + */ +int +seghole_create(struct seg *seg, void *argsp) +{ + seghole_crargs_t *crargs = argsp; + seghole_data_t *data; + + data = kmem_alloc(sizeof (seghole_data_t), KM_SLEEP); + data->shd_name = crargs->name; + + seg->s_ops = &seghole_ops; + seg->s_data = data; + seg->s_flags = S_HOLE; + + return (0); +} + +static int +seghole_dup(struct seg *seg, struct seg *newseg) +{ + seghole_data_t *shd = (seghole_data_t *)seg->s_data; + seghole_data_t *newshd; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + newshd = kmem_zalloc(sizeof (seghole_data_t), KM_SLEEP); + newshd->shd_name = shd->shd_name; + + newseg->s_ops = seg->s_ops; + newseg->s_data = newshd; + newseg->s_flags = S_HOLE; + + return (0); +} + +static int +seghole_unmap(struct seg *seg, caddr_t addr, size_t len) +{ + seghole_data_t *sud = (seghole_data_t *)seg->s_data; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + /* Entire segment is being unmapped */ + if (addr == seg->s_base && len == seg->s_size) { + seg_free(seg); + return (0); + } + + /* Shrinking from low address side */ + if (addr == seg->s_base) { + seg->s_base += len; + seg->s_size -= len; + return (0); + } + + /* Shrinking from high address side */ + if ((addr + len) == (seg->s_base + seg->s_size)) { + seg->s_size -= len; + return (0); + } + + /* Do not tolerate splitting the segment */ + return (EINVAL); +} + +static void +seghole_free(struct seg *seg) +{ + seghole_data_t *data = (seghole_data_t *)seg->s_data; + + ASSERT(data != NULL); + + kmem_free(data, sizeof (*data)); + seg->s_data = NULL; +} + +/* ARGSUSED */ +static faultcode_t +seghole_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, + enum fault_type type, enum seg_rw tw) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + return (FC_NOMAP); +} + +/* ARGSUSED */ +static faultcode_t +seghole_faulta(struct seg *seg, caddr_t addr) +{ + return (FC_NOMAP); +} + +/* ARGSUSED */ +static int +seghole_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + return (ENOMEM); +} + +/* ARGSUSED */ +static int +seghole_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + return (ENOMEM); +} + +/* ARGSUSED */ +static int +seghole_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) +{ + /* Always succeed since there are no backing store to sync */ + return (0); +} + +/* ARGSUSED */ +static size_t +seghole_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + return (0); +} + +/* ARGSUSED */ +static int +seghole_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op, + ulong_t *lockmap, size_t pos) +{ + /* + * Emit an error consistent with there being no segment in this hole in + * the AS. The MC_LOCKAS and MC_UNLOCKAS commands will explicitly skip + * hole segments, allowing such operations to proceed as expected. + */ + return (ENOMEM); +} + +static int +seghole_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + size_t pgno; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + /* + * Few SEGOP_GETPROT callers actually check for an error, so it's + * necessary to report zeroed protection for the length of the request. + */ + pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; + while (pgno > 0) { + protv[--pgno] = 0; + } + + return (ENOMEM); +} + +/* ARGSUSED */ +static u_offset_t +seghole_getoffset(struct seg *seg, caddr_t addr) +{ + /* + * To avoid leaking information about the layout of the kernel address + * space, always report '0' as the offset. + */ + return (0); +} + +/* ARGSUSED */ +static int +seghole_gettype(struct seg *seg, caddr_t addr) +{ + return (MAP_PRIVATE); +} + +/* ARGSUSED */ +static int +seghole_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + return (ENOMEM); +} + +/* ARGSUSED */ +static int +seghole_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) +{ + return (ENOMEM); +} + +/* ARGSUSED */ +static void +seghole_dump(struct seg *seg) +{ + /* There's nothing to dump from a hole in the AS */ +} + +/* ARGSUSED */ +static int +seghole_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, + enum lock_type type, enum seg_rw rw) +{ + return (EFAULT); +} + +/* ARGSUSED */ +static int +seghole_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) +{ + return (ENOMEM); +} + +/* ARGSUSED */ +static int +seghole_capable(struct seg *seg, segcapability_t capability) +{ + /* no special capablities */ + return (0); +} diff --git a/usr/src/uts/common/vm/seg_hole.h b/usr/src/uts/common/vm/seg_hole.h new file mode 100644 index 0000000000..fb48a057e0 --- /dev/null +++ b/usr/src/uts/common/vm/seg_hole.h @@ -0,0 +1,40 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_SEG_HOLE_H +#define _VM_SEG_HOLE_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct seghole_crargs { + const char *name; +} seghole_crargs_t; + +typedef struct seghole_data { + const char *shd_name; +} seghole_data_t; + +extern int seghole_create(struct seg *, void *); + +#define AS_MAP_CHECK_SEGHOLE(crfp) \ + ((crfp) == (int (*)())seghole_create) + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_HOLE_H */ diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c index 0becd0f81c..502fb8f386 100644 --- a/usr/src/uts/common/vm/vm_as.c +++ b/usr/src/uts/common/vm/vm_as.c @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -68,6 +68,7 @@ #include <vm/seg_kmem.h> #include <vm/seg_map.h> #include <vm/seg_spt.h> +#include <vm/seg_hole.h> #include <vm/page.h> clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ @@ -819,7 +820,9 @@ as_dup(struct as *as, struct proc *forkedproc) as_free(newas); return (error); } - newas->a_size += seg->s_size; + if ((newseg->s_flags & S_HOLE) == 0) { + newas->a_size += seg->s_size; + } } newas->a_resvsize = as->a_resvsize - purgesize; @@ -1330,6 +1333,8 @@ top: as_clearwatchprot(as, raddr, eaddr - raddr); for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { + const boolean_t is_hole = ((seg->s_flags & S_HOLE) != 0); + if (eaddr <= seg->s_base) break; /* eaddr was in a gap; all done */ @@ -1434,9 +1439,11 @@ retry: return (-1); } - as->a_size -= ssize; - if (rsize) - as->a_resvsize -= rsize; + if (!is_hole) { + as->a_size -= ssize; + if (rsize) + as->a_resvsize -= rsize; + } raddr += ssize; } AS_LOCK_EXIT(as); @@ -1686,6 +1693,7 @@ as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), size_t rsize; /* rounded up size */ int error; int unmap = 0; + boolean_t is_hole = B_FALSE; /* * The use of a_proc is preferred to handle the case where curproc is * a door_call server and is allocating memory in the client's (a_proc) @@ -1712,7 +1720,14 @@ as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), gethrestime(&as->a_updatetime); if (as != &kas) { - if (as->a_size + rsize > (size_t)p->p_vmem_ctl) { + /* + * Ensure that the virtual size of the process will not exceed + * the configured limit. Since seg_hole segments will later + * set the S_HOLE flag indicating their status as a hole in the + * AS, they are excluded from this check. + */ + if (as->a_size + rsize > (size_t)p->p_vmem_ctl && + !AS_MAP_CHECK_SEGHOLE(crfp)) { AS_LOCK_EXIT(as); (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], @@ -1770,19 +1785,24 @@ as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), } /* * Add size now so as_unmap will work if as_ctl fails. + * Not applicable to explicit hole segments. */ - as->a_size += rsize; - as->a_resvsize += rsize; + if ((seg->s_flags & S_HOLE) == 0) { + as->a_size += rsize; + as->a_resvsize += rsize; + } else { + is_hole = B_TRUE; + } } as_setwatch(as); /* - * If the address space is locked, - * establish memory locks for the new segment. + * Establish memory locks for the segment if the address space is + * locked, provided it's not an explicit hole in the AS. */ mutex_enter(&as->a_contents); - if (AS_ISPGLCK(as)) { + if (AS_ISPGLCK(as) && !is_hole) { mutex_exit(&as->a_contents); AS_LOCK_EXIT(as); error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); @@ -2310,6 +2330,9 @@ retry: } for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { + if (seg->s_flags & S_HOLE != 0) { + continue; + } error = SEGOP_LOCKOP(seg, seg->s_base, seg->s_size, attr, MC_LOCK, mlock_map, pos); if (error != 0) @@ -2339,6 +2362,9 @@ retry: mutex_exit(&as->a_contents); for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { + if (seg->s_flags & S_HOLE != 0) { + continue; + } error = SEGOP_LOCKOP(seg, seg->s_base, seg->s_size, attr, MC_UNLOCK, NULL, 0); if (error != 0) diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c index 0625e37bbf..152a717ad0 100644 --- a/usr/src/uts/i86pc/vm/vm_machdep.c +++ b/usr/src/uts/i86pc/vm/vm_machdep.c @@ -24,7 +24,7 @@ /* * Copyright (c) 2010, Intel Corporation. * All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -689,9 +689,6 @@ map_addr_proc( base = p->p_brkbase; #if defined(__amd64) - /* - * XX64 Yes, this needs more work. - */ if (p->p_model == DATAMODEL_NATIVE) { if (userlimit < as->a_userlimit) { /* @@ -711,16 +708,24 @@ map_addr_proc( } } else { /* - * XX64 This layout is probably wrong .. but in - * the event we make the amd64 address space look - * like sparcv9 i.e. with the stack -above- the - * heap, this bit of code might even be correct. + * With the stack positioned at a higher address than + * the heap for 64-bit processes, it is necessary to be + * mindful of its location and potential size. + * + * Unallocated space above the top of the stack (that + * is, at a lower address) but still within the bounds + * of the stack limit should be considered unavailable. + * + * As the 64-bit stack guard is mapped in immediately + * adjacent to the stack limit boundary, this prevents + * new mappings from having accidentally dangerous + * proximity to the stack. */ slen = p->p_usrstack - base - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK); } } else -#endif +#endif /* defined(__amd64) */ slen = userlimit - base; /* Make len be a multiple of PAGESIZE */ |