diff options
Diffstat (limited to 'usr/src/uts/common/os')
45 files changed, 3484 insertions, 1046 deletions
| diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c index e598e0d08d..891c4e0836 100644 --- a/usr/src/uts/common/os/acct.c +++ b/usr/src/uts/common/os/acct.c @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -47,6 +48,7 @@  #include <sys/time.h>  #include <sys/msacct.h>  #include <sys/zone.h> +#include <sys/brand.h>  /*   * Each zone has its own accounting settings (on or off) and associated @@ -373,7 +375,7 @@ acct_compress(ulong_t t)   * On exit, write a record on the accounting file.   */  void -acct(char st) +acct(int st)  {  	struct vnode *vp;  	struct cred *cr; @@ -402,6 +404,21 @@ acct(char st)  	 * This only gets called from exit after all lwp's have exited so no  	 * cred locking is needed.  	 */ + +	/* If there is a brand-specific hook, use it instead */ +	if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) { +		ZBROP(curzone)->b_acct_out(vp, st); +		mutex_exit(&ag->aclock); +		return; +	} + +	/* +	 * The 'st' status value was traditionally masked this way by our +	 * caller, but we now accept the unmasked value for brand handling. +	 * Zones not using the brand hook mask the status here. +	 */ +	st &= 0xff; +  	p = curproc;  	ua = PTOU(p);  	bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm)); diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index 0af67f5d98..60e8150a0d 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc.   */  #include <sys/kmem.h> @@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops  = {  };  #else /* !__sparcv9 */  struct brand_mach_ops native_mach_ops  = { -		NULL, NULL, NULL, NULL +		NULL, NULL, NULL, NULL, NULL, NULL, NULL  };  #endif /* !__sparcv9 */ @@ -53,7 +54,8 @@ brand_t native_brand = {  		BRAND_VER_1,  		"native",  		NULL, -		&native_mach_ops +		&native_mach_ops, +		0  };  /* @@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp)  	mutex_exit(&brand_list_lock);  } -void -brand_setbrand(proc_t *p) +int +brand_setbrand(proc_t *p, boolean_t lwps_ok)  {  	brand_t *bp = p->p_zone->zone_brand; +	void *brand_data = NULL; -	ASSERT(bp != NULL); -	ASSERT(p->p_brand == &native_brand); +	VERIFY(MUTEX_NOT_HELD(&p->p_lock)); +	VERIFY(bp != NULL);  	/* -	 * We should only be called from exec(), when we know the process -	 * is single-threaded. +	 * Process branding occurs during fork() and exec().  When it happens +	 * during fork(), the LWP count will always be 0 since branding is +	 * performed as part of getproc(), before LWPs have been associated. +	 * The same is not true during exec(), where a multi-LWP process may +	 * undergo branding just prior to gexec(). This is to ensure +	 * exec-related brand hooks are available.  While it may seem +	 * complicated to brand a multi-LWP process, the two possible outcomes +	 * simplify things: +	 * +	 * 1. The exec() succeeds:  LWPs besides the caller will be killed and +	 *    any further branding will occur in a single-LWP context. +	 * 2. The exec() fails: The process will be promptly unbranded since +	 *    the hooks are no longer needed. +	 * +	 * To prevent inconsistent brand state from being encountered during +	 * the exec(), LWPs beyond the caller which are associated with this +	 * process must be held temporarily.  They will be released either when +	 * they are killed in the exec() success, or when the brand is cleared +	 * after exec() failure.  	 */ -	ASSERT(p->p_tlist == p->p_tlist->t_forw); +	if (lwps_ok) { +		/* +		 * We've been called from a exec() context tolerating the +		 * existence of multiple LWPs during branding is necessary. +		 */ +		VERIFY(p == curproc); +		VERIFY(p->p_tlist != NULL); +		if (p->p_tlist != p->p_tlist->t_forw) { +			/* +			 * Multiple LWPs are present.  Hold all but the caller. +			 */ +			if (!holdlwps(SHOLDFORK1)) { +				return (-1); +			} +		} +	} else { +		/* +		 * Processes branded during fork() should not have LWPs at all. +		 */ +		VERIFY(p->p_tlist == NULL); +	} + +	if (bp->b_data_size > 0) { +		brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); +	} + +	mutex_enter(&p->p_lock); +	ASSERT(!PROC_IS_BRANDED(p));  	p->p_brand = bp; +	p->p_brand_data = brand_data;  	ASSERT(PROC_IS_BRANDED(p));  	BROP(p)->b_setbrand(p); +	mutex_exit(&p->p_lock); +	return (0);  }  void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p, boolean_t lwps_ok)  {  	brand_t *bp = p->p_zone->zone_brand; -	klwp_t *lwp = NULL; -	ASSERT(bp != NULL); -	ASSERT(!no_lwps || (p->p_tlist == NULL)); +	void *brand_data; -	/* -	 * If called from exec_common() or proc_exit(), -	 * we know the process is single-threaded. -	 * If called from fork_fail, p_tlist is NULL. -	 */ -	if (!no_lwps) { -		ASSERT(p->p_tlist == p->p_tlist->t_forw); -		lwp = p->p_tlist->t_lwp; -	} +	VERIFY(MUTEX_NOT_HELD(&p->p_lock)); +	VERIFY(bp != NULL); +	VERIFY(PROC_IS_BRANDED(p)); -	ASSERT(PROC_IS_BRANDED(p)); -	BROP(p)->b_proc_exit(p, lwp); +	if (BROP(p)->b_clearbrand != NULL) +		BROP(p)->b_clearbrand(p, lwps_ok); + +	mutex_enter(&p->p_lock);  	p->p_brand = &native_brand; +	brand_data = p->p_brand_data; +	p->p_brand_data = NULL; + +	if (lwps_ok) { +		VERIFY(p == curproc); +		/* +		 * A process with multiple LWPs is being de-branded after +		 * failing an exec.  The other LWPs were held as part of the +		 * procedure, so they must be resumed now. +		 */ +		if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) { +			continuelwps(p); +		} +	} else { +		/* +		 * While clearing the brand, it's ok for one LWP to be present. +		 * This happens when a native binary is executed inside a +		 * branded zone, since the brand will be removed during the +		 * course of a successful exec. +		 */ +		VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw); +	} +	mutex_exit(&p->p_lock); + +	if (brand_data != NULL) { +		kmem_free(brand_data, bp->b_data_size); +	}  }  #if defined(__sparcv9) @@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,  		return (ENOSYS);  	/* For all other operations this must be a branded process. */ -	if (p->p_brand == &native_brand) +	if (!PROC_IS_BRANDED(p))  		return (ENOSYS);  	ASSERT(p->p_brand == pbrand); @@ -600,16 +671,16 @@ restoreexecenv(struct execenv *ep, stack_t *sp)  /*ARGSUSED*/  int  brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, -    intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, -    cred_t *cred, int brand_action, struct brand *pbrand, char *bname, -    char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) +    intpdata_t *idatap, int level, size_t *execsz, int setid, +    caddr_t exec_file, cred_t *cred, int *brand_action, struct brand *pbrand, +    char *bname, char *brandlib, char *brandlib32)  {  	vnode_t		*nvp;  	Ehdr		ehdr;  	Addr		uphdr_vaddr;  	intptr_t	voffset; -	int		interp; +	char		*interp;  	int		i, err;  	struct execenv	env;  	struct execenv	origenv; @@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	klwp_t		*lwp = ttolwp(curthread);  	brand_proc_data_t	*spd;  	brand_elf_data_t sed, *sedp; -	char		*linker;  	uintptr_t	lddata; /* lddata of executable's linker */  	ASSERT(curproc->p_brand == pbrand); @@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	 */  	if (args->to_model == DATAMODEL_NATIVE) {  		args->emulator = brandlib; -		linker = brandlinker;  	}  #if defined(_LP64)  	else {  		args->emulator = brandlib32; -		linker = brandlinker32;  	}  #endif  /* _LP64 */ @@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	if (args->to_model == DATAMODEL_NATIVE) {  		err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,  		    &voffset, exec_file, &interp, &env.ex_bssbase, -		    &env.ex_brkbase, &env.ex_brksize, NULL); +		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);  	}  #if defined(_LP64)  	else { @@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  		Elf32_Addr uphdr_vaddr32;  		err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,  		    &voffset, exec_file, &interp, &env.ex_bssbase, -		    &env.ex_brkbase, &env.ex_brksize, NULL); +		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);  		Ehdr32to64(&ehdr32, &ehdr);  		if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  #endif  /* _LP64 */  	if (err != 0) {  		restoreexecenv(&origenv, &orig_sigaltstack); + +		if (interp != NULL) +			kmem_free(interp, MAXPATHLEN); +  		return (err);  	} @@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	sedp->sed_phent = ehdr.e_phentsize;  	sedp->sed_phnum = ehdr.e_phnum; -	if (interp) { +	if (interp != NULL) {  		if (ehdr.e_type == ET_DYN) {  			/*  			 * This is a shared object executable, so we @@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  		 * it in and store relevant information about it in the  		 * aux vector, where the brand library can find it.  		 */ -		if ((err = lookupname(linker, UIO_SYSSPACE, +		if ((err = lookupname(interp, UIO_SYSSPACE,  		    FOLLOW, NULLVPP, &nvp)) != 0) { -			uprintf("%s: not found.", brandlinker); +			uprintf("%s: not found.", interp);  			restoreexecenv(&origenv, &orig_sigaltstack); +			kmem_free(interp, MAXPATHLEN);  			return (err);  		} + +		kmem_free(interp, MAXPATHLEN); +  		if (args->to_model == DATAMODEL_NATIVE) {  			err = mapexec_brand(nvp, args, &ehdr,  			    &uphdr_vaddr, &voffset, exec_file, &interp, -			    NULL, NULL, NULL, &lddata); +			    NULL, NULL, NULL, &lddata, NULL);  		}  #if defined(_LP64)  		else { @@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  			Elf32_Addr uphdr_vaddr32;  			err = mapexec32_brand(nvp, args, &ehdr32,  			    &uphdr_vaddr32, &voffset, exec_file, &interp, -			    NULL, NULL, NULL, &lddata); +			    NULL, NULL, NULL, &lddata, NULL);  			Ehdr32to64(&ehdr32, &ehdr);  			if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	/*  	 * Third, the /proc aux vectors set up by elfexec() point to -	 * brand emulation library and it's linker.  Copy these to the +	 * brand emulation library and its linker.  Copy these to the  	 * /proc brand specific aux vector, and update the regular -	 * /proc aux vectors to point to the executable (and it's +	 * /proc aux vectors to point to the executable (and its  	 * linker).  This will enable debuggers to access the  	 * executable via the usual /proc or elf notes aux vectors.  	 * @@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand)  }  /*ARGSUSED*/ -int +void  brand_solaris_initlwp(klwp_t *l, struct brand *pbrand)  {  	ASSERT(l->lwp_procp->p_brand == pbrand);  	ASSERT(l->lwp_procp->p_brand_data != NULL);  	ASSERT(l->lwp_brand == NULL);  	l->lwp_brand = (void *)-1; -	return (0);  }  /*ARGSUSED*/  void  brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand)  { -	proc_t  *p = l->lwp_procp; -  	ASSERT(l->lwp_procp->p_brand == pbrand);  	ASSERT(l->lwp_procp->p_brand_data != NULL);  	ASSERT(l->lwp_brand != NULL); - -	/* -	 * We should never be called for the last thread in a process. -	 * (That case is handled by brand_solaris_proc_exit().) -	 * Therefore this lwp must be exiting from a multi-threaded -	 * process. -	 */ -	ASSERT(p->p_tlist != p->p_tlist->t_forw); - -	l->lwp_brand = NULL;  }  /*ARGSUSED*/  void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand)  {  	ASSERT(p->p_brand == pbrand);  	ASSERT(p->p_brand_data != NULL); -	/* -	 * When called from proc_exit(), we know that process is -	 * single-threaded and free our lwp brand data. -	 * otherwise just free p_brand_data and return. -	 */ -	if (l != NULL) { -		ASSERT(p->p_tlist == p->p_tlist->t_forw); -		ASSERT(p->p_tlist->t_lwp == l); -		(void) brand_solaris_freelwp(l, pbrand); -	} -  	/* upon exit, free our proc brand data */  	kmem_free(p->p_brand_data, sizeof (brand_proc_data_t));  	p->p_brand_data = NULL; @@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand)  	ASSERT(p->p_tlist == p->p_tlist->t_forw);  	p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); -	(void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand);  } diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index 805813037d..1280c8a1b6 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -25,7 +25,7 @@   */  /* - * Copyright (c) 2015, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc.   */  #include <sys/timer.h> @@ -41,6 +41,9 @@  static clock_backend_t clock_highres; +/* minimum non-privileged interval (200us) */ +long clock_highres_interval_min = 200000; +  /*ARGSUSED*/  static int  clock_highres_settime(timespec_t *ts) @@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts)  static int  clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *))  { -	/* -	 * CLOCK_HIGHRES timers of sufficiently high resolution can deny -	 * service; only allow privileged users to create such timers. -	 * Sites that do not wish to have this restriction should -	 * give users the "proc_clock_highres" privilege. -	 */ -	if (secpolicy_clock_highres(CRED()) != 0) { -		it->it_arg = NULL; -		return (EPERM); -	} -  	it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP);  	it->it_fire = fire; @@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags,  	cpu_t *cpu;  	cpupart_t *cpupart;  	int pset; +	boolean_t value_need_clamp = B_FALSE; +	boolean_t intval_need_clamp = B_FALSE; +	cred_t *cr = CRED(); +	struct itimerspec clamped; + +	/* +	 * CLOCK_HIGHRES timers of sufficiently high resolution can deny +	 * service; only allow privileged users to create such timers. +	 * Non-privileged users (those without the "proc_clock_highres" +	 * privilege) can create timers with lower resolution but if they +	 * attempt to use a very low time value (< 200us) then their +	 * timer will be clamped at 200us. +	 */ +	if (when->it_value.tv_sec == 0 && +	    when->it_value.tv_nsec > 0 && +	    when->it_value.tv_nsec < clock_highres_interval_min) +		value_need_clamp = B_TRUE; + +	if (when->it_interval.tv_sec == 0 && +	    when->it_interval.tv_nsec > 0 && +	    when->it_interval.tv_nsec < clock_highres_interval_min) +		intval_need_clamp = B_TRUE; + +	if ((value_need_clamp || intval_need_clamp) && +	    secpolicy_clock_highres(cr) != 0) { +		clamped.it_value.tv_sec = when->it_value.tv_sec; +		clamped.it_interval.tv_sec = when->it_interval.tv_sec; + +		if (value_need_clamp) { +			clamped.it_value.tv_nsec = clock_highres_interval_min; +		} else { +			clamped.it_value.tv_nsec = when->it_value.tv_nsec; +		} + +		if (intval_need_clamp) { +			clamped.it_interval.tv_nsec = +			    clock_highres_interval_min; +		} else { +			clamped.it_interval.tv_nsec = when->it_interval.tv_nsec; +		} + +		when = &clamped; +	}  	cyctime.cyt_when = ts2hrt(&when->it_value);  	cyctime.cyt_interval = ts2hrt(&when->it_interval); diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index 909a6c2860..1a3502a710 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -21,6 +21,7 @@  /*   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2016 Joyent, Inc.   */  /*   * Copyright (c) 2017 by Delphix. All rights reserved. @@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,  	avl_index_t where;  	klwp_t *curlwp = ttolwp(curthread); -	ASSERT(author == curproc); +	/* +	 * It's possible that author is not curproc if the zone is creating +	 * a new process as a child of zsched. +	 */  	mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);  	mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index d5e272c16a..a147b1cf0f 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2019 Joyent Inc.   * Copyright (c) 2016 by Delphix. All rights reserved.   */ @@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type)  	/*  	 * Determine what rootvp to use.  	 */ +	mutex_enter(&curproc->p_lock);  	if (core_type == CORE_PROC) {  		rootvp = (PTOU(curproc)->u_rdir == NULL ?  		    curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir); @@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type)  	VN_HOLD(startvp);  	if (rootvp != rootdir)  		VN_HOLD(rootvp); +	mutex_exit(&curproc->p_lock);  	if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp,  	    startvp, CRED())) != 0) {  		pn_free(&pn); @@ -793,7 +795,7 @@ clock_t	core_delay_usec = 10000;   * using core_write() below, and so it has the same failure semantics.   */  int -core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size, +core_seg(proc_t *p, vnode_t *vp, u_offset_t offset, caddr_t addr, size_t size,      rlim64_t rlimit, cred_t *credp)  {  	caddr_t eaddr; @@ -801,6 +803,11 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,  	size_t len;  	int err = 0; +	if (offset > OFF_MAX || offset + size > OFF_MAX || +	    offset + size < offset) { +		return (EOVERFLOW); +	} +  	eaddr = addr + size;  	for (base = addr; base < eaddr; base += len) {  		len = eaddr - base; @@ -841,15 +848,20 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,   * unexpectedly returns zero but no progress has been made, we return ENOSPC.   */  int -core_write(vnode_t *vp, enum uio_seg segflg, offset_t offset, +core_write(vnode_t *vp, enum uio_seg segflg, u_offset_t offset,      const void *buf, size_t len, rlim64_t rlimit, cred_t *credp)  {  	ssize_t resid = len;  	int error = 0; +	if (offset > OFF_MAX || offset + len > OFF_MAX || +	    offset + len < offset) { +		return (EOVERFLOW); +	} +  	while (len != 0) { -		error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len, offset, -		    segflg, 0, rlimit, credp, &resid); +		error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len, +		    (offset_t)offset, segflg, 0, rlimit, credp, &resid);  		if (error != 0)  			break; diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 87c0896814..4648dae9dd 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -21,6 +21,7 @@  /*   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.   * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc.   */  /* @@ -108,7 +109,8 @@ kmutex_t	cpu_lock;  cpu_t		*cpu_list;		/* list of all CPUs */  cpu_t		*clock_cpu_list;	/* used by clock to walk CPUs */  cpu_t		*cpu_active;		/* list of active CPUs */ -static cpuset_t	cpu_available;		/* set of available CPUs */ +cpuset_t	cpu_active_set;		/* cached set of active CPUs */ +cpuset_t	cpu_available;		/* set of available CPUs */  cpuset_t	cpu_seqid_inuse;	/* which cpu_seqids are in use */  cpu_t		**cpu_seq;		/* ptrs to CPUs, indexed by seq_id */ @@ -386,36 +388,56 @@ force_thread_migrate(kthread_id_t tp)  /*   * Set affinity for a specified CPU. - * A reference count is incremented and the affinity is held until the - * reference count is decremented to zero by thread_affinity_clear(). - * This is so regions of code requiring affinity can be nested. - * Caller needs to ensure that cpu_id remains valid, which can be - * done by holding cpu_lock across this call, unless the caller - * specifies CPU_CURRENT in which case the cpu_lock will be acquired - * by thread_affinity_set and CPU->cpu_id will be the target CPU. + * + * Specifying a cpu_id of CPU_CURRENT, allowed _only_ when setting affinity for + * curthread, will set affinity to the CPU on which the thread is currently + * running.  For other cpu_id values, the caller must ensure that the + * referenced CPU remains valid, which can be done by holding cpu_lock across + * this call. + * + * CPU affinity is guaranteed after return of thread_affinity_set().  If a + * caller setting affinity to CPU_CURRENT requires that its thread not migrate + * CPUs prior to a successful return, it should take extra precautions (such as + * their own call to kpreempt_disable) to ensure that safety. + * + * CPU_BEST can be used to pick a "best" CPU to migrate to, including + * potentially the current CPU. + * + * A CPU affinity reference count is maintained by thread_affinity_set and + * thread_affinity_clear (incrementing and decrementing it, respectively), + * maintaining CPU affinity while the count is non-zero, and allowing regions + * of code which require affinity to be nested.   */  void  thread_affinity_set(kthread_id_t t, int cpu_id)  { -	cpu_t		*cp; -	int		c; +	cpu_t *cp;  	ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL)); -	if ((c = cpu_id) == CPU_CURRENT) { -		mutex_enter(&cpu_lock); -		cpu_id = CPU->cpu_id; +	if (cpu_id == CPU_CURRENT) { +		VERIFY3P(t, ==, curthread); +		kpreempt_disable(); +		cp = CPU; +	} else if (cpu_id == CPU_BEST) { +		VERIFY3P(t, ==, curthread); +		kpreempt_disable(); +		cp = disp_choose_best_cpu(); +	} else { +		/* +		 * We should be asserting that cpu_lock is held here, but +		 * the NCA code doesn't acquire it.  The following assert +		 * should be uncommented when the NCA code is fixed. +		 * +		 * ASSERT(MUTEX_HELD(&cpu_lock)); +		 */ +		VERIFY((cpu_id >= 0) && (cpu_id < NCPU)); +		cp = cpu[cpu_id]; + +		/* user must provide a good cpu_id */ +		VERIFY(cp != NULL);  	} -	/* -	 * We should be asserting that cpu_lock is held here, but -	 * the NCA code doesn't acquire it.  The following assert -	 * should be uncommented when the NCA code is fixed. -	 * -	 * ASSERT(MUTEX_HELD(&cpu_lock)); -	 */ -	ASSERT((cpu_id >= 0) && (cpu_id < NCPU)); -	cp = cpu[cpu_id]; -	ASSERT(cp != NULL);		/* user must provide a good cpu_id */ +  	/*  	 * If there is already a hard affinity requested, and this affinity  	 * conflicts with that, panic. @@ -432,13 +454,14 @@ thread_affinity_set(kthread_id_t t, int cpu_id)  	 * Make sure we're running on the right CPU.  	 */  	if (cp != t->t_cpu || t != curthread) { +		ASSERT(cpu_id != CPU_CURRENT);  		force_thread_migrate(t);	/* drops thread lock */  	} else {  		thread_unlock(t);  	} -	if (c == CPU_CURRENT) -		mutex_exit(&cpu_lock); +	if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST) +		kpreempt_enable();  }  /* @@ -1473,8 +1496,8 @@ again:	for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {  				 * Update CPU last ran on if it was this CPU  				 */  				if (t->t_cpu == cp && t->t_bound_cpu != cp) -					t->t_cpu = disp_lowpri_cpu(ncp, -					    t->t_lpl, t->t_pri, NULL); +					t->t_cpu = disp_lowpri_cpu(ncp, t, +					    t->t_pri);  				ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||  				    t->t_weakbound_cpu == cp); @@ -1516,10 +1539,9 @@ again:	for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {  			 * Update CPU last ran on if it was this CPU  			 */ -			if (t->t_cpu == cp && t->t_bound_cpu != cp) { -				t->t_cpu = disp_lowpri_cpu(ncp, -				    t->t_lpl, t->t_pri, NULL); -			} +			if (t->t_cpu == cp && t->t_bound_cpu != cp) +				t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri); +  			ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||  			    t->t_weakbound_cpu == cp);  			t = t->t_next; @@ -1724,6 +1746,7 @@ cpu_list_init(cpu_t *cp)  	cp->cpu_part = &cp_default;  	CPUSET_ADD(cpu_available, cp->cpu_id); +	CPUSET_ADD(cpu_active_set, cp->cpu_id);  }  /* @@ -1895,6 +1918,7 @@ cpu_add_active_internal(cpu_t *cp)  	cp->cpu_prev_onln = cpu_active->cpu_prev_onln;  	cpu_active->cpu_prev_onln->cpu_next_onln = cp;  	cpu_active->cpu_prev_onln = cp; +	CPUSET_ADD(cpu_active_set, cp->cpu_id);  	if (pp->cp_cpulist) {  		cp->cpu_next_part = pp->cp_cpulist; @@ -1965,6 +1989,7 @@ cpu_remove_active(cpu_t *cp)  	}  	cp->cpu_next_onln = cp;  	cp->cpu_prev_onln = cp; +	CPUSET_DEL(cpu_active_set, cp->cpu_id);  	cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;  	cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; @@ -2704,13 +2729,18 @@ cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind,  	return (0);  } -#if CPUSET_WORDS > 1 -/* - * Functions for implementing cpuset operations when a cpuset is more - * than one word.  On platforms where a cpuset is a single word these - * are implemented as macros in cpuvar.h. - */ +cpuset_t * +cpuset_alloc(int kmflags) +{ +	return (kmem_alloc(sizeof (cpuset_t), kmflags)); +} + +void +cpuset_free(cpuset_t *s) +{ +	kmem_free(s, sizeof (cpuset_t)); +}  void  cpuset_all(cpuset_t *s) @@ -2722,38 +2752,61 @@ cpuset_all(cpuset_t *s)  }  void -cpuset_all_but(cpuset_t *s, uint_t cpu) +cpuset_all_but(cpuset_t *s, const uint_t cpu)  {  	cpuset_all(s);  	CPUSET_DEL(*s, cpu);  }  void -cpuset_only(cpuset_t *s, uint_t cpu) +cpuset_only(cpuset_t *s, const uint_t cpu)  {  	CPUSET_ZERO(*s);  	CPUSET_ADD(*s, cpu);  } +long +cpu_in_set(cpuset_t *s, const uint_t cpu) +{ +	VERIFY(cpu < NCPU); +	return (BT_TEST(s->cpub, cpu)); +} + +void +cpuset_add(cpuset_t *s, const uint_t cpu) +{ +	VERIFY(cpu < NCPU); +	BT_SET(s->cpub, cpu); +} + +void +cpuset_del(cpuset_t *s, const uint_t cpu) +{ +	VERIFY(cpu < NCPU); +	BT_CLEAR(s->cpub, cpu); +} +  int  cpuset_isnull(cpuset_t *s)  {  	int i; -	for (i = 0; i < CPUSET_WORDS; i++) +	for (i = 0; i < CPUSET_WORDS; i++) {  		if (s->cpub[i] != 0)  			return (0); +	}  	return (1);  }  int -cpuset_cmp(cpuset_t *s1, cpuset_t *s2) +cpuset_isequal(cpuset_t *s1, cpuset_t *s2)  {  	int i; -	for (i = 0; i < CPUSET_WORDS; i++) +	for (i = 0; i < CPUSET_WORDS; i++) {  		if (s1->cpub[i] != s2->cpub[i])  			return (0); +	}  	return (1);  } @@ -2822,7 +2875,72 @@ cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid)  	*smallestid = *largestid = CPUSET_NOTINSET;  } -#endif	/* CPUSET_WORDS */ +void +cpuset_atomic_del(cpuset_t *s, const uint_t cpu) +{ +	VERIFY(cpu < NCPU); +	BT_ATOMIC_CLEAR(s->cpub, (cpu)) +} + +void +cpuset_atomic_add(cpuset_t *s, const uint_t cpu) +{ +	VERIFY(cpu < NCPU); +	BT_ATOMIC_SET(s->cpub, (cpu)) +} + +long +cpuset_atomic_xadd(cpuset_t *s, const uint_t cpu) +{ +	long res; + +	VERIFY(cpu < NCPU); +	BT_ATOMIC_SET_EXCL(s->cpub, cpu, res); +	return (res); +} + +long +cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu) +{ +	long res; + +	VERIFY(cpu < NCPU); +	BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res); +	return (res); +} + +void +cpuset_or(cpuset_t *dst, cpuset_t *src) +{ +	for (int i = 0; i < CPUSET_WORDS; i++) { +		dst->cpub[i] |= src->cpub[i]; +	} +} + +void +cpuset_xor(cpuset_t *dst, cpuset_t *src) +{ +	for (int i = 0; i < CPUSET_WORDS; i++) { +		dst->cpub[i] ^= src->cpub[i]; +	} +} + +void +cpuset_and(cpuset_t *dst, cpuset_t *src) +{ +	for (int i = 0; i < CPUSET_WORDS; i++) { +		dst->cpub[i] &= src->cpub[i]; +	} +} + +void +cpuset_zero(cpuset_t *dst) +{ +	for (int i = 0; i < CPUSET_WORDS; i++) { +		dst->cpub[i] = 0; +	} +} +  /*   * Unbind threads bound to specified CPU. diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 25727d54c5..0bd6cfd44f 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -729,6 +729,14 @@ crgetzoneid(const cred_t *cr)  	    cr->cr_zone->zone_id);  } +zoneid_t +crgetzonedid(const cred_t *cr) +{ +	return (cr->cr_zone == NULL ? +	    (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : +	    cr->cr_zone->zone_did); +} +  projid_t  crgetprojid(const cred_t *cr)  { diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c index 0aa54eeaee..316dffc326 100644 --- a/usr/src/uts/common/os/cyclic.c +++ b/usr/src/uts/common/os/cyclic.c @@ -24,7 +24,7 @@   */  /* - * Copyright (c) 2012, Joyent Inc. All rights reserved. + * Copyright 2018 Joyent Inc.   */  /* @@ -112,6 +112,7 @@   *      cyclic_remove()      <-- Removes a cyclic   *      cyclic_bind()        <-- Change a cyclic's CPU or partition binding   *      cyclic_reprogram()   <-- Reprogram a cyclic's expiration + *      cyclic_move_here()   <-- Shuffle cyclic to current CPU   *   *  Inter-subsystem Interfaces   * @@ -3111,6 +3112,61 @@ cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)  	return (1);  } +/* + *  void cyclic_move_here(cyclic_id_t) + * + *  Overview + * + *    cyclic_move_here() attempts to shuffle a cyclic onto the current CPU. + * + *  Arguments and notes + * + *    The first argument is a cyclic_id returned from cyclic_add(). + *    cyclic_move_here() may _not_ be called on a cyclic_id returned from + *    cyclic_add_omni() or one bound to a CPU or partition via cyclic_bind(). + * + *    This cyclic shuffling is performed on a best-effort basis.  If for some + *    reason the current CPU is unsuitable or the thread migrates between CPUs + *    during the call, the function may return with the cyclic residing on some + *    other CPU. + * + *  Return value + * + *    None; cyclic_move_here() always reports success. + * + *  Caller's context + * + *    cpu_lock must be held by the caller, and the caller must not be in + *    interrupt context.  The caller may not hold any locks which are also + *    grabbed by any cyclic handler. + */ +void +cyclic_move_here(cyclic_id_t id) +{ +	cyc_id_t *idp = (cyc_id_t *)id; +	cyc_cpu_t *cc = idp->cyi_cpu; +	cpu_t *dest = CPU; + +	ASSERT(MUTEX_HELD(&cpu_lock)); +	CYC_PTRACE("move_here", idp, dest); +	VERIFY3P(cc, !=, NULL); +	VERIFY3U(cc->cyp_cyclics[idp->cyi_ndx].cy_flags & +	    (CYF_CPU_BOUND|CYF_PART_BOUND), ==, 0); + +	if (cc->cyp_cpu == dest) { +		return; +	} + +	/* Is the destination CPU suitable for a migration target? */ +	if (dest->cpu_cyclic == NULL || +	    dest->cpu_cyclic->cyp_state == CYS_OFFLINE || +	    (dest->cpu_flags & CPU_ENABLE) == 0) { +		return; +	} + +	cyclic_juggle_one_to(idp, dest->cpu_cyclic); +} +  hrtime_t  cyclic_getres()  { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index c3c0481e7f..a4b35dcb5b 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)  	/* Log callback errors */  	if (ret != DDI_SUCCESS) { -		cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", +		cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n",  		    ddi_driver_name(req_p->ireq_dip),  		    ddi_get_instance(req_p->ireq_dip), (int)action, ret);  	} diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index f51e2c5ca1..24b6f0e2eb 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -26,7 +26,7 @@  /*	Copyright (c) 1988 AT&T	*/  /*	  All Rights Reserved  	*/  /* - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc.   */  #include <sys/types.h> @@ -99,6 +99,7 @@ uint_t auxv_hwcap32_2 = 0;	/* 32-bit version of auxv_hwcap2 */  #endif  #define	PSUIDFLAGS		(SNOCD|SUGID) +#define	RANDOM_LEN	16	/* 16 bytes for AT_RANDOM aux entry */  /*   * These are consumed within the specific exec modules, but are defined here @@ -143,7 +144,7 @@ exec_common(const char *fname, const char **argp, const char **envp,  	proc_t *p = ttoproc(curthread);  	klwp_t *lwp = ttolwp(curthread);  	struct user *up = PTOU(p); -	long execsz;		/* temporary count of exec size */ +	size_t execsz;		/* temporary count of exec size */  	int i;  	int error;  	char exec_file[MAXCOMLEN+1]; @@ -265,8 +266,10 @@ exec_common(const char *fname, const char **argp, const char **envp,  	 * only if the pathname does not contain a "/" the resolved path  	 * points to a file in the current working (attribute) directory.  	 */ -	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 && +	mutex_enter(&p->p_lock); +	if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 &&  	    strchr(resolvepn.pn_path, '/') == NULL) { +		mutex_exit(&p->p_lock);  		if (dir != NULL)  			VN_RELE(dir);  		error = EACCES; @@ -275,6 +278,7 @@ exec_common(const char *fname, const char **argp, const char **envp,  		VN_RELE(vp);  		goto out;  	} +	mutex_exit(&p->p_lock);  	bzero(exec_file, MAXCOMLEN+1);  	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN); @@ -322,14 +326,43 @@ exec_common(const char *fname, const char **argp, const char **envp,  	ua.argp = argp;  	ua.envp = envp; -	/* If necessary, brand this process before we start the exec. */ -	if (brandme) -		brand_setbrand(p); +	/* If necessary, brand this process/lwp before we start the exec. */ +	if (brandme) { +		void *brand_data = NULL; + +		/* +		 * Process branding may fail if multiple LWPs are present and +		 * holdlwps() cannot complete successfully. +		 */ +		error = brand_setbrand(p, B_TRUE); + +		if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) { +			brand_data = BROP(p)->b_lwpdata_alloc(p); +			if (brand_data == NULL) { +				error = 1; +			} +		} + +		if (error == 0) { +			mutex_enter(&p->p_lock); +			BROP(p)->b_initlwp(lwp, brand_data); +			mutex_exit(&p->p_lock); +		} else { +			VN_RELE(vp); +			if (dir != NULL) { +				VN_RELE(dir); +			} +			pn_free(&resolvepn); +			goto fail; +		} +	}  	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, -	    exec_file, p->p_cred, brand_action)) != 0) { -		if (brandme) -			brand_clearbrand(p, B_FALSE); +	    exec_file, p->p_cred, &brand_action)) != 0) { +		if (brandme) { +			BROP(p)->b_freelwp(lwp); +			brand_clearbrand(p, B_TRUE); +		}  		VN_RELE(vp);  		if (dir != NULL)  			VN_RELE(dir); @@ -361,7 +394,7 @@ exec_common(const char *fname, const char **argp, const char **envp,  	/*  	 * Clear contract template state  	 */ -	lwp_ctmpl_clear(lwp); +	lwp_ctmpl_clear(lwp, B_TRUE);  	/*  	 * Save the directory in which we found the executable for expanding @@ -385,6 +418,8 @@ exec_common(const char *fname, const char **argp, const char **envp,  	 * pending held signals remain held, so don't clear t_hold.  	 */  	mutex_enter(&p->p_lock); +	DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, +	    uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);  	lwp->lwp_oldcontext = 0;  	lwp->lwp_ustack = 0;  	lwp->lwp_old_stk_ctl = 0; @@ -444,8 +479,10 @@ exec_common(const char *fname, const char **argp, const char **envp,  	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);  	/* Unbrand ourself if necessary. */ -	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) +	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { +		BROP(p)->b_freelwp(lwp);  		brand_clearbrand(p, B_FALSE); +	}  	setregs(&args); @@ -566,10 +603,10 @@ gexec(  	struct uarg *args,  	struct intpdata *idatap,  	int level, -	long *execsz, +	size_t *execsz,  	caddr_t exec_file,  	struct cred *cred, -	int brand_action) +	int *brand_action)  {  	struct vnode *vp, *execvp = NULL;  	proc_t *pp = ttoproc(curthread); @@ -890,8 +927,14 @@ gexec(  			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))  				args->traceinval = 1;  		} -		if (pp->p_proc_flag & P_PR_PTRACE) + +		/* +		 * If legacy ptrace is enabled, generate the SIGTRAP. +		 */ +		if (pp->p_proc_flag & P_PR_PTRACE) {  			psignal(pp, SIGTRAP); +		} +  		if (args->traceinval)  			prinvalidate(&pp->p_user);  	} @@ -1448,7 +1491,7 @@ noexec(      struct uarg *args,      struct intpdata *idatap,      int level, -    long *execsz, +    size_t *execsz,      int setid,      caddr_t exec_file,      struct cred *cred) @@ -1555,6 +1598,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)  	return (0);  } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ +	int error; + +	if (STK_AVAIL(args) < sizeof (int)) +		return (E2BIG); +	*--args->stk_offp = args->stk_strp - args->stk_base; + +	if (len > STK_AVAIL(args)) +		return (E2BIG); +	bcopy(sp, args->stk_strp, len); + +	args->stk_strp += len; + +	return (0); +} +  static int  stk_getptr(uarg_t *args, char *src, char **dst)  { @@ -1591,6 +1655,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)  	size_t size, pad;  	char *argv = (char *)uap->argp;  	char *envp = (char *)uap->envp; +	uint8_t rdata[RANDOM_LEN];  	/*  	 * Copy interpreter's name and argument to argv[0] and argv[1]. @@ -1673,8 +1738,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)  	args->ne = args->na - argc;  	/* -	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and -	 * AT_SUN_EMULATOR strings to the stack. +	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, +	 * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM +	 * array, to the stack.  	 */  	if (auxvpp != NULL && *auxvpp != NULL) {  		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1687,6 +1753,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)  		if (args->emulator != NULL &&  		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)  			return (error); + +		/* +		 * For the AT_RANDOM aux vector we provide 16 bytes of random +		 * data. +		 */ +		(void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + +		if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) +			return (error); + +		if (args->brand_nroot != NULL && +		    (error = stk_add(args, args->brand_nroot, +		    UIO_SYSSPACE)) != 0) +			return (error);  	}  	/* @@ -1793,7 +1873,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)  	/*  	 * Fill in the aux vector now that we know the user stack addresses  	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and -	 * AT_SUN_EMULATOR strings. +	 * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.  	 */  	if (auxvpp != NULL && *auxvpp != NULL) {  		if (args->to_model == DATAMODEL_NATIVE) { @@ -1806,6 +1886,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)  			if (args->emulator != NULL)  				ADDAUX(*a,  				    AT_SUN_EMULATOR, (long)&ustrp[*--offp]) +			ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) +			if (args->brand_nroot != NULL) { +				ADDAUX(*a, +				    AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) +			}  		} else {  			auxv32_t **a = (auxv32_t **)auxvpp;  			ADDAUX(*a, @@ -1818,6 +1903,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)  			if (args->emulator != NULL)  				ADDAUX(*a, AT_SUN_EMULATOR,  				    (int)(uintptr_t)&ustrp[*--offp]) +			ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) +			if (args->brand_nroot != NULL) { +				ADDAUX(*a, AT_SUN_BRAND_NROOT, +				    (int)(uintptr_t)&ustrp[*--offp]) +			}  		}  	} @@ -1961,6 +2051,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)  		usrstack = (char *)USRSTACK32;  	} +	if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) +		usrstack = (char *)args->maxstack; +  	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);  #if defined(__sparc) diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index 1b9359da47..06e0117cd6 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -138,6 +138,27 @@ rexit(int rval)  }  /* + * Bump the init_restarts kstat and let interested parties know about the + * restart. + */ +static void +restart_init_notify(zone_t *zone) +{ +	nvlist_t *nvl = NULL; + +	zone->zone_proc_init_restarts++; + +	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 && +	    nvlist_add_uint32(nvl, ZONE_CB_RESTARTS, +	    zone->zone_proc_init_restarts) == 0) { +		zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS, +		    ZONE_EVENT_INIT_RESTART_SC, nvl); +	} + +	nvlist_free(nvl); +} + +/*   * Called by proc_exit() when a zone's init exits, presumably because   * it failed.  As long as the given zone is still in the "running"   * state, we will re-exec() init, but first we need to reset things @@ -230,7 +251,7 @@ restart_init(int what, int why)  		siginfofree(lwp->lwp_curinfo);  		lwp->lwp_curinfo = NULL;  	} -	lwp_ctmpl_clear(lwp); +	lwp_ctmpl_clear(lwp, B_FALSE);  	/*  	 * Reset both the process root directory and the current working @@ -260,6 +281,8 @@ restart_init(int what, int why)  	ASSERT(p == curproc);  	(void) freectty(B_TRUE); +	restart_init_notify(p->p_zone); +  	/*  	 * Now exec() the new init(1M) on top of the current process.  If we  	 * succeed, the caller will treat this like a successful system call. @@ -320,6 +343,119 @@ proc_is_exiting(proc_t *p)  }  /* + * Return true if zone's init is restarted, false if exit processing should + * proceeed. + */ +static boolean_t +zone_init_exit(zone_t *z, int why, int what) +{ +	/* +	 * Typically we don't let the zone's init exit unless zone_start_init() +	 * failed its exec, or we are shutting down the zone or the machine, +	 * although the various flags handled within this function will control +	 * the behavior. +	 * +	 * Since we are single threaded, we don't need to lock the following +	 * accesses to zone_proc_initpid. +	 */ +	if (z->zone_boot_err != 0 || +	    zone_status_get(z) >= ZONE_IS_SHUTTING_DOWN || +	    zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) { +		/* +		 * Clear the zone's init pid and proceed with exit processing. +		 */ +		z->zone_proc_initpid = -1; +		return (B_FALSE); +	} + +	/* +	 * There are a variety of configuration flags on the zone to control +	 * init exit behavior. +	 * +	 * If the init process should be restarted, the "zone_restart_init" +	 * member will be set. +	 */ +	if (!z->zone_restart_init) { +		/* +		 * The zone has been setup to halt when init exits. +		 */ +		z->zone_init_status = wstat(why, what); +		(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); +		z->zone_proc_initpid = -1; +		return (B_FALSE); +	} + +	/* +	 * At this point we know we're configured to restart init, but there +	 * are various modifiers to that behavior. +	 */ + +	if (z->zone_reboot_on_init_exit) { +		/* +		 * Some init programs in branded zones do not tolerate a +		 * restart in the traditional manner; setting +		 * "zone_reboot_on_init_exit" will cause the entire zone to be +		 * rebooted instead. +		 */ + +		if (z->zone_restart_init_0) { +			/* +			 * Some init programs in branded zones only want to +			 * restart if they exit 0, otherwise the zone should +			 * shutdown. Setting the "zone_restart_init_0" member +			 * controls this behavior. +			 */ +			if (why == CLD_EXITED && what == 0) { +				/* Trigger a zone reboot */ +				(void) zone_kadmin(A_REBOOT, 0, NULL, +				    zone_kcred()); +			} else { +				/* Shutdown instead of reboot */ +				(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, +				    zone_kcred()); +			} +		} else { +			/* Trigger a zone reboot */ +			(void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred()); +		} + +		z->zone_init_status = wstat(why, what); +		z->zone_proc_initpid = -1; +		return (B_FALSE); +	} + +	if (z->zone_restart_init_0) { +		/* +		 * Some init programs in branded zones only want to restart if +		 * they exit 0, otherwise the zone should shutdown. Setting the +		 * "zone_restart_init_0" member controls this behavior. +		 * +		 * In this case we only restart init if it exited successfully. +		 */ +		if (why == CLD_EXITED && what == 0 && +		    restart_init(what, why) == 0) { +			return (B_TRUE); +		} +	} else { +		/* +		 * No restart modifiers on the zone, attempt to restart init. +		 */ +		if (restart_init(what, why) == 0) { +			return (B_TRUE); +		} +	} + + +	/* +	 * The restart failed, the zone will shut down. +	 */ +	z->zone_init_status = wstat(why, what); +	(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); +	z->zone_proc_initpid = -1; +	return (B_FALSE); +} + +/*   * Return value:   *   1 - exitlwps() failed, call (or continue) lwp_exit()   *   0 - restarting init.  Return through system call path @@ -366,45 +502,36 @@ proc_exit(int why, int what)  	}  	mutex_exit(&p->p_lock); -	DTRACE_PROC(lwp__exit); -	DTRACE_PROC1(exit, int, why); +	if (p->p_pid == z->zone_proc_initpid) { +		/* If zone's init restarts, we're done here. */ +		if (zone_init_exit(z, why, what)) +			return (0); +	}  	/* -	 * Will perform any brand specific proc exit processing, since this -	 * is always the last lwp, will also perform lwp_exit and free brand -	 * data +	 * Delay firing probes (and performing brand cleanup) until after the +	 * zone_proc_initpid check. Cases which result in zone shutdown or +	 * restart via zone_kadmin eventually result in a call back to +	 * proc_exit.  	 */ -	if (PROC_IS_BRANDED(p)) { -		lwp_detach_brand_hdlrs(lwp); -		brand_clearbrand(p, B_FALSE); -	} +	DTRACE_PROC(lwp__exit); +	DTRACE_PROC1(exit, int, why);  	/* -	 * Don't let init exit unless zone_start_init() failed its exec, or -	 * we are shutting down the zone or the machine. -	 * -	 * Since we are single threaded, we don't need to lock the -	 * following accesses to zone_proc_initpid. +	 * Will perform any brand specific proc exit processing. Since this +	 * is always the last lwp, will also perform lwp exit/free and proc +	 * exit. Brand data will be freed when the process is reaped.  	 */ -	if (p->p_pid == z->zone_proc_initpid) { -		if (z->zone_boot_err == 0 && -		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && -		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { -			if (z->zone_restart_init == B_TRUE) { -				if (restart_init(what, why) == 0) -					return (0); -			} else { -				(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, -				    CRED()); -			} -		} - +	if (PROC_IS_BRANDED(p)) { +		BROP(p)->b_lwpexit(lwp); +		BROP(p)->b_proc_exit(p);  		/* -		 * Since we didn't or couldn't restart init, we clear -		 * the zone's init state and proceed with exit -		 * processing. +		 * To ensure that b_proc_exit has access to brand-specific data +		 * contained by the one remaining lwp, call the freelwp hook as +		 * the last part of this clean-up process.  		 */ -		z->zone_proc_initpid = -1; +		BROP(p)->b_freelwp(lwp); +		lwp_detach_brand_hdlrs(lwp);  	}  	lwp_pcb_exit(); @@ -565,7 +692,7 @@ proc_exit(int why, int what)  		semexit(p);  	rv = wstat(why, what); -	acct(rv & 0xff); +	acct(rv);  	exacct_commit_proc(p, rv);  	/* @@ -658,10 +785,22 @@ proc_exit(int why, int what)  	if ((q = p->p_child) != NULL && p != proc_init) {  		struct proc	*np;  		struct proc	*initp = proc_init; +		pid_t		zone_initpid = 1; +		struct proc	*zoneinitp = NULL;  		boolean_t	setzonetop = B_FALSE; -		if (!INGLOBALZONE(curproc)) -			setzonetop = B_TRUE; +		if (!INGLOBALZONE(curproc)) { +			zone_initpid = curproc->p_zone->zone_proc_initpid; + +			ASSERT(MUTEX_HELD(&pidlock)); +			zoneinitp = prfind(zone_initpid); +			if (zoneinitp != NULL) { +				initp = zoneinitp; +			} else { +				zone_initpid = 1; +				setzonetop = B_TRUE; +			} +		}  		pgdetach(p); @@ -673,7 +812,8 @@ proc_exit(int why, int what)  			 */  			delete_ns(q->p_parent, q); -			q->p_ppid = 1; +			q->p_ppid = zone_initpid; +  			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);  			if (setzonetop) {  				mutex_enter(&q->p_lock); @@ -847,8 +987,50 @@ proc_exit(int why, int what)  	mutex_exit(&p->p_lock);  	if (!evaporate) { -		p->p_pidflag &= ~CLDPEND; -		sigcld(p, sqp); +		/* +		 * The brand specific code only happens when the brand has a +		 * function to call in place of sigcld and the parent of the +		 * exiting process is not the global zone init. If the parent +		 * is the global zone init, then the process was reparented, +		 * and we don't want brand code delivering possibly strange +		 * signals to init. Also, init is not branded, so any brand +		 * specific exit data will not be picked up by init anyway. +		 */ +		if (PROC_IS_BRANDED(p) && +		    BROP(p)->b_exit_with_sig != NULL && +		    p->p_ppid != 1) { +			/* +			 * The code for _fini that could unload the brand_t +			 * blocks until the count of zones using the module +			 * reaches zero. Zones decrement the refcount on their +			 * brands only after all user tasks in that zone have +			 * exited and been waited on. The decrement on the +			 * brand's refcount happen in zone_destroy(). That +			 * depends on zone_shutdown() having been completed. +			 * zone_shutdown() includes a call to zone_empty(), +			 * where the zone waits for itself to reach the state +			 * ZONE_IS_EMPTY. This state is only set in either +			 * zone_shutdown(), when there are no user processes as +			 * the zone enters this function, or in +			 * zone_task_rele(). zone_task_rele() is called from +			 * code triggered by waiting on processes, not by the +			 * processes exiting through proc_exit().  This means +			 * all the branded processes that could exist for a +			 * specific brand_t must exit and get reaped before the +			 * refcount on the brand_t can reach 0. _fini will +			 * never unload the corresponding brand module before +			 * proc_exit finishes execution for all processes +			 * branded with a particular brand_t, which makes the +			 * operation below safe to do. Brands that wish to use +			 * this mechanism must wait in _fini as described +			 * above. +			 */ +			BROP(p)->b_exit_with_sig(p, sqp); +		} else { +			p->p_pidflag &= ~CLDPEND; +			sigcld(p, sqp); +		} +  	} else {  		/*  		 * Do what sigcld() would do if the disposition @@ -927,10 +1109,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)  int  waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  { -	int found;  	proc_t *cp, *pp; -	int proc_gone;  	int waitflag = !(options & WNOWAIT); +	boolean_t have_brand_helper = B_FALSE;  	/*  	 * Obsolete flag, defined here only for binary compatibility @@ -958,7 +1139,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  	pp = ttoproc(curthread);  	/* -	 * lock parent mutex so that sibling chain can be searched. +	 * Anytime you are looking for a process, you take pidlock to prevent +	 * things from changing as you look.  	 */  	mutex_enter(&pidlock); @@ -978,10 +1160,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  		return (ECHILD);  	} -	while (pp->p_child != NULL) { +	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { +		have_brand_helper = B_TRUE; +	} + +	while (pp->p_child != NULL || have_brand_helper) { +		boolean_t brand_wants_wait = B_FALSE; +		int proc_gone = 0; +		int found = 0; + +		/* +		 * Give the brand a chance to return synthetic results from +		 * this waitid() call before we do the real thing. +		 */ +		if (have_brand_helper) { +			int ret; + +			if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, +			    &brand_wants_wait, &ret) == 0) { +				mutex_exit(&pidlock); +				return (ret); +			} -		proc_gone = 0; +			if (pp->p_child == NULL) { +				goto no_real_children; +			} +		} +		/* +		 * Look for interesting children in the newstate list. +		 */ +		VERIFY(pp->p_child != NULL);  		for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {  			if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))  				continue; @@ -989,6 +1198,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  				continue;  			if (idtype == P_PGID && id != cp->p_pgrp)  				continue; +			if (PROC_IS_BRANDED(pp)) { +				if (BROP(pp)->b_wait_filter != NULL && +				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) +					continue; +			}  			switch (cp->p_wcode) { @@ -1033,12 +1247,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  		 * Wow! None of the threads on the p_sibling_ns list were  		 * interesting threads. Check all the kids!  		 */ -		found = 0;  		for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {  			if (idtype == P_PID && id != cp->p_pid)  				continue;  			if (idtype == P_PGID && id != cp->p_pgrp)  				continue; +			if (PROC_IS_BRANDED(pp)) { +				if (BROP(pp)->b_wait_filter != NULL && +				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) +					continue; +			}  			switch (cp->p_wcode) {  			case CLD_TRAPPED: @@ -1107,11 +1325,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  				break;  		} +no_real_children:  		/*  		 * If we found no interesting processes at all,  		 * break out and return ECHILD.  		 */ -		if (found + proc_gone == 0) +		if (!brand_wants_wait && (found + proc_gone == 0))  			break;  		if (options & WNOHANG) { @@ -1130,7 +1349,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  		 * change state while we wait, we don't wait at all.  		 * Get out with ECHILD according to SVID.  		 */ -		if (found == proc_gone) +		if (!brand_wants_wait && (found == proc_gone))  			break;  		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1226,6 +1445,12 @@ freeproc(proc_t *p)  		p->p_killsqp = NULL;  	} +	/* Clear any remaining brand data */ +	if (PROC_IS_BRANDED(p)) { +		brand_clearbrand(p, B_FALSE); +	} + +  	prfree(p);	/* inform /proc */  	/* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 76eddd4e50..41e7e63d2b 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. + * Copyright 2017, Joyent Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -386,6 +386,7 @@ flist_grow(int maxfd)  		dst->uf_flag = src->uf_flag;  		dst->uf_busy = src->uf_busy;  		dst->uf_portfd = src->uf_portfd; +		dst->uf_gen = src->uf_gen;  	}  	/* @@ -487,7 +488,7 @@ free_afd(afd_t *afd)		/* called below and from thread_free() */  		afd->a_fd[i] = -1;  } -static void +void  set_active_fd(int fd)  {  	afd_t *afd = &curthread->t_activefd; @@ -575,13 +576,12 @@ is_active_fd(kthread_t *t, int fd)  }  /* - * Convert a user supplied file descriptor into a pointer to a file - * structure.  Only task is to check range of the descriptor (soft - * resource limit was enforced at open time and shouldn't be checked - * here). + * Convert a user supplied file descriptor into a pointer to a file structure. + * Only task is to check range of the descriptor (soft resource limit was + * enforced at open time and shouldn't be checked here).   */  file_t * -getf(int fd) +getf_gen(int fd, uf_entry_gen_t *genp)  {  	uf_info_t *fip = P_FINFO(curproc);  	uf_entry_t *ufp; @@ -607,6 +607,9 @@ getf(int fd)  		return (NULL);  	}  	ufp->uf_refcnt++; +	if (genp != NULL) { +		*genp = ufp->uf_gen; +	}  	set_active_fd(fd);	/* record the active file descriptor */ @@ -615,6 +618,12 @@ getf(int fd)  	return (fp);  } +file_t * +getf(int fd) +{ +	return (getf_gen(fd, NULL)); +} +  /*   * Close whatever file currently occupies the file descriptor slot   * and install the new file, usually NULL, in the file descriptor slot. @@ -667,6 +676,7 @@ closeandsetf(int fd, file_t *newfp)  			ASSERT(ufp->uf_flag == 0);  			fd_reserve(fip, fd, 1);  			ufp->uf_file = newfp; +			ufp->uf_gen++;  			UF_EXIT(ufp);  			mutex_exit(&fip->fi_lock);  			return (0); @@ -852,7 +862,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)  	 */  	cfip->fi_nfiles = nfiles = flist_minsize(pfip); -	cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); +	cfip->fi_list = nfiles == 0 ? NULL : +	    kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);  	for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;  	    fd++, pufp++, cufp++) { @@ -860,6 +871,7 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)  		cufp->uf_alloc = pufp->uf_alloc;  		cufp->uf_flag = pufp->uf_flag;  		cufp->uf_busy = pufp->uf_busy; +		cufp->uf_gen = pufp->uf_gen;  		if (pufp->uf_file == NULL) {  			ASSERT(pufp->uf_flag == 0);  			if (pufp->uf_busy) { @@ -1028,6 +1040,9 @@ ufalloc_file(int start, file_t *fp)  	fd_reserve(fip, fd, 1);  	ASSERT(ufp->uf_file == NULL);  	ufp->uf_file = fp; +	if (fp != NULL) { +		ufp->uf_gen++; +	}  	UF_EXIT(ufp);  	mutex_exit(&fip->fi_lock);  	return (fd); @@ -1183,6 +1198,7 @@ setf(int fd, file_t *fp)  	} else {  		UF_ENTER(ufp, fip, fd);  		ASSERT(ufp->uf_busy); +		ufp->uf_gen++;  	}  	ASSERT(ufp->uf_fpollinfo == NULL);  	ASSERT(ufp->uf_flag == 0); @@ -1212,8 +1228,7 @@ f_getfl(int fd, int *flagp)  			error = EBADF;  		else {  			vnode_t *vp = fp->f_vnode; -			int flag = fp->f_flag | -			    ((fp->f_flag2 & ~FEPOLLED) << 16); +			int flag = fp->f_flag | (fp->f_flag2 << 16);  			/*  			 * BSD fcntl() FASYNC compatibility. diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index a63931459f..7e198910b4 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -84,6 +84,7 @@ static int64_t cfork(int, int, int);  static int getproc(proc_t **, pid_t, uint_t);  #define	GETPROC_USER	0x0  #define	GETPROC_KERNEL	0x1 +#define	GETPROC_ZSCHED	0x2  static void fork_fail(proc_t *);  static void forklwp_fail(proc_t *); @@ -705,7 +706,7 @@ fork_fail(proc_t *cp)  	if (PTOU(curproc)->u_cwd)  		refstr_rele(PTOU(curproc)->u_cwd);  	if (PROC_IS_BRANDED(cp)) { -		brand_clearbrand(cp, B_TRUE); +		brand_clearbrand(cp, B_FALSE);  	}  } @@ -754,7 +755,7 @@ forklwp_fail(proc_t *p)  			kmem_free(t->t_door, sizeof (door_data_t));  			t->t_door = NULL;  		} -		lwp_ctmpl_clear(ttolwp(t)); +		lwp_ctmpl_clear(ttolwp(t), B_FALSE);  		/*  		 * Remove the thread from the all threads list. @@ -791,6 +792,9 @@ extern struct as kas;  /*   * fork a kernel process. + * + * Passing a pid argument of -1 indicates that the new process should be + * launched as a child of 'zsched' within the zone.   */  int  newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, @@ -809,6 +813,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,  		rctl_set_t *init_set;  		ASSERT(pid != 1); +		ASSERT(pid >= 0);  		if (getproc(&p, pid, GETPROC_KERNEL) < 0)  			return (EAGAIN); @@ -852,8 +857,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,  		rctl_set_t *init_set;  		task_t *tk, *tk_old;  		klwp_t *lwp; +		boolean_t pzsched = B_FALSE; +		int flag = GETPROC_USER; + +		/* Handle a new user-level thread as child of zsched. */ +		if (pid < 0) { +			VERIFY(curzone != global_zone); +			flag = GETPROC_ZSCHED; +			pzsched = B_TRUE; +			pid = 0; +		} -		if (getproc(&p, pid, GETPROC_USER) < 0) +		if (getproc(&p, pid, flag) < 0)  			return (EAGAIN);  		/*  		 * init creates a new task, distinct from the task @@ -914,7 +929,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,  		}  		t = lwptot(lwp); -		ctp = contract_process_fork(sys_process_tmpl, p, curproc, +		ctp = contract_process_fork(sys_process_tmpl, p, +		    (pzsched ? curproc->p_zone->zone_zsched : curproc),  		    B_FALSE);  		ASSERT(ctp != NULL);  		if (ct != NULL) @@ -955,7 +971,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)  		return (-1);	/* no point in starting new processes */ -	pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; +	if (flags & GETPROC_ZSCHED) { +		pp = curproc->p_zone->zone_zsched; +	} else { +		pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; +	}  	task = pp->p_task;  	proj = task->tk_proj;  	zone = pp->p_zone; @@ -1016,6 +1036,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	cp->p_t1_lgrpid = LGRP_NONE;  	cp->p_tr_lgrpid = LGRP_NONE; +	/* Default to native brand initially */ +	cp->p_brand = &native_brand; +  	if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {  		if (nproc == v.v_proc) {  			CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1083,9 +1106,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);  	cp->p_sessp = pp->p_sessp;  	sess_hold(pp); -	cp->p_brand = pp->p_brand; -	if (PROC_IS_BRANDED(pp)) -		BROP(pp)->b_copy_procdata(cp, pp);  	cp->p_bssbase = pp->p_bssbase;  	cp->p_brkbase = pp->p_brkbase;  	cp->p_brksize = pp->p_brksize; @@ -1170,6 +1190,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	mutex_exit(&cp->p_lock);  	mutex_exit(&pidlock); +	if (PROC_IS_BRANDED(pp)) { +		/* +		 * The only reason why process branding should fail is when +		 * the procedure is complicated by multiple LWPs on the scene. +		 * With an LWP count of 0, this newly allocated process has no +		 * reason to fail branding. +		 */ +		VERIFY0(brand_setbrand(cp, B_FALSE)); + +		BROP(pp)->b_copy_procdata(cp, pp); +	} +  	avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),  	    offsetof(contract_t, ct_ctlist)); @@ -1187,6 +1219,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	 */  	fcnt_add(P_FINFO(pp), 1); +	mutex_enter(&pp->p_lock);  	if (PTOU(pp)->u_cdir) {  		VN_HOLD(PTOU(pp)->u_cdir);  	} else { @@ -1200,6 +1233,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  		VN_HOLD(PTOU(pp)->u_rdir);  	if (PTOU(pp)->u_cwd)  		refstr_hold(PTOU(pp)->u_cwd); +	mutex_exit(&pp->p_lock);  	/*  	 * copy the parent's uarea. diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index de2a4f26c4..07fd623a95 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -21,7 +21,7 @@  /*   * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc.   */  /* @@ -55,6 +55,7 @@  #include <sys/fcntl.h>  #include <sys/lwpchan_impl.h>  #include <sys/nbmlock.h> +#include <sys/brand.h>  #include <vm/hat.h>  #include <vm/as.h> @@ -570,6 +571,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,  	return (0);  } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ +	if (flags & _MAP_LOW32) { +		if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { +			return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); +		} else { +			return ((caddr_t)_userlimit32); +		} +	} + +	return (as->a_userlimit); +} +  /*   * Used for MAP_ANON - fast way to get anonymous pages @@ -585,8 +600,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,  		return (EACCES);  	if ((flags & MAP_FIXED) != 0) { -		caddr_t userlimit; -  		/*  		 * Use the user address.  First verify that  		 * the address to be used is page aligned. @@ -595,9 +608,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,  		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)  			return (EINVAL); -		userlimit = flags & _MAP_LOW32 ? -		    (caddr_t)USERLIMIT32 : as->a_userlimit; -		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { +		switch (valid_usr_range(*addrp, len, uprot, as, +		    map_userlimit(as->a_proc, as, flags))) {  		case RANGE_OKAY:  			break;  		case RANGE_BADPROT: @@ -638,7 +650,7 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,  #define	RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \  	!(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint)) -static int +int  smmap_common(caddr_t *addrp, size_t len,      int prot, int flags, struct file *fp, offset_t pos)  { @@ -780,8 +792,6 @@ smmap_common(caddr_t *addrp, size_t len,  	 * If the user specified an address, do some simple checks here  	 */  	if ((flags & MAP_FIXED) != 0) { -		caddr_t userlimit; -  		/*  		 * Use the user address.  First verify that  		 * the address to be used is page aligned. @@ -789,10 +799,8 @@ smmap_common(caddr_t *addrp, size_t len,  		 */  		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)  			return (EINVAL); - -		userlimit = flags & _MAP_LOW32 ? -		    (caddr_t)USERLIMIT32 : as->a_userlimit; -		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { +		switch (valid_usr_range(*addrp, len, uprot, as, +		    map_userlimit(curproc, as, flags))) {  		case RANGE_OKAY:  			break;  		case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c deleted file mode 100644 index 2dad0cb940..0000000000 --- a/usr/src/uts/common/os/id_space.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/id_space.h> -#include <sys/debug.h> - -/* - * ID Spaces - * - *   The id_space_t provides a simple implementation of a managed range of - *   integer identifiers using a vmem arena.  An ID space guarantees that the - *   next identifer returned by an allocation is larger than the previous one, - *   unless there are no larger slots remaining in the range.  In this case, - *   the ID space will return the first available slot in the lower part of the - *   range (viewing the previous identifier as a partitioning element).  If no - *   slots are available, id_alloc()/id_allocff() will sleep until an - *   identifier becomes available.  Accordingly, id_space allocations must be - *   initiated from contexts where sleeping is acceptable.  id_alloc_nosleep()/ - *   id_allocff_nosleep() will return -1 if no slots are available or if the - *   system is low on memory.  If id_alloc_nosleep() fails, callers should - *   not try to extend the ID space.  This is to avoid making a possible - *   low-memory situation worse. - * - *   As an ID space is designed for representing a range of id_t's, there - *   is a preexisting maximal range: [0, MAXUID].  ID space requests outside - *   that range will fail on a DEBUG kernel.  The id_allocff*() functions - *   return the first available id, and should be used when there is benefit - *   to having a compact allocated range. - * - *   (Presently, the id_space_t abstraction supports only direct allocations; ID - *   reservation, in which an ID is allocated but placed in a internal - *   dictionary for later use, should be added when a consuming subsystem - *   arrives.) - */ - -#define	ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) -#define	ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) - -/* - * Create an arena to represent the range [low, high). - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_space_t * -id_space_create(const char *name, id_t low, id_t high) -{ -	ASSERT(low >= 0); -	ASSERT(low < high); - -	return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, -	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER)); -} - -/* - * Destroy a previously created ID space. - * No restrictions on caller's context. - */ -void -id_space_destroy(id_space_t *isp) -{ -	vmem_destroy(isp); -} - -void -id_space_extend(id_space_t *isp, id_t low, id_t high) -{ -	(void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP); -} - -/* - * Allocate an id_t from specified ID space. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_alloc(id_space_t *isp) -{ -	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space. - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_alloc_nosleep(id_space_t *isp) -{ -	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_allocff(id_space_t *isp) -{ -	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_allocff_nosleep(id_space_t *isp) -{ -	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate a specific identifier if possible, returning the id if - * successful, or -1 on failure. - */ -id_t -id_alloc_specific_nosleep(id_space_t *isp, id_t id) -{ -	void *minaddr = ID_TO_ADDR(id); -	void *maxaddr = ID_TO_ADDR(id + 1); - -	/* -	 * Note that even though we're vmem_free()ing this later, it -	 * should be OK, since there's no quantum cache. -	 */ -	return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, -	    minaddr, maxaddr, VM_NOSLEEP))); -} - -/* - * Free a previously allocated ID. - * No restrictions on caller's context. - */ -void -id_free(id_space_t *isp, id_t id) -{ -	vmem_free(isp, ID_TO_ADDR(id), 1); -} diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c index 9381019cd1..6a6f5d84ef 100644 --- a/usr/src/uts/common/os/ipc.c +++ b/usr/src/uts/common/os/ipc.c @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/ @@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm)  	    (IPC_ZONE_USAGE(perm, service) == 0)));  } +/* + * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID. + */ +void +ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm) +{ +	ASSERT(service->ipcs_count > 0); +	ASSERT(MUTEX_HELD(&service->ipcs_lock)); + +	ipc_remove(service, perm); +	mutex_exit(&service->ipcs_lock); + +	/* perform any per-service removal actions */ +	service->ipcs_rmid(perm); + +	ipc_rele(service, perm); +}  /*   * Common code to perform an IPC_RMID.  Returns an errno value on @@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr)  	/*  	 * Nothing can fail from this point on.  	 */ -	ipc_remove(service, perm); -	mutex_exit(&service->ipcs_lock); - -	/* perform any per-service removal actions */ -	service->ipcs_rmid(perm); - -	ipc_rele(service, perm); +	ipc_rmsvc(service, perm);  	return (0);  } diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index bc0cda418b..ed2c7fc346 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc.   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.   * Copyright 2018, Joyent, Inc. @@ -1011,6 +1012,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */  size_t kmem_content_log_size;	/* content log size [2% of memory] */  size_t kmem_failure_log_size;	/* failure log [4 pages per CPU] */  size_t kmem_slab_log_size;	/* slab create log [4 pages per CPU] */ +size_t kmem_zerosized_log_size;	/* zero-sized log [4 pages per CPU] */  size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */  size_t kmem_lite_minsize = 0;	/* minimum buffer size for KMF_LITE */  size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */ @@ -1018,6 +1020,14 @@ int kmem_lite_pcs = 4;		/* number of PCs to store in KMF_LITE mode */  size_t kmem_maxverify;		/* maximum bytes to inspect in debug routines */  size_t kmem_minfirewall;	/* hardware-enforced redzone threshold */ +#ifdef DEBUG +int kmem_warn_zerosized = 1;	/* whether to warn on zero-sized KM_SLEEP */ +#else +int kmem_warn_zerosized = 0;	/* whether to warn on zero-sized KM_SLEEP */ +#endif + +int kmem_panic_zerosized = 0;	/* whether to panic on zero-sized KM_SLEEP */ +  #ifdef _LP64  size_t	kmem_max_cached = KMEM_BIG_MAXBUF;	/* maximum kmem_alloc cache */  #else @@ -1098,6 +1108,7 @@ kmem_log_header_t	*kmem_transaction_log;  kmem_log_header_t	*kmem_content_log;  kmem_log_header_t	*kmem_failure_log;  kmem_log_header_t	*kmem_slab_log; +kmem_log_header_t	*kmem_zerosized_log;  static int		kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ @@ -2853,8 +2864,33 @@ kmem_alloc(size_t size, int kmflag)  		/* fall through to kmem_cache_alloc() */  	} else { -		if (size == 0) +		if (size == 0) { +			if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC)) +				return (NULL); + +			/* +			 * If this is a sleeping allocation or one that has +			 * been specified to panic on allocation failure, we +			 * consider it to be deprecated behavior to allocate +			 * 0 bytes.  If we have been configured to panic under +			 * this condition, we panic; if to warn, we warn -- and +			 * regardless, we log to the kmem_zerosized_log that +			 * that this condition has occurred (which gives us +			 * enough information to be able to debug it). +			 */ +			if (kmem_panic && kmem_panic_zerosized) +				panic("attempted to kmem_alloc() size of 0"); + +			if (kmem_warn_zerosized) { +				cmn_err(CE_WARN, "kmem_alloc(): sleeping " +				    "allocation with size of 0; " +				    "see kmem_zerosized_log for details"); +			} + +			kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL); +  			return (NULL); +		}  		buf = vmem_alloc(kmem_oversize_arena, size,  		    kmflag & KM_VMFLAGS); @@ -4397,8 +4433,8 @@ kmem_init(void)  	}  	kmem_failure_log = kmem_log_init(kmem_failure_log_size); -  	kmem_slab_log = kmem_log_init(kmem_slab_log_size); +	kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);  	/*  	 * Initialize STREAMS message caches so allocb() is available. diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c index 93c04cff8d..b09b2d3558 100644 --- a/usr/src/uts/common/os/kstat_fr.c +++ b/usr/src/uts/common/os/kstat_fr.c @@ -20,7 +20,7 @@   */  /*   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2014, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. All rights reserved.   * Copyright 2015 Nexenta Systems, Inc. All rights reserved.   */ @@ -198,6 +198,9 @@ struct {  	kstat_named_t pagesfree;  	kstat_named_t pageslocked;  	kstat_named_t pagestotal; +	kstat_named_t lowmemscan; +	kstat_named_t zonecapscan; +	kstat_named_t nthrottle;  } system_pages_kstat = {  	{ "physmem",		KSTAT_DATA_ULONG },  	{ "nalloc",		KSTAT_DATA_ULONG }, @@ -219,6 +222,9 @@ struct {  	{ "pagesfree", 		KSTAT_DATA_ULONG },  	{ "pageslocked", 	KSTAT_DATA_ULONG },  	{ "pagestotal",		KSTAT_DATA_ULONG }, +	{ "low_mem_scan",	KSTAT_DATA_ULONG }, +	{ "zone_cap_scan",	KSTAT_DATA_ULONG }, +	{ "n_throttle",		KSTAT_DATA_ULONG },  };  static int header_kstat_update(kstat_t *, int); @@ -912,6 +918,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw)  	system_pages_kstat.pageslocked.value.ul	= (ulong_t)(availrmem_initial -  	    availrmem);  	system_pages_kstat.pagestotal.value.ul	= (ulong_t)total_pages; +	system_pages_kstat.lowmemscan.value.ul	= (ulong_t)low_mem_scan; +	system_pages_kstat.zonecapscan.value.ul	= (ulong_t)zone_cap_scan; +	system_pages_kstat.nthrottle.value.ul	= (ulong_t)n_throttle;  	/*  	 * pp_kernel represents total pages used by the kernel since the  	 * startup. This formula takes into account the boottime kernel diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c index 6288f47bed..6f6aced619 100644 --- a/usr/src/uts/common/os/lgrp.c +++ b/usr/src/uts/common/os/lgrp.c @@ -21,6 +21,7 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2018 Joyent, Inc.   */  /* @@ -90,6 +91,7 @@  #include <sys/pg.h>  #include <sys/promif.h>  #include <sys/sdt.h> +#include <sys/ht.h>  lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */  lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ @@ -520,6 +522,8 @@ lgrp_main_mp_init(void)  {  	klgrpset_t changed; +	ht_init(); +  	/*  	 * Update lgroup topology (if necessary)  	 */ diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 149f5f8a88..06c03dd38e 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -22,6 +22,7 @@  /*   * Copyright (c) 2013 Gary Mills   * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc.   */  #include <sys/types.h> @@ -249,8 +250,7 @@ log_init(void)  	 */  	printf("\rSunOS Release %s Version %s %u-bit\n",  	    utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); -	printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " -	    "All rights reserved.\n"); +	printf("Copyright (c) 2010-2019, Joyent Inc. All rights reserved.\n");  #ifdef DEBUG  	printf("DEBUG enabled\n");  #endif @@ -491,7 +491,7 @@ log_console(log_t *lp, log_ctl_t *lc)  mblk_t *  log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg, -	size_t size, int on_intr) +    size_t size, int on_intr)  {  	mblk_t *mp = NULL;  	mblk_t *mp2; diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index b2adae570f..341e4ae356 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@   */  /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc.   */  #include <sys/param.h> @@ -57,6 +57,8 @@  #include <sys/lgrp.h>  #include <sys/rctl.h>  #include <sys/contract_impl.h> +#include <sys/contract/process.h> +#include <sys/contract/process_impl.h>  #include <sys/cpc_impl.h>  #include <sys/sdt.h>  #include <sys/cmn_err.h> @@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,  	ret_tidhash_t *ret_tidhash = NULL;  	int i;  	int rctlfail = 0; -	boolean_t branded = 0; +	void *brand_data = NULL;  	struct ctxop *ctx = NULL;  	ASSERT(cid != sysdccid);	/* system threads must start in SYS */ @@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,  	 */  	lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); +	/* +	 * If necessary, speculatively allocate lwp brand data.  This is done +	 * ahead of time so p_lock need not be dropped during lwp branding. +	 */ +	if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { +		if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { +			mutex_enter(&p->p_lock); +			err = 1; +			atomic_inc_32(&p->p_zone->zone_ffmisc); +			goto error; +		} +	} +  	mutex_enter(&p->p_lock);  grow:  	/* @@ -630,18 +645,6 @@ grow:  		} while (lwp_hash_lookup(p, t->t_tid) != NULL);  	} -	/* -	 * If this is a branded process, let the brand do any necessary lwp -	 * initialization. -	 */ -	if (PROC_IS_BRANDED(p)) { -		if (BROP(p)->b_initlwp(lwp)) { -			err = 1; -			atomic_inc_32(&p->p_zone->zone_ffmisc); -			goto error; -		} -		branded = 1; -	}  	if (t->t_tid == 1) {  		kpreempt_disable(); @@ -654,7 +657,6 @@ grow:  		}  	} -	p->p_lwpcnt++;  	t->t_waitfor = -1;  	/* @@ -696,8 +698,27 @@ grow:  	t->t_post_sys = 1;  	/* +	 * Perform lwp branding +	 * +	 * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be +	 * continuously held between when the tidhash is sized and when the lwp +	 * is inserted into it.  Operations requiring p->p_lock to be +	 * temporarily dropped can be performed in b_initlwp_post. +	 */ +	if (PROC_IS_BRANDED(p)) { +		BROP(p)->b_initlwp(lwp, brand_data); +		/* +		 * The b_initlwp hook is expected to consume any preallocated +		 * brand_data in a way that prepares it for deallocation by the +		 * b_freelwp hook. +		 */ +		brand_data = NULL; +	} + +	/*  	 * Insert the new thread into the list of all threads.  	 */ +	p->p_lwpcnt++;  	if ((tx = p->p_tlist) == NULL) {  		t->t_back = t;  		t->t_forw = t; @@ -718,6 +739,13 @@ grow:  	lep->le_start = t->t_start;  	lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1); +	/* +	 * Complete lwp branding +	 */ +	if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) { +		BROP(p)->b_initlwp_post(lwp); +	} +  	lwp_fp_init(lwp);  	if (state == TS_RUN) { @@ -755,8 +783,9 @@ error:  		if (cid != NOCLASS && bufp != NULL)  			CL_FREE(cid, bufp); -		if (branded) -			BROP(p)->b_freelwp(lwp); +		if (brand_data != NULL) { +			BROP(p)->b_lwpdata_free(brand_data); +		}  		mutex_exit(&p->p_lock);  		t->t_state = TS_FREE; @@ -829,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)  	int i;  	for (i = 0; i < ct_ntypes; i++) { -		dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]); +		ct_template_t *tmpl = src->lwp_ct_active[i]; + +		/* +		 * If the process contract template is setup to be preserved +		 * across exec, then if we're forking, perform an implicit +		 * template_clear now. This ensures that future children of +		 * this child will remain in the same contract unless they're +		 * explicitly setup differently. We know we're forking if the +		 * two LWPs belong to different processes. +		 */ +		if (i == CTT_PROCESS && tmpl != NULL) { +			ctmpl_process_t *ctp = tmpl->ctmpl_data; + +			if (dst->lwp_procp != src->lwp_procp && +			    (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) +				tmpl = NULL; +		} + +		dst->lwp_ct_active[i] = ctmpl_dup(tmpl);  		dst->lwp_ct_latest[i] = NULL; +  	}  } @@ -838,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)   * Clear an LWP's contract template state.   */  void -lwp_ctmpl_clear(klwp_t *lwp) +lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec)  {  	ct_template_t *tmpl;  	int i;  	for (i = 0; i < ct_ntypes; i++) { -		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { -			ctmpl_free(tmpl); -			lwp->lwp_ct_active[i] = NULL; -		} -  		if (lwp->lwp_ct_latest[i] != NULL) {  			contract_rele(lwp->lwp_ct_latest[i]);  			lwp->lwp_ct_latest[i] = NULL;  		} + +		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { +			/* +			 * If we're exec-ing a new program and the process +			 * contract template is setup to be preserved across +			 * exec, then don't clear it. +			 */ +			if (is_exec && i == CTT_PROCESS) { +				ctmpl_process_t *ctp = tmpl->ctmpl_data; + +				if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) +					continue; +			} + +			ctmpl_free(tmpl); +			lwp->lwp_ct_active[i] = NULL; +		}  	}  } @@ -893,13 +953,6 @@ lwp_exit(void)  	if (t->t_upimutex != NULL)  		upimutex_cleanup(); -	/* -	 * Perform any brand specific exit processing, then release any -	 * brand data associated with the lwp -	 */ -	if (PROC_IS_BRANDED(p)) -		BROP(p)->b_lwpexit(lwp); -  	lwp_pcb_exit();  	mutex_enter(&p->p_lock); @@ -943,6 +996,18 @@ lwp_exit(void)  	DTRACE_PROC(lwp__exit);  	/* +	 * Perform any brand specific exit processing, then release any +	 * brand data associated with the lwp +	 */ +	if (PROC_IS_BRANDED(p)) { +		mutex_exit(&p->p_lock); +		BROP(p)->b_lwpexit(lwp); +		BROP(p)->b_freelwp(lwp); +		mutex_enter(&p->p_lock); +		prbarrier(p); +	} + +	/*  	 * If the lwp is a detached lwp or if the process is exiting,  	 * remove (lwp_hash_out()) the lwp from the lwp directory.  	 * Otherwise null out the lwp's le_thread pointer in the lwp @@ -1103,7 +1168,7 @@ lwp_cleanup(void)  	}  	kpreempt_enable(); -	lwp_ctmpl_clear(ttolwp(t)); +	lwp_ctmpl_clear(ttolwp(t), B_FALSE);  }  int diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 7bc41b6954..3364d1e523 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -158,7 +158,7 @@ exec_init(const char *initpath, const char *args)  	int error = 0, count = 0;  	proc_t *p = ttoproc(curthread);  	klwp_t *lwp = ttolwp(curthread); -	int brand_action; +	int brand_action = EBA_NONE;  	if (args == NULL)  		args = ""; @@ -288,7 +288,15 @@ exec_init(const char *initpath, const char *args)  	 */  	sigemptyset(&curthread->t_hold); -	brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; +	/* +	 * Only instruct exec_common to brand the process if necessary.  It is +	 * possible that the init process is already properly branded due to the +	 * proc_exit -> restart_init -> exec_init call chain. +	 */ +	if (ZONE_IS_BRANDED(p->p_zone) && +	    p->p_brand != p->p_zone->zone_brand) { +		brand_action = EBA_BRAND; +	}  again:  	error = exec_common((const char *)exec_fnamep,  	    (const char **)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c index 3571747e9c..6be46fa422 100644 --- a/usr/src/uts/common/os/mem_config.c +++ b/usr/src/uts/common/os/mem_config.c @@ -21,6 +21,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2017 Joyent, Inc.   */  #include <sys/types.h> @@ -1638,7 +1639,7 @@ delthr_get_freemem(struct mem_handle *mhp)  		 * Put pressure on pageout.  		 */  		page_needfree(free_get); -		cv_signal(&proc_pageout->p_cv); +		WAKE_PAGEOUT_SCANNER();  		mutex_enter(&mhp->mh_mutex);  		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c index 142c10754e..0410e6f47b 100644 --- a/usr/src/uts/common/os/mmapobj.c +++ b/usr/src/uts/common/os/mmapobj.c @@ -1381,10 +1381,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,  			}  			if (num_segs++ == 0) {  				/* -				 * The p_vaddr of the first PT_LOAD segment -				 * must either be NULL or within the first -				 * page in order to be interpreted. -				 * Otherwise, its an invalid file. +				 * While ELF doesn't specify the meaning of +				 * p_vaddr for PT_LOAD segments in ET_DYN +				 * objects, we mandate that is either NULL or +				 * (to accommodate some historical binaries) +				 * within the first page.  (Note that there +				 * exist non-native ET_DYN objects that violate +				 * this constraint that we nonetheless must be +				 * able to execute; see the ET_DYN handling in +				 * mapelfexec() for details.)  				 */  				if (e_type == ET_DYN &&  				    ((caddr_t)((uintptr_t)vaddr & diff --git a/usr/src/uts/common/os/modctl.c b/usr/src/uts/common/os/modctl.c index e2a3335eb4..f1003f7834 100644 --- a/usr/src/uts/common/os/modctl.c +++ b/usr/src/uts/common/os/modctl.c @@ -21,6 +21,7 @@  /*   * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017 Joyent, Inc.   */  /* @@ -3470,6 +3471,11 @@ mod_load(struct modctl *mp, int usepath)  		retval = install_stubs_by_name(mp, mp->mod_modname);  		/* +		 * Perform hotinlines before module is started. +		 */ +		do_hotinlines(mp->mod_mp); + +		/*  		 * Now that the module is loaded, we need to give DTrace  		 * a chance to notify its providers.  This is done via  		 * the dtrace_modload function pointer. diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c index 3605104ae7..a04294eed5 100644 --- a/usr/src/uts/common/os/modsysfile.c +++ b/usr/src/uts/common/os/modsysfile.c @@ -22,6 +22,7 @@  /*   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2018 Joyent, Inc.   * Copyright 2017 Nexenta Systems, Inc.   */ @@ -57,10 +58,12 @@ struct hwc_class *hcl_head;	/* head of list of classes */  static kmutex_t hcl_lock;	/* for accessing list of classes */  #define	DAFILE		"/etc/driver_aliases" +#define	PPTFILE		"/etc/ppt_aliases"  #define	CLASSFILE	"/etc/driver_classes"  #define	DACFFILE	"/etc/dacf.conf"  static char class_file[] = CLASSFILE; +static char pptfile[] = PPTFILE;  static char dafile[] = DAFILE;  static char dacffile[] = DACFFILE; @@ -2150,14 +2153,13 @@ hwc_parse_now(char *fname, struct par_list **pl, ddi_prop_t **props)  	return (0);	/* always return success */  } -void -make_aliases(struct bind **bhash) +static void +parse_aliases(struct bind **bhash, struct _buf *file)  {  	enum {  		AL_NEW, AL_DRVNAME, AL_DRVNAME_COMMA, AL_ALIAS, AL_ALIAS_COMMA  	} state; -	struct _buf *file;  	char tokbuf[MAXPATHLEN];  	char drvbuf[MAXPATHLEN];  	token_t token; @@ -2166,9 +2168,6 @@ make_aliases(struct bind **bhash)  	static char dupwarn[] = "!Driver alias \"%s\" conflicts with "  	    "an existing driver name or alias."; -	if ((file = kobj_open_file(dafile)) == (struct _buf *)-1) -		return; -  	state = AL_NEW;  	major = DDI_MAJOR_T_NONE;  	while (!done) { @@ -2253,8 +2252,22 @@ make_aliases(struct bind **bhash)  			kobj_file_err(CE_WARN, file, tok_err, tokbuf);  		}  	} +} -	kobj_close_file(file); +void +make_aliases(struct bind **bhash) +{ +	struct _buf *file; + +	if ((file = kobj_open_file(pptfile)) != (struct _buf *)-1) { +		parse_aliases(bhash, file); +		kobj_close_file(file); +	} + +	if ((file = kobj_open_file(dafile)) != (struct _buf *)-1) { +		parse_aliases(bhash, file); +		kobj_close_file(file); +	}  } diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index b555bb82b7..eba6147fab 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -21,6 +21,7 @@  /*   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -112,6 +113,18 @@ pid_lookup(pid_t pid)  	return (pidp);  } +struct pid * +pid_find(pid_t pid) +{ +	struct pid *pidp; + +	mutex_enter(&pidlinklock); +	pidp = pid_lookup(pid); +	mutex_exit(&pidlinklock); + +	return (pidp); +} +  void  pid_setmin(void)  { @@ -522,6 +535,20 @@ sprunlock(proc_t *p)  	THREAD_KPRI_RELEASE();  } +/* + * Undo effects of sprlock but without dropping p->p_lock + */ +void +sprunprlock(proc_t *p) +{ +	ASSERT(p->p_proc_flag & P_PR_LOCK); +	ASSERT(MUTEX_HELD(&p->p_lock)); + +	cv_signal(&pr_pid_cv[p->p_slot]); +	p->p_proc_flag &= ~P_PR_LOCK; +	THREAD_KPRI_RELEASE(); +} +  void  pid_init(void)  { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index d3d362a8a7..861c748cff 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -56,6 +56,7 @@  #include <sys/mntent.h>  #include <sys/contract_impl.h>  #include <sys/dld_ioc.h> +#include <sys/brand.h>  /*   * There are two possible layers of privilege routines and two possible @@ -1244,6 +1245,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner)  void  secpolicy_setid_clear(vattr_t *vap, cred_t *cr)  { +	proc_t *p = curproc; + +	/* +	 * Allow the brand to override this behaviour. +	 */ +	if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { +		/* +		 * This brand hook will return 0 if handling is complete, or +		 * some other value if the brand would like us to fall back to +		 * the usual behaviour. +		 */ +		if (BROP(p)->b_setid_clear(vap, cr) == 0) { +			return; +		} +	} +  	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&  	    secpolicy_vnode_setid_retain(cr,  	    (vap->va_mode & S_ISUID) != 0 && @@ -2092,6 +2109,13 @@ secpolicy_meminfo(const cred_t *cr)  }  int +secpolicy_fs_import(const cred_t *cr) +{ +	return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int  secpolicy_pfexec_register(const cred_t *cr)  {  	return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2607,3 +2631,11 @@ secpolicy_ppp_config(const cred_t *cr)  		return (secpolicy_net_config(cr, B_FALSE));  	return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));  } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ +	if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) +		return (EPERM); +	return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index bc1787c9ca..854fb602da 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP  	Allows a process to perform privileged mappings through a  	graphics device. +privilege PRIV_HYPRLOFS_CONTROL + +	Allows a process to manage hyprlofs entries. +  privilege PRIV_IPC_DAC_READ  	Allows a process to read a System V IPC @@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES  	Allows a process to open the real console device directly.  	Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + +	Allows a process to import a potentially untrusted file system. +  privilege PRIV_SYS_IPC_CONFIG  	Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 09b80323d5..e0a1126567 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc.   */  #include <sys/atomic.h> @@ -194,6 +195,8 @@ id_space_t *rctl_ids;  kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */  kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */ +extern rctl_hndl_t rc_process_maxlockedmem; +  kmutex_t rctl_lists_lock;  rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1]; @@ -2872,12 +2875,12 @@ rctl_init(void)   * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,   *     int chargeproc)   * - * Increments the amount of locked memory on a project, and - * zone. If proj is non-NULL the project must be held by the - * caller; if it is NULL the proj and zone of proc_t p are used. - * If chargeproc is non-zero, then the charged amount is cached - * on p->p_locked_mem so that the charge can be migrated when a - * process changes projects. + * Increments the amount of locked memory on a process, project, and + * zone. If 'proj' is non-NULL, the project must be held by the + * caller; if it is NULL, the project and zone of process 'p' are used. + * If 'chargeproc' is non-zero, then the charged amount is added + * to p->p_locked_mem. This is also used so that the charge can be + * migrated when a process changes projects.   *   * Return values   *    0 - success @@ -2895,6 +2898,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,  	ASSERT(p != NULL);  	ASSERT(MUTEX_HELD(&p->p_lock)); +  	if (proj != NULL) {  		projp = proj;  		zonep = proj->kpj_zone; @@ -2938,11 +2942,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,  		}  	} -	zonep->zone_locked_mem += inc; -	projp->kpj_data.kpd_locked_mem += inc;  	if (chargeproc != 0) { +		/* Check for overflow */ +		if ((p->p_locked_mem + inc) < p->p_locked_mem) { +			ret = EAGAIN; +			goto out; +		} +		if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p, +		    &e, inc, 0) & RCT_DENY) { +			ret = EAGAIN; +			goto out; +		} +  		p->p_locked_mem += inc;  	} + +	zonep->zone_locked_mem += inc; +	projp->kpj_data.kpd_locked_mem += inc;  out:  	mutex_exit(&zonep->zone_mem_lock);  	return (ret); diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c index 9b7324fe7b..c62540d2b4 100644 --- a/usr/src/uts/common/os/rctl_proc.c +++ b/usr/src/uts/common/os/rctl_proc.c @@ -21,6 +21,7 @@  /*   * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2017 Joyent, Inc.   */  #include <sys/types.h> @@ -32,6 +33,7 @@  #include <sys/port_kernel.h>  #include <sys/signal.h>  #include <sys/var.h> +#include <sys/policy.h>  #include <sys/vmparam.h>  #include <sys/machparam.h> @@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl;  rctl_hndl_t rc_process_semopm;  rctl_hndl_t rc_process_portev;  rctl_hndl_t rc_process_sigqueue; +rctl_hndl_t rc_process_maxlockedmem;  /*   * process.max-cpu-time / RLIMIT_CPU @@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = {  };  /* + * process.max-locked-memory + */ +/*ARGSUSED*/ +static int +proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e, +    struct rctl_val *rv, rctl_qty_t i, uint_t f) +{ +	if (secpolicy_lock_memory(CRED()) == 0) +		return (0); +	return ((p->p_locked_mem + i) > rv->rcv_value); +} + +static rctl_ops_t proc_maxlockedmem_ops = { +	rcop_no_action, +	rcop_no_usage, +	rcop_no_set, +	proc_maxlockedmem_test +}; + +/*   * void rctlproc_default_init()   *   * Overview @@ -383,6 +406,11 @@ rctlproc_init(void)  	rctl_add_default_limit("process.max-sigqueue-size",  	    _SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); +	rc_process_maxlockedmem = rctl_register("process.max-locked-memory", +	    RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS | +	    RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES, +	    ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops); +  	/*  	 * Place minimal set of controls on "sched" process for inheritance by  	 * processes created via newproc(). diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index c1d6569f11..15e77d39f7 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/  /*	  All Rights Reserved	*/ +/* + * Copyright (c) 2015, Joyent, Inc.  All rights reserved. + */ +  #include <sys/param.h>  #include <sys/types.h>  #include <sys/sysmacros.h> @@ -646,16 +650,17 @@ top:  		klwp_t *lwp = ttolwp(tp);  		/* -		 * Swapout eligible lwps (specified by the scheduling -		 * class) which don't have TS_DONT_SWAP set.  Set the -		 * "intent to swap" flag (TS_SWAPENQ) on threads -		 * which have TS_DONT_SWAP set so that they can be +		 * Swapout eligible lwps (specified by the scheduling class) +		 * which don't have TS_DONT_SWAP set.  Set the "intent to swap" +		 * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP +		 * set or are currently on a split stack so that they can be  		 * swapped if and when they reach a safe point.  		 */  		thread_lock(tp);  		thread_pri = CL_SWAPOUT(tp, swapflags);  		if (thread_pri != -1) { -			if (tp->t_schedflag & TS_DONT_SWAP) { +			if ((tp->t_schedflag & TS_DONT_SWAP) || +			    (tp->t_flag & T_SPLITSTK)) {  				tp->t_schedflag |= TS_SWAPENQ;  				tp->t_trapret = 1;  				aston(tp); diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 5721083751..18b396a765 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -22,6 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2016 Joyent, Inc.   */  #include <sys/types.h> @@ -327,12 +328,17 @@ schedctl_sigblock(kthread_t *t)  /* - * If the sc_sigblock field is set for the specified thread, set - * its signal mask to block all maskable signals, then clear the - * sc_sigblock field.  This finishes what user-level code requested - * to be done when it set tdp->sc_shared->sc_sigblock non-zero. - * Called from signal-related code either by the current thread for - * itself or by a thread that holds the process's p_lock (/proc code). + * If the sc_sigblock field is set for the specified thread, set its signal + * mask to block all maskable signals, then clear the sc_sigblock field.  This + * accomplishes what user-level code requested to be done when it set + * tdp->sc_shared->sc_sigblock non-zero. + * + * This is generally called by signal-related code in the current thread.  In + * order to call against a thread other than curthread, p_lock for the + * containing process must be held.  Even then, the caller is not protected + * from races with the thread in question updating its own fields.  It is the + * responsibility of the caller to perform additional synchronization. + *   */  void  schedctl_finish_sigblock(kthread_t *t) diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c index bacc595f78..5deae96d73 100644 --- a/usr/src/uts/common/os/shm.c +++ b/usr/src/uts/common/os/shm.c @@ -21,6 +21,7 @@  /*   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)  		size_t	share_size;  		struct	shm_data ssd;  		uintptr_t align_hint; +		long	curprot;  		/*  		 * Pick a share pagesize to use, if (!isspt(sp)). @@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)  			}  		} +		curprot = sp->shm_opts & SHM_PROT_MASK;  		if (!isspt(sp)) {  			error = sptcreate(size, &segspt, sp->shm_amp, prot,  			    flags, share_szc); @@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)  			}  			sp->shm_sptinfo->sptas = segspt->s_as;  			sp->shm_sptseg = segspt; -			sp->shm_sptprot = prot; -		} else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { +			sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot; +		} else if ((prot & curprot) != curprot) {  			/*  			 * Ensure we're attaching to an ISM segment with  			 * fewer or equal permissions than what we're @@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg)  		}  		break; +	/* Stage segment for removal, but don't remove until last detach */ +	case SHM_RMID: +		if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0) +			break; + +		/* +		 * If attached, just mark it as a pending remove, otherwise +		 * we must perform the normal ipc_rmid now. +		 */ +		if ((sp->shm_perm.ipc_ref - 1) > 0) { +			sp->shm_opts |= SHM_RM_PENDING; +		} else { +			mutex_exit(lock); +			return (ipc_rmid(shm_svc, shmid, cr)); +		} +		break; +  	default:  		error = EINVAL;  		break; @@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap)  		sp->shm_ismattch--;  	sp->shm_dtime = gethrestime_sec();  	sp->shm_lpid = pp->p_pid; +	if ((sp->shm_opts & SHM_RM_PENDING) != 0 && +	    sp->shm_perm.ipc_ref == 2) { +		/* +		 * If this is the last detach of the segment across the whole +		 * system then now we can perform the delayed IPC_RMID. +		 * The ipc_ref count has 1 for the original 'get' and one for +		 * each 'attach' (see 'stat' handling in shmctl). +		 */ +		sp->shm_opts &= ~SHM_RM_PENDING; +		mutex_enter(&shm_svc->ipcs_lock); +		ipc_rmsvc(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */ +		ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock)); +		ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0); + +		/* Lock was dropped, need to retake it for following rele. */ +		(void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); +	}  	ipc_rele(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */  	kmem_free(sap, sizeof (segacct_t)); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 453b1f22d4..67a93581dd 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc.  All rights reserved. + * Copyright 2017, Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -60,6 +60,7 @@  #include <sys/cyclic.h>  #include <sys/dtrace.h>  #include <sys/sdt.h> +#include <sys/brand.h>  #include <sys/signalfd.h>  const k_sigset_t nullsmask = {0, 0, 0}; @@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig)  }  /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ +	return (sigismember(&p->p_ignore, sig) &&	/* sig in ignore mask */ +	    !(PROC_IS_BRANDED(p) &&			/* allowed by brand */ +	    BROP(p)->b_sig_ignorable != NULL && +	    BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE)); + +} + +/*   * Return true if the signal can safely be discarded on generation.   * That is, if there is no need for the signal on the receiving end.   * The answer is true if the process is a zombie or @@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig)   *	the signal is not being accepted via sigwait()   */  static int -sig_discardable(proc_t *p, int sig) +sig_discardable(proc_t *p, kthread_t *tp, int sig)  {  	kthread_t *t = p->p_tlist; +	klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp;  	return (t == NULL ||		/* if zombie or ... */ -	    (sigismember(&p->p_ignore, sig) &&	/* signal is ignored */ +	    (sig_ignorable(p, lwp, sig) &&		/* signal is ignored */  	    t->t_forw == t &&			/* and single-threaded */  	    !tracing(p, sig) &&			/* and no /proc tracing */  	    !signal_is_blocked(t, sig) &&	/* and signal not blocked */ @@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig)  		    !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) {  			ttoproc(t)->p_stopsig = 0;  			t->t_dtrace_stop = 0; -			t->t_schedflag |= TS_XSTART | TS_PSTART; +			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;  			setrun_locked(t);  		} else if (t != curthread && t->t_state == TS_ONPROC) {  			aston(t);	/* make it do issig promptly */ @@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig)  		}  	} -	if (sig_discardable(p, sig)) { +	if (sig_discardable(p, t, sig)) {  		DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist,  		    proc_t *, p, int, sig);  		return; @@ -497,7 +514,7 @@ issig_justlooking(void)  			if (sigismember(&set, sig) &&  			    (tracing(p, sig) ||  			    sigismember(&t->t_sigwait, sig) || -			    !sigismember(&p->p_ignore, sig))) { +			    !sig_ignorable(p, lwp, sig))) {  				/*  				 * Don't promote a signal that will stop  				 * the process when lwp_nostop is set. @@ -623,6 +640,28 @@ issig_forreal(void)  		}  		/* +		 * The brand hook name 'b_issig_stop' is a misnomer. +		 * Allow the brand the chance to alter (or suppress) delivery +		 * of this signal. +		 */ +		if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { +			int r; + +			/* +			 * The brand hook will return 0 if it would like +			 * us to drive on, -1 if we should restart +			 * the loop to check other conditions, or 1 if we +			 * should terminate the loop. +			 */ +			r = BROP(p)->b_issig_stop(p, lwp); +			if (r < 0) { +				continue; +			} else if (r > 0) { +				break; +			} +		} + +		/*  		 * Honor requested stop before dealing with the  		 * current signal; a debugger may change it.  		 * Do not want to go back to loop here since this is a special @@ -656,7 +695,7 @@ issig_forreal(void)  			lwp->lwp_cursig = 0;  			lwp->lwp_extsig = 0;  			if (sigismember(&t->t_sigwait, sig) || -			    (!sigismember(&p->p_ignore, sig) && +			    (!sig_ignorable(p, lwp, sig) &&  			    !isjobstop(sig))) {  				if (p->p_flag & (SEXITLWPS|SKILLED)) {  					sig = SIGKILL; @@ -708,7 +747,7 @@ issig_forreal(void)  				toproc = 0;  				if (tracing(p, sig) ||  				    sigismember(&t->t_sigwait, sig) || -				    !sigismember(&p->p_ignore, sig)) { +				    !sig_ignorable(p, lwp, sig)) {  					if (sigismember(&t->t_extsig, sig))  						ext = 1;  					break; @@ -722,7 +761,7 @@ issig_forreal(void)  				toproc = 1;  				if (tracing(p, sig) ||  				    sigismember(&t->t_sigwait, sig) || -				    !sigismember(&p->p_ignore, sig)) { +				    !sig_ignorable(p, lwp, sig)) {  					if (sigismember(&p->p_extsig, sig))  						ext = 1;  					break; @@ -954,6 +993,16 @@ stop(int why, int what)  		}  		break; +	case PR_BRAND: +		/* +		 * We have been stopped by the brand code for a brand-private +		 * reason.  This is an asynchronous stop affecting only this +		 * LWP. +		 */ +		VERIFY(PROC_IS_BRANDED(p)); +		flags &= ~TS_BSTART; +		break; +  	default:	/* /proc stop */  		flags &= ~TS_PSTART;  		/* @@ -1065,7 +1114,7 @@ stop(int why, int what)  		}  	} -	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { +	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) {  		/*  		 * Do process-level notification when all lwps are  		 * either stopped on events of interest to /proc @@ -1171,6 +1220,13 @@ stop(int why, int what)  	if (why == PR_CHECKPOINT)  		del_one_utstop(); +	/* +	 * Allow the brand to post notification of this stop condition. +	 */ +	if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { +		BROP(p)->b_stop_notify(p, lwp, why, what); +	} +  	thread_lock(t);  	ASSERT((t->t_schedflag & TS_ALLSTART) == 0);  	t->t_schedflag |= flags; @@ -1192,7 +1248,7 @@ stop(int why, int what)  		    (p->p_flag & (SEXITLWPS|SKILLED))) {  			p->p_stopsig = 0;  			thread_lock(t); -			t->t_schedflag |= TS_XSTART | TS_PSTART; +			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;  			setrun_locked(t);  			thread_unlock_nopreempt(t);  		} else if (why == PR_JOBCONTROL) { @@ -1327,7 +1383,7 @@ psig(void)  	 * this signal from pending to current (we dropped p->p_lock).  	 * This can happen only in a multi-threaded process.  	 */ -	if (sigismember(&p->p_ignore, sig) || +	if (sig_ignorable(p, lwp, sig) ||  	    (func == SIG_DFL && sigismember(&stopdefault, sig))) {  		lwp->lwp_cursig = 0;  		lwp->lwp_extsig = 0; @@ -1771,9 +1827,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp)  			/*  			 * This can only happen when the parent is init.  			 * (See call to sigcld(q, NULL) in exit().) -			 * Use KM_NOSLEEP to avoid deadlock. +			 * Use KM_NOSLEEP to avoid deadlock. The child procs +			 * initpid can be 1 for zlogin.  			 */ -			ASSERT(pp == proc_init); +			ASSERT(pp->p_pidp->pid_id == +			    cp->p_zone->zone_proc_initpid || +			    pp->p_pidp->pid_id == 1);  			winfo(cp, &info, 0);  			sigaddq(pp, NULL, &info, KM_NOSLEEP);  		} else { @@ -1804,6 +1863,15 @@ sigcld_repost()  	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);  	mutex_enter(&pidlock); +	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { +		/* +		 * Allow the brand to inject synthetic SIGCLD signals. +		 */ +		if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { +			mutex_exit(&pidlock); +			return; +		} +	}  	for (cp = pp->p_child; cp; cp = cp->p_sibling) {  		if (cp->p_pidflag & CLDPEND) {  			post_sigcld(cp, sqp); @@ -2115,7 +2183,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp)  	ASSERT(MUTEX_HELD(&p->p_lock));  	ASSERT(sig >= 1 && sig < NSIG); -	if (sig_discardable(p, sig)) +	if (sig_discardable(p, t, sig))  		siginfofree(sigqp);  	else  		sigaddqins(p, t, sigqp); @@ -2141,7 +2209,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags)  	 * blocking the signal (it *could* change it's mind while  	 * the signal is pending) then don't bother creating one.  	 */ -	if (!sig_discardable(p, sig) && +	if (!sig_discardable(p, t, sig) &&  	    (sigismember(&p->p_siginfo, sig) ||  	    (curproc->p_ct_process != p->p_ct_process) ||  	    (sig == SIGCLD && SI_FROMKERNEL(infop))) && diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c index 6084676b17..6dc7230bed 100644 --- a/usr/src/uts/common/os/smb_subr.c +++ b/usr/src/uts/common/os/smb_subr.c @@ -25,7 +25,9 @@   * Use is subject to license terms.   */ -#pragma ident	"%Z%%M%	%I%	%E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc.  All rights reserved. + */  #include <sys/smbios_impl.h>  #include <sys/cmn_err.h> @@ -43,13 +45,13 @@ smb_strerror(int err)  void *  smb_alloc(size_t len)  { -	return (kmem_alloc(len, KM_SLEEP)); +	return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL);  }  void *  smb_zalloc(size_t len)  { -	return (kmem_zalloc(len, KM_SLEEP)); +	return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL);  }  void diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index d4c2f7023d..68afeef013 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -78,6 +78,7 @@  #include <sys/policy.h>  #include <sys/dld.h>  #include <sys/zone.h> +#include <sys/limits.h>  #include <c2/audit.h>  /* @@ -986,12 +987,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,  		 * (registered in sd_wakeq).  		 */  		struiod_t uiod; +		struct iovec buf[IOV_MAX_STACK]; +		int iovlen = 0;  		if (first)  			stp->sd_wakeq &= ~RSLEEP; -		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, -		    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); +		if (uiop->uio_iovcnt > IOV_MAX_STACK) { +			iovlen = uiop->uio_iovcnt * sizeof (iovec_t); +			uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); +		} else { +			uiod.d_iov = buf; +		} + +		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);  		uiod.d_mp = 0;  		/*  		 * Mark that a thread is in rwnext on the read side @@ -1030,6 +1039,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,  			if ((bp = uiod.d_mp) != NULL) {  				*errorp = 0;  				ASSERT(MUTEX_HELD(&stp->sd_lock)); +				if (iovlen != 0) +					kmem_free(uiod.d_iov, iovlen);  				return (bp);  			}  			error = 0; @@ -1049,8 +1060,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,  		} else {  			*errorp = error;  			ASSERT(MUTEX_HELD(&stp->sd_lock)); +			if (iovlen != 0) +				kmem_free(uiod.d_iov, iovlen);  			return (NULL);  		} + +		if (iovlen != 0) +			kmem_free(uiod.d_iov, iovlen); +  		/*  		 * Try a getq in case a rwnext() generated mblk  		 * has bubbled up via strrput(). @@ -2545,6 +2562,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,      int b_flag, int pri, int flags)  {  	struiod_t uiod; +	struct iovec buf[IOV_MAX_STACK]; +	int iovlen = 0;  	mblk_t *mp;  	queue_t *wqp = stp->sd_wrq;  	int error = 0; @@ -2636,13 +2655,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,  	mp->b_flag |= b_flag;  	mp->b_band = (uchar_t)pri; -	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, -	    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); +	if (uiop->uio_iovcnt > IOV_MAX_STACK) { +		iovlen = uiop->uio_iovcnt * sizeof (iovec_t); +		uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP); +	} else { +		uiod.d_iov = buf; +	} + +	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);  	uiod.d_uio.uio_offset = 0;  	uiod.d_mp = mp;  	error = rwnext(wqp, &uiod);  	if (! uiod.d_mp) {  		uioskip(uiop, *iosize); +		if (iovlen != 0) +			kmem_free(uiod.d_iov, iovlen);  		return (error);  	}  	ASSERT(mp == uiod.d_mp); @@ -2660,17 +2687,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,  		error = 0;  	} else {  		freemsg(mp); +		if (iovlen != 0) +			kmem_free(uiod.d_iov, iovlen);  		return (error);  	}  	/* Have to check canput before consuming data from the uio */  	if (pri == 0) {  		if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {  			freemsg(mp); +			if (iovlen != 0) +				kmem_free(uiod.d_iov, iovlen);  			return (EWOULDBLOCK);  		}  	} else {  		if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {  			freemsg(mp); +			if (iovlen != 0) +				kmem_free(uiod.d_iov, iovlen);  			return (EWOULDBLOCK);  		}  	} @@ -2678,6 +2711,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,  	/* Copyin data from the uio */  	if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {  		freemsg(mp); +		if (iovlen != 0) +			kmem_free(uiod.d_iov, iovlen);  		return (error);  	}  	uioskip(uiop, *iosize); @@ -2694,6 +2729,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,  		putnext(wqp, mp);  		stream_runservice(stp);  	} +	if (iovlen != 0) +		kmem_free(uiod.d_iov, iovlen);  	return (0);  } @@ -3179,6 +3216,7 @@ job_control_type(int cmd)  	case JAGENT:	/* Obsolete */  	case JTRUN:	/* Obsolete */  	case JXTPROTO:	/* Obsolete */ +	case TIOCSETLD:  		return (JCSETP);  	} diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index 1ffb561428..ac1ee2d1ce 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -26,6 +26,7 @@   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc.   * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.   */ @@ -8470,6 +8471,12 @@ mblk_copycred(mblk_t *mp, const mblk_t *src)  		dbp->db_cpid = cpid;  } + +/* + * Now that NIC drivers are expected to deal only with M_DATA mblks, the + * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their + * respective mac_hcksum_set and mac_hcksum_get counterparts. + */  int  hcksum_assoc(mblk_t *mp,  multidata_t *mmd, pdesc_t *pd,      uint32_t start, uint32_t stuff, uint32_t end, uint32_t value, diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index c39819156d..e0cc20fa45 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -5903,6 +5903,12 @@ ddi_ffs(long mask)  	return (ffs(mask));  } +int +ddi_ffsll(long long mask) +{ +	return (ffs(mask)); +} +  /*   * Find last bit set. Take mask and clear   * all but the most significant bit, and @@ -5914,8 +5920,14 @@ ddi_ffs(long mask)  int  ddi_fls(long mask)  { +	return (ddi_flsll(mask)); +} + +int +ddi_flsll(long long mask) +{  	while (mask) { -		long nx; +		long long nx;  		if ((nx = (mask & (mask - 1))) == 0)  			break; diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index fb8bf07077..fb64000e4d 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -23,6 +23,7 @@   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.   * Copyright 2012 Milan Jurik. All rights reserved.   * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc.   * Copyright (c) 2018, Joyent, Inc.   */ @@ -61,8 +62,7 @@ struct mmaplf32a;  int	access(char *, int);  int	alarm(int);  int	auditsys(struct auditcalls *, rval_t *); -int64_t	brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, -    uintptr_t); +int64_t	brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t);  intptr_t	brk(caddr_t);  int	chdir(char *);  int	chmod(char *, int); @@ -647,7 +647,7 @@ struct sysent sysent[NSYSCALL] =  			SYSENT_NOSYS(),  			SYSENT_C("llseek",	llseek32,	4)),  	/* 176 */ SYSENT_LOADABLE(),		/* inst_sync */ -	/* 177 */ SYSENT_CI("brandsys",		brandsys,	6), +	/* 177 */ SYSENT_CI("brandsys",		brandsys,	5),  	/* 178 */ SYSENT_LOADABLE(),		/* kaio */  	/* 179 */ SYSENT_LOADABLE(),		/* cpc */  	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3), @@ -1002,7 +1002,7 @@ struct sysent sysent32[NSYSCALL] =  	/* 174 */ SYSENT_CI("pwrite",		pwrite32,		4),  	/* 175 */ SYSENT_C("llseek",		llseek32,	4),  	/* 176 */ SYSENT_LOADABLE32(),		/* inst_sync */ -	/* 177 */ SYSENT_CI("brandsys",		brandsys,	6), +	/* 177 */ SYSENT_CI("brandsys",		brandsys,	5),  	/* 178 */ SYSENT_LOADABLE32(),		/* kaio */  	/* 179 */ SYSENT_LOADABLE32(),		/* cpc */  	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3), @@ -1094,18 +1094,20 @@ char **syscallnames;  systrace_sysent_t *systrace_sysent;  void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, -    uintptr_t, uintptr_t, uintptr_t, uintptr_t); +    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);  /*ARGSUSED*/  void  systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, -    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) +    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, +    uintptr_t arg6, uintptr_t arg7)  {}  /*ARGSUSED*/  int64_t  dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, -    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) +    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, +    uintptr_t arg7)  {  	systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum];  	dtrace_id_t id; @@ -1113,7 +1115,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,  	proc_t *p;  	if ((id = sy->stsy_entry) != DTRACE_IDNONE) -		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); +		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, +		    arg6, arg7);  	/*  	 * We want to explicitly allow DTrace consumers to stop a process @@ -1127,14 +1130,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,  	}  	mutex_exit(&p->p_lock); -	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); +	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, +	    arg6, arg7);  	if (ttolwp(curthread)->lwp_errno != 0)  		rval = -1;  	if ((id = sy->stsy_return) != DTRACE_IDNONE)  		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, -		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); +		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0);  	return (rval);  } @@ -1146,7 +1150,8 @@ systrace_sysent_t *systrace_sysent32;  /*ARGSUSED*/  int64_t  dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, -    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) +    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, +    uintptr_t arg7)  {  	systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum];  	dtrace_id_t id; @@ -1154,7 +1159,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,  	proc_t *p;  	if ((id = sy->stsy_entry) != DTRACE_IDNONE) -		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); +		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, +		    arg7);  	/*  	 * We want to explicitly allow DTrace consumers to stop a process @@ -1168,14 +1174,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,  	}  	mutex_exit(&p->p_lock); -	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); +	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, +	    arg7);  	if (ttolwp(curthread)->lwp_errno != 0)  		rval = -1;  	if ((id = sy->stsy_return) != DTRACE_IDNONE)  		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, -		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); +		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0);  	return (rval);  } @@ -1203,5 +1210,5 @@ dtrace_systrace_rtt(void)  	}  	if ((id = sy->stsy_return) != DTRACE_IDNONE) -		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0); +		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0);  } diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c index b25a6cbcf1..5453ebf380 100644 --- a/usr/src/uts/common/os/timer.c +++ b/usr/src/uts/common/os/timer.c @@ -25,11 +25,12 @@   */  /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc.   */  #include <sys/timer.h>  #include <sys/systm.h> +#include <sys/sysmacros.h>  #include <sys/param.h>  #include <sys/kmem.h>  #include <sys/debug.h> @@ -81,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it)   * waiters.  p_lock must be held on entry; it will not be dropped by   * timer_unlock().   */ +/* ARGSUSED */  static void  timer_unlock(proc_t *p, itimer_t *it)  { @@ -123,6 +125,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)  		timer_lock(p, it);  	} +	ASSERT(p->p_itimer_sz > tid);  	ASSERT(p->p_itimer[tid] == it);  	p->p_itimer[tid] = NULL; @@ -137,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)  	it->it_backend->clk_timer_delete(it); -	if (it->it_portev) { +	if (it->it_flags & IT_PORT) {  		mutex_enter(&it->it_mutex);  		if (it->it_portev) {  			port_kevent_t	*pev; @@ -199,18 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)  static itimer_t *  timer_grab(proc_t *p, timer_t tid)  { -	itimer_t **itp, *it; +	itimer_t *it; -	if (tid >= timer_max || tid < 0) +	if (tid < 0) {  		return (NULL); +	}  	mutex_enter(&p->p_lock); - -	if ((itp = p->p_itimer) == NULL || (it = itp[tid]) == NULL) { +	if (p->p_itimer == NULL || tid >= p->p_itimer_sz || +	    (it = p->p_itimer[tid]) == NULL) {  		mutex_exit(&p->p_lock);  		return (NULL);  	} +	/* This may drop p_lock temporarily. */  	timer_lock(p, it);  	if (it->it_lock & ITLK_REMOVE) { @@ -232,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid)   * should not be held on entry; timer_release() will acquire p_lock but   * will drop it before returning.   */ -static void +void  timer_release(proc_t *p, itimer_t *it)  {  	mutex_enter(&p->p_lock); @@ -245,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it)   * p_lock should not be held on entry; timer_delete_grabbed() will acquire   * p_lock, but will drop it before returning.   */ -static void +void  timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it)  {  	mutex_enter(&p->p_lock); @@ -258,6 +263,13 @@ clock_timer_init()  {  	clock_timer_cache = kmem_cache_create("timer_cache",  	    sizeof (itimer_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + +	/* +	 * Push the timer_max limit up to at least 4 * NCPU.  Due to the way +	 * NCPU is defined, proper initialization of the timer limit is +	 * performed at runtime. +	 */ +	timer_max = MAX(NCPU * 4, timer_max);  }  void @@ -453,6 +465,9 @@ timer_fire(itimer_t *it)  			it->it_pending = 1;  			port_send_event((port_kevent_t *)it->it_portev);  			mutex_exit(&it->it_mutex); +		} else if (it->it_flags & IT_CALLBACK) { +			it->it_cb_func(it); +			ASSERT(MUTEX_NOT_HELD(&it->it_mutex));  		} else if (it->it_flags & IT_SIGNAL) {  			it->it_pending = 1;  			mutex_exit(&it->it_mutex); @@ -466,159 +481,175 @@ timer_fire(itimer_t *it)  		mutex_exit(&p->p_lock);  } -int -timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) +/* + * Allocate an itimer_t and find and appropriate slot for it in p_itimer. + * Acquires p_lock and holds it on return, regardless of success. + */ +static itimer_t * +timer_alloc(proc_t *p, timer_t *id)  { -	struct sigevent ev; -	proc_t *p = curproc; -	clock_backend_t *backend; -	itimer_t *it, **itp; -	sigqueue_t *sigq; -	cred_t *cr = CRED(); -	int error = 0; -	timer_t i; -	port_notify_t tim_pnevp; -	port_kevent_t *pkevp = NULL; +	itimer_t *it, **itp = NULL; +	uint_t i; -	if ((backend = CLOCK_BACKEND(clock)) == NULL) -		return (set_errno(EINVAL)); +	ASSERT(MUTEX_NOT_HELD(&p->p_lock)); -	if (evp != NULL) { -		/* -		 * short copyin() for binary compatibility -		 * fetch oldsigevent to determine how much to copy in. -		 */ -		if (get_udatamodel() == DATAMODEL_NATIVE) { -			if (copyin(evp, &ev, sizeof (struct oldsigevent))) -				return (set_errno(EFAULT)); +	it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); +	bzero(it, sizeof (itimer_t)); +	mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); -			if (ev.sigev_notify == SIGEV_PORT || -			    ev.sigev_notify == SIGEV_THREAD) { -				if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, -				    sizeof (port_notify_t))) -					return (set_errno(EFAULT)); +	mutex_enter(&p->p_lock); +retry: +	if (p->p_itimer != NULL) { +		for (i = 0; i < p->p_itimer_sz; i++) { +			if (p->p_itimer[i] == NULL) { +				itp = &(p->p_itimer[i]); +				break;  			} -#ifdef	_SYSCALL32_IMPL -		} else { -			struct sigevent32 ev32; -			port_notify32_t tim_pnevp32; +		} +	} -			if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) -				return (set_errno(EFAULT)); -			ev.sigev_notify = ev32.sigev_notify; -			ev.sigev_signo = ev32.sigev_signo; +	/* +	 * A suitable slot was not found.  If possible, allocate (or resize) +	 * the p_itimer array and try again. +	 */ +	if (itp == NULL) { +		uint_t target_sz = _TIMER_ALLOC_INIT; +		itimer_t **itp_new; + +		if (p->p_itimer != NULL) { +			ASSERT(p->p_itimer_sz != 0); + +			target_sz = p->p_itimer_sz * 2; +		} +		/* +		 * Protect against exceeding the max or overflow +		 */ +		if (target_sz > timer_max || target_sz > INT_MAX || +		    target_sz < p->p_itimer_sz) { +			kmem_cache_free(clock_timer_cache, it); +			return (NULL); +		} +		mutex_exit(&p->p_lock); +		itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *), +		    KM_SLEEP); +		mutex_enter(&p->p_lock); +		if (target_sz <= p->p_itimer_sz) {  			/* -			 * See comment in sigqueue32() on handling of 32-bit -			 * sigvals in a 64-bit kernel. +			 * A racing thread performed the resize while we were +			 * waiting outside p_lock.  Discard our now-useless +			 * allocation and retry.  			 */ -			ev.sigev_value.sival_int = ev32.sigev_value.sival_int; -			if (ev.sigev_notify == SIGEV_PORT || -			    ev.sigev_notify == SIGEV_THREAD) { -				if (copyin((void *)(uintptr_t) -				    ev32.sigev_value.sival_ptr, -				    (void *)&tim_pnevp32, -				    sizeof (port_notify32_t))) -					return (set_errno(EFAULT)); -				tim_pnevp.portnfy_port = -				    tim_pnevp32.portnfy_port; -				tim_pnevp.portnfy_user = -				    (void *)(uintptr_t)tim_pnevp32.portnfy_user; +			kmem_free(itp_new, target_sz * sizeof (itimer_t *)); +			goto retry; +		} else { +			/* +			 * Instantiate the larger allocation and select the +			 * first fresh entry for use. +			 */ +			if (p->p_itimer != NULL) { +				uint_t old_sz; + +				old_sz = p->p_itimer_sz; +				bcopy(p->p_itimer, itp_new, +				    old_sz * sizeof (itimer_t *)); +				kmem_free(p->p_itimer, +				    old_sz * sizeof (itimer_t *)); + +				/* +				 * Short circuit to use the first free entry in +				 * the new allocation.  It's possible that +				 * other lower-indexed timers were freed while +				 * p_lock was dropped, but skipping over them +				 * is not harmful at all.  In the common case, +				 * we skip the need to walk over an array +				 * filled with timers before arriving at the +				 * slot we know is fresh from the allocation. +				 */ +				i = old_sz; +			} else { +				/* +				 * For processes lacking any existing timers, +				 * we can simply select the first entry. +				 */ +				i = 0;  			} -#endif +			p->p_itimer = itp_new; +			p->p_itimer_sz = target_sz;  		} -		switch (ev.sigev_notify) { -		case SIGEV_NONE: -			break; -		case SIGEV_SIGNAL: -			if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) -				return (set_errno(EINVAL)); -			break; -		case SIGEV_THREAD: -		case SIGEV_PORT: -			break; -		default: -			return (set_errno(EINVAL)); -		} -	} else { -		/* -		 * Use the clock's default sigevent (this is a structure copy). -		 */ -		ev = backend->clk_default;  	} +	ASSERT(i <= INT_MAX); +	*id = (timer_t)i; +	return (it); +} + +/* + * Setup a timer + * + * This allocates an itimer_t (including a timer_t ID and slot in the process), + * wires it up according to the provided sigevent, and associates it with the + * desired clock backend.  Upon successful completion, the timer will be + * locked, preventing it from being armed via timer_settime() or deleted via + * timer_delete().  This gives the caller a chance to perform any last minute + * manipulations (such as configuring the IT_CALLBACK functionality and/or + * copying the timer_t out to userspace) before using timer_release() to unlock + * it or timer_delete_grabbed() to delete it. + */ +int +timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp, +    itimer_t **itp, timer_t *tidp) +{ +	proc_t *p = curproc; +	int error = 0; +	itimer_t *it; +	sigqueue_t *sigq; +	timer_t tid; +  	/* -	 * We'll allocate our timer and sigqueue now, before we grab p_lock. -	 * If we can't find an empty slot, we'll free them before returning. +	 * We'll allocate our sigqueue now, before we grab p_lock. +	 * If we can't find an empty slot, we'll free it before returning.  	 */ -	it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); -	bzero(it, sizeof (itimer_t)); -	mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);  	sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); -	mutex_enter(&p->p_lock); -  	/* -	 * If this is this process' first timer, we need to attempt to allocate -	 * an array of timerstr_t pointers.  We drop p_lock to perform the -	 * allocation; if we return to discover that p_itimer is non-NULL, -	 * we will free our allocation and drive on. +	 * Allocate a timer and choose a slot for it. This acquires p_lock.  	 */ -	if ((itp = p->p_itimer) == NULL) { -		mutex_exit(&p->p_lock); -		itp = kmem_zalloc(timer_max * sizeof (itimer_t *), KM_SLEEP); -		mutex_enter(&p->p_lock); - -		if (p->p_itimer == NULL) -			p->p_itimer = itp; -		else { -			kmem_free(itp, timer_max * sizeof (itimer_t *)); -			itp = p->p_itimer; -		} -	} - -	for (i = 0; i < timer_max && itp[i] != NULL; i++) -		continue; +	it = timer_alloc(p, &tid); +	ASSERT(MUTEX_HELD(&p->p_lock)); -	if (i == timer_max) { -		/* -		 * We couldn't find a slot.  Drop p_lock, free the preallocated -		 * timer and sigqueue, and return an error. -		 */ +	if (it == NULL) {  		mutex_exit(&p->p_lock); -		kmem_cache_free(clock_timer_cache, it);  		kmem_free(sigq, sizeof (sigqueue_t)); - -		return (set_errno(EAGAIN)); +		return (EAGAIN);  	} -	ASSERT(i < timer_max && itp[i] == NULL); +	ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL); +	ASSERT(evp != NULL);  	/*  	 * If we develop other notification mechanisms, this will need  	 * to call into (yet another) backend.  	 */ -	sigq->sq_info.si_signo = ev.sigev_signo; -	if (evp == NULL) -		sigq->sq_info.si_value.sival_int = i; -	else -		sigq->sq_info.si_value = ev.sigev_value; +	sigq->sq_info.si_signo = evp->sigev_signo; +	sigq->sq_info.si_value = evp->sigev_value;  	sigq->sq_info.si_code = SI_TIMER;  	sigq->sq_info.si_pid = p->p_pid;  	sigq->sq_info.si_ctid = PRCTID(p);  	sigq->sq_info.si_zoneid = getzoneid(); -	sigq->sq_info.si_uid = crgetruid(cr); +	sigq->sq_info.si_uid = crgetruid(CRED());  	sigq->sq_func = timer_signal;  	sigq->sq_next = NULL;  	sigq->sq_backptr = it;  	it->it_sigq = sigq;  	it->it_backend = backend;  	it->it_lock = ITLK_LOCKED; -	itp[i] = it; - -	if (ev.sigev_notify == SIGEV_THREAD || -	    ev.sigev_notify == SIGEV_PORT) { +	if (evp->sigev_notify == SIGEV_THREAD || +	    evp->sigev_notify == SIGEV_PORT) {  		int port; +		port_kevent_t *pkevp = NULL; + +		ASSERT(pnp != NULL);  		/*  		 * This timer is programmed to use event port notification when @@ -638,18 +669,17 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)  		 */  		it->it_flags |= IT_PORT; -		port = tim_pnevp.portnfy_port; +		port = pnp->portnfy_port;  		/* associate timer as event source with the port */  		error = port_associate_ksource(port, PORT_SOURCE_TIMER,  		    (port_source_t **)&it->it_portsrc, timer_close_port,  		    (void *)it, NULL);  		if (error) { -			itp[i] = NULL;		/* clear slot */  			mutex_exit(&p->p_lock);  			kmem_cache_free(clock_timer_cache, it);  			kmem_free(sigq, sizeof (sigqueue_t)); -			return (set_errno(error)); +			return (error);  		}  		/* allocate an event structure/slot */ @@ -658,23 +688,24 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)  		if (error) {  			(void) port_dissociate_ksource(port, PORT_SOURCE_TIMER,  			    (port_source_t *)it->it_portsrc); -			itp[i] = NULL;		/* clear slot */  			mutex_exit(&p->p_lock);  			kmem_cache_free(clock_timer_cache, it);  			kmem_free(sigq, sizeof (sigqueue_t)); -			return (set_errno(error)); +			return (error);  		}  		/* initialize event data */ -		port_init_event(pkevp, i, tim_pnevp.portnfy_user, +		port_init_event(pkevp, tid, pnp->portnfy_user,  		    timer_port_callback, it);  		it->it_portev = pkevp;  		it->it_portfd = port;  	} else { -		if (ev.sigev_notify == SIGEV_SIGNAL) +		if (evp->sigev_notify == SIGEV_SIGNAL)  			it->it_flags |= IT_SIGNAL;  	} +	/* Populate the slot now that the timer is prepped. */ +	p->p_itimer[tid] = it;  	mutex_exit(&p->p_lock);  	/* @@ -687,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)  	it->it_lwp = ttolwp(curthread);  	it->it_proc = p; -	if (copyout(&i, tid, sizeof (timer_t)) != 0) { -		error = EFAULT; -		goto err; -	} - -	/* -	 * If we're here, then we have successfully created the timer; we -	 * just need to release the timer and return. -	 */ -	timer_release(p, it); - +	*itp = it; +	*tidp = tid;  	return (0);  err: @@ -708,11 +730,115 @@ err:  	 * impossible for a removal to be pending.  	 */  	ASSERT(!(it->it_lock & ITLK_REMOVE)); -	timer_delete_grabbed(p, i, it); +	timer_delete_grabbed(p, tid, it); -	return (set_errno(error)); +	return (error);  } + +int +timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp) +{ +	int error = 0; +	proc_t *p = curproc; +	clock_backend_t *backend; +	struct sigevent ev; +	itimer_t *it; +	timer_t tid; +	port_notify_t tim_pnevp; + +	if ((backend = CLOCK_BACKEND(clock)) == NULL) +		return (set_errno(EINVAL)); + +	if (evp != NULL) { +		/* +		 * short copyin() for binary compatibility +		 * fetch oldsigevent to determine how much to copy in. +		 */ +		if (get_udatamodel() == DATAMODEL_NATIVE) { +			if (copyin(evp, &ev, sizeof (struct oldsigevent))) +				return (set_errno(EFAULT)); + +			if (ev.sigev_notify == SIGEV_PORT || +			    ev.sigev_notify == SIGEV_THREAD) { +				if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, +				    sizeof (port_notify_t))) +					return (set_errno(EFAULT)); +			} +#ifdef	_SYSCALL32_IMPL +		} else { +			struct sigevent32 ev32; +			port_notify32_t tim_pnevp32; + +			if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) +				return (set_errno(EFAULT)); +			ev.sigev_notify = ev32.sigev_notify; +			ev.sigev_signo = ev32.sigev_signo; +			/* +			 * See comment in sigqueue32() on handling of 32-bit +			 * sigvals in a 64-bit kernel. +			 */ +			ev.sigev_value.sival_int = ev32.sigev_value.sival_int; +			if (ev.sigev_notify == SIGEV_PORT || +			    ev.sigev_notify == SIGEV_THREAD) { +				if (copyin((void *)(uintptr_t) +				    ev32.sigev_value.sival_ptr, +				    (void *)&tim_pnevp32, +				    sizeof (port_notify32_t))) +					return (set_errno(EFAULT)); +				tim_pnevp.portnfy_port = +				    tim_pnevp32.portnfy_port; +				tim_pnevp.portnfy_user = +				    (void *)(uintptr_t)tim_pnevp32.portnfy_user; +			} +#endif +		} +		switch (ev.sigev_notify) { +		case SIGEV_NONE: +			break; +		case SIGEV_SIGNAL: +			if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) +				return (set_errno(EINVAL)); +			break; +		case SIGEV_THREAD: +		case SIGEV_PORT: +			break; +		default: +			return (set_errno(EINVAL)); +		} +	} else { +		/* +		 * Use the clock's default sigevent (this is a structure copy). +		 */ +		ev = backend->clk_default; +	} + +	if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) { +		return (set_errno(error)); +	} + +	/* +	 * Populate si_value with the timer ID if no sigevent was passed in. +	 */ +	if (evp == NULL) { +		it->it_sigq->sq_info.si_value.sival_int = tid; +	} + +	if (copyout(&tid, tidp, sizeof (timer_t)) != 0) { +		timer_delete_grabbed(p, tid, it); +		return (set_errno(EFAULT)); +	} + +	/* +	 * If we're here, then we have successfully created the timer; we +	 * just need to release the timer and return. +	 */ +	timer_release(p, it); + +	return (0); +} + +  int  timer_gettime(timer_t tid, itimerspec_t *val)  { @@ -832,20 +958,23 @@ timer_getoverrun(timer_t tid)  void  timer_lwpexit(void)  { -	timer_t i; +	uint_t i;  	proc_t *p = curproc;  	klwp_t *lwp = ttolwp(curthread); -	itimer_t *it, **itp; +	itimer_t *it;  	ASSERT(MUTEX_HELD(&p->p_lock)); -	if ((itp = p->p_itimer) == NULL) +	if (p->p_itimer == NULL) {  		return; +	} -	for (i = 0; i < timer_max; i++) { -		if ((it = itp[i]) == NULL) +	for (i = 0; i < p->p_itimer_sz; i++) { +		if ((it = p->p_itimer[i]) == NULL) {  			continue; +		} +		/* This may drop p_lock temporarily. */  		timer_lock(p, it);  		if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) { @@ -876,20 +1005,22 @@ timer_lwpexit(void)  void  timer_lwpbind()  { -	timer_t i; +	uint_t i;  	proc_t *p = curproc;  	klwp_t *lwp = ttolwp(curthread); -	itimer_t *it, **itp; +	itimer_t *it;  	ASSERT(MUTEX_HELD(&p->p_lock)); -	if ((itp = p->p_itimer) == NULL) +	if (p->p_itimer == NULL) {  		return; +	} -	for (i = 0; i < timer_max; i++) { -		if ((it = itp[i]) == NULL) +	for (i = 0; i < p->p_itimer_sz; i++) { +		if ((it = p->p_itimer[i]) == NULL)  			continue; +		/* This may drop p_lock temporarily. */  		timer_lock(p, it);  		if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) { @@ -911,16 +1042,19 @@ timer_lwpbind()  void  timer_exit(void)  { -	timer_t i; +	uint_t i;  	proc_t *p = curproc;  	ASSERT(p->p_itimer != NULL); +	ASSERT(p->p_itimer_sz != 0); -	for (i = 0; i < timer_max; i++) -		(void) timer_delete(i); +	for (i = 0; i < p->p_itimer_sz; i++) { +		(void) timer_delete((timer_t)i); +	} -	kmem_free(p->p_itimer, timer_max * sizeof (itimer_t *)); +	kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *));  	p->p_itimer = NULL; +	p->p_itimer_sz = 0;  }  /* @@ -977,7 +1111,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose)  	for (tid = 0; tid < timer_max; tid++) {  		if ((it = timer_grab(p, tid)) == NULL)  			continue; -		if (it->it_portev) { +		if (it->it_flags & IT_PORT) {  			mutex_enter(&it->it_mutex);  			if (it->it_portfd == port) {  				port_kevent_t *pev; diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c index 61acc6cf97..53be806026 100644 --- a/usr/src/uts/common/os/timers.c +++ b/usr/src/uts/common/os/timers.c @@ -22,6 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2016 Joyent, Inc.   */  /* @@ -1172,6 +1173,14 @@ timespectohz64(timespec_t *tv)  void  hrt2ts(hrtime_t hrt, timestruc_t *tsp)  { +#if defined(__amd64) +	/* +	 * The cleverness explained above is unecessary on x86_64 CPUs where +	 * modern compilers are able to optimize down to faster operations. +	 */ +	tsp->tv_sec = hrt / NANOSEC; +	tsp->tv_nsec = hrt % NANOSEC; +#else  	uint32_t sec, nsec, tmp;  	tmp = (uint32_t)(hrt >> 30); @@ -1193,20 +1202,28 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp)  	}  	tsp->tv_sec = (time_t)sec;  	tsp->tv_nsec = nsec; +#endif /* defined(__amd64) */  }  /*   * Convert from timestruc_t to hrtime_t. - * - * The code below is equivalent to: - * - *	hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; - * - * but requires no integer multiply.   */  hrtime_t  ts2hrt(const timestruc_t *tsp)  { +#if defined(__amd64) || defined(__i386) +	/* +	 * On modern x86 CPUs, the simple version is faster. +	 */ +	return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec); +#else +	/* +	 * The code below is equivalent to: +	 * +	 *	hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; +	 * +	 * but requires no integer multiply. +	 */  	hrtime_t hrt;  	hrt = tsp->tv_sec; @@ -1215,6 +1232,7 @@ ts2hrt(const timestruc_t *tsp)  	hrt = (hrt << 7) - hrt - hrt - hrt;  	hrt = (hrt << 9) + tsp->tv_nsec;  	return (hrt); +#endif /* defined(__amd64) || defined(__i386) */  }  /* @@ -1246,6 +1264,13 @@ tv2hrt(struct timeval *tvp)  void  hrt2tv(hrtime_t hrt, struct timeval *tvp)  { +#if defined(__amd64) +	/* +	 * Like hrt2ts, the simple version is faster on x86_64. +	 */ +	tvp->tv_sec = hrt / NANOSEC; +	tvp->tv_usec = (hrt % NANOSEC) / (NANOSEC / MICROSEC); +#else  	uint32_t sec, nsec, tmp;  	uint32_t q, r, t; @@ -1267,17 +1292,17 @@ hrt2tv(hrtime_t hrt, struct timeval *tvp)  		sec++;  	}  	tvp->tv_sec = (time_t)sec; -/* - * this routine is very similar to hr2ts, but requires microseconds - * instead of nanoseconds, so an interger divide by 1000 routine - * completes the conversion - */ +	/* +	 * this routine is very similar to hr2ts, but requires microseconds +	 * instead of nanoseconds, so an interger divide by 1000 routine +	 * completes the conversion +	 */  	t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12);  	q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14);  	q = q >> 9;  	r = nsec - q*1000;  	tvp->tv_usec = q + ((r + 24) >> 10); - +#endif /* defined(__amd64) */  }  int diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c index 608208bbca..f5ee76a2cb 100644 --- a/usr/src/uts/common/os/vm_pageout.c +++ b/usr/src/uts/common/os/vm_pageout.c @@ -21,6 +21,7 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2018 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -58,6 +59,7 @@  #include <sys/tnf_probe.h>  #include <sys/mem_cage.h>  #include <sys/time.h> +#include <sys/zone.h>  #include <vm/hat.h>  #include <vm/as.h> @@ -73,7 +75,7 @@ static int checkpage(page_t *, int);   * algorithm.  They are initialized to 0, and then computed at boot time   * based on the size of the system.  If they are patched non-zero in   * a loaded vmunix they are left alone and may thus be changed per system - * using adb on the loaded system. + * using mdb on the loaded system.   */  pgcnt_t		slowscan = 0;  pgcnt_t		fastscan = 0; @@ -81,6 +83,7 @@ pgcnt_t		fastscan = 0;  static pgcnt_t	handspreadpages = 0;  static int	loopfraction = 2;  static pgcnt_t	looppages; +/* See comment below describing 4% and 80% */  static int	min_percent_cpu = 4;  static int	max_percent_cpu = 80;  static pgcnt_t	maxfastscan = 0; @@ -98,14 +101,34 @@ pgcnt_t	deficit;  pgcnt_t	nscan;  pgcnt_t	desscan; +/* kstats */ +uint64_t low_mem_scan; +uint64_t zone_cap_scan; +uint64_t n_throttle; + +clock_t	zone_pageout_ticks;	/* tunable to change zone pagescan ticks */ +  /*   * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks   * are the number of ticks in each wakeup cycle that gives the   * equivalent of some underlying %CPU duty cycle. - * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is - * awakened every 25 clock ticks.  So, converting from %CPU to ticks - * per wakeup cycle would be x% of 25, that is (x * 100) / 25. - * So, for example, 4% == 1 tick and 80% == 20 ticks. + * + * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging() + * will run 4 times/sec to update pageout scanning parameters and kickoff + * the pageout_scanner() thread if necessary. + * + * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When + * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed + * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1). + * + * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When + * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed + * by the scanner in a 1 second interval is 80% of a CPU + * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25 + * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec. + * + * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks + * will be 200, so the CPU percentages are the same as when hz is 100.   *   * min_pageout_ticks:   *     ticks/wakeup equivalent of min_percent_cpu. @@ -117,19 +140,29 @@ pgcnt_t	desscan;   *     Number of clock ticks budgeted for each wakeup cycle.   *     Computed each time around by schedpaging().   *     Varies between min_pageout_ticks .. max_pageout_ticks, - *     depending on memory pressure. - * - * pageout_lbolt: - *     Timestamp of the last time pageout_scanner woke up and started - *     (or resumed) scanning for not recently referenced pages. + *     depending on memory pressure or zones over their cap.   */  static clock_t	min_pageout_ticks;  static clock_t	max_pageout_ticks;  static clock_t	pageout_ticks; -static clock_t	pageout_lbolt; -static uint_t	reset_hands; +#define	MAX_PSCAN_THREADS	16 +static boolean_t reset_hands[MAX_PSCAN_THREADS]; + +/* + * These can be tuned in /etc/system or set with mdb. + * 'des_page_scanners' is the desired number of page scanner threads. The + * system will bring the actual number of threads into line with the desired + * number. If des_page_scanners is set to an invalid value, the system will + * correct the setting. + */ +uint_t des_page_scanners; +uint_t pageout_reset_cnt = 64;	/* num. cycles for pageout_scanner hand reset */ + +uint_t n_page_scanners; +static pgcnt_t	pscan_region_sz; /* informational only */ +  #define	PAGES_POLL_MASK	1023 @@ -145,33 +178,37 @@ static uint_t	reset_hands;   * pageout_sample_pages:   *     The accumulated number of pages scanned during sampling.   * - * pageout_sample_ticks: - *     The accumulated clock ticks for the sample. + * pageout_sample_etime: + *     The accumulated number of nanoseconds for the sample.   *   * pageout_rate: - *     Rate in pages/nanosecond, computed at the end of sampling. + *     Rate in pages/second, computed at the end of sampling.   *   * pageout_new_spread: - *     The new value to use for fastscan and handspreadpages. - *     Calculated after enough samples have been taken. + *     The new value to use for maxfastscan and (perhaps) handspreadpages. + *     Intended to be the number pages that can be scanned per sec using ~10% + *     of a CPU. Calculated after enough samples have been taken. + *     pageout_rate / 10   */  typedef hrtime_t hrrate_t; -static uint64_t	pageout_sample_lim = 4; -static uint64_t	pageout_sample_cnt = 0; +static uint_t	pageout_sample_lim = 4; +static uint_t	pageout_sample_cnt = 0;  static pgcnt_t	pageout_sample_pages = 0;  static hrrate_t	pageout_rate = 0;  static pgcnt_t	pageout_new_spread = 0; -static clock_t	pageout_cycle_ticks; -static hrtime_t	sample_start, sample_end;  static hrtime_t	pageout_sample_etime = 0; +/* True if page scanner is first starting up */ +#define	PAGE_SCAN_STARTUP	(pageout_sample_cnt < pageout_sample_lim) +  /*   * Record number of times a pageout_scanner wakeup cycle finished because it   * timed out (exceeded its CPU budget), rather than because it visited - * its budgeted number of pages. + * its budgeted number of pages. This is only done when scanning under low + * free memory conditions, not when scanning for zones over their cap.   */  uint64_t pageout_timeouts = 0; @@ -194,25 +231,35 @@ kcondvar_t	memavail_cv;  #define	LOOPPAGES	total_pages  /* - * Set up the paging constants for the clock algorithm. - * Called after the system is initialized and the amount of memory - * and number of paging devices is known. + * Local boolean to control scanning when zones are over their cap. Avoids + * accessing the zone_num_over_cap variable except within schedpaging(), which + * only runs periodically. This is here only to reduce our access to + * zone_num_over_cap, since it is already accessed a lot during paging, and + * the page scanner accesses the zones_over variable on each page during a + * scan. There is no lock needed for zone_num_over_cap since schedpaging() + * doesn't modify the variable, it only cares if the variable is 0 or non-0. + */ +static boolean_t zones_over = B_FALSE; + +/* + * Set up the paging constants for the page scanner clock-hand algorithm. + * Called at startup after the system is initialized and the amount of memory + * and number of paging devices is known (recalc will be 0). Called again once + * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples + * (recalc will be 1). + * + * Will also be called after a memory dynamic reconfiguration operation and + * recalc will be 1 in those cases too.   * - * lotsfree is 1/64 of memory, but at least 512K. + * lotsfree is 1/64 of memory, but at least 512K (ha!).   * desfree is 1/2 of lotsfree.   * minfree is 1/2 of desfree. - * - * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set: - * - *	lotsfree = btop(512K) - *	desfree = btop(200K) - *	minfree = btop(100K) - *	throttlefree = INT_MIN - *	max_percent_cpu = 4   */  void  setupclock(int recalc)  { +	uint_t i; +	pgcnt_t sz, tmp;  	static spgcnt_t init_lfree, init_dfree, init_mfree;  	static spgcnt_t init_tfree, init_preserve, init_mpgio; @@ -221,8 +268,8 @@ setupclock(int recalc)  	looppages = LOOPPAGES;  	/* -	 * setupclock can now be called to recalculate the paging -	 * parameters in the case of dynamic addition of memory. +	 * setupclock can be called to recalculate the paging +	 * parameters in the case of dynamic reconfiguration of memory.  	 * So to make sure we make the proper calculations, if such a  	 * situation should arise, we save away the initial values  	 * of each parameter so we can recall them when needed. This @@ -311,105 +358,98 @@ setupclock(int recalc)  		maxpgio = init_mpgio;  	/* -	 * The clock scan rate varies between fastscan and slowscan -	 * based on the amount of free memory available.  Fastscan -	 * rate should be set based on the number pages that can be -	 * scanned per sec using ~10% of processor time.  Since this -	 * value depends on the processor, MMU, Mhz etc., it is -	 * difficult to determine it in a generic manner for all -	 * architectures. +	 * When the system is in a low memory state, the page scan rate varies +	 * between fastscan and slowscan based on the amount of free memory +	 * available. When only zones are over their memory cap, the scan rate +	 * is always fastscan.  	 * -	 * Instead of trying to determine the number of pages scanned -	 * per sec for every processor, fastscan is set to be the smaller -	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling -	 * time is limited to ~4% of processor time. +	 * The fastscan rate should be set based on the number pages that can +	 * be scanned per sec using ~10% of a CPU. Since this value depends on +	 * the processor, MMU, Ghz etc., it must be determined dynamically.  	 * -	 * Setting fastscan to be 1/2 of memory allows pageout to scan -	 * all of memory in ~2 secs.  This implies that user pages not -	 * accessed within 1 sec (assuming, handspreadpages == fastscan) -	 * can be reclaimed when free memory is very low.  Stealing pages -	 * not accessed within 1 sec seems reasonable and ensures that -	 * active user processes don't thrash. +	 * When the scanner first starts up, fastscan will be set to 0 and +	 * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages). +	 * However, once the scanner has collected enough samples, then fastscan +	 * is set to be the smaller of 1/2 of memory (looppages / loopfraction) +	 * or maxfastscan (which is set from pageout_new_spread). Thus, +	 * MAXHANDSPREADPAGES is irrelevant after the scanner is fully +	 * initialized.  	 * -	 * Smaller values of fastscan result in scanning fewer pages -	 * every second and consequently pageout may not be able to free -	 * sufficient memory to maintain the minimum threshold.  Larger -	 * values of fastscan result in scanning a lot more pages which -	 * could lead to thrashing and higher CPU usage. +	 * pageout_new_spread is calculated when the scanner first starts +	 * running. During this initial sampling period the nscan_limit +	 * is set to the total_pages of system memory. Thus, the scanner could +	 * theoretically scan all of memory in one pass. However, each sample +	 * is also limited by the %CPU budget. This is controlled by +	 * pageout_ticks which is set in schedpaging(). During the sampling +	 * period, pageout_ticks is set to max_pageout_ticks. This tick value +	 * is derived from the max_percent_cpu (80%) described above. On a +	 * system with more than a small amount of memory (~8GB), the scanner's +	 * %CPU will be the limiting factor in calculating pageout_new_spread.  	 * -	 * Fastscan needs to be limited to a maximum value and should not -	 * scale with memory to prevent pageout from consuming too much -	 * time for scanning on slow CPU's and avoid thrashing, as a -	 * result of scanning too many pages, on faster CPU's. -	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES -	 * (the upper bound for fastscan) based on the average number -	 * of pages that can potentially be scanned in ~1 sec (using ~4% -	 * of the CPU) on some of the following machines that currently -	 * run Solaris 2.x: +	 * At the end of the sampling period, the pageout_rate indicates how +	 * many pages could be scanned per second. The pageout_new_spread is +	 * then set to be 1/10th of that (i.e. approximating 10% of a CPU). +	 * Of course, this value could still be more than the physical memory +	 * on the system. If so, fastscan is set to 1/2 of memory, as +	 * mentioned above.  	 * -	 *			average memory scanned in ~1 sec +	 * All of this leads up to the setting of handspreadpages, which is +	 * set to fastscan. This is the distance, in pages, between the front +	 * and back hands during scanning. It will dictate which pages will +	 * be considered "hot" on the backhand and which pages will be "cold" +	 * and reclaimed  	 * -	 *	25 Mhz SS1+:		23 Meg -	 *	LX:			37 Meg -	 *	50 Mhz SC2000:		68 Meg +	 * If the scanner is limited by desscan, then at the highest rate it +	 * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the +	 * scanner is limited by the %CPU, then at the highest rate (20% of a +	 * CPU per cycle) the number of pages scanned could be much less.  	 * -	 *	40 Mhz 486:		26 Meg -	 *	66 Mhz 486:		42 Meg +	 * Thus, if the scanner is limited by desscan, then the handspreadpages +	 * setting means 1sec between the front and back hands, but if the +	 * scanner is limited by %CPU, it could be several seconds between the +	 * two hands.  	 * -	 * When free memory falls just below lotsfree, the scan rate -	 * goes from 0 to slowscan (i.e., pageout starts running).  This +	 * The basic assumption is that at the worst case, stealing pages +	 * not accessed within 1 sec seems reasonable and ensures that active +	 * user processes don't thrash. This is especially true when the system +	 * is in a low memory state. +	 * +	 * There are some additional factors to consider for the case of +	 * scanning when zones are over their cap. In this situation it is +	 * also likely that the machine will have a large physical memory which +	 * will take many seconds to fully scan (due to the %CPU and desscan +	 * limits per cycle). It is probable that there will be few (or 0) +	 * pages attributed to these zones in any single scanning cycle. The +	 * result is that reclaiming enough pages for these zones might take +	 * several additional seconds (this is generally not a problem since +	 * the zone physical cap is just a soft cap). +	 * +	 * This is similar to the typical multi-processor situation in which +	 * pageout is often unable to maintain the minimum paging thresholds +	 * under heavy load due to the fact that user processes running on +	 * other CPU's can be dirtying memory at a much faster pace than +	 * pageout can find pages to free. +	 * +	 * One potential approach to address both of these cases is to enable +	 * more than one CPU to run the page scanner, in such a manner that the +	 * various clock hands don't overlap. However, this also makes it more +	 * difficult to determine the values for fastscan, slowscan and +	 * handspreadpages. This is left as a future enhancement, if necessary. +	 * +	 * When free memory falls just below lotsfree, the scan rate goes from +	 * 0 to slowscan (i.e., the page scanner starts running).  This  	 * transition needs to be smooth and is achieved by ensuring that  	 * pageout scans a small number of pages to satisfy the transient  	 * memory demand.  This is set to not exceed 100 pages/sec (25 per  	 * wakeup) since scanning that many pages has no noticible impact  	 * on system performance.  	 * -	 * In addition to setting fastscan and slowscan, pageout is -	 * limited to using ~4% of the CPU.  This results in increasing -	 * the time taken to scan all of memory, which in turn means that -	 * user processes have a better opportunity of preventing their -	 * pages from being stolen.  This has a positive effect on -	 * interactive and overall system performance when memory demand -	 * is high. -	 * -	 * Thus, the rate at which pages are scanned for replacement will -	 * vary linearly between slowscan and the number of pages that -	 * can be scanned using ~4% of processor time instead of varying -	 * linearly between slowscan and fastscan. -	 * -	 * Also, the processor time used by pageout will vary from ~1% -	 * at slowscan to ~4% at fastscan instead of varying between -	 * ~1% at slowscan and ~10% at fastscan. -	 * -	 * The values chosen for the various VM parameters (fastscan, -	 * handspreadpages, etc) are not universally true for all machines, -	 * but appear to be a good rule of thumb for the machines we've -	 * tested.  They have the following ranges: -	 * -	 *	cpu speed:	20 to 70 Mhz -	 *	page size:	4K to 8K -	 *	memory size:	16M to 5G -	 *	page scan rate:	4000 - 17400 4K pages per sec -	 * -	 * The values need to be re-examined for machines which don't -	 * fall into the various ranges (e.g., slower or faster CPUs, -	 * smaller or larger pagesizes etc) shown above. -	 * -	 * On an MP machine, pageout is often unable to maintain the -	 * minimum paging thresholds under heavy load.  This is due to -	 * the fact that user processes running on other CPU's can be -	 * dirtying memory at a much faster pace than pageout can find -	 * pages to free.  The memory demands could be met by enabling -	 * more than one CPU to run the clock algorithm in such a manner -	 * that the various clock hands don't overlap.  This also makes -	 * it more difficult to determine the values for fastscan, slowscan -	 * and handspreadpages. -	 * -	 * The swapper is currently used to free up memory when pageout -	 * is unable to meet memory demands by swapping out processes. -	 * In addition to freeing up memory, swapping also reduces the -	 * demand for memory by preventing user processes from running -	 * and thereby consuming memory. +	 * The swapper is currently used to free up memory when pageout is +	 * unable to meet memory demands. It does this by swapping out entire +	 * processes. In addition to freeing up memory, swapping also reduces +	 * the demand for memory because the swapped out processes cannot +	 * run, and thereby consume memory. However, this is a pathological +	 * state and performance will generally be considered unacceptable.  	 */  	if (init_mfscan == 0) {  		if (pageout_new_spread != 0) @@ -419,12 +459,13 @@ setupclock(int recalc)  	} else {  		maxfastscan = init_mfscan;  	} -	if (init_fscan == 0) +	if (init_fscan == 0) {  		fastscan = MIN(looppages / loopfraction, maxfastscan); -	else +	} else {  		fastscan = init_fscan; -	if (fastscan > looppages / loopfraction) -		fastscan = looppages / loopfraction; +		if (fastscan > looppages / loopfraction) +			fastscan = looppages / loopfraction; +	}  	/*  	 * Set slow scan time to 1/10 the fast scan time, but @@ -444,12 +485,10 @@ setupclock(int recalc)  	 * decreases as the scan rate rises. It must be < the amount  	 * of pageable memory.  	 * -	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages -	 * to be "fastscan" results in the front hand being a few secs -	 * (varies based on the processor speed) ahead of the back hand -	 * at fastscan rates.  This distance can be further reduced, if -	 * necessary, by increasing the processor time used by pageout -	 * to be more than ~4% and preferrably not more than ~10%. +	 * Since pageout is limited to the %CPU per cycle, setting +	 * handspreadpages to be "fastscan" results in the front hand being +	 * a few secs (varies based on the processor speed) ahead of the back +	 * hand at fastscan rates.  	 *  	 * As a result, user processes have a much better chance of  	 * referencing their pages before the back hand examines them. @@ -471,29 +510,78 @@ setupclock(int recalc)  	if (handspreadpages >= looppages)  		handspreadpages = looppages - 1; +	if (recalc == 0) { +		/* +		 * Setup basic values at initialization. +		 */ +		pscan_region_sz = total_pages; +		des_page_scanners = n_page_scanners = 1; +		reset_hands[0] = B_TRUE; +		return; +	} +  	/* -	 * If we have been called to recalculate the parameters, -	 * set a flag to re-evaluate the clock hand pointers. +	 * Recalculating +	 * +	 * We originally set the number of page scanners to 1. Now that we +	 * know what the handspreadpages is for a scanner, figure out how many +	 * scanners we should run. We want to ensure that the regions don't +	 * overlap and that they are not touching. +	 * +	 * A default 64GB region size is used as the initial value to calculate +	 * how many scanner threads we should create on lower memory systems. +	 * The idea is to limit the number of threads to a practical value +	 * (e.g. a 64GB machine really only needs one scanner thread). For very +	 * large memory systems, we limit ourselves to MAX_PSCAN_THREADS +	 * threads. +	 * +	 * The scanner threads themselves are evenly spread out around the +	 * memory "clock" in pageout_scanner when we reset the hands, and each +	 * thread will scan all of memory.  	 */ -	if (recalc) -		reset_hands = 1; +	sz = (btop(64ULL * 0x40000000ULL)); +	if (sz < handspreadpages) { +		/* +		 * 64GB is smaller than the separation between the front +		 * and back hands; use double handspreadpages. +		 */ +		sz = handspreadpages << 1; +	} +	if (sz > total_pages) { +		sz = total_pages; +	} +	/* Record region size for inspection with mdb, otherwise unused */ +	pscan_region_sz = sz; + +	tmp = sz; +	for (i = 1; tmp < total_pages; i++) { +		tmp += sz; +	} + +	if (i > MAX_PSCAN_THREADS) +		i = MAX_PSCAN_THREADS; + +	des_page_scanners = i;  }  /*   * Pageout scheduling.   *   * Schedpaging controls the rate at which the page out daemon runs by - * setting the global variables nscan and desscan RATETOSCHEDPAGING - * times a second.  Nscan records the number of pages pageout has examined - * in its current pass; schedpaging resets this value to zero each time - * it runs.  Desscan records the number of pages pageout should examine - * in its next pass; schedpaging sets this value based on the amount of - * currently available memory. + * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING + * times a second. The pageout_ticks variable controls the percent of one + * CPU that each page scanner thread should consume (see min_percent_cpu + * and max_percent_cpu descriptions). The desscan variable records the number + * of pages pageout should examine in its next pass; schedpaging sets this + * value based on the amount of currently available memory. In addtition, the + * nscan variable records the number of pages pageout has examined in its + * current pass; schedpaging resets this value to zero each time it runs.   */ -#define	RATETOSCHEDPAGING	4		/* hz that is */ +#define	RATETOSCHEDPAGING	4		/* times/second */ -static kmutex_t	pageout_mutex;	/* held while pageout or schedpaging running */ +/* held while pageout_scanner or schedpaging are modifying shared data */ +static kmutex_t	pageout_mutex;  /*   * Pool of available async pageout putpage requests. @@ -506,7 +594,7 @@ static kcondvar_t push_cv;  static int async_list_size = 256;	/* number of async request structs */ -static void pageout_scanner(void); +static void pageout_scanner(void *);  /*   * If a page is being shared more than "po_share" times @@ -535,67 +623,153 @@ schedpaging(void *arg)  	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))  		kcage_cageout_wakeup(); -	if (mutex_tryenter(&pageout_mutex)) { -		/* pageout() not running */ -		nscan = 0; -		vavail = freemem - deficit; -		if (pageout_new_spread != 0) -			vavail -= needfree; -		if (vavail < 0) -			vavail = 0; -		if (vavail > lotsfree) -			vavail = lotsfree; +	(void) atomic_swap_ulong(&nscan, 0); +	vavail = freemem - deficit; +	if (pageout_new_spread != 0) +		vavail -= needfree; +	if (vavail < 0) +		vavail = 0; +	if (vavail > lotsfree) +		vavail = lotsfree; +	/* +	 * Fix for 1161438 (CRS SPR# 73922).  All variables +	 * in the original calculation for desscan were 32 bit signed +	 * ints.  As freemem approaches 0x0 on a system with 1 Gig or +	 * more of memory, the calculation can overflow.  When this +	 * happens, desscan becomes negative and pageout_scanner() +	 * stops paging out. +	 */ +	if ((needfree) && (pageout_new_spread == 0)) {  		/* -		 * Fix for 1161438 (CRS SPR# 73922).  All variables -		 * in the original calculation for desscan were 32 bit signed -		 * ints.  As freemem approaches 0x0 on a system with 1 Gig or -		 * more of memory, the calculation can overflow.  When this -		 * happens, desscan becomes negative and pageout_scanner() -		 * stops paging out. +		 * If we've not yet collected enough samples to +		 * calculate a spread, kick into high gear anytime +		 * needfree is non-zero. Note that desscan will not be +		 * the limiting factor for systems with larger memory; +		 * the %CPU will limit the scan. That will also be +		 * maxed out below.  		 */ -		if ((needfree) && (pageout_new_spread == 0)) { -			/* -			 * If we've not yet collected enough samples to -			 * calculate a spread, use the old logic of kicking -			 * into high gear anytime needfree is non-zero. -			 */ -			desscan = fastscan / RATETOSCHEDPAGING; -		} else { -			/* -			 * Once we've calculated a spread based on system -			 * memory and usage, just treat needfree as another -			 * form of deficit. -			 */ -			spgcnt_t faststmp, slowstmp, result; +		desscan = fastscan / RATETOSCHEDPAGING; +	} else { +		/* +		 * Once we've calculated a spread based on system +		 * memory and usage, just treat needfree as another +		 * form of deficit. +		 */ +		spgcnt_t faststmp, slowstmp, result; + +		slowstmp = slowscan * vavail; +		faststmp = fastscan * (lotsfree - vavail); +		result = (slowstmp + faststmp) / +		    nz(lotsfree) / RATETOSCHEDPAGING; +		desscan = (pgcnt_t)result; +	} + +	/* +	 * If we've not yet collected enough samples to calculate a +	 * spread, also kick %CPU to the max. +	 */ +	if (pageout_new_spread == 0) { +		pageout_ticks = max_pageout_ticks; +	} else { +		pageout_ticks = min_pageout_ticks + +		    (lotsfree - vavail) * +		    (max_pageout_ticks - min_pageout_ticks) / +		    nz(lotsfree); +	} -			slowstmp = slowscan * vavail; -			faststmp = fastscan * (lotsfree - vavail); -			result = (slowstmp + faststmp) / -			    nz(lotsfree) / RATETOSCHEDPAGING; -			desscan = (pgcnt_t)result; +	if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) { +		/* +		 * We have finished the pagescan initialization and the desired +		 * number of page scanners has changed, either because +		 * initialization just finished, because of a memory DR, or +		 * because des_page_scanners has been modified on the fly (i.e. +		 * by mdb). If we need more scanners, start them now, otherwise +		 * the excess scanners will terminate on their own when they +		 * reset their hands. +		 */ +		uint_t i; +		uint_t curr_nscan = n_page_scanners; +		pgcnt_t max = total_pages / handspreadpages; + +		if (des_page_scanners > max) +			des_page_scanners = max; + +		if (des_page_scanners > MAX_PSCAN_THREADS) { +			des_page_scanners = MAX_PSCAN_THREADS; +		} else if (des_page_scanners == 0) { +			des_page_scanners = 1;  		} -		pageout_ticks = min_pageout_ticks + (lotsfree - vavail) * -		    (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree); +		/* +		 * Each thread has its own entry in the reset_hands array, so +		 * we don't need any locking in pageout_scanner to check the +		 * thread's reset_hands entry. Thus, we use a pre-allocated +		 * fixed size reset_hands array and upper limit on the number +		 * of pagescan threads. +		 * +		 * The reset_hands entries need to be true before we start new +		 * scanners, but if we're reducing, we don't want a race on the +		 * recalculation for the existing threads, so we set +		 * n_page_scanners first. +		 */ +		n_page_scanners = des_page_scanners; +		for (i = 0; i < MAX_PSCAN_THREADS; i++) { +			reset_hands[i] = B_TRUE; +		} -		if (freemem < lotsfree + needfree || -		    pageout_sample_cnt < pageout_sample_lim) { -			TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, -			    "pageout_cv_signal:freemem %ld", freemem); -			cv_signal(&proc_pageout->p_cv); -		} else { -			/* -			 * There are enough free pages, no need to -			 * kick the scanner thread.  And next time -			 * around, keep more of the `highly shared' -			 * pages. -			 */ -			cv_signal_pageout(); -			if (po_share > MIN_PO_SHARE) { -				po_share >>= 1; +		if (des_page_scanners > curr_nscan) { +			/* Create additional pageout scanner threads. */ +			for (i = curr_nscan; i < des_page_scanners; i++) { +				(void) lwp_kernel_create(proc_pageout, +				    pageout_scanner, (void *)(uintptr_t)i, +				    TS_RUN, curthread->t_pri);  			}  		} +	} + +	zones_over = B_FALSE; + +	if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) { +		if (!PAGE_SCAN_STARTUP) +			low_mem_scan++; +		DTRACE_PROBE(schedpage__wake__low); +		WAKE_PAGEOUT_SCANNER(); + +	} else if (zone_num_over_cap > 0) { +		/* One or more zones are over their cap. */ + +		/* No page limit */ +		desscan = total_pages; + +		/* +		 * Increase the scanning CPU% to the max. This implies +		 * 80% of one CPU/sec if the scanner can run each +		 * opportunity. Can also be tuned via setting +		 * zone_pageout_ticks in /etc/system or with mdb. +		 */ +		pageout_ticks = (zone_pageout_ticks != 0) ? +		    zone_pageout_ticks : max_pageout_ticks; + +		zones_over = B_TRUE; +		zone_cap_scan++; + +		DTRACE_PROBE(schedpage__wake__zone); +		WAKE_PAGEOUT_SCANNER(); + +	} else { +		/* +		 * There are enough free pages, no need to +		 * kick the scanner thread.  And next time +		 * around, keep more of the `highly shared' +		 * pages. +		 */ +		cv_signal_pageout(); + +		mutex_enter(&pageout_mutex); +		if (po_share > MIN_PO_SHARE) { +			po_share >>= 1; +		}  		mutex_exit(&pageout_mutex);  	} @@ -617,36 +791,46 @@ ulong_t		push_list_size;		/* # of requests on pageout queue */  #define	FRONT	1  #define	BACK	2 -int dopageout = 1;	/* must be non-zero to turn page stealing on */ +int dopageout = 1;	/* /etc/system tunable to disable page reclamation */  /*   * The page out daemon, which runs as process 2.   * - * As long as there are at least lotsfree pages, - * this process is not run.  When the number of free - * pages stays in the range desfree to lotsfree, - * this daemon runs through the pages in the loop - * at a rate determined in schedpaging().  Pageout manages - * two hands on the clock.  The front hand moves through - * memory, clearing the reference bit, - * and stealing pages from procs that are over maxrss. - * The back hand travels a distance behind the front hand, - * freeing the pages that have not been referenced in the time - * since the front hand passed.  If modified, they are pushed to - * swap before being freed. + * Page out occurs when either: + * a) there is less than lotsfree pages, + * b) there are one or more zones over their physical memory cap. + * + * The daemon treats physical memory as a circular array of pages and scans the + * pages using a 'two-handed clock' algorithm. The front hand moves through + * the pages, clearing the reference bit. The back hand travels a distance + * (handspreadpages) behind the front hand, freeing the pages that have not + * been referenced in the time since the front hand passed. If modified, they + * are first written to their backing store before being freed. + * + * In order to make page invalidation more responsive on machines with larger + * memory, multiple pageout_scanner threads may be created. In this case, the + * threads are evenly distributed around the the memory "clock face" so that + * memory can be reclaimed more quickly (that is, there can be large regions in + * which no pages can be reclaimed by a single thread, leading to lag which + * causes undesirable behavior such as htable stealing). + * + * As long as there are at least lotsfree pages, or no zones over their cap, + * then pageout_scanner threads are not run. When pageout_scanner threads are + * running for case (a), all pages are considered for pageout. For case (b), + * only pages belonging to a zone over its cap will be considered for pageout.   * - * There are 2 threads that act on behalf of the pageout process. - * One thread scans pages (pageout_scanner) and frees them up if + * There are multiple threads that act on behalf of the pageout process. + * A set of threads scan pages (pageout_scanner) and frees them up if   * they don't require any VOP_PUTPAGE operation. If a page must be   * written back to its backing store, the request is put on a list   * and the other (pageout) thread is signaled. The pageout thread   * grabs VOP_PUTPAGE requests from the list, and processes them.   * Some filesystems may require resources for the VOP_PUTPAGE   * operations (like memory) and hence can block the pageout - * thread, but the scanner thread can still operate. There is still + * thread, but the pageout_scanner threads can still operate. There is still   * no guarantee that memory deadlocks cannot occur.   * - * For now, this thing is in very rough form. + * The pageout_scanner parameters are determined in schedpaging().   */  void  pageout() @@ -684,9 +868,9 @@ pageout()  	pageout_pri = curthread->t_pri; -	/* Create the pageout scanner thread. */ -	(void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, -	    pageout_pri - 1); +	/* Create the (first) pageout scanner thread. */ +	(void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0, +	    TS_RUN, pageout_pri - 1);  	/*  	 * kick off pageout scheduler. @@ -720,6 +904,7 @@ pageout()  		arg->a_next = NULL;  		mutex_exit(&push_lock); +		DTRACE_PROBE(pageout__push);  		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,  		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {  			pushes++; @@ -740,32 +925,24 @@ pageout()   * Kernel thread that scans pages looking for ones to free   */  static void -pageout_scanner(void) +pageout_scanner(void *a)  {  	struct page *fronthand, *backhand; -	uint_t count; +	uint_t count, iter = 0;  	callb_cpr_t cprinfo; -	pgcnt_t	nscan_limit; +	pgcnt_t	nscan_cnt, nscan_limit;  	pgcnt_t	pcount; +	uint_t inst = (uint_t)(uintptr_t)a; +	hrtime_t sample_start, sample_end; +	clock_t pageout_lbolt; +	kmutex_t pscan_mutex; -	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); -	mutex_enter(&pageout_mutex); +	VERIFY3U(inst, <, MAX_PSCAN_THREADS); -	/* -	 * The restart case does not attempt to point the hands at roughly -	 * the right point on the assumption that after one circuit things -	 * will have settled down - and restarts shouldn't be that often. -	 */ +	mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL); -	/* -	 * Set the two clock hands to be separated by a reasonable amount, -	 * but no more than 360 degrees apart. -	 */ -	backhand = page_first(); -	if (handspreadpages >= total_pages) -		fronthand = page_nextn(backhand, total_pages - 1); -	else -		fronthand = page_nextn(backhand, handspreadpages); +	CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan"); +	mutex_enter(&pscan_mutex);  	min_pageout_ticks = MAX(1,  	    ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING); @@ -776,71 +953,116 @@ loop:  	cv_signal_pageout();  	CALLB_CPR_SAFE_BEGIN(&cprinfo); -	cv_wait(&proc_pageout->p_cv, &pageout_mutex); -	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); +	cv_wait(&proc_pageout->p_cv, &pscan_mutex); +	CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);  	if (!dopageout)  		goto loop; -	if (reset_hands) { -		reset_hands = 0; +	if (reset_hands[inst]) { +		struct page *first; +		pgcnt_t offset = total_pages / n_page_scanners; -		backhand = page_first(); -		if (handspreadpages >= total_pages) +		reset_hands[inst] = B_FALSE; +		if (inst >= n_page_scanners) { +			/* +			 * The desired number of page scanners has been +			 * reduced and this instance is no longer wanted. +			 * Exit the lwp. +			 */ +			VERIFY3U(inst, !=, 0); +			mutex_exit(&pscan_mutex); +			mutex_enter(&curproc->p_lock); +			lwp_exit(); +		} + +		/* +		 * The reset case repositions the hands at the proper place +		 * on the memory clock face to prevent creep into another +		 * thread's active region or when the number of threads has +		 * changed. +		 * +		 * Set the two clock hands to be separated by a reasonable +		 * amount, but no more than 360 degrees apart. +		 * +		 * If inst == 0, backhand starts at first page, otherwise +		 * it is (inst * offset) around the memory "clock face" so that +		 * we spread out each scanner instance evenly. +		 */ +		first = page_first(); +		backhand = page_nextn(first, offset * inst); +		if (handspreadpages >= total_pages) {  			fronthand = page_nextn(backhand, total_pages - 1); -		else +		} else {  			fronthand = page_nextn(backhand, handspreadpages); +		}  	} +	/* +	 * This CPU kstat is only incremented here and we're obviously on this +	 * CPU, so no lock. +	 */  	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);  	count = 0; -	TRACE_4(TR_FAC_VM, TR_PAGEOUT_START, -	    "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld", -	    freemem, lotsfree, nscan, desscan); -  	/* Kernel probe */  	TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,  	    tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);  	pcount = 0; -	if (pageout_sample_cnt < pageout_sample_lim) { +	nscan_cnt = 0; +	if (PAGE_SCAN_STARTUP) {  		nscan_limit = total_pages;  	} else {  		nscan_limit = desscan;  	} + +	DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst, +	    page_t *, backhand, page_t *, fronthand); +  	pageout_lbolt = ddi_get_lbolt();  	sample_start = gethrtime();  	/*  	 * Scan the appropriate number of pages for a single duty cycle. -	 * However, stop scanning as soon as there is enough free memory. -	 * For a short while, we will be sampling the performance of the -	 * scanner and need to keep running just to get sample data, in -	 * which case we keep going and don't pay attention to whether -	 * or not there is enough free memory. +	 * Only scan while at least one of these is true: +	 * 1) one or more zones is over its cap +	 * 2) there is not enough free memory +	 * 3) during page scan startup when determining sample data  	 */ - -	while (nscan < nscan_limit && (freemem < lotsfree + needfree || -	    pageout_sample_cnt < pageout_sample_lim)) { +	while (nscan_cnt < nscan_limit && +	    (zones_over || +	    freemem < lotsfree + needfree || +	    PAGE_SCAN_STARTUP)) {  		int rvfront, rvback; +		DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst); +  		/*  		 * Check to see if we have exceeded our %CPU budget  		 * for this wakeup, but not on every single page visited,  		 * just every once in a while.  		 */  		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { +			clock_t pageout_cycle_ticks; +  			pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;  			if (pageout_cycle_ticks >= pageout_ticks) { -				++pageout_timeouts; +				/* +				 * This is where we normally break out of the +				 * loop when scanning zones or sampling. +				 */ +				if (!zones_over) { +					atomic_inc_64(&pageout_timeouts); +				} +				DTRACE_PROBE1(pageout__timeout, uint_t, inst);  				break;  			}  		}  		/*  		 * If checkpage manages to add a page to the free list, -		 * we give ourselves another couple of trips around the loop. +		 * we give ourselves another couple of trips around memory.  		 */  		if ((rvfront = checkpage(fronthand, FRONT)) == 1)  			count = 0; @@ -850,7 +1072,8 @@ loop:  		++pcount;  		/* -		 * protected by pageout_mutex instead of cpu_stat_lock +		 * This CPU kstat is only incremented here and we're obviously +		 * on this CPU, so no lock.  		 */  		CPU_STATS_ADDQ(CPU, vm, scan, 1); @@ -858,7 +1081,7 @@ loop:  		 * Don't include ineligible pages in the number scanned.  		 */  		if (rvfront != -1 || rvback != -1) -			nscan++; +			nscan_cnt++;  		backhand = page_next(backhand); @@ -868,56 +1091,89 @@ loop:  		 */  		if ((fronthand = page_next(fronthand)) == page_first())	{ -			TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP, -			    "pageout_hand_wrap:freemem %ld whichhand %d", -			    freemem, FRONT); +			DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);  			/* -			 * protected by pageout_mutex instead of cpu_stat_lock +			 * Every 64 wraps we reposition our hands within our +			 * region to prevent creep into another thread. +			 */ +			if ((++iter % pageout_reset_cnt) == 0) +				reset_hands[inst] = B_TRUE; + +			/* +			 * This CPU kstat is only incremented here and we're +			 * obviously on this CPU, so no lock.  			 */  			CPU_STATS_ADDQ(CPU, vm, rev, 1); -			if (++count > 1) { + +			/* +			 * If scanning because the system is low on memory, +			 * then when we wraparound memory we want to try to +			 * reclaim more pages. +			 * If scanning only because zones are over their cap, +			 * then wrapping is common and we simply keep going. +			 */ +			if (freemem < lotsfree + needfree && ++count > 1) {  				/* +				 * The system is low on memory.  				 * Extremely unlikely, but it happens. -				 * We went around the loop at least once -				 * and didn't get far enough. +				 * We went around memory at least once +				 * and didn't reclaim enough.  				 * If we are still skipping `highly shared'  				 * pages, skip fewer of them.  Otherwise,  				 * give up till the next clock tick.  				 */ +				mutex_enter(&pageout_mutex);  				if (po_share < MAX_PO_SHARE) {  					po_share <<= 1; +					mutex_exit(&pageout_mutex);  				} else {  					/* -					 * Really a "goto loop", but -					 * if someone is TRACing or -					 * TNF_PROBE_ing, at least -					 * make records to show -					 * where we are. +					 * Really a "goto loop", but if someone +					 * is tracing or TNF_PROBE_ing, hit +					 * those probes first.  					 */ +					mutex_exit(&pageout_mutex);  					break;  				}  			}  		}  	} +	atomic_add_long(&nscan, nscan_cnt); +  	sample_end = gethrtime(); -	TRACE_5(TR_FAC_VM, TR_PAGEOUT_END, -	    "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u", -	    freemem, lotsfree, nscan, desscan, count); +	DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount, +	    uint_t, inst);  	/* Kernel probe */  	TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */, -	    tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem); +	    tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free, +	    freemem); -	if (pageout_sample_cnt < pageout_sample_lim) { +	/* +	 * The following two blocks are only relevant when the scanner is +	 * first started up. After the scanner runs for a while, neither of +	 * the conditions will ever be true again. +	 * +	 * The global variables used below are only modified by this thread and +	 * only during initial scanning when there is a single page scanner +	 * thread running. Thus, we don't use any locking. +	 */ +	if (PAGE_SCAN_STARTUP) { +		VERIFY3U(inst, ==, 0);  		pageout_sample_pages += pcount;  		pageout_sample_etime += sample_end - sample_start;  		++pageout_sample_cnt; -	} -	if (pageout_sample_cnt >= pageout_sample_lim && -	    pageout_new_spread == 0) { + +	} else if (pageout_new_spread == 0) { +		uint_t i; + +		/* +		 * We have run enough samples, set the spread. +		 */ +		VERIFY3U(inst, ==, 0);  		pageout_rate = (hrrate_t)pageout_sample_pages *  		    (hrrate_t)(NANOSEC) / pageout_sample_etime;  		pageout_new_spread = pageout_rate / 10; @@ -931,9 +1187,8 @@ loop:   * Look at the page at hand.  If it is locked (e.g., for physical i/o),   * system (u., page table) or free, then leave it alone.  Otherwise,   * if we are running the front hand, turn off the page's reference bit. - * If the proc is over maxrss, we take it.  If running the back hand, - * check whether the page has been reclaimed.  If not, free the page, - * pushing it to disk first if necessary. + * If running the back hand, check whether the page has been reclaimed. + * If not, free the page, pushing it to disk first if necessary.   *   * Return values:   *	-1 if the page is not a candidate at all, @@ -947,6 +1202,7 @@ checkpage(struct page *pp, int whichhand)  	int isfs = 0;  	int isexec = 0;  	int pagesync_flag; +	zoneid_t zid = ALL_ZONES;  	/*  	 * Skip pages: @@ -989,6 +1245,21 @@ checkpage(struct page *pp, int whichhand)  		return (-1);  	} +	if (zones_over) { +		ASSERT(pp->p_zoneid == ALL_ZONES || +		    pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID); +		if (pp->p_zoneid == ALL_ZONES || +		    zone_pdata[pp->p_zoneid].zpers_over == 0) { +			/* +			 * Cross-zone shared page, or zone not over it's cap. +			 * Leave the page alone. +			 */ +			page_unlock(pp); +			return (-1); +		} +		zid = pp->p_zoneid; +	} +  	/*  	 * Maintain statistics for what we are freeing  	 */ @@ -1016,31 +1287,24 @@ checkpage(struct page *pp, int whichhand)  recheck:  	/* -	 * If page is referenced; make unreferenced but reclaimable. -	 * If this page is not referenced, then it must be reclaimable -	 * and we can add it to the free list. +	 * If page is referenced; fronthand makes unreferenced and reclaimable. +	 * For the backhand, a process referenced the page since the front hand +	 * went by, so it's not a candidate for freeing up.  	 */  	if (ppattr & P_REF) { -		TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF, -		    "pageout_isref:pp %p whichhand %d", pp, whichhand); +		DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand);  		if (whichhand == FRONT) { -			/* -			 * Checking of rss or madvise flags needed here... -			 * -			 * If not "well-behaved", fall through into the code -			 * for not referenced. -			 */  			hat_clrref(pp);  		} -		/* -		 * Somebody referenced the page since the front -		 * hand went by, so it's not a candidate for -		 * freeing up. -		 */  		page_unlock(pp);  		return (0);  	} +	/* +	 * This page is not referenced, so it must be reclaimable and we can +	 * add it to the free list. This can be done by either hand. +	 */ +  	VM_STAT_ADD(pageoutvmstats.checkpage[0]);  	/* @@ -1073,8 +1337,9 @@ recheck:  		u_offset_t offset = pp->p_offset;  		/* -		 * XXX - Test for process being swapped out or about to exit? -		 * [Can't get back to process(es) using the page.] +		 * Note: There is no possibility to test for process being +		 * swapped out or about to exit since we can't get back to +		 * process(es) from the page.  		 */  		/* @@ -1092,6 +1357,11 @@ recheck:  			VN_RELE(vp);  			return (0);  		} +		if (isfs) { +			zone_pageout_stat(zid, ZPO_DIRTY); +		} else { +			zone_pageout_stat(zid, ZPO_ANONDIRTY); +		}  		return (1);  	} @@ -1102,8 +1372,7 @@ recheck:  	 * the pagesync but before it was unloaded we catch it  	 * and handle the page properly.  	 */ -	TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE, -	    "pageout_free:pp %p whichhand %d", pp, whichhand); +	DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand);  	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);  	ppattr = hat_page_getattr(pp, P_MOD | P_REF);  	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode)) @@ -1120,8 +1389,10 @@ recheck:  		} else {  			CPU_STATS_ADD_K(vm, fsfree, 1);  		} +		zone_pageout_stat(zid, ZPO_FS);  	} else {  		CPU_STATS_ADD_K(vm, anonfree, 1); +		zone_pageout_stat(zid, ZPO_ANON);  	}  	return (1);		/* freed a page! */ diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index c177ecfd75..ad35fd7187 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1627,7 +1627,7 @@ vmem_destroy(vmem_t *vmp)  	leaked = vmem_size(vmp, VMEM_ALLOC);  	if (leaked != 0) -		cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", +		cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",  		    vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?  		    "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index c759f7e010..1db130797c 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. All rights reserved. + * Copyright (c) 2019, Joyent, Inc.   * Copyright (c) 2016 by Delphix. All rights reserved.   * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.   */ @@ -106,14 +106,16 @@   *   removed from the list of active zones.  zone_destroy() returns, and   *   the zone can be recreated.   * - *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor - *   callbacks are executed, and all memory associated with the zone is - *   freed. + *   ZONE_IS_FREE (internal state): All references have been dropped and + *   the zone_t is no longer in the zone_active nor zone_deathrow lists. + *   The zone_t is in the process of being freed.  This state exists + *   only for publishing a sysevent to indicate that the zone by this + *   name can be booted again.   * - *   Threads can wait for the zone to enter a requested state by using - *   zone_status_wait() or zone_status_timedwait() with the desired - *   state passed in as an argument.  Zone state transitions are - *   uni-directional; it is not possible to move back to an earlier state. + *   Threads can wait for the zone to enter a requested state (other than + *   ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait() + *   with the desired state passed in as an argument.  Zone state transitions + *   are uni-directional; it is not possible to move back to an earlier state.   *   *   *   Zone-Specific Data: @@ -252,6 +254,8 @@  #include <sys/cpucaps.h>  #include <vm/seg.h>  #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h>  /*   * This constant specifies the number of seconds that threads waiting for @@ -312,6 +316,7 @@ static id_space_t *zoneid_space;   * 'global_zone'.   */  zone_t zone0; +zone_zfs_io_t zone0_zp_zfs;  zone_t *global_zone = NULL;	/* Set when the global zone is initialized */  /* @@ -327,8 +332,8 @@ static list_t zone_active;  static list_t zone_deathrow;  static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES;  /* Event channel to sent zone state change notifications */  evchan_t *zone_event_chan; @@ -350,6 +355,7 @@ const char  *zone_status_table[] = {  	ZONE_EVENT_SHUTTING_DOWN,	/* down */  	ZONE_EVENT_SHUTTING_DOWN,	/* dying */  	ZONE_EVENT_UNINITIALIZED,	/* dead */ +	ZONE_EVENT_FREE,		/* free */  };  /* @@ -372,8 +378,12 @@ static char *zone_ref_subsys_names[] = {  rctl_hndl_t rc_zone_cpu_shares;  rctl_hndl_t rc_zone_locked_mem;  rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem;  rctl_hndl_t rc_zone_max_lofi;  rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri;  rctl_hndl_t rc_zone_nlwps;  rctl_hndl_t rc_zone_nprocs;  rctl_hndl_t rc_zone_shmmax; @@ -389,6 +399,7 @@ static int zone_remove_datalink(zoneid_t, datalink_id_t);  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);  static int zone_set_network(zoneid_t, zone_net_data_t *);  static int zone_get_network(zoneid_t, zone_net_data_t *); +static void zone_status_set(zone_t *, zone_status_t);  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t); @@ -419,8 +430,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,   * Version 5 alters the zone_boot system call, and converts its old   *     bootargs parameter to be set by the zone_setattr API instead.   * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create.   */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; + +/* + * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent" + * data which can be referenced independently of the zone_t structure. This + * data falls into two categories; + *   1) pages and RSS data associated with processes inside a zone + *   2) in-flight ZFS I/O data + * + * Each member of zone_persist_t stores the zone's current page usage, its page + * limit, a flag indicating if the zone is over its physical memory cap and + * various page-related statistics. The zpers_over flag is the interface for + * the page scanner to use when reclaiming pages for zones that are over their + * cap. The zone_persist_t structure also includes a mutex and a reference to a + * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data. + * + * All zone physical memory cap data is stored in this array instead of within + * the zone structure itself. This is because zone structures come and go, but + * paging-related work can be asynchronous to any particular zone. In, + * particular: + * 1) Page scanning to reclaim pages occurs from a kernel thread that is not + *    associated with any zone. + * 2) Freeing segkp pages can occur long after the zone which first + *    instantiated those pages has gone away. + * We want to be able to account for pages/zone without constantly having to + * take extra locks and finding the relevant zone structure, particularly during + * page scanning. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_pdata" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's zpers_over entry in the array. The scanner should never modify + * either of these items. Internally the entries and the counter are managed + * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We + * take care to ensure that we only take the zone_physcap_lock mutex when a + * zone is transitioning over/under its physical memory cap. + * + * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage + * the "zone_pdata" array and associated counter. + * + * The zone_persist_t structure tracks the zone's physical cap and phyiscal + * usage in terms of pages. These values are currently defined as uint32. Thus, + * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) + * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a + * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size. + * In the future we may need to expand these counters to 64-bit, but for now + * we're using 32-bit to conserve memory, since this array is statically + * allocated within the kernel based on the maximum number of zones supported. + * + * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under + * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we + * had to continuously find the zone structure associated with an I/O that has + * just completed. To avoid that overhead, we track the I/O data within the + * zone_zfs_io_t instead. We can directly access that data without having to + * lookup the full zone_t structure. + */ +uint_t zone_num_over_cap; +zone_persist_t zone_pdata[MAX_ZONES]; +static kmutex_t zone_physcap_lock;  /*   * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1379,6 +1454,127 @@ static rctl_ops_t zone_cpu_cap_ops = {  /*ARGSUSED*/  static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ +	ASSERT(MUTEX_HELD(&p->p_lock)); +	return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, +    rctl_qty_t nv) +{ +	zone_t *zone = e->rcep_p.zone; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	ASSERT(e->rcep_t == RCENTITY_ZONE); + +	if (zone == NULL) +		return (0); + +	return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { +	rcop_no_action, +	zone_cpu_base_get, +	zone_cpu_base_set, +	rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ +	ASSERT(MUTEX_HELD(&p->p_lock)); +	return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, +    rctl_qty_t nv) +{ +	zone_t *zone = e->rcep_p.zone; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	ASSERT(e->rcep_t == RCENTITY_ZONE); + +	if (zone == NULL) +		return (0); + +	return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { +	rcop_no_action, +	zone_cpu_burst_time_get, +	zone_cpu_burst_time_set, +	rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ +	zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; +	rctl_qty_t r = 0; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	mutex_enter(&zp->zpers_zfs_lock); +	if (zp->zpers_zfsp != NULL) +		r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri; +	mutex_exit(&zp->zpers_zfs_lock); + +	return (r); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, +    rctl_qty_t nv) +{ +	zone_t *zone = e->rcep_p.zone; +	zone_persist_t *zp; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	ASSERT(e->rcep_t == RCENTITY_ZONE); + +	if (zone == NULL) +		return (0); + +	/* +	 * set priority to the new value. +	 */ +	zp = &zone_pdata[zone->zone_id]; +	mutex_enter(&zp->zpers_zfs_lock); +	if (zp->zpers_zfsp != NULL) +		zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv; +	mutex_exit(&zp->zpers_zfs_lock); +	return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { +	rcop_no_action, +	zone_zfs_io_pri_get, +	zone_zfs_io_pri_set, +	rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t  zone_lwps_usage(rctl_t *r, proc_t *p)  {  	rctl_qty_t nlwps; @@ -1705,6 +1901,57 @@ static rctl_ops_t zone_max_swap_ops = {  /*ARGSUSED*/  static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ +	rctl_qty_t q; +	zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	q = ptob(zp->zpers_pg_cnt); +	return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, +    rctl_qty_t nv) +{ +	zoneid_t zid; +	uint_t pg_val; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	ASSERT(e->rcep_t == RCENTITY_ZONE); +	if (e->rcep_p.zone == NULL) +		return (0); +	zid = e->rcep_p.zone->zone_id; +	if (nv == UINT64_MAX) { +		pg_val = UINT32_MAX; +	} else { +		uint64_t pages = btop(nv); + +		/* +		 * Return from RCTLOP_SET is always ignored so just clamp an +		 * out-of-range value to our largest "limited" value. +		 */ +		if (pages >= UINT32_MAX) { +			pg_val = UINT32_MAX - 1; +		} else { +			pg_val = (uint_t)pages; +		} +	} +	zone_pdata[zid].zpers_pg_limit = pg_val; +	return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { +	rcop_no_action, +	zone_phys_mem_usage, +	zone_phys_mem_set, +	rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)  {  	rctl_qty_t q; @@ -1798,6 +2045,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)  }  static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ +	zone_t *zone = ksp->ks_private; +	zone_kstat_t *zk = ksp->ks_data; +	zone_persist_t *zp = &zone_pdata[zone->zone_id]; + +	if (rw == KSTAT_WRITE) +		return (EACCES); + +	zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt); +	zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit); +	return (0); +} + +static int  zone_nprocs_kstat_update(kstat_t *ksp, int rw)  {  	zone_t *zone = ksp->ks_private; @@ -1826,7 +2088,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)  }  static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name,      int (*updatefunc) (kstat_t *, int))  {  	kstat_t *ksp; @@ -1851,16 +2113,200 @@ zone_kstat_create_common(zone_t *zone, char *name,  	return (ksp);  } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ +	zone_t *zone = ksp->ks_private; +	zone_vfs_kstat_t *zvp = ksp->ks_data; +	kstat_io_t *kiop = &zone->zone_vfs_rwstats; + +	if (rw == KSTAT_WRITE) +		return (EACCES); + +	/* +	 * Extract the VFS statistics from the kstat_io_t structure used by +	 * kstat_runq_enter() and related functions.  Since the slow ops +	 * counters are updated directly by the VFS layer, there's no need to +	 * copy those statistics here. +	 * +	 * Note that kstat_runq_enter() and the related functions use +	 * gethrtime_unscaled(), so scale the time here. +	 */ +	zvp->zv_nread.value.ui64 = kiop->nread; +	zvp->zv_reads.value.ui64 = kiop->reads; +	zvp->zv_rtime.value.ui64 = kiop->rtime; +	zvp->zv_rcnt.value.ui64 = kiop->rcnt; +	zvp->zv_rlentime.value.ui64 = kiop->rlentime; +	zvp->zv_nwritten.value.ui64 = kiop->nwritten; +	zvp->zv_writes.value.ui64 = kiop->writes; +	zvp->zv_wtime.value.ui64 = kiop->wtime; +	zvp->zv_wcnt.value.ui64 = kiop->wcnt; +	zvp->zv_wlentime.value.ui64 = kiop->wlentime; + +	scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); +	scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); +	scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); +	scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + +	return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ +	kstat_t *ksp; +	zone_vfs_kstat_t *zvp; + +	if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, +	    zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, +	    sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), +	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) +		return (NULL); + +	if (zone->zone_id != GLOBAL_ZONEID) +		kstat_zone_add(ksp, GLOBAL_ZONEID); + +	zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); +	ksp->ks_data_size += strlen(zone->zone_name) + 1; +	ksp->ks_lock = &zone->zone_vfs_lock; +	zone->zone_vfs_stats = zvp; + +	/* The kstat "name" field is not large enough for a full zonename */ +	kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); +	kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); +	kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + +	ksp->ks_update = zone_vfs_kstat_update; +	ksp->ks_private = zone; + +	kstat_install(ksp); +	return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ +	zone_t *zone = ksp->ks_private; +	zone_zfs_kstat_t *zzp = ksp->ks_data; +	zone_persist_t *zp = &zone_pdata[zone->zone_id]; + +	if (rw == KSTAT_WRITE) +		return (EACCES); + +	mutex_enter(&zp->zpers_zfs_lock); +	if (zp->zpers_zfsp == NULL) { +		zzp->zz_nread.value.ui64 = 0; +		zzp->zz_reads.value.ui64 = 0; +		zzp->zz_rtime.value.ui64 = 0; +		zzp->zz_rlentime.value.ui64 = 0; +		zzp->zz_nwritten.value.ui64 = 0; +		zzp->zz_writes.value.ui64 = 0; +		zzp->zz_waittime.value.ui64 = 0; +	} else { +		kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats; + +		/* +		 * Extract the ZFS statistics from the kstat_io_t structure +		 * used by kstat_runq_enter() and related functions. Since the +		 * I/O throttle counters are updated directly by the ZFS layer, +		 * there's no need to copy those statistics here. +		 * +		 * Note that kstat_runq_enter() and the related functions use +		 * gethrtime_unscaled(), so scale the time here. +		 */ +		zzp->zz_nread.value.ui64 = kiop->nread; +		zzp->zz_reads.value.ui64 = kiop->reads; +		zzp->zz_rtime.value.ui64 = kiop->rtime; +		zzp->zz_rlentime.value.ui64 = kiop->rlentime; +		zzp->zz_nwritten.value.ui64 = kiop->nwritten; +		zzp->zz_writes.value.ui64 = kiop->writes; +		zzp->zz_waittime.value.ui64 = +		    zp->zpers_zfsp->zpers_zfs_rd_waittime; +	} +	mutex_exit(&zp->zpers_zfs_lock); + +	scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); +	scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + +	return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ +	kstat_t *ksp; +	zone_zfs_kstat_t *zzp; + +	if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, +	    zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, +	    sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), +	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) +		return (NULL); + +	if (zone->zone_id != GLOBAL_ZONEID) +		kstat_zone_add(ksp, GLOBAL_ZONEID); + +	zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); +	ksp->ks_data_size += strlen(zone->zone_name) + 1; +	ksp->ks_lock = &zone->zone_zfs_lock; +	zone->zone_zfs_stats = zzp; + +	/* The kstat "name" field is not large enough for a full zonename */ +	kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); +	kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); +	kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + +	ksp->ks_update = zone_zfs_kstat_update; +	ksp->ks_private = zone; + +	kstat_install(ksp); +	return (ksp); +}  static int  zone_mcap_kstat_update(kstat_t *ksp, int rw)  {  	zone_t *zone = ksp->ks_private;  	zone_mcap_kstat_t *zmp = ksp->ks_data; +	zone_persist_t *zp;  	if (rw == KSTAT_WRITE)  		return (EACCES); +	zp = &zone_pdata[zone->zone_id]; + +	zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt); +	zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit); +	zmp->zm_swap.value.ui64 = zone->zone_max_swap; +	zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; +	zmp->zm_nover.value.ui64 = zp->zpers_nover; +#ifndef DEBUG +	zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out); +#else +	zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty + +	    zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty); +#endif  	zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;  	zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;  	zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; @@ -1893,6 +2339,12 @@ zone_mcap_kstat_create(zone_t *zone)  	/* The kstat "name" field is not large enough for a full zonename */  	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);  	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); +	kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);  	kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);  	kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);  	kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); @@ -1942,9 +2394,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)  	zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;  	zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; +	zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim; +  	zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;  	zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; +	zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts;  	zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;  	return (0); @@ -1985,9 +2440,13 @@ zone_misc_kstat_create(zone_t *zone)  	    KSTAT_DATA_UINT32);  	kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);  	kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); +	kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim", +	    KSTAT_DATA_UINT32);  	kstat_named_init(&zmp->zm_nested_intp, "nested_interp",  	    KSTAT_DATA_UINT32);  	kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); +	kstat_named_init(&zmp->zm_init_restarts, "init_restarts", +	    KSTAT_DATA_UINT32);  	kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);  	ksp->ks_update = zone_misc_kstat_update; @@ -2000,13 +2459,25 @@ zone_misc_kstat_create(zone_t *zone)  static void  zone_kstat_create(zone_t *zone)  { -	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, +	zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,  	    "lockedmem", zone_lockedmem_kstat_update); -	zone->zone_swapresv_kstat = zone_kstat_create_common(zone, +	zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,  	    "swapresv", zone_swapresv_kstat_update); -	zone->zone_nprocs_kstat = zone_kstat_create_common(zone, +	zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, +	    "physicalmem", zone_physmem_kstat_update); +	zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,  	    "nprocs", zone_nprocs_kstat_update); +	if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { +		zone->zone_vfs_stats = kmem_zalloc( +		    sizeof (zone_vfs_kstat_t), KM_SLEEP); +	} + +	if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { +		zone->zone_zfs_stats = kmem_zalloc( +		    sizeof (zone_zfs_kstat_t), KM_SLEEP); +	} +  	if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {  		zone->zone_mcap_stats = kmem_zalloc(  		    sizeof (zone_mcap_kstat_t), KM_SLEEP); @@ -2038,8 +2509,15 @@ zone_kstat_delete(zone_t *zone)  	    sizeof (zone_kstat_t));  	zone_kstat_delete_common(&zone->zone_swapresv_kstat,  	    sizeof (zone_kstat_t)); +	zone_kstat_delete_common(&zone->zone_physmem_kstat, +	    sizeof (zone_kstat_t));  	zone_kstat_delete_common(&zone->zone_nprocs_kstat,  	    sizeof (zone_kstat_t)); + +	zone_kstat_delete_common(&zone->zone_vfs_ksp, +	    sizeof (zone_vfs_kstat_t)); +	zone_kstat_delete_common(&zone->zone_zfs_ksp, +	    sizeof (zone_zfs_kstat_t));  	zone_kstat_delete_common(&zone->zone_mcap_ksp,  	    sizeof (zone_mcap_kstat_t));  	zone_kstat_delete_common(&zone->zone_misc_ksp, @@ -2101,8 +2579,12 @@ zone_zsd_init(void)  	zone0.zone_initname = initname;  	zone0.zone_lockedmem_kstat = NULL;  	zone0.zone_swapresv_kstat = NULL; +	zone0.zone_physmem_kstat = NULL;  	zone0.zone_nprocs_kstat = NULL; +	zone_pdata[0].zpers_zfsp = &zone0_zp_zfs; +	zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1; +  	list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),  	    offsetof(zone_ref_t, zref_linkage));  	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), @@ -2209,6 +2691,21 @@ zone_init(void)  	    RCTL_GLOBAL_INFINITE,  	    MAXCAP, MAXCAP, &zone_cpu_cap_ops); +	rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", +	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | +	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, +	    MAXCAP, MAXCAP, &zone_cpu_base_ops); + +	rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", +	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | +	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, +	    INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + +	rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", +	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | +	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, +	    16384, 16384, &zone_zfs_io_pri_ops); +  	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,  	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,  	    INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2250,6 +2747,20 @@ zone_init(void)  	rde = rctl_dict_lookup("zone.cpu-shares");  	(void) rctl_val_list_insert(&rde->rcd_default_value, dval); +	/* +	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach +	 * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. +	 */ +	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); +	bzero(dval, sizeof (rctl_val_t)); +	dval->rcv_value = 1; +	dval->rcv_privilege = RCPRIV_PRIVILEGED; +	dval->rcv_flagaction = RCTL_LOCAL_NOACTION; +	dval->rcv_action_recip_pid = -1; + +	rde = rctl_dict_lookup("zone.zfs-io-priority"); +	(void) rctl_val_list_insert(&rde->rcd_default_value, dval); +  	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",  	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |  	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2260,6 +2771,11 @@ zone_init(void)  	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,  	    &zone_max_swap_ops); +	rc_zone_phys_mem = rctl_register("zone.max-physical-memory", +	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | +	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, +	    &zone_phys_mem_ops); +  	rc_zone_max_lofi = rctl_register("zone.max-lofi",  	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |  	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2281,6 +2797,9 @@ zone_init(void)  	zone0.zone_ntasks = 1;  	mutex_exit(&p0.p_lock);  	zone0.zone_restart_init = B_TRUE; +	zone0.zone_reboot_on_init_exit = B_FALSE; +	zone0.zone_restart_init_0 = B_FALSE; +	zone0.zone_init_status = -1;  	zone0.zone_brand = &native_brand;  	rctl_prealloc_destroy(gp);  	/* @@ -2362,6 +2881,8 @@ zone_init(void)  static void  zone_free(zone_t *zone)  { +	zone_dl_t *zdl; +  	ASSERT(zone != global_zone);  	ASSERT(zone->zone_ntasks == 0);  	ASSERT(zone->zone_nlwps == 0); @@ -2377,6 +2898,9 @@ zone_free(zone_t *zone)  	 */  	cpucaps_zone_remove(zone); +	/* Clear physical memory capping data. */ +	bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t)); +  	ASSERT(zone->zone_cpucap == NULL);  	/* remove from deathrow list */ @@ -2390,8 +2914,30 @@ zone_free(zone_t *zone)  	list_destroy(&zone->zone_ref_list);  	zone_free_zsd(zone);  	zone_free_datasets(zone); + +	/* +	 * While dlmgmtd should have removed all of these, it could have left +	 * something behind or crashed. In which case it's not safe for us to +	 * assume that the list is empty which list_destroy() will ASSERT. We +	 * clean up for our userland comrades which may have crashed, or worse, +	 * been disabled by SMF. +	 */ +	while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { +		if (zdl->zdl_net != NULL) +			nvlist_free(zdl->zdl_net); +		kmem_free(zdl, sizeof (zone_dl_t)); +	}  	list_destroy(&zone->zone_dl_list); +	/* +	 * This zone_t can no longer inhibit creation of another zone_t +	 * with the same name or debug ID.  Generate a sysevent so that +	 * userspace tools know it is safe to carry on. +	 */ +	mutex_enter(&zone_status_lock); +	zone_status_set(zone, ZONE_IS_FREE); +	mutex_exit(&zone_status_lock); +  	cpu_uarray_free(zone->zone_ustate);  	if (zone->zone_rootvp != NULL) @@ -2436,11 +2982,17 @@ zone_free(zone_t *zone)  static void  zone_status_set(zone_t *zone, zone_status_t status)  { +	timestruc_t now; +	uint64_t t;  	nvlist_t *nvl = NULL;  	ASSERT(MUTEX_HELD(&zone_status_lock)); -	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && -	    status >= zone_status_get(zone)); +	ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE || +	    status == ZONE_IS_FREE) && status >= zone_status_get(zone)); + +	/* Current time since Jan 1 1970 but consumers expect NS */ +	gethrestime(&now); +	t = (now.tv_sec * NANOSEC) + now.tv_nsec;  	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||  	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || @@ -2449,12 +3001,14 @@ zone_status_set(zone_t *zone, zone_status_t status)  	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,  	    zone_status_table[zone->zone_status]) ||  	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || -	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || +	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||  	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,  	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {  #ifdef DEBUG  		(void) printf(  		    "Failed to allocate and send zone state change event.\n"); +#else +		/* EMPTY */  #endif  	}  	nvlist_free(nvl); @@ -2474,6 +3028,38 @@ zone_status_get(zone_t *zone)  	return (zone->zone_status);  } +/* + * Publish a zones-related sysevent for purposes other than zone state changes. + * While it is unfortunate that zone_event_chan is associated with + * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be + * the only ones with class "status" and subclass "change". + */ +void +zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass, +    nvlist_t *ev_nvl) +{ +	nvlist_t *nvl = NULL; +	timestruc_t now; +	uint64_t t; + +	gethrestime(&now); +	t = (now.tv_sec * NANOSEC) + now.tv_nsec; + +	if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 || +	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 || +	    nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 || +	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 || +	    sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com", +	    "kernel", nvl, EVCH_SLEEP) != 0) { +#ifdef DEBUG +		(void) printf("Failed to allocate and send zone misc event.\n"); +#else +		/* EMPTY */ +#endif +	} +	nvlist_free(nvl); +} +  static int  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)  { @@ -2527,9 +3113,14 @@ zone_set_brand(zone_t *zone, const char *brand)  		return (EINVAL);  	} -	/* set up the brand specific data */ +	/* +	 * Set up the brand specific data. +	 * Note that it's possible that the hook has to drop the +	 * zone_status_lock and reaquire it before returning so we can't +	 * assume the lock has been held the entire time. +	 */  	zone->zone_brand = bp; -	ZBROP(zone)->b_init_brand_data(zone); +	ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);  	mutex_exit(&zone_status_lock);  	return (0); @@ -2602,18 +3193,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname)  }  static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) -{ -	uint64_t mcap; -	int err = 0; - -	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) -		zone->zone_phys_mcap = mcap; - -	return (err); -} - -static int  zone_set_sched_class(zone_t *zone, const char *new_class)  {  	char sched_class[PC_CLNMSZ]; @@ -3020,6 +3599,12 @@ getzoneid(void)  	return (curproc->p_zone->zone_id);  } +zoneid_t +getzonedid(void) +{ +	return (curproc->p_zone->zone_did); +} +  /*   * Internal versions of zone_find_by_*().  These don't zone_hold() or   * check the validity of a zone's state. @@ -3766,6 +4351,17 @@ zone_start_init(void)  	 */  	z->zone_proc_initpid = p->p_pid; +	if (z->zone_setup_app_contract == B_TRUE) { +		/* +		 * Normally a process cannot modify its own contract, but we're +		 * just starting the zone's init process and its contract is +		 * always initialized from the sys_process_tmpl template, so +		 * this is the simplest way to setup init's contract to kill +		 * the process if any other process in the contract exits. +		 */ +		p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; +	} +  	/*  	 * We maintain zone_boot_err so that we can return the cause of the  	 * failure back to the caller of the zone_boot syscall. @@ -3794,9 +4390,54 @@ zone_start_init(void)  			lwp_exit();  		}  	} else { +		id_t cid = curthread->t_cid; +  		if (zone_status_get(z) == ZONE_IS_BOOTING)  			zone_status_set(z, ZONE_IS_RUNNING);  		mutex_exit(&zone_status_lock); + +		mutex_enter(&class_lock); +		ASSERT(cid < loaded_classes); +		if (strcmp(sclass[cid].cl_name, "FX") == 0 && +		    z->zone_fixed_hipri) { +			/* +			 * If the zone is using FX then by default all +			 * processes start at the lowest priority and stay +			 * there. We provide a mechanism for the zone to +			 * indicate that it should run at "high priority". In +			 * this case we setup init to run at the highest FX +			 * priority (which is one level higher than the +			 * non-fixed scheduling classes can use). +			 */ +			pcparms_t pcparms; + +			pcparms.pc_cid = cid; +			((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; +			((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = +			    FXMAXUPRI; +			((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = +			    FX_DOUPRILIM | FX_DOUPRI; + +			mutex_enter(&pidlock); +			mutex_enter(&curproc->p_lock); + +			(void) parmsset(&pcparms, curthread); + +			mutex_exit(&curproc->p_lock); +			mutex_exit(&pidlock); +		} else if (strcmp(sclass[cid].cl_name, "RT") == 0) { +			/* +			 * zsched always starts the init lwp at priority +			 * minclsyspri - 1. This priority gets set in t_pri and +			 * is invalid for RT, but RT never uses t_pri. However +			 * t_pri is used by procfs, so we always see processes +			 * within an RT zone with an invalid priority value. +			 * We fix that up now. +			 */ +			curthread->t_pri = RTGPPRIO0; +		} +		mutex_exit(&class_lock); +  		/* cause the process to return to userland. */  		lwp_rtt();  	} @@ -4282,8 +4923,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)  		error = EINVAL;  		name = nvpair_name(nvp); -		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) -		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { +		if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && +		    strncmp(name, "project.", sizeof ("project.") - 1) != 0) || +		    nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {  			goto out;  		}  		if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4402,7 +5044,7 @@ zone_create(const char *zone_name, const char *zone_root,      caddr_t rctlbuf, size_t rctlbufsz,      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,      int match, uint32_t doi, const bslabel_t *label, -    int flags) +    int flags, zoneid_t zone_did)  {  	struct zsched_arg zarg;  	nvlist_t *rctls = NULL; @@ -4474,6 +5116,7 @@ zone_create(const char *zone_name, const char *zone_root,  	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);  	zone->zone_id = zoneid; +	zone->zone_did = zone_did;  	zone->zone_status = ZONE_IS_UNINITIALIZED;  	zone->zone_pool = pool_default;  	zone->zone_pool_mod = gethrtime(); @@ -4481,6 +5124,9 @@ zone_create(const char *zone_name, const char *zone_root,  	zone->zone_ncpus = 0;  	zone->zone_ncpus_online = 0;  	zone->zone_restart_init = B_TRUE; +	zone->zone_reboot_on_init_exit = B_FALSE; +	zone->zone_restart_init_0 = B_FALSE; +	zone->zone_init_status = -1;  	zone->zone_brand = &native_brand;  	zone->zone_initname = NULL;  	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4547,8 +5193,13 @@ zone_create(const char *zone_name, const char *zone_root,  	zone->zone_max_swap_ctl = UINT64_MAX;  	zone->zone_max_lofi = 0;  	zone->zone_max_lofi_ctl = UINT64_MAX; -	zone0.zone_lockedmem_kstat = NULL; -	zone0.zone_swapresv_kstat = NULL; +	zone->zone_lockedmem_kstat = NULL; +	zone->zone_swapresv_kstat = NULL; +	zone->zone_physmem_kstat = NULL; + +	zone_pdata[zoneid].zpers_zfsp = +	    kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP); +	zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;  	zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP); @@ -4557,6 +5208,13 @@ zone_create(const char *zone_name, const char *zone_root,  	 */  	zone->zone_rctls = NULL; +	/* +	 * Ensure page count is 0 (in case zoneid has wrapped). +	 * Initialize physical memory cap as unlimited. +	 */ +	zone_pdata[zoneid].zpers_pg_cnt = 0; +	zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX; +  	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {  		zone_free(zone);  		return (zone_create_error(error, 0, extended_error)); @@ -4705,8 +5363,8 @@ zone_create(const char *zone_name, const char *zone_root,  	/*  	 * The process, task, and project rctls are probably wrong;  	 * we need an interface to get the default values of all rctls, -	 * and initialize zsched appropriately.  I'm not sure that that -	 * makes much of a difference, though. +	 * and initialize zsched appropriately. However, we allow zoneadmd +	 * to pass down both zone and project rctls for the zone's init.  	 */  	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);  	if (error != 0) { @@ -4845,6 +5503,7 @@ zone_boot(zoneid_t zoneid)  static int  zone_empty(zone_t *zone)  { +	int cnt = 0;  	int waitstatus;  	/* @@ -4855,7 +5514,16 @@ zone_empty(zone_t *zone)  	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));  	while ((waitstatus = zone_status_timedwait_sig(zone,  	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { -		killall(zone->zone_id); +		boolean_t force = B_FALSE; + +		/* Every 30 seconds, try harder */ +		if (cnt++ >= 30) { +			cmn_err(CE_WARN, "attempt to force kill zone %d\n", +			    zone->zone_id); +			force = B_TRUE; +			cnt = 0; +		} +		killall(zone->zone_id, force);  	}  	/*  	 * return EINTR if we were signaled @@ -5184,6 +5852,7 @@ zone_destroy(zoneid_t zoneid)  	zone_status_t status;  	clock_t wait_time;  	boolean_t log_refcounts; +	zone_persist_t *zp;  	if (secpolicy_zone_config(CRED()) != 0)  		return (set_errno(EPERM)); @@ -5217,6 +5886,12 @@ zone_destroy(zoneid_t zoneid)  	zone_hold(zone);  	mutex_exit(&zonehash_lock); +	zp = &zone_pdata[zoneid]; +	mutex_enter(&zp->zpers_zfs_lock); +	kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t)); +	zp->zpers_zfsp = NULL; +	mutex_exit(&zp->zpers_zfs_lock); +  	/*  	 * wait for zsched to exit  	 */ @@ -5606,14 +6281,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  				error = EFAULT;  		}  		break; -	case ZONE_ATTR_PHYS_MCAP: -		size = sizeof (zone->zone_phys_mcap); -		if (bufsize > size) -			bufsize = size; -		if (buf != NULL && -		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) -			error = EFAULT; -		break;  	case ZONE_ATTR_SCHED_CLASS:  		mutex_enter(&class_lock); @@ -5677,6 +6344,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  		}  		kmem_free(zbuf, bufsize);  		break; +	case ZONE_ATTR_DID: +		size = sizeof (zoneid_t); +		if (bufsize > size) +			bufsize = size; + +		if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) +			error = EFAULT; +		break; +	case ZONE_ATTR_SCHED_FIXEDHI: +		size = sizeof (boolean_t); +		if (bufsize > size) +			bufsize = size; + +		if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, +		    bufsize) != 0) +			error = EFAULT; +		break;  	default:  		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {  			size = bufsize; @@ -5708,10 +6392,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  		return (set_errno(EPERM));  	/* -	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the -	 * global zone. +	 * No attributes can be set on the global zone.  	 */ -	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { +	if (zoneid == GLOBAL_ZONEID) {  		return (set_errno(EINVAL));  	} @@ -5724,11 +6407,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  	mutex_exit(&zonehash_lock);  	/* -	 * At present most attributes can only be set on non-running, +	 * At present attributes can only be set on non-running,  	 * non-global zones.  	 */  	zone_status = zone_status_get(zone); -	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { +	if (zone_status > ZONE_IS_READY) {  		err = EINVAL;  		goto done;  	} @@ -5741,6 +6424,14 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  		zone->zone_restart_init = B_FALSE;  		err = 0;  		break; +	case ZONE_ATTR_INITRESTART0: +		zone->zone_restart_init_0 = B_TRUE; +		err = 0; +		break; +	case ZONE_ATTR_INITREBOOT: +		zone->zone_reboot_on_init_exit = B_TRUE; +		err = 0; +		break;  	case ZONE_ATTR_BOOTARGS:  		err = zone_set_bootargs(zone, (const char *)buf);  		break; @@ -5753,9 +6444,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  	case ZONE_ATTR_SECFLAGS:  		err = zone_set_secflags(zone, (psecflags_t *)buf);  		break; -	case ZONE_ATTR_PHYS_MCAP: -		err = zone_set_phys_mcap(zone, (const uint64_t *)buf); -		break;  	case ZONE_ATTR_SCHED_CLASS:  		err = zone_set_sched_class(zone, (const char *)buf);  		break; @@ -5783,6 +6471,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  		err = zone_set_network(zoneid, zbuf);  		kmem_free(zbuf, bufsize);  		break; +	case ZONE_ATTR_APP_SVC_CT: +		if (bufsize != sizeof (boolean_t)) { +			err = EINVAL; +		} else { +			zone->zone_setup_app_contract = (boolean_t)buf; +			err = 0; +		} +		break; +	case ZONE_ATTR_SCHED_FIXEDHI: +		if (bufsize != sizeof (boolean_t)) { +			err = EINVAL; +		} else { +			zone->zone_fixed_hipri = (boolean_t)buf; +			err = 0; +		} +		break;  	default:  		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))  			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6486,6 +7190,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)  			zs.doi = zs32.doi;  			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;  			zs.flags = zs32.flags; +			zs.zoneid = zs32.zoneid;  #else  			panic("get_udatamodel() returned bogus result\n");  #endif @@ -6496,7 +7201,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)  		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,  		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,  		    zs.extended_error, zs.match, zs.doi, -		    zs.label, zs.flags)); +		    zs.label, zs.flags, zs.zoneid));  	case ZONE_BOOT:  		return (zone_boot((zoneid_t)(uintptr_t)arg1));  	case ZONE_DESTROY: @@ -6597,6 +7302,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)  	bcopy(zone->zone_name, zone_name, zone_namelen);  	zoneid = zone->zone_id;  	uniqid = zone->zone_uniqid; +	arg.status = zone->zone_init_status;  	/*  	 * zoneadmd may be down, but at least we can empty out the zone.  	 * We can ignore the return value of zone_empty() since we're called @@ -6774,7 +7480,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)  	 * zone_ki_call_zoneadmd() will do a more thorough job of this  	 * later.  	 */ -	killall(zone->zone_id); +	killall(zone->zone_id, B_FALSE);  	/*  	 * Now, create the thread to contact zoneadmd and do the rest of the  	 * work.  This thread can't be created in our zone otherwise @@ -6837,16 +7543,15 @@ zone_shutdown_global(void)  }  /* - * Returns true if the named dataset is visible in the current zone. + * Returns true if the named dataset is visible in the specified zone.   * The 'write' parameter is set to 1 if the dataset is also writable.   */  int -zone_dataset_visible(const char *dataset, int *write) +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)  {  	static int zfstype = -1;  	zone_dataset_t *zd;  	size_t len; -	zone_t *zone = curproc->p_zone;  	const char *name = NULL;  	vfs_t *vfsp = NULL; @@ -6914,7 +7619,8 @@ zone_dataset_visible(const char *dataset, int *write)  	vfs_list_read_lock();  	vfsp = zone->zone_vfslist;  	do { -		ASSERT(vfsp); +		if (vfsp == NULL) +			break;  		if (vfsp->vfs_fstype == zfstype) {  			name = refstr_value(vfsp->vfs_resource); @@ -6951,6 +7657,18 @@ zone_dataset_visible(const char *dataset, int *write)  }  /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ +	zone_t *zone = curproc->p_zone; + +	return (zone_dataset_visible_inzone(zone, dataset, write)); +} + +/*   * zone_find_by_any_path() -   *   * kernel-private routine similar to zone_find_by_path(), but which @@ -7052,6 +7770,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)  	zone_t *zone;  	zone_t *thiszone; +	/* +	 * Only the GZ may add a datalink to a zone's list. +	 */ +	if (getzoneid() != GLOBAL_ZONEID) +		return (set_errno(EPERM)); + +	/* +	 * Only a process with the datalink config priv may add a +	 * datalink to a zone's list. +	 */ +	if (secpolicy_dl_config(CRED()) != 0) +		return (set_errno(EPERM)); + +	/* +	 * When links exist in the GZ, they aren't added to the GZ's +	 * zone_dl_list. We must enforce this because link_activate() +	 * depends on zone_check_datalink() returning only NGZs. +	 */ +	if (zoneid == GLOBAL_ZONEID) +		return (set_errno(EINVAL)); +  	if ((thiszone = zone_find_by_id(zoneid)) == NULL)  		return (set_errno(ENXIO)); @@ -7084,6 +7823,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)  	zone_t *zone;  	int err = 0; +	/* +	 * Only the GZ may remove a datalink from a zone's list. +	 */ +	if (getzoneid() != GLOBAL_ZONEID) +		return (set_errno(EPERM)); + +	/* +	 * Only a process with the datalink config priv may remove a +	 * datalink from a zone's list. +	 */ +	if (secpolicy_dl_config(CRED()) != 0) +		return (set_errno(EPERM)); + +	/* +	 * If we can't add a datalink to the GZ's zone_dl_list then we +	 * certainly can't remove them either. +	 */ +	if (zoneid == GLOBAL_ZONEID) +		return (set_errno(EINVAL)); +  	if ((zone = zone_find_by_id(zoneid)) == NULL)  		return (set_errno(EINVAL)); @@ -7101,25 +7860,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)  }  /* - * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned - * the linkid.  Otherwise we just check if the specified zoneidp has been - * assigned the supplied linkid. + * + * This function may be used in two ways: + * + * 1. to get the zoneid of the zone this link is under, or + * + * 2. to verify that the link is under a specific zone. + * + * The first use is achieved by passing a zoneid of ALL_ZONES. The + * function then iterates the datalink list of every zone on the + * system until it finds the linkid. If the linkid is found then the + * function returns 0 and zoneidp is updated. Otherwise, ENXIO is + * returned and zoneidp is not modified. The use of ALL_ZONES is + * limited to callers in the GZ to prevent leaking information to + * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed + * to the second type in the list above. + * + * The second use is achieved by passing a specific zoneid. The GZ can + * use this to verify a link is under a particular zone. An NGZ can + * use this to verify a link is under itself. But an NGZ cannot use + * this to determine if a link is under some other zone as that would + * result in information leakage. If the link exists under the zone + * then 0 is returned. Otherwise, ENXIO is returned.   */  int  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)  {  	zone_t *zone; +	zoneid_t zoneid = *zoneidp; +	zoneid_t caller = getzoneid();  	int err = ENXIO; -	if (*zoneidp != ALL_ZONES) { -		if ((zone = zone_find_by_id(*zoneidp)) != NULL) { -			if (zone_dl_exists(zone, linkid)) +	/* +	 * Only the GZ may enquire about all zones; an NGZ may only +	 * enuqire about itself. +	 */ +	if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID) +		zoneid = caller; + +	if (zoneid != caller && caller != GLOBAL_ZONEID) +		return (err); + +	if (zoneid != ALL_ZONES) { +		if ((zone = zone_find_by_id(zoneid)) != NULL) { +			if (zone_dl_exists(zone, linkid)) { +				/* +				 * We need to set this in case an NGZ +				 * passes ALL_ZONES. +				 */ +				*zoneidp = zoneid;  				err = 0; +			}  			zone_rele(zone);  		}  		return (err);  	} +	ASSERT(caller == GLOBAL_ZONEID);  	mutex_enter(&zonehash_lock);  	for (zone = list_head(&zone_active); zone != NULL;  	    zone = list_next(&zone_active, zone)) { @@ -7130,6 +7927,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)  		}  	}  	mutex_exit(&zonehash_lock); +  	return (err);  } @@ -7150,6 +7948,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)  	zone_dl_t *zdl;  	datalink_id_t *idptr = idarray; +	/* +	 * Only the GZ or the owning zone may look at the datalink list. +	 */ +	if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid)) +		return (set_errno(EPERM)); +  	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)  		return (set_errno(EFAULT));  	if ((zone = zone_find_by_id(zoneid)) == NULL) @@ -7175,6 +7979,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)  	mutex_exit(&zone->zone_lock);  	zone_rele(zone); +	/* +	 * Prevent returning negative nump values -- we should never +	 * have this many links anyways. +	 */ +	if (num > INT_MAX) +		return (set_errno(EOVERFLOW)); +  	/* Increased or decreased, caller should be notified. */  	if (num != dlcount) {  		if (copyout(&num, nump, sizeof (num)) != 0) @@ -7388,3 +8199,231 @@ done:  	else  		return (0);  } + +static void +zone_incr_capped(zoneid_t zid) +{ +	zone_persist_t *zp = &zone_pdata[zid]; + +	/* See if over (unlimited is UINT32_MAX), or already marked that way. */ +	if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) { +		return; +	} + +	mutex_enter(&zone_physcap_lock); +	/* Recheck setting under mutex */ +	if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) { +		zp->zpers_over = 1; +		zp->zpers_nover++; +		zone_num_over_cap++; +		DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid); +	} +	mutex_exit(&zone_physcap_lock); +} + +/* + * We want some hysteresis when the zone is going under its cap so that we're + * not continuously toggling page scanning back and forth by a single page + * around the cap. Using ~1% of the zone's page limit seems to be a good + * quantity. This table shows some various zone memory caps and the number of + * pages (assuming a 4k page size). Given this, we choose to shift the page + * limit by 7 places to get a hysteresis that is slightly less than 1%. + * + *   cap    pages     pages     1% shift7  shift7 + *  128M    32768 0x0008000    327    256 0x00100 + *  512M   131072 0x0020000   1310   1024 0x00400 + *    1G   262144 0x0040000   2621   2048 0x00800 + *    4G  1048576 0x0100000  10485   8192 0x02000 + *    8G  2097152 0x0200000  20971  16384 0x04000 + *   16G  4194304 0x0400000  41943  32768 0x08000 + *   32G  8388608 0x0800000  83886  65536 0x10000 + *   64G 16777216 0x1000000 167772 131072 0x20000 + */ +static void +zone_decr_capped(zoneid_t zid) +{ +	zone_persist_t *zp = &zone_pdata[zid]; +	uint32_t adjusted_limit; + +	/* +	 * See if under, or already marked that way. There is no need to +	 * check for an unlimited cap (zpers_pg_limit == UINT32_MAX) +	 * since we'll never set zpers_over in zone_incr_capped(). +	 */ +	if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) { +		return; +	} + +	adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7); + +	/* Recheck, accounting for our hysteresis. */ +	if (zp->zpers_pg_cnt >= adjusted_limit) { +		return; +	} + +	mutex_enter(&zone_physcap_lock); +	/* Recheck under mutex. */ +	if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) { +		zp->zpers_over = 0; +		ASSERT(zone_num_over_cap > 0); +		zone_num_over_cap--; +		DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid); +	} +	mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ +	uint_t pcnt; +	zone_persist_t *zp; +	zoneid_t zid; + +	/* Skip pages in segkmem, etc. (KV_KVP, ...) */ +	if (PP_ISKAS(pp)) +		return; + +	ASSERT(!PP_ISFREE(pp)); + +	zid = curzone->zone_id; +	if (pp->p_zoneid == zid) { +		/* Another mapping to this page for this zone, do nothing */ +		return; +	} + +	if (pp->p_szc == 0) { +		pcnt = 1; +	} else { +		/* large page */ +		pcnt = page_get_pagecnt(pp->p_szc); +	} + +	if (pp->p_share == 0) { +		/* First mapping to this page. */ +		pp->p_zoneid = zid; +		zp = &zone_pdata[zid]; +		ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX); +		atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt); +		zone_incr_capped(zid); +		return; +	} + +	if (pp->p_zoneid != ALL_ZONES) { +		/* +		 * The page is now being shared across a different zone. +		 * Decrement the original zone's usage. +		 */ +		zid = pp->p_zoneid; +		pp->p_zoneid = ALL_ZONES; +		ASSERT(zid >= 0 && zid <= MAX_ZONEID); +		zp = &zone_pdata[zid]; + +		if (zp->zpers_pg_cnt > 0) { +			atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); +		} +		zone_decr_capped(zid); +	} +} + +void +zone_rm_page(page_t *pp) +{ +	uint_t pcnt; +	zone_persist_t *zp; +	zoneid_t zid; + +	/* Skip pages in segkmem, etc. (KV_KVP, ...) */ +	if (PP_ISKAS(pp)) +		return; + +	zid = pp->p_zoneid; +	if (zid == ALL_ZONES || pp->p_share != 0) +		return; + +	/* This is the last mapping to the page for a zone. */ +	if (pp->p_szc == 0) { +		pcnt = 1; +	} else { +		/* large page */ +		pcnt = (int64_t)page_get_pagecnt(pp->p_szc); +	} + +	ASSERT(zid >= 0 && zid <= MAX_ZONEID); +	zp = &zone_pdata[zid]; +	if (zp->zpers_pg_cnt > 0) { +		atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); +	} +	zone_decr_capped(zid); +	pp->p_zoneid = ALL_ZONES; +} + +void +zone_pageout_stat(int zid, zone_pageout_op_t op) +{ +	zone_persist_t *zp; + +	if (zid == ALL_ZONES) +		return; + +	ASSERT(zid >= 0 && zid <= MAX_ZONEID); +	zp = &zone_pdata[zid]; + +#ifndef DEBUG +	atomic_add_64(&zp->zpers_pg_out, 1); +#else +	switch (op) { +	case ZPO_DIRTY: +		atomic_add_64(&zp->zpers_pg_fsdirty, 1); +		break; +	case ZPO_FS: +		atomic_add_64(&zp->zpers_pg_fs, 1); +		break; +	case ZPO_ANON: +		atomic_add_64(&zp->zpers_pg_anon, 1); +		break; +	case ZPO_ANONDIRTY: +		atomic_add_64(&zp->zpers_pg_anondirty, 1); +		break; +	default: +		cmn_err(CE_PANIC, "Invalid pageout operator %d", op); +		break; +	} +#endif +} + +/* + * Return the zone's physical memory cap and current free memory (in pages). + */ +void +zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free) +{ +	zone_persist_t *zp; + +	ASSERT(zid >= 0 && zid <= MAX_ZONEID); +	zp = &zone_pdata[zid]; + +	/* +	 * If memory or swap limits are set on the zone, use those, otherwise +	 * use the system values. physmem and freemem are also in pages. +	 */ +	if (zp->zpers_pg_limit == UINT32_MAX) { +		*memcap = physmem; +		*free = freemem; +	} else { +		int64_t freemem; + +		*memcap = (pgcnt_t)zp->zpers_pg_limit; +		freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt; +		if (freemem > 0) { +			*free = (pgcnt_t)freemem; +		} else { +			*free = (pgcnt_t)0; +		} +	} +} | 
