diff options
Diffstat (limited to 'usr/src/uts/common/os')
33 files changed, 2113 insertions, 889 deletions
| diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index 0af67f5d98..62c3bbe2d6 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc.   */  #include <sys/kmem.h> @@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops  = {  };  #else /* !__sparcv9 */  struct brand_mach_ops native_mach_ops  = { -		NULL, NULL, NULL, NULL +		NULL, NULL, NULL, NULL, NULL, NULL, NULL  };  #endif /* !__sparcv9 */ @@ -53,7 +54,8 @@ brand_t native_brand = {  		BRAND_VER_1,  		"native",  		NULL, -		&native_mach_ops +		&native_mach_ops, +		0  };  /* @@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp)  	mutex_exit(&brand_list_lock);  } -void -brand_setbrand(proc_t *p) +int +brand_setbrand(proc_t *p, boolean_t lwps_ok)  {  	brand_t *bp = p->p_zone->zone_brand; +	void *brand_data = NULL; -	ASSERT(bp != NULL); -	ASSERT(p->p_brand == &native_brand); +	VERIFY(MUTEX_NOT_HELD(&p->p_lock)); +	VERIFY(bp != NULL);  	/* -	 * We should only be called from exec(), when we know the process -	 * is single-threaded. +	 * Process branding occurs during fork() and exec().  When it happens +	 * during fork(), the LWP count will always be 0 since branding is +	 * performed as part of getproc(), before LWPs have been associated. +	 * The same is not true during exec(), where a multi-LWP process may +	 * undergo branding just prior to gexec(). This is to ensure +	 * exec-related brand hooks are available.  While it may seem +	 * complicated to brand a multi-LWP process, the two possible outcomes +	 * simplify things: +	 * +	 * 1. The exec() succeeds:  LWPs besides the caller will be killed and +	 *    any further branding will occur in a single-LWP context. +	 * 2. The exec() fails: The process will be promptly unbranded since +	 *    the hooks are no longer needed. +	 * +	 * To prevent inconsistent brand state from being encountered during +	 * the exec(), LWPs beyond the caller which are associated with this +	 * process must be held temporarily.  They will be released either when +	 * they are killed in the exec() success, or when the brand is cleared +	 * after exec() failure.  	 */ -	ASSERT(p->p_tlist == p->p_tlist->t_forw); +	if (lwps_ok) { +		/* +		 * We've been called from a exec() context tolerating the +		 * existence of multiple LWPs during branding is necessary. +		 */ +		VERIFY(p == curproc); +		VERIFY(p->p_tlist != NULL); +		if (p->p_tlist != p->p_tlist->t_forw) { +			/* +			 * Multiple LWPs are present.  Hold all but the caller. +			 */ +			if (!holdlwps(SHOLDFORK1)) { +				return (-1); +			} +		} +	} else { +		/* +		 * Processes branded during fork() should not have LWPs at all. +		 */ +		VERIFY(p->p_tlist == NULL); +	} + +	if (bp->b_data_size > 0) { +		brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); +	} + +	mutex_enter(&p->p_lock); +	ASSERT(!PROC_IS_BRANDED(p));  	p->p_brand = bp; +	p->p_brand_data = brand_data;  	ASSERT(PROC_IS_BRANDED(p));  	BROP(p)->b_setbrand(p); +	mutex_exit(&p->p_lock); +	return (0);  }  void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p, boolean_t lwps_ok)  {  	brand_t *bp = p->p_zone->zone_brand; -	klwp_t *lwp = NULL; -	ASSERT(bp != NULL); -	ASSERT(!no_lwps || (p->p_tlist == NULL)); +	void *brand_data; -	/* -	 * If called from exec_common() or proc_exit(), -	 * we know the process is single-threaded. -	 * If called from fork_fail, p_tlist is NULL. -	 */ -	if (!no_lwps) { -		ASSERT(p->p_tlist == p->p_tlist->t_forw); -		lwp = p->p_tlist->t_lwp; -	} +	VERIFY(MUTEX_NOT_HELD(&p->p_lock)); +	VERIFY(bp != NULL); +	VERIFY(PROC_IS_BRANDED(p)); -	ASSERT(PROC_IS_BRANDED(p)); -	BROP(p)->b_proc_exit(p, lwp); +	if (BROP(p)->b_clearbrand != NULL) +		BROP(p)->b_clearbrand(p, lwps_ok); + +	mutex_enter(&p->p_lock);  	p->p_brand = &native_brand; +	brand_data = p->p_brand_data; +	p->p_brand_data = NULL; + +	if (lwps_ok) { +		VERIFY(p == curproc); +		/* +		 * A process with multiple LWPs is being de-branded after +		 * failing an exec.  The other LWPs were held as part of the +		 * procedure, so they must be resumed now. +		 */ +		if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) { +			continuelwps(p); +		} +	} else { +		/* +		 * While clearing the brand, it's ok for one LWP to be present. +		 * This happens when a native binary is executed inside a +		 * branded zone, since the brand will be removed during the +		 * course of a successful exec. +		 */ +		VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw); +	} +	mutex_exit(&p->p_lock); + +	if (brand_data != NULL) { +		kmem_free(brand_data, bp->b_data_size); +	}  }  #if defined(__sparcv9) @@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,  		return (ENOSYS);  	/* For all other operations this must be a branded process. */ -	if (p->p_brand == &native_brand) +	if (!PROC_IS_BRANDED(p))  		return (ENOSYS);  	ASSERT(p->p_brand == pbrand); @@ -601,15 +672,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp)  int  brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,      intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, -    cred_t *cred, int brand_action, struct brand *pbrand, char *bname, -    char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) +    cred_t *cred, int *brand_action, struct brand *pbrand, char *bname, +    char *brandlib, char *brandlib32)  {  	vnode_t		*nvp;  	Ehdr		ehdr;  	Addr		uphdr_vaddr;  	intptr_t	voffset; -	int		interp; +	char		*interp;  	int		i, err;  	struct execenv	env;  	struct execenv	origenv; @@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	klwp_t		*lwp = ttolwp(curthread);  	brand_proc_data_t	*spd;  	brand_elf_data_t sed, *sedp; -	char		*linker;  	uintptr_t	lddata; /* lddata of executable's linker */  	ASSERT(curproc->p_brand == pbrand); @@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	 */  	if (args->to_model == DATAMODEL_NATIVE) {  		args->emulator = brandlib; -		linker = brandlinker;  	}  #if defined(_LP64)  	else {  		args->emulator = brandlib32; -		linker = brandlinker32;  	}  #endif  /* _LP64 */ @@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	if (args->to_model == DATAMODEL_NATIVE) {  		err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,  		    &voffset, exec_file, &interp, &env.ex_bssbase, -		    &env.ex_brkbase, &env.ex_brksize, NULL); +		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);  	}  #if defined(_LP64)  	else { @@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  		Elf32_Addr uphdr_vaddr32;  		err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,  		    &voffset, exec_file, &interp, &env.ex_bssbase, -		    &env.ex_brkbase, &env.ex_brksize, NULL); +		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);  		Ehdr32to64(&ehdr32, &ehdr);  		if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  #endif  /* _LP64 */  	if (err != 0) {  		restoreexecenv(&origenv, &orig_sigaltstack); + +		if (interp != NULL) +			kmem_free(interp, MAXPATHLEN); +  		return (err);  	} @@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	sedp->sed_phent = ehdr.e_phentsize;  	sedp->sed_phnum = ehdr.e_phnum; -	if (interp) { +	if (interp != NULL) {  		if (ehdr.e_type == ET_DYN) {  			/*  			 * This is a shared object executable, so we @@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  		 * it in and store relevant information about it in the  		 * aux vector, where the brand library can find it.  		 */ -		if ((err = lookupname(linker, UIO_SYSSPACE, +		if ((err = lookupname(interp, UIO_SYSSPACE,  		    FOLLOW, NULLVPP, &nvp)) != 0) { -			uprintf("%s: not found.", brandlinker); +			uprintf("%s: not found.", interp);  			restoreexecenv(&origenv, &orig_sigaltstack); +			kmem_free(interp, MAXPATHLEN);  			return (err);  		} + +		kmem_free(interp, MAXPATHLEN); +  		if (args->to_model == DATAMODEL_NATIVE) {  			err = mapexec_brand(nvp, args, &ehdr,  			    &uphdr_vaddr, &voffset, exec_file, &interp, -			    NULL, NULL, NULL, &lddata); +			    NULL, NULL, NULL, &lddata, NULL);  		}  #if defined(_LP64)  		else { @@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  			Elf32_Addr uphdr_vaddr32;  			err = mapexec32_brand(nvp, args, &ehdr32,  			    &uphdr_vaddr32, &voffset, exec_file, &interp, -			    NULL, NULL, NULL, &lddata); +			    NULL, NULL, NULL, &lddata, NULL);  			Ehdr32to64(&ehdr32, &ehdr);  			if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,  	/*  	 * Third, the /proc aux vectors set up by elfexec() point to -	 * brand emulation library and it's linker.  Copy these to the +	 * brand emulation library and its linker.  Copy these to the  	 * /proc brand specific aux vector, and update the regular -	 * /proc aux vectors to point to the executable (and it's +	 * /proc aux vectors to point to the executable (and its  	 * linker).  This will enable debuggers to access the  	 * executable via the usual /proc or elf notes aux vectors.  	 * @@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand)  }  /*ARGSUSED*/ -int +void  brand_solaris_initlwp(klwp_t *l, struct brand *pbrand)  {  	ASSERT(l->lwp_procp->p_brand == pbrand);  	ASSERT(l->lwp_procp->p_brand_data != NULL);  	ASSERT(l->lwp_brand == NULL);  	l->lwp_brand = (void *)-1; -	return (0);  }  /*ARGSUSED*/  void  brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand)  { -	proc_t  *p = l->lwp_procp; -  	ASSERT(l->lwp_procp->p_brand == pbrand);  	ASSERT(l->lwp_procp->p_brand_data != NULL);  	ASSERT(l->lwp_brand != NULL); - -	/* -	 * We should never be called for the last thread in a process. -	 * (That case is handled by brand_solaris_proc_exit().) -	 * Therefore this lwp must be exiting from a multi-threaded -	 * process. -	 */ -	ASSERT(p->p_tlist != p->p_tlist->t_forw); - -	l->lwp_brand = NULL;  }  /*ARGSUSED*/  void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand)  {  	ASSERT(p->p_brand == pbrand);  	ASSERT(p->p_brand_data != NULL); -	/* -	 * When called from proc_exit(), we know that process is -	 * single-threaded and free our lwp brand data. -	 * otherwise just free p_brand_data and return. -	 */ -	if (l != NULL) { -		ASSERT(p->p_tlist == p->p_tlist->t_forw); -		ASSERT(p->p_tlist->t_lwp == l); -		(void) brand_solaris_freelwp(l, pbrand); -	} -  	/* upon exit, free our proc brand data */  	kmem_free(p->p_brand_data, sizeof (brand_proc_data_t));  	p->p_brand_data = NULL; @@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand)  	ASSERT(p->p_tlist == p->p_tlist->t_forw);  	p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); -	(void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand);  } diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index 805813037d..1280c8a1b6 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -25,7 +25,7 @@   */  /* - * Copyright (c) 2015, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc.   */  #include <sys/timer.h> @@ -41,6 +41,9 @@  static clock_backend_t clock_highres; +/* minimum non-privileged interval (200us) */ +long clock_highres_interval_min = 200000; +  /*ARGSUSED*/  static int  clock_highres_settime(timespec_t *ts) @@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts)  static int  clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *))  { -	/* -	 * CLOCK_HIGHRES timers of sufficiently high resolution can deny -	 * service; only allow privileged users to create such timers. -	 * Sites that do not wish to have this restriction should -	 * give users the "proc_clock_highres" privilege. -	 */ -	if (secpolicy_clock_highres(CRED()) != 0) { -		it->it_arg = NULL; -		return (EPERM); -	} -  	it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP);  	it->it_fire = fire; @@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags,  	cpu_t *cpu;  	cpupart_t *cpupart;  	int pset; +	boolean_t value_need_clamp = B_FALSE; +	boolean_t intval_need_clamp = B_FALSE; +	cred_t *cr = CRED(); +	struct itimerspec clamped; + +	/* +	 * CLOCK_HIGHRES timers of sufficiently high resolution can deny +	 * service; only allow privileged users to create such timers. +	 * Non-privileged users (those without the "proc_clock_highres" +	 * privilege) can create timers with lower resolution but if they +	 * attempt to use a very low time value (< 200us) then their +	 * timer will be clamped at 200us. +	 */ +	if (when->it_value.tv_sec == 0 && +	    when->it_value.tv_nsec > 0 && +	    when->it_value.tv_nsec < clock_highres_interval_min) +		value_need_clamp = B_TRUE; + +	if (when->it_interval.tv_sec == 0 && +	    when->it_interval.tv_nsec > 0 && +	    when->it_interval.tv_nsec < clock_highres_interval_min) +		intval_need_clamp = B_TRUE; + +	if ((value_need_clamp || intval_need_clamp) && +	    secpolicy_clock_highres(cr) != 0) { +		clamped.it_value.tv_sec = when->it_value.tv_sec; +		clamped.it_interval.tv_sec = when->it_interval.tv_sec; + +		if (value_need_clamp) { +			clamped.it_value.tv_nsec = clock_highres_interval_min; +		} else { +			clamped.it_value.tv_nsec = when->it_value.tv_nsec; +		} + +		if (intval_need_clamp) { +			clamped.it_interval.tv_nsec = +			    clock_highres_interval_min; +		} else { +			clamped.it_interval.tv_nsec = when->it_interval.tv_nsec; +		} + +		when = &clamped; +	}  	cyctime.cyt_when = ts2hrt(&when->it_value);  	cyctime.cyt_interval = ts2hrt(&when->it_interval); diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index 249066674e..9ea08f5535 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -21,6 +21,7 @@  /*   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2016 Joyent, Inc.   */  /* @@ -287,7 +288,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,  	avl_index_t where;  	klwp_t *curlwp = ttolwp(curthread); -	ASSERT(author == curproc); +	/* +	 * It's possible that author is not curproc if the zone is creating +	 * a new process as a child of zsched. +	 */  	mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);  	mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index d5e272c16a..437f26e6e0 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc.   * Copyright (c) 2016 by Delphix. All rights reserved.   */ @@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type)  	/*  	 * Determine what rootvp to use.  	 */ +	mutex_enter(&curproc->p_lock);  	if (core_type == CORE_PROC) {  		rootvp = (PTOU(curproc)->u_rdir == NULL ?  		    curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir); @@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type)  	VN_HOLD(startvp);  	if (rootvp != rootdir)  		VN_HOLD(rootvp); +	mutex_exit(&curproc->p_lock);  	if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp,  	    startvp, CRED())) != 0) {  		pn_free(&pn); diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 87c0896814..3a42ac8fb1 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -21,6 +21,7 @@  /*   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.   * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc.   */  /* @@ -108,6 +109,7 @@ kmutex_t	cpu_lock;  cpu_t		*cpu_list;		/* list of all CPUs */  cpu_t		*clock_cpu_list;	/* used by clock to walk CPUs */  cpu_t		*cpu_active;		/* list of active CPUs */ +cpuset_t	cpu_active_set;		/* cached set of active CPUs */  static cpuset_t	cpu_available;		/* set of available CPUs */  cpuset_t	cpu_seqid_inuse;	/* which cpu_seqids are in use */ @@ -1724,6 +1726,7 @@ cpu_list_init(cpu_t *cp)  	cp->cpu_part = &cp_default;  	CPUSET_ADD(cpu_available, cp->cpu_id); +	CPUSET_ADD(cpu_active_set, cp->cpu_id);  }  /* @@ -1895,6 +1898,7 @@ cpu_add_active_internal(cpu_t *cp)  	cp->cpu_prev_onln = cpu_active->cpu_prev_onln;  	cpu_active->cpu_prev_onln->cpu_next_onln = cp;  	cpu_active->cpu_prev_onln = cp; +	CPUSET_ADD(cpu_active_set, cp->cpu_id);  	if (pp->cp_cpulist) {  		cp->cpu_next_part = pp->cp_cpulist; @@ -1965,6 +1969,7 @@ cpu_remove_active(cpu_t *cp)  	}  	cp->cpu_next_onln = cp;  	cp->cpu_prev_onln = cp; +	CPUSET_DEL(cpu_active_set, cp->cpu_id);  	cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;  	cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; @@ -2704,13 +2709,18 @@ cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind,  	return (0);  } -#if CPUSET_WORDS > 1 -/* - * Functions for implementing cpuset operations when a cpuset is more - * than one word.  On platforms where a cpuset is a single word these - * are implemented as macros in cpuvar.h. - */ +cpuset_t * +cpuset_alloc(int kmflags) +{ +	return (kmem_alloc(sizeof (cpuset_t), kmflags)); +} + +void +cpuset_free(cpuset_t *s) +{ +	kmem_free(s, sizeof (cpuset_t)); +}  void  cpuset_all(cpuset_t *s) @@ -2735,25 +2745,45 @@ cpuset_only(cpuset_t *s, uint_t cpu)  	CPUSET_ADD(*s, cpu);  } +long +cpu_in_set(cpuset_t *s, uint_t cpu) +{ +	return (BT_TEST(s->cpub, cpu)); +} + +void +cpuset_add(cpuset_t *s, uint_t cpu) +{ +	BT_SET(s->cpub, cpu); +} + +void +cpuset_del(cpuset_t *s, uint_t cpu) +{ +	BT_CLEAR(s->cpub, cpu); +} +  int  cpuset_isnull(cpuset_t *s)  {  	int i; -	for (i = 0; i < CPUSET_WORDS; i++) +	for (i = 0; i < CPUSET_WORDS; i++) {  		if (s->cpub[i] != 0)  			return (0); +	}  	return (1);  }  int -cpuset_cmp(cpuset_t *s1, cpuset_t *s2) +cpuset_isequal(cpuset_t *s1, cpuset_t *s2)  {  	int i; -	for (i = 0; i < CPUSET_WORDS; i++) +	for (i = 0; i < CPUSET_WORDS; i++) {  		if (s1->cpub[i] != s2->cpub[i])  			return (0); +	}  	return (1);  } @@ -2822,7 +2852,68 @@ cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid)  	*smallestid = *largestid = CPUSET_NOTINSET;  } -#endif	/* CPUSET_WORDS */ +void +cpuset_atomic_del(cpuset_t *s, uint_t cpu) +{ +	BT_ATOMIC_CLEAR(s->cpub, (cpu)) +} + +void +cpuset_atomic_add(cpuset_t *s, uint_t cpu) +{ +	BT_ATOMIC_SET(s->cpub, (cpu)) +} + +long +cpuset_atomic_xadd(cpuset_t *s, uint_t cpu) +{ +	long res; + +	BT_ATOMIC_SET_EXCL(s->cpub, cpu, res); +	return (res); +} + +long +cpuset_atomic_xdel(cpuset_t *s, uint_t cpu) +{ +	long res; + +	BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res); +	return (res); +} + +void +cpuset_or(cpuset_t *dst, cpuset_t *src) +{ +	for (int i = 0; i < CPUSET_WORDS; i++) { +		dst->cpub[i] |= src->cpub[i]; +	} +} + +void +cpuset_xor(cpuset_t *dst, cpuset_t *src) +{ +	for (int i = 0; i < CPUSET_WORDS; i++) { +		dst->cpub[i] ^= src->cpub[i]; +	} +} + +void +cpuset_and(cpuset_t *dst, cpuset_t *src) +{ +	for (int i = 0; i < CPUSET_WORDS; i++) { +		dst->cpub[i] &= src->cpub[i]; +	} +} + +void +cpuset_zero(cpuset_t *dst) +{ +	for (int i = 0; i < CPUSET_WORDS; i++) { +		dst->cpub[i] = 0; +	} +} +  /*   * Unbind threads bound to specified CPU. diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 25727d54c5..0bd6cfd44f 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -729,6 +729,14 @@ crgetzoneid(const cred_t *cr)  	    cr->cr_zone->zone_id);  } +zoneid_t +crgetzonedid(const cred_t *cr) +{ +	return (cr->cr_zone == NULL ? +	    (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : +	    cr->cr_zone->zone_did); +} +  projid_t  crgetprojid(const cred_t *cr)  { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index c3c0481e7f..a4b35dcb5b 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)  	/* Log callback errors */  	if (ret != DDI_SUCCESS) { -		cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", +		cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n",  		    ddi_driver_name(req_p->ireq_dip),  		    ddi_get_instance(req_p->ireq_dip), (int)action, ret);  	} diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index 0065b4945b..2ab4d1f023 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -98,6 +98,7 @@ uint_t auxv_hwcap32_2 = 0;	/* 32-bit version of auxv_hwcap2 */  #endif  #define	PSUIDFLAGS		(SNOCD|SUGID) +#define	RANDOM_LEN	16	/* 16 bytes for AT_RANDOM aux entry */  /*   * These are consumed within the specific exec modules, but are defined here @@ -256,8 +257,10 @@ exec_common(const char *fname, const char **argp, const char **envp,  	 * only if the pathname does not contain a "/" the resolved path  	 * points to a file in the current working (attribute) directory.  	 */ -	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 && +	mutex_enter(&p->p_lock); +	if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 &&  	    strchr(resolvepn.pn_path, '/') == NULL) { +		mutex_exit(&p->p_lock);  		if (dir != NULL)  			VN_RELE(dir);  		error = EACCES; @@ -266,6 +269,7 @@ exec_common(const char *fname, const char **argp, const char **envp,  		VN_RELE(vp);  		goto out;  	} +	mutex_exit(&p->p_lock);  	bzero(exec_file, MAXCOMLEN+1);  	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN); @@ -313,14 +317,43 @@ exec_common(const char *fname, const char **argp, const char **envp,  	ua.argp = argp;  	ua.envp = envp; -	/* If necessary, brand this process before we start the exec. */ -	if (brandme) -		brand_setbrand(p); +	/* If necessary, brand this process/lwp before we start the exec. */ +	if (brandme) { +		void *brand_data = NULL; + +		/* +		 * Process branding may fail if multiple LWPs are present and +		 * holdlwps() cannot complete successfully. +		 */ +		error = brand_setbrand(p, B_TRUE); + +		if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) { +			brand_data = BROP(p)->b_lwpdata_alloc(p); +			if (brand_data == NULL) { +				error = 1; +			} +		} + +		if (error == 0) { +			mutex_enter(&p->p_lock); +			BROP(p)->b_initlwp(lwp, brand_data); +			mutex_exit(&p->p_lock); +		} else { +			VN_RELE(vp); +			if (dir != NULL) { +				VN_RELE(dir); +			} +			pn_free(&resolvepn); +			goto fail; +		} +	}  	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, -	    exec_file, p->p_cred, brand_action)) != 0) { -		if (brandme) -			brand_clearbrand(p, B_FALSE); +	    exec_file, p->p_cred, &brand_action)) != 0) { +		if (brandme) { +			BROP(p)->b_freelwp(lwp); +			brand_clearbrand(p, B_TRUE); +		}  		VN_RELE(vp);  		if (dir != NULL)  			VN_RELE(dir); @@ -352,7 +385,7 @@ exec_common(const char *fname, const char **argp, const char **envp,  	/*  	 * Clear contract template state  	 */ -	lwp_ctmpl_clear(lwp); +	lwp_ctmpl_clear(lwp, B_TRUE);  	/*  	 * Save the directory in which we found the executable for expanding @@ -376,6 +409,8 @@ exec_common(const char *fname, const char **argp, const char **envp,  	 * pending held signals remain held, so don't clear t_hold.  	 */  	mutex_enter(&p->p_lock); +	DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, +	    uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);  	lwp->lwp_oldcontext = 0;  	lwp->lwp_ustack = 0;  	lwp->lwp_old_stk_ctl = 0; @@ -435,8 +470,10 @@ exec_common(const char *fname, const char **argp, const char **envp,  	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);  	/* Unbrand ourself if necessary. */ -	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) +	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { +		BROP(p)->b_freelwp(lwp);  		brand_clearbrand(p, B_FALSE); +	}  	setregs(&args); @@ -560,7 +597,7 @@ gexec(  	long *execsz,  	caddr_t exec_file,  	struct cred *cred, -	int brand_action) +	int *brand_action)  {  	struct vnode *vp, *execvp = NULL;  	proc_t *pp = ttoproc(curthread); @@ -881,8 +918,14 @@ gexec(  			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))  				args->traceinval = 1;  		} -		if (pp->p_proc_flag & P_PR_PTRACE) + +		/* +		 * If legacy ptrace is enabled, generate the SIGTRAP. +		 */ +		if (pp->p_proc_flag & P_PR_PTRACE) {  			psignal(pp, SIGTRAP); +		} +  		if (args->traceinval)  			prinvalidate(&pp->p_user);  	} @@ -1546,6 +1589,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)  	return (0);  } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ +	int error; + +	if (STK_AVAIL(args) < sizeof (int)) +		return (E2BIG); +	*--args->stk_offp = args->stk_strp - args->stk_base; + +	if (len > STK_AVAIL(args)) +		return (E2BIG); +	bcopy(sp, args->stk_strp, len); + +	args->stk_strp += len; + +	return (0); +} +  static int  stk_getptr(uarg_t *args, char *src, char **dst)  { @@ -1582,6 +1646,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)  	size_t size, pad;  	char *argv = (char *)uap->argp;  	char *envp = (char *)uap->envp; +	uint8_t rdata[RANDOM_LEN];  	/*  	 * Copy interpreter's name and argument to argv[0] and argv[1]. @@ -1664,8 +1729,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)  	args->ne = args->na - argc;  	/* -	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and -	 * AT_SUN_EMULATOR strings to the stack. +	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, +	 * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM +	 * array, to the stack.  	 */  	if (auxvpp != NULL && *auxvpp != NULL) {  		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1678,6 +1744,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)  		if (args->emulator != NULL &&  		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)  			return (error); + +		/* +		 * For the AT_RANDOM aux vector we provide 16 bytes of random +		 * data. +		 */ +		(void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + +		if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) +			return (error); + +		if (args->brand_nroot != NULL && +		    (error = stk_add(args, args->brand_nroot, +		    UIO_SYSSPACE)) != 0) +			return (error);  	}  	/* @@ -1784,7 +1864,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)  	/*  	 * Fill in the aux vector now that we know the user stack addresses  	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and -	 * AT_SUN_EMULATOR strings. +	 * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.  	 */  	if (auxvpp != NULL && *auxvpp != NULL) {  		if (args->to_model == DATAMODEL_NATIVE) { @@ -1797,6 +1877,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)  			if (args->emulator != NULL)  				ADDAUX(*a,  				    AT_SUN_EMULATOR, (long)&ustrp[*--offp]) +			ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) +			if (args->brand_nroot != NULL) { +				ADDAUX(*a, +				    AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) +			}  		} else {  			auxv32_t **a = (auxv32_t **)auxvpp;  			ADDAUX(*a, @@ -1809,6 +1894,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)  			if (args->emulator != NULL)  				ADDAUX(*a, AT_SUN_EMULATOR,  				    (int)(uintptr_t)&ustrp[*--offp]) +			ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) +			if (args->brand_nroot != NULL) { +				ADDAUX(*a, AT_SUN_BRAND_NROOT, +				    (int)(uintptr_t)&ustrp[*--offp]) +			}  		}  	} @@ -1935,6 +2025,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)  		usrstack = (char *)USRSTACK32;  	} +	if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) +		usrstack = (char *)args->maxstack; +  	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);  #if defined(__sparc) diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index 1b9359da47..41f5f29eee 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -230,7 +230,7 @@ restart_init(int what, int why)  		siginfofree(lwp->lwp_curinfo);  		lwp->lwp_curinfo = NULL;  	} -	lwp_ctmpl_clear(lwp); +	lwp_ctmpl_clear(lwp, B_FALSE);  	/*  	 * Reset both the process root directory and the current working @@ -366,19 +366,6 @@ proc_exit(int why, int what)  	}  	mutex_exit(&p->p_lock); -	DTRACE_PROC(lwp__exit); -	DTRACE_PROC1(exit, int, why); - -	/* -	 * Will perform any brand specific proc exit processing, since this -	 * is always the last lwp, will also perform lwp_exit and free brand -	 * data -	 */ -	if (PROC_IS_BRANDED(p)) { -		lwp_detach_brand_hdlrs(lwp); -		brand_clearbrand(p, B_FALSE); -	} -  	/*  	 * Don't let init exit unless zone_start_init() failed its exec, or  	 * we are shutting down the zone or the machine. @@ -390,12 +377,35 @@ proc_exit(int why, int what)  		if (z->zone_boot_err == 0 &&  		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&  		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { -			if (z->zone_restart_init == B_TRUE) { -				if (restart_init(what, why) == 0) -					return (0); + +			/* +			 * If the init process should be restarted, the +			 * "zone_restart_init" member will be set.  Some init +			 * programs in branded zones do not tolerate a restart +			 * in the traditional manner; setting the +			 * "zone_reboot_on_init_exit" member will cause the +			 * entire zone to be rebooted instead.  If neither of +			 * these flags is set the zone will shut down. +			 */ +			if (z->zone_reboot_on_init_exit == B_TRUE && +			    z->zone_restart_init == B_TRUE) { +				/* +				 * Trigger a zone reboot and continue +				 * with exit processing. +				 */ +				z->zone_init_status = wstat(why, what); +				(void) zone_kadmin(A_REBOOT, 0, NULL, +				    zone_kcred()); +  			} else { +				if (z->zone_restart_init == B_TRUE) { +					if (restart_init(what, why) == 0) +						return (0); +				} + +				z->zone_init_status = wstat(why, what);  				(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, -				    CRED()); +				    zone_kcred());  			}  		} @@ -407,6 +417,32 @@ proc_exit(int why, int what)  		z->zone_proc_initpid = -1;  	} +	/* +	 * Delay firing probes (and performing brand cleanup) until after the +	 * zone_proc_initpid check. Cases which result in zone shutdown or +	 * restart via zone_kadmin eventually result in a call back to +	 * proc_exit. +	 */ +	DTRACE_PROC(lwp__exit); +	DTRACE_PROC1(exit, int, why); + +	/* +	 * Will perform any brand specific proc exit processing. Since this +	 * is always the last lwp, will also perform lwp exit/free and proc +	 * exit. Brand data will be freed when the process is reaped. +	 */ +	if (PROC_IS_BRANDED(p)) { +		BROP(p)->b_lwpexit(lwp); +		BROP(p)->b_proc_exit(p); +		/* +		 * To ensure that b_proc_exit has access to brand-specific data +		 * contained by the one remaining lwp, call the freelwp hook as +		 * the last part of this clean-up process. +		 */ +		BROP(p)->b_freelwp(lwp); +		lwp_detach_brand_hdlrs(lwp); +	} +  	lwp_pcb_exit();  	/* @@ -658,10 +694,22 @@ proc_exit(int why, int what)  	if ((q = p->p_child) != NULL && p != proc_init) {  		struct proc	*np;  		struct proc	*initp = proc_init; +		pid_t		zone_initpid = 1; +		struct proc	*zoneinitp = NULL;  		boolean_t	setzonetop = B_FALSE; -		if (!INGLOBALZONE(curproc)) -			setzonetop = B_TRUE; +		if (!INGLOBALZONE(curproc)) { +			zone_initpid = curproc->p_zone->zone_proc_initpid; + +			ASSERT(MUTEX_HELD(&pidlock)); +			zoneinitp = prfind(zone_initpid); +			if (zoneinitp != NULL) { +				initp = zoneinitp; +			} else { +				zone_initpid = 1; +				setzonetop = B_TRUE; +			} +		}  		pgdetach(p); @@ -673,7 +721,8 @@ proc_exit(int why, int what)  			 */  			delete_ns(q->p_parent, q); -			q->p_ppid = 1; +			q->p_ppid = zone_initpid; +  			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);  			if (setzonetop) {  				mutex_enter(&q->p_lock); @@ -847,8 +896,50 @@ proc_exit(int why, int what)  	mutex_exit(&p->p_lock);  	if (!evaporate) { -		p->p_pidflag &= ~CLDPEND; -		sigcld(p, sqp); +		/* +		 * The brand specific code only happens when the brand has a +		 * function to call in place of sigcld and the parent of the +		 * exiting process is not the global zone init. If the parent +		 * is the global zone init, then the process was reparented, +		 * and we don't want brand code delivering possibly strange +		 * signals to init. Also, init is not branded, so any brand +		 * specific exit data will not be picked up by init anyway. +		 */ +		if (PROC_IS_BRANDED(p) && +		    BROP(p)->b_exit_with_sig != NULL && +		    p->p_ppid != 1) { +			/* +			 * The code for _fini that could unload the brand_t +			 * blocks until the count of zones using the module +			 * reaches zero. Zones decrement the refcount on their +			 * brands only after all user tasks in that zone have +			 * exited and been waited on. The decrement on the +			 * brand's refcount happen in zone_destroy(). That +			 * depends on zone_shutdown() having been completed. +			 * zone_shutdown() includes a call to zone_empty(), +			 * where the zone waits for itself to reach the state +			 * ZONE_IS_EMPTY. This state is only set in either +			 * zone_shutdown(), when there are no user processes as +			 * the zone enters this function, or in +			 * zone_task_rele(). zone_task_rele() is called from +			 * code triggered by waiting on processes, not by the +			 * processes exiting through proc_exit().  This means +			 * all the branded processes that could exist for a +			 * specific brand_t must exit and get reaped before the +			 * refcount on the brand_t can reach 0. _fini will +			 * never unload the corresponding brand module before +			 * proc_exit finishes execution for all processes +			 * branded with a particular brand_t, which makes the +			 * operation below safe to do. Brands that wish to use +			 * this mechanism must wait in _fini as described +			 * above. +			 */ +			BROP(p)->b_exit_with_sig(p, sqp); +		} else { +			p->p_pidflag &= ~CLDPEND; +			sigcld(p, sqp); +		} +  	} else {  		/*  		 * Do what sigcld() would do if the disposition @@ -927,10 +1018,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)  int  waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  { -	int found;  	proc_t *cp, *pp; -	int proc_gone;  	int waitflag = !(options & WNOWAIT); +	boolean_t have_brand_helper = B_FALSE;  	/*  	 * Obsolete flag, defined here only for binary compatibility @@ -958,7 +1048,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  	pp = ttoproc(curthread);  	/* -	 * lock parent mutex so that sibling chain can be searched. +	 * Anytime you are looking for a process, you take pidlock to prevent +	 * things from changing as you look.  	 */  	mutex_enter(&pidlock); @@ -978,10 +1069,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  		return (ECHILD);  	} -	while (pp->p_child != NULL) { +	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { +		have_brand_helper = B_TRUE; +	} + +	while (pp->p_child != NULL || have_brand_helper) { +		boolean_t brand_wants_wait = B_FALSE; +		int proc_gone = 0; +		int found = 0; -		proc_gone = 0; +		/* +		 * Give the brand a chance to return synthetic results from +		 * this waitid() call before we do the real thing. +		 */ +		if (have_brand_helper) { +			int ret; +			if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, +			    &brand_wants_wait, &ret) == 0) { +				mutex_exit(&pidlock); +				return (ret); +			} + +			if (pp->p_child == NULL) { +				goto no_real_children; +			} +		} + +		/* +		 * Look for interesting children in the newstate list. +		 */ +		VERIFY(pp->p_child != NULL);  		for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {  			if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))  				continue; @@ -989,6 +1107,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  				continue;  			if (idtype == P_PGID && id != cp->p_pgrp)  				continue; +			if (PROC_IS_BRANDED(pp)) { +				if (BROP(pp)->b_wait_filter != NULL && +				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) +					continue; +			}  			switch (cp->p_wcode) { @@ -1033,12 +1156,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  		 * Wow! None of the threads on the p_sibling_ns list were  		 * interesting threads. Check all the kids!  		 */ -		found = 0;  		for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {  			if (idtype == P_PID && id != cp->p_pid)  				continue;  			if (idtype == P_PGID && id != cp->p_pgrp)  				continue; +			if (PROC_IS_BRANDED(pp)) { +				if (BROP(pp)->b_wait_filter != NULL && +				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) +					continue; +			}  			switch (cp->p_wcode) {  			case CLD_TRAPPED: @@ -1107,11 +1234,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  				break;  		} +no_real_children:  		/*  		 * If we found no interesting processes at all,  		 * break out and return ECHILD.  		 */ -		if (found + proc_gone == 0) +		if (!brand_wants_wait && (found + proc_gone == 0))  			break;  		if (options & WNOHANG) { @@ -1130,7 +1258,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)  		 * change state while we wait, we don't wait at all.  		 * Get out with ECHILD according to SVID.  		 */ -		if (found == proc_gone) +		if (!brand_wants_wait && (found == proc_gone))  			break;  		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1226,6 +1354,12 @@ freeproc(proc_t *p)  		p->p_killsqp = NULL;  	} +	/* Clear any remaining brand data */ +	if (PROC_IS_BRANDED(p)) { +		brand_clearbrand(p, B_FALSE); +	} + +  	prfree(p);	/* inform /proc */  	/* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 76eddd4e50..62f7a307f1 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. + * Copyright 2017, Joyent Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -487,7 +487,7 @@ free_afd(afd_t *afd)		/* called below and from thread_free() */  		afd->a_fd[i] = -1;  } -static void +void  set_active_fd(int fd)  {  	afd_t *afd = &curthread->t_activefd; @@ -852,7 +852,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)  	 */  	cfip->fi_nfiles = nfiles = flist_minsize(pfip); -	cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); +	cfip->fi_list = nfiles == 0 ? NULL : +	    kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);  	for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;  	    fd++, pufp++, cufp++) { diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index a63931459f..7e198910b4 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -84,6 +84,7 @@ static int64_t cfork(int, int, int);  static int getproc(proc_t **, pid_t, uint_t);  #define	GETPROC_USER	0x0  #define	GETPROC_KERNEL	0x1 +#define	GETPROC_ZSCHED	0x2  static void fork_fail(proc_t *);  static void forklwp_fail(proc_t *); @@ -705,7 +706,7 @@ fork_fail(proc_t *cp)  	if (PTOU(curproc)->u_cwd)  		refstr_rele(PTOU(curproc)->u_cwd);  	if (PROC_IS_BRANDED(cp)) { -		brand_clearbrand(cp, B_TRUE); +		brand_clearbrand(cp, B_FALSE);  	}  } @@ -754,7 +755,7 @@ forklwp_fail(proc_t *p)  			kmem_free(t->t_door, sizeof (door_data_t));  			t->t_door = NULL;  		} -		lwp_ctmpl_clear(ttolwp(t)); +		lwp_ctmpl_clear(ttolwp(t), B_FALSE);  		/*  		 * Remove the thread from the all threads list. @@ -791,6 +792,9 @@ extern struct as kas;  /*   * fork a kernel process. + * + * Passing a pid argument of -1 indicates that the new process should be + * launched as a child of 'zsched' within the zone.   */  int  newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, @@ -809,6 +813,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,  		rctl_set_t *init_set;  		ASSERT(pid != 1); +		ASSERT(pid >= 0);  		if (getproc(&p, pid, GETPROC_KERNEL) < 0)  			return (EAGAIN); @@ -852,8 +857,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,  		rctl_set_t *init_set;  		task_t *tk, *tk_old;  		klwp_t *lwp; +		boolean_t pzsched = B_FALSE; +		int flag = GETPROC_USER; + +		/* Handle a new user-level thread as child of zsched. */ +		if (pid < 0) { +			VERIFY(curzone != global_zone); +			flag = GETPROC_ZSCHED; +			pzsched = B_TRUE; +			pid = 0; +		} -		if (getproc(&p, pid, GETPROC_USER) < 0) +		if (getproc(&p, pid, flag) < 0)  			return (EAGAIN);  		/*  		 * init creates a new task, distinct from the task @@ -914,7 +929,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,  		}  		t = lwptot(lwp); -		ctp = contract_process_fork(sys_process_tmpl, p, curproc, +		ctp = contract_process_fork(sys_process_tmpl, p, +		    (pzsched ? curproc->p_zone->zone_zsched : curproc),  		    B_FALSE);  		ASSERT(ctp != NULL);  		if (ct != NULL) @@ -955,7 +971,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)  		return (-1);	/* no point in starting new processes */ -	pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; +	if (flags & GETPROC_ZSCHED) { +		pp = curproc->p_zone->zone_zsched; +	} else { +		pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; +	}  	task = pp->p_task;  	proj = task->tk_proj;  	zone = pp->p_zone; @@ -1016,6 +1036,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	cp->p_t1_lgrpid = LGRP_NONE;  	cp->p_tr_lgrpid = LGRP_NONE; +	/* Default to native brand initially */ +	cp->p_brand = &native_brand; +  	if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {  		if (nproc == v.v_proc) {  			CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1083,9 +1106,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);  	cp->p_sessp = pp->p_sessp;  	sess_hold(pp); -	cp->p_brand = pp->p_brand; -	if (PROC_IS_BRANDED(pp)) -		BROP(pp)->b_copy_procdata(cp, pp);  	cp->p_bssbase = pp->p_bssbase;  	cp->p_brkbase = pp->p_brkbase;  	cp->p_brksize = pp->p_brksize; @@ -1170,6 +1190,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	mutex_exit(&cp->p_lock);  	mutex_exit(&pidlock); +	if (PROC_IS_BRANDED(pp)) { +		/* +		 * The only reason why process branding should fail is when +		 * the procedure is complicated by multiple LWPs on the scene. +		 * With an LWP count of 0, this newly allocated process has no +		 * reason to fail branding. +		 */ +		VERIFY0(brand_setbrand(cp, B_FALSE)); + +		BROP(pp)->b_copy_procdata(cp, pp); +	} +  	avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),  	    offsetof(contract_t, ct_ctlist)); @@ -1187,6 +1219,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  	 */  	fcnt_add(P_FINFO(pp), 1); +	mutex_enter(&pp->p_lock);  	if (PTOU(pp)->u_cdir) {  		VN_HOLD(PTOU(pp)->u_cdir);  	} else { @@ -1200,6 +1233,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)  		VN_HOLD(PTOU(pp)->u_rdir);  	if (PTOU(pp)->u_cwd)  		refstr_hold(PTOU(pp)->u_cwd); +	mutex_exit(&pp->p_lock);  	/*  	 * copy the parent's uarea. diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index 647bca2542..a3de80259f 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -19,7 +19,10 @@   * CDDL HEADER END   */ -/* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. @@ -52,6 +55,7 @@  #include <sys/fcntl.h>  #include <sys/lwpchan_impl.h>  #include <sys/nbmlock.h> +#include <sys/brand.h>  #include <vm/hat.h>  #include <vm/as.h> @@ -540,6 +544,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,  	return (0);  } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ +	if (flags & _MAP_LOW32) { +		if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { +			return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); +		} else { +			return ((caddr_t)_userlimit32); +		} +	} + +	return (as->a_userlimit); +} +  /*   * Used for MAP_ANON - fast way to get anonymous pages @@ -555,8 +573,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,  		return (EACCES);  	if ((flags & MAP_FIXED) != 0) { -		caddr_t userlimit; -  		/*  		 * Use the user address.  First verify that  		 * the address to be used is page aligned. @@ -565,9 +581,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,  		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)  			return (EINVAL); -		userlimit = flags & _MAP_LOW32 ? -		    (caddr_t)USERLIMIT32 : as->a_userlimit; -		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { +		switch (valid_usr_range(*addrp, len, uprot, as, +		    map_userlimit(as->a_proc, as, flags))) {  		case RANGE_OKAY:  			break;  		case RANGE_BADPROT: @@ -750,8 +765,6 @@ smmap_common(caddr_t *addrp, size_t len,  	 * If the user specified an address, do some simple checks here  	 */  	if ((flags & MAP_FIXED) != 0) { -		caddr_t userlimit; -  		/*  		 * Use the user address.  First verify that  		 * the address to be used is page aligned. @@ -759,10 +772,8 @@ smmap_common(caddr_t *addrp, size_t len,  		 */  		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)  			return (EINVAL); - -		userlimit = flags & _MAP_LOW32 ? -		    (caddr_t)USERLIMIT32 : as->a_userlimit; -		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { +		switch (valid_usr_range(*addrp, len, uprot, as, +		    map_userlimit(curproc, as, flags))) {  		case RANGE_OKAY:  			break;  		case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c deleted file mode 100644 index 2dad0cb940..0000000000 --- a/usr/src/uts/common/os/id_space.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/id_space.h> -#include <sys/debug.h> - -/* - * ID Spaces - * - *   The id_space_t provides a simple implementation of a managed range of - *   integer identifiers using a vmem arena.  An ID space guarantees that the - *   next identifer returned by an allocation is larger than the previous one, - *   unless there are no larger slots remaining in the range.  In this case, - *   the ID space will return the first available slot in the lower part of the - *   range (viewing the previous identifier as a partitioning element).  If no - *   slots are available, id_alloc()/id_allocff() will sleep until an - *   identifier becomes available.  Accordingly, id_space allocations must be - *   initiated from contexts where sleeping is acceptable.  id_alloc_nosleep()/ - *   id_allocff_nosleep() will return -1 if no slots are available or if the - *   system is low on memory.  If id_alloc_nosleep() fails, callers should - *   not try to extend the ID space.  This is to avoid making a possible - *   low-memory situation worse. - * - *   As an ID space is designed for representing a range of id_t's, there - *   is a preexisting maximal range: [0, MAXUID].  ID space requests outside - *   that range will fail on a DEBUG kernel.  The id_allocff*() functions - *   return the first available id, and should be used when there is benefit - *   to having a compact allocated range. - * - *   (Presently, the id_space_t abstraction supports only direct allocations; ID - *   reservation, in which an ID is allocated but placed in a internal - *   dictionary for later use, should be added when a consuming subsystem - *   arrives.) - */ - -#define	ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) -#define	ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) - -/* - * Create an arena to represent the range [low, high). - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_space_t * -id_space_create(const char *name, id_t low, id_t high) -{ -	ASSERT(low >= 0); -	ASSERT(low < high); - -	return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, -	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER)); -} - -/* - * Destroy a previously created ID space. - * No restrictions on caller's context. - */ -void -id_space_destroy(id_space_t *isp) -{ -	vmem_destroy(isp); -} - -void -id_space_extend(id_space_t *isp, id_t low, id_t high) -{ -	(void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP); -} - -/* - * Allocate an id_t from specified ID space. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_alloc(id_space_t *isp) -{ -	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space. - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_alloc_nosleep(id_space_t *isp) -{ -	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_allocff(id_space_t *isp) -{ -	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_allocff_nosleep(id_space_t *isp) -{ -	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate a specific identifier if possible, returning the id if - * successful, or -1 on failure. - */ -id_t -id_alloc_specific_nosleep(id_space_t *isp, id_t id) -{ -	void *minaddr = ID_TO_ADDR(id); -	void *maxaddr = ID_TO_ADDR(id + 1); - -	/* -	 * Note that even though we're vmem_free()ing this later, it -	 * should be OK, since there's no quantum cache. -	 */ -	return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, -	    minaddr, maxaddr, VM_NOSLEEP))); -} - -/* - * Free a previously allocated ID. - * No restrictions on caller's context. - */ -void -id_free(id_space_t *isp, id_t id) -{ -	vmem_free(isp, ID_TO_ADDR(id), 1); -} diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c index 9381019cd1..6a6f5d84ef 100644 --- a/usr/src/uts/common/os/ipc.c +++ b/usr/src/uts/common/os/ipc.c @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/ @@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm)  	    (IPC_ZONE_USAGE(perm, service) == 0)));  } +/* + * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID. + */ +void +ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm) +{ +	ASSERT(service->ipcs_count > 0); +	ASSERT(MUTEX_HELD(&service->ipcs_lock)); + +	ipc_remove(service, perm); +	mutex_exit(&service->ipcs_lock); + +	/* perform any per-service removal actions */ +	service->ipcs_rmid(perm); + +	ipc_rele(service, perm); +}  /*   * Common code to perform an IPC_RMID.  Returns an errno value on @@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr)  	/*  	 * Nothing can fail from this point on.  	 */ -	ipc_remove(service, perm); -	mutex_exit(&service->ipcs_lock); - -	/* perform any per-service removal actions */ -	service->ipcs_rmid(perm); - -	ipc_rele(service, perm); +	ipc_rmsvc(service, perm);  	return (0);  } diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index ccdbc59f21..e2d7fe988d 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc.  All rights reserved.   * Copyright (c) 2012, 2016 by Delphix. All rights reserved.   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.   */ @@ -159,10 +160,22 @@   *               find known objects and is about to free it, or   *            c) the client has freed the object.   *            In all these cases (a, b, and c) kmem frees the new object (the - *            unused copy destination) and searches for the old object in the - *            magazine layer. If found, the object is removed from the magazine - *            layer and freed to the slab layer so it will no longer hold the - *            slab hostage. + *            unused copy destination).  In the first case, the object is in + *            use and the correct action is that for LATER; in the latter two + *            cases, we know that the object is either freed or about to be + *            freed, in which case it is either already in a magazine or about + *            to be in one.  In these cases, we know that the object will either + *            be reallocated and reused, or it will end up in a full magazine + *            that will be reaped (thereby liberating the slab).  Because it + *            is prohibitively expensive to differentiate these cases, and + *            because the defrag code is executed when we're low on memory + *            (thereby biasing the system to reclaim full magazines) we treat + *            all DONT_KNOW cases as LATER and rely on cache reaping to + *            generally clean up full magazines.  While we take the same action + *            for these cases, we maintain their semantic distinction:  if + *            defragmentation is not occurring, it is useful to know if this + *            is due to objects in use (LATER) or objects in an unknown state + *            of transition (DONT_KNOW).   *   * 2.3 Object States   * @@ -285,10 +298,10 @@   * view of the slab layer, making it a candidate for the move callback. Most   * objects unrecognized by the client in the move callback fall into this   * category and are cheaply distinguished from known objects by the test - * described earlier. Since recognition is cheap for the client, and searching - * magazines is expensive for kmem, kmem defers searching until the client first - * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem - * elsewhere does what it can to avoid bothering the client unnecessarily. + * described earlier. Because searching magazines is prohibitively expensive + * for kmem, clients that do not mark freed objects (and therefore return + * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation + * efficacy reduced.   *   * Invalidating the designated pointer member before freeing the object marks   * the object to be avoided in the callback, and conversely, assigning a valid @@ -998,6 +1011,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */  size_t kmem_content_log_size;	/* content log size [2% of memory] */  size_t kmem_failure_log_size;	/* failure log [4 pages per CPU] */  size_t kmem_slab_log_size;	/* slab create log [4 pages per CPU] */ +size_t kmem_zerosized_log_size;	/* zero-sized log [4 pages per CPU] */  size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */  size_t kmem_lite_minsize = 0;	/* minimum buffer size for KMF_LITE */  size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */ @@ -1005,6 +1019,14 @@ int kmem_lite_pcs = 4;		/* number of PCs to store in KMF_LITE mode */  size_t kmem_maxverify;		/* maximum bytes to inspect in debug routines */  size_t kmem_minfirewall;	/* hardware-enforced redzone threshold */ +#ifdef DEBUG +int kmem_warn_zerosized = 1;	/* whether to warn on zero-sized KM_SLEEP */ +#else +int kmem_warn_zerosized = 0;	/* whether to warn on zero-sized KM_SLEEP */ +#endif + +int kmem_panic_zerosized = 0;	/* whether to panic on zero-sized KM_SLEEP */ +  #ifdef _LP64  size_t	kmem_max_cached = KMEM_BIG_MAXBUF;	/* maximum kmem_alloc cache */  #else @@ -1038,21 +1060,7 @@ static vmem_t		*kmem_default_arena;  static vmem_t		*kmem_firewall_va_arena;  static vmem_t		*kmem_firewall_arena; -/* - * Define KMEM_STATS to turn on statistic gathering. By default, it is only - * turned on when DEBUG is also defined. - */ -#ifdef	DEBUG -#define	KMEM_STATS -#endif	/* DEBUG */ - -#ifdef	KMEM_STATS -#define	KMEM_STAT_ADD(stat)			((stat)++) -#define	KMEM_STAT_COND_ADD(cond, stat)		((void) (!(cond) || (stat)++)) -#else -#define	KMEM_STAT_ADD(stat)			/* nothing */ -#define	KMEM_STAT_COND_ADD(cond, stat)		/* nothing */ -#endif	/* KMEM_STATS */ +static int		kmem_zerosized;		/* # of zero-sized allocs */  /*   * kmem slab consolidator thresholds (tunables) @@ -1071,47 +1079,6 @@ size_t kmem_reclaim_max_slabs = 1;   */  size_t kmem_reclaim_scan_range = 12; -#ifdef	KMEM_STATS -static struct { -	uint64_t kms_callbacks; -	uint64_t kms_yes; -	uint64_t kms_no; -	uint64_t kms_later; -	uint64_t kms_dont_need; -	uint64_t kms_dont_know; -	uint64_t kms_hunt_found_mag; -	uint64_t kms_hunt_found_slab; -	uint64_t kms_hunt_alloc_fail; -	uint64_t kms_hunt_lucky; -	uint64_t kms_notify; -	uint64_t kms_notify_callbacks; -	uint64_t kms_disbelief; -	uint64_t kms_already_pending; -	uint64_t kms_callback_alloc_fail; -	uint64_t kms_callback_taskq_fail; -	uint64_t kms_endscan_slab_dead; -	uint64_t kms_endscan_slab_destroyed; -	uint64_t kms_endscan_nomem; -	uint64_t kms_endscan_refcnt_changed; -	uint64_t kms_endscan_nomove_changed; -	uint64_t kms_endscan_freelist; -	uint64_t kms_avl_update; -	uint64_t kms_avl_noupdate; -	uint64_t kms_no_longer_reclaimable; -	uint64_t kms_notify_no_longer_reclaimable; -	uint64_t kms_notify_slab_dead; -	uint64_t kms_notify_slab_destroyed; -	uint64_t kms_alloc_fail; -	uint64_t kms_constructor_fail; -	uint64_t kms_dead_slabs_freed; -	uint64_t kms_defrags; -	uint64_t kms_scans; -	uint64_t kms_scan_depot_ws_reaps; -	uint64_t kms_debug_reaps; -	uint64_t kms_debug_scans; -} kmem_move_stats; -#endif	/* KMEM_STATS */ -  /* consolidator knobs */  static boolean_t kmem_move_noreap;  static boolean_t kmem_move_blocked; @@ -1142,6 +1109,7 @@ kmem_log_header_t	*kmem_transaction_log;  kmem_log_header_t	*kmem_content_log;  kmem_log_header_t	*kmem_failure_log;  kmem_log_header_t	*kmem_slab_log; +kmem_log_header_t	*kmem_zerosized_log;  static int		kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ @@ -1922,15 +1890,7 @@ kmem_slab_free(kmem_cache_t *cp, void *buf)  		cp->cache_complete_slab_count--;  		avl_add(&cp->cache_partial_slabs, sp);  	} else { -#ifdef	DEBUG -		if (avl_update_gt(&cp->cache_partial_slabs, sp)) { -			KMEM_STAT_ADD(kmem_move_stats.kms_avl_update); -		} else { -			KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate); -		} -#else  		(void) avl_update_gt(&cp->cache_partial_slabs, sp); -#endif  	}  	ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == @@ -2964,8 +2924,33 @@ kmem_alloc(size_t size, int kmflag)  		/* fall through to kmem_cache_alloc() */  	} else { -		if (size == 0) +		if (size == 0) { +			if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC)) +				return (NULL); + +			/* +			 * If this is a sleeping allocation or one that has +			 * been specified to panic on allocation failure, we +			 * consider it to be deprecated behavior to allocate +			 * 0 bytes.  If we have been configured to panic under +			 * this condition, we panic; if to warn, we warn -- and +			 * regardless, we log to the kmem_zerosized_log that +			 * that this condition has occurred (which gives us +			 * enough information to be able to debug it). +			 */ +			if (kmem_panic && kmem_panic_zerosized) +				panic("attempted to kmem_alloc() size of 0"); + +			if (kmem_warn_zerosized) { +				cmn_err(CE_WARN, "kmem_alloc(): sleeping " +				    "allocation with size of 0; " +				    "see kmem_zerosized_log for details"); +			} + +			kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL); +  			return (NULL); +		}  		buf = vmem_alloc(kmem_oversize_arena, size,  		    kmflag & KM_VMFLAGS); @@ -3579,7 +3564,7 @@ kmem_cache_kstat_update(kstat_t *ksp, int rw)  		kmcp->kmc_move_later.value.ui64		= kd->kmd_later;  		kmcp->kmc_move_dont_need.value.ui64	= kd->kmd_dont_need;  		kmcp->kmc_move_dont_know.value.ui64	= kd->kmd_dont_know; -		kmcp->kmc_move_hunt_found.value.ui64	= kd->kmd_hunt_found; +		kmcp->kmc_move_hunt_found.value.ui64	= 0;  		kmcp->kmc_move_slabs_freed.value.ui64	= kd->kmd_slabs_freed;  		kmcp->kmc_defrag.value.ui64		= kd->kmd_defrags;  		kmcp->kmc_scan.value.ui64		= kd->kmd_scans; @@ -4150,7 +4135,8 @@ kmem_cache_destroy(kmem_cache_t *cp)  	if (kmem_taskq != NULL)  		taskq_wait(kmem_taskq); -	if (kmem_move_taskq != NULL) + +	if (kmem_move_taskq != NULL && cp->cache_defrag != NULL)  		taskq_wait(kmem_move_taskq);  	kmem_cache_magazine_purge(cp); @@ -4488,8 +4474,8 @@ kmem_init(void)  	}  	kmem_failure_log = kmem_log_init(kmem_failure_log_size); -  	kmem_slab_log = kmem_log_init(kmem_slab_log_size); +	kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);  	/*  	 * Initialize STREAMS message caches so allocb() is available. @@ -4677,94 +4663,6 @@ kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags)  	    (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));  } -static void * -kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf, -    void *tbuf) -{ -	int i;		/* magazine round index */ - -	for (i = 0; i < n; i++) { -		if (buf == m->mag_round[i]) { -			if (cp->cache_flags & KMF_BUFTAG) { -				(void) kmem_cache_free_debug(cp, tbuf, -				    caller()); -			} -			m->mag_round[i] = tbuf; -			return (buf); -		} -	} - -	return (NULL); -} - -/* - * Hunt the magazine layer for the given buffer. If found, the buffer is - * removed from the magazine layer and returned, otherwise NULL is returned. - * The state of the returned buffer is freed and constructed. - */ -static void * -kmem_hunt_mags(kmem_cache_t *cp, void *buf) -{ -	kmem_cpu_cache_t *ccp; -	kmem_magazine_t	*m; -	int cpu_seqid; -	int n;		/* magazine rounds */ -	void *tbuf;	/* temporary swap buffer */ - -	ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); - -	/* -	 * Allocated a buffer to swap with the one we hope to pull out of a -	 * magazine when found. -	 */ -	tbuf = kmem_cache_alloc(cp, KM_NOSLEEP); -	if (tbuf == NULL) { -		KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail); -		return (NULL); -	} -	if (tbuf == buf) { -		KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky); -		if (cp->cache_flags & KMF_BUFTAG) { -			(void) kmem_cache_free_debug(cp, buf, caller()); -		} -		return (buf); -	} - -	/* Hunt the depot. */ -	mutex_enter(&cp->cache_depot_lock); -	n = cp->cache_magtype->mt_magsize; -	for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) { -		if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { -			mutex_exit(&cp->cache_depot_lock); -			return (buf); -		} -	} -	mutex_exit(&cp->cache_depot_lock); - -	/* Hunt the per-CPU magazines. */ -	for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { -		ccp = &cp->cache_cpu[cpu_seqid]; - -		mutex_enter(&ccp->cc_lock); -		m = ccp->cc_loaded; -		n = ccp->cc_rounds; -		if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { -			mutex_exit(&ccp->cc_lock); -			return (buf); -		} -		m = ccp->cc_ploaded; -		n = ccp->cc_prounds; -		if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { -			mutex_exit(&ccp->cc_lock); -			return (buf); -		} -		mutex_exit(&ccp->cc_lock); -	} - -	kmem_cache_free(cp, tbuf); -	return (NULL); -} -  /*   * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),   * or when the buffer is freed. @@ -4828,7 +4726,7 @@ static void kmem_move_end(kmem_cache_t *, kmem_move_t *);   * NO		kmem frees the new buffer, marks the slab of the old buffer   *              non-reclaimable to avoid bothering the client again   * LATER	kmem frees the new buffer, increments slab_later_count - * DONT_KNOW	kmem frees the new buffer, searches mags for the old buffer + * DONT_KNOW	kmem frees the new buffer   * DONT_NEED	kmem frees both the old buffer and the new buffer   *   * The pending callback argument now being processed contains both of the @@ -4862,19 +4760,14 @@ kmem_move_buffer(kmem_move_t *callback)  	 * another buffer on the same slab.  	 */  	if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) { -		KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable); -		KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), -		    kmem_move_stats.kms_notify_no_longer_reclaimable);  		kmem_slab_free(cp, callback->kmm_to_buf);  		kmem_move_end(cp, callback);  		return;  	}  	/* -	 * Hunting magazines is expensive, so we'll wait to do that until the -	 * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer -	 * is cheap, so we might as well do that here in case we can avoid -	 * bothering the client. +	 * Checking the slab layer is easy, so we might as well do that here +	 * in case we can avoid bothering the client.  	 */  	mutex_enter(&cp->cache_lock);  	free_on_slab = (kmem_slab_allocated(cp, sp, @@ -4882,7 +4775,6 @@ kmem_move_buffer(kmem_move_t *callback)  	mutex_exit(&cp->cache_lock);  	if (free_on_slab) { -		KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab);  		kmem_slab_free(cp, callback->kmm_to_buf);  		kmem_move_end(cp, callback);  		return; @@ -4894,7 +4786,6 @@ kmem_move_buffer(kmem_move_t *callback)  		 */  		if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,  		    KM_NOSLEEP, 1, caller()) != 0) { -			KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail);  			kmem_move_end(cp, callback);  			return;  		} @@ -4902,15 +4793,11 @@ kmem_move_buffer(kmem_move_t *callback)  	    cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,  	    KM_NOSLEEP) != 0) {  		atomic_inc_64(&cp->cache_alloc_fail); -		KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail);  		kmem_slab_free(cp, callback->kmm_to_buf);  		kmem_move_end(cp, callback);  		return;  	} -	KMEM_STAT_ADD(kmem_move_stats.kms_callbacks); -	KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), -	    kmem_move_stats.kms_notify_callbacks);  	cp->cache_defrag->kmd_callbacks++;  	cp->cache_defrag->kmd_thread = curthread;  	cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf; @@ -4928,7 +4815,6 @@ kmem_move_buffer(kmem_move_t *callback)  	cp->cache_defrag->kmd_to_buf = NULL;  	if (response == KMEM_CBRC_YES) { -		KMEM_STAT_ADD(kmem_move_stats.kms_yes);  		cp->cache_defrag->kmd_yes++;  		kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);  		/* slab safe to access until kmem_move_end() */ @@ -4943,14 +4829,12 @@ kmem_move_buffer(kmem_move_t *callback)  	switch (response) {  	case KMEM_CBRC_NO: -		KMEM_STAT_ADD(kmem_move_stats.kms_no);  		cp->cache_defrag->kmd_no++;  		mutex_enter(&cp->cache_lock);  		kmem_slab_move_no(cp, sp, callback->kmm_from_buf);  		mutex_exit(&cp->cache_lock);  		break;  	case KMEM_CBRC_LATER: -		KMEM_STAT_ADD(kmem_move_stats.kms_later);  		cp->cache_defrag->kmd_later++;  		mutex_enter(&cp->cache_lock);  		if (!KMEM_SLAB_IS_PARTIAL(sp)) { @@ -4959,7 +4843,6 @@ kmem_move_buffer(kmem_move_t *callback)  		}  		if (++sp->slab_later_count >= KMEM_DISBELIEF) { -			KMEM_STAT_ADD(kmem_move_stats.kms_disbelief);  			kmem_slab_move_no(cp, sp, callback->kmm_from_buf);  		} else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {  			sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, @@ -4968,7 +4851,6 @@ kmem_move_buffer(kmem_move_t *callback)  		mutex_exit(&cp->cache_lock);  		break;  	case KMEM_CBRC_DONT_NEED: -		KMEM_STAT_ADD(kmem_move_stats.kms_dont_need);  		cp->cache_defrag->kmd_dont_need++;  		kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);  		if (sp->slab_refcnt == 0) @@ -4978,19 +4860,21 @@ kmem_move_buffer(kmem_move_t *callback)  		mutex_exit(&cp->cache_lock);  		break;  	case KMEM_CBRC_DONT_KNOW: -		KMEM_STAT_ADD(kmem_move_stats.kms_dont_know); +		/* +		 * If we don't know if we can move this buffer or not, we'll +		 * just assume that we can't:  if the buffer is in fact free, +		 * then it is sitting in one of the per-CPU magazines or in +		 * a full magazine in the depot layer.  Either way, because +		 * defrag is induced in the same logic that reaps a cache, +		 * it's likely that full magazines will be returned to the +		 * system soon (thereby accomplishing what we're trying to +		 * accomplish here: return those magazines to their slabs). +		 * Given this, any work that we might do now to locate a buffer +		 * in a magazine is wasted (and expensive!) work; we bump +		 * a counter in this case and otherwise assume that we can't +		 * move it. +		 */  		cp->cache_defrag->kmd_dont_know++; -		if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) { -			KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag); -			cp->cache_defrag->kmd_hunt_found++; -			kmem_slab_free_constructed(cp, callback->kmm_from_buf, -			    B_TRUE); -			if (sp->slab_refcnt == 0) -				cp->cache_defrag->kmd_slabs_freed++; -			mutex_enter(&cp->cache_lock); -			kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); -			mutex_exit(&cp->cache_lock); -		}  		break;  	default:  		panic("'%s' (%p) unexpected move callback response %d\n", @@ -5015,10 +4899,9 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)  	ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);  	callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP); -	if (callback == NULL) { -		KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail); + +	if (callback == NULL)  		return (B_FALSE); -	}  	callback->kmm_from_slab = sp;  	callback->kmm_from_buf = buf; @@ -5043,7 +4926,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)  			pending->kmm_flags |= KMM_DESPERATE;  		}  		mutex_exit(&cp->cache_lock); -		KMEM_STAT_ADD(kmem_move_stats.kms_already_pending);  		kmem_cache_free(kmem_move_cache, callback);  		return (B_TRUE);  	} @@ -5057,7 +4939,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)  	if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,  	    callback, TQ_NOSLEEP)) { -		KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail);  		mutex_enter(&cp->cache_lock);  		avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);  		mutex_exit(&cp->cache_lock); @@ -5103,7 +4984,6 @@ kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)  			cp->cache_slab_destroy++;  			mutex_exit(&cp->cache_lock);  			kmem_slab_destroy(cp, sp); -			KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);  			mutex_enter(&cp->cache_lock);  		}  	} @@ -5248,8 +5128,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,  					 * pending move completes.  					 */  					list_insert_head(deadlist, sp); -					KMEM_STAT_ADD(kmem_move_stats. -					    kms_endscan_slab_dead);  					return (-1);  				} @@ -5264,10 +5142,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,  				cp->cache_slab_destroy++;  				mutex_exit(&cp->cache_lock);  				kmem_slab_destroy(cp, sp); -				KMEM_STAT_ADD(kmem_move_stats. -				    kms_dead_slabs_freed); -				KMEM_STAT_ADD(kmem_move_stats. -				    kms_endscan_slab_destroyed);  				mutex_enter(&cp->cache_lock);  				/*  				 * Since we can't pick up the scan where we left @@ -5283,8 +5157,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,  				 * for the request and say nothing about the  				 * number of reclaimable slabs.  				 */ -				KMEM_STAT_COND_ADD(s < max_slabs, -				    kmem_move_stats.kms_endscan_nomem);  				return (-1);  			} @@ -5300,16 +5172,10 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,  				 * destination buffer on the same slab. In that  				 * case, we're not interested in counting it.  				 */ -				KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && -				    (s < max_slabs), -				    kmem_move_stats.kms_endscan_refcnt_changed);  				return (-1);  			} -			if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) { -				KMEM_STAT_COND_ADD(s < max_slabs, -				    kmem_move_stats.kms_endscan_nomove_changed); +			if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)  				return (-1); -			}  			/*  			 * Generating a move request allocates a destination @@ -5336,11 +5202,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,  	}  end_scan: -	KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && -	    (s < max_slabs) && -	    (sp == avl_first(&cp->cache_partial_slabs)), -	    kmem_move_stats.kms_endscan_freelist); -  	return (s);  } @@ -5400,8 +5261,6 @@ kmem_cache_move_notify_task(void *arg)  			    &cp->cache_defrag->kmd_moves_pending)) {  				list_insert_head(deadlist, sp);  				mutex_exit(&cp->cache_lock); -				KMEM_STAT_ADD(kmem_move_stats. -				    kms_notify_slab_dead);  				return;  			} @@ -5409,9 +5268,6 @@ kmem_cache_move_notify_task(void *arg)  			cp->cache_slab_destroy++;  			mutex_exit(&cp->cache_lock);  			kmem_slab_destroy(cp, sp); -			KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); -			KMEM_STAT_ADD(kmem_move_stats. -			    kms_notify_slab_destroyed);  			return;  		}  	} else { @@ -5425,7 +5281,6 @@ kmem_cache_move_notify(kmem_cache_t *cp, void *buf)  {  	kmem_move_notify_args_t *args; -	KMEM_STAT_ADD(kmem_move_stats.kms_notify);  	args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);  	if (args != NULL) {  		args->kmna_cache = cp; @@ -5448,7 +5303,6 @@ kmem_cache_defrag(kmem_cache_t *cp)  	n = avl_numnodes(&cp->cache_partial_slabs);  	if (n > 1) {  		/* kmem_move_buffers() drops and reacquires cache_lock */ -		KMEM_STAT_ADD(kmem_move_stats.kms_defrags);  		cp->cache_defrag->kmd_defrags++;  		(void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);  	} @@ -5547,7 +5401,6 @@ kmem_cache_scan(kmem_cache_t *cp)  		 *  		 * kmem_move_buffers() drops and reacquires cache_lock.  		 */ -		KMEM_STAT_ADD(kmem_move_stats.kms_scans);  		kmd->kmd_scans++;  		slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,  		    kmem_reclaim_max_slabs, 0); @@ -5588,12 +5441,9 @@ kmem_cache_scan(kmem_cache_t *cp)  			if (!kmem_move_noreap &&  			    ((debug_rand % kmem_mtb_reap) == 0)) {  				mutex_exit(&cp->cache_lock); -				KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps);  				kmem_cache_reap(cp);  				return;  			} else if ((debug_rand % kmem_mtb_move) == 0) { -				KMEM_STAT_ADD(kmem_move_stats.kms_scans); -				KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans);  				kmd->kmd_scans++;  				(void) kmem_move_buffers(cp,  				    kmem_reclaim_scan_range, 1, KMM_DEBUG); @@ -5604,8 +5454,6 @@ kmem_cache_scan(kmem_cache_t *cp)  	mutex_exit(&cp->cache_lock); -	if (reap) { -		KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps); +	if (reap)  		kmem_depot_ws_reap(cp); -	}  } diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 149f5f8a88..cbc4fa0000 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -22,6 +22,7 @@  /*   * Copyright (c) 2013 Gary Mills   * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, Joyent, Inc.   */  #include <sys/types.h> @@ -249,8 +250,7 @@ log_init(void)  	 */  	printf("\rSunOS Release %s Version %s %u-bit\n",  	    utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); -	printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " -	    "All rights reserved.\n"); +	printf("Copyright (c) 2010-2016, Joyent Inc. All rights reserved.\n");  #ifdef DEBUG  	printf("DEBUG enabled\n");  #endif diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index feb8e76c42..cde81f511a 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@   */  /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc.   */  #include <sys/param.h> @@ -57,6 +57,8 @@  #include <sys/lgrp.h>  #include <sys/rctl.h>  #include <sys/contract_impl.h> +#include <sys/contract/process.h> +#include <sys/contract/process_impl.h>  #include <sys/cpc_impl.h>  #include <sys/sdt.h>  #include <sys/cmn_err.h> @@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,  	ret_tidhash_t *ret_tidhash = NULL;  	int i;  	int rctlfail = 0; -	boolean_t branded = 0; +	void *brand_data = NULL;  	struct ctxop *ctx = NULL;  	ASSERT(cid != sysdccid);	/* system threads must start in SYS */ @@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,  	 */  	lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); +	/* +	 * If necessary, speculatively allocate lwp brand data.  This is done +	 * ahead of time so p_lock need not be dropped during lwp branding. +	 */ +	if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { +		if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { +			mutex_enter(&p->p_lock); +			err = 1; +			atomic_inc_32(&p->p_zone->zone_ffmisc); +			goto error; +		} +	} +  	mutex_enter(&p->p_lock);  grow:  	/* @@ -630,18 +645,6 @@ grow:  		} while (lwp_hash_lookup(p, t->t_tid) != NULL);  	} -	/* -	 * If this is a branded process, let the brand do any necessary lwp -	 * initialization. -	 */ -	if (PROC_IS_BRANDED(p)) { -		if (BROP(p)->b_initlwp(lwp)) { -			err = 1; -			atomic_inc_32(&p->p_zone->zone_ffmisc); -			goto error; -		} -		branded = 1; -	}  	if (t->t_tid == 1) {  		kpreempt_disable(); @@ -654,7 +657,6 @@ grow:  		}  	} -	p->p_lwpcnt++;  	t->t_waitfor = -1;  	/* @@ -696,8 +698,27 @@ grow:  	t->t_post_sys = 1;  	/* +	 * Perform lwp branding +	 * +	 * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be +	 * continuously held between when the tidhash is sized and when the lwp +	 * is inserted into it.  Operations requiring p->p_lock to be +	 * temporarily dropped can be performed in b_initlwp_post. +	 */ +	if (PROC_IS_BRANDED(p)) { +		BROP(p)->b_initlwp(lwp, brand_data); +		/* +		 * The b_initlwp hook is expected to consume any preallocated +		 * brand_data in a way that prepares it for deallocation by the +		 * b_freelwp hook. +		 */ +		brand_data = NULL; +	} + +	/*  	 * Insert the new thread into the list of all threads.  	 */ +	p->p_lwpcnt++;  	if ((tx = p->p_tlist) == NULL) {  		t->t_back = t;  		t->t_forw = t; @@ -718,6 +739,13 @@ grow:  	lep->le_start = t->t_start;  	lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1); +	/* +	 * Complete lwp branding +	 */ +	if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) { +		BROP(p)->b_initlwp_post(lwp); +	} +  	if (state == TS_RUN) {  		/*  		 * We set the new lwp running immediately. @@ -753,8 +781,9 @@ error:  		if (cid != NOCLASS && bufp != NULL)  			CL_FREE(cid, bufp); -		if (branded) -			BROP(p)->b_freelwp(lwp); +		if (brand_data != NULL) { +			BROP(p)->b_lwpdata_free(brand_data); +		}  		mutex_exit(&p->p_lock);  		t->t_state = TS_FREE; @@ -827,8 +856,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)  	int i;  	for (i = 0; i < ct_ntypes; i++) { -		dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]); +		ct_template_t *tmpl = src->lwp_ct_active[i]; + +		/* +		 * If the process contract template is setup to be preserved +		 * across exec, then if we're forking, perform an implicit +		 * template_clear now. This ensures that future children of +		 * this child will remain in the same contract unless they're +		 * explicitly setup differently. We know we're forking if the +		 * two LWPs belong to different processes. +		 */ +		if (i == CTT_PROCESS && tmpl != NULL) { +			ctmpl_process_t *ctp = tmpl->ctmpl_data; + +			if (dst->lwp_procp != src->lwp_procp && +			    (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) +				tmpl = NULL; +		} + +		dst->lwp_ct_active[i] = ctmpl_dup(tmpl);  		dst->lwp_ct_latest[i] = NULL; +  	}  } @@ -836,21 +884,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)   * Clear an LWP's contract template state.   */  void -lwp_ctmpl_clear(klwp_t *lwp) +lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec)  {  	ct_template_t *tmpl;  	int i;  	for (i = 0; i < ct_ntypes; i++) { -		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { -			ctmpl_free(tmpl); -			lwp->lwp_ct_active[i] = NULL; -		} -  		if (lwp->lwp_ct_latest[i] != NULL) {  			contract_rele(lwp->lwp_ct_latest[i]);  			lwp->lwp_ct_latest[i] = NULL;  		} + +		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { +			/* +			 * If we're exec-ing a new program and the process +			 * contract template is setup to be preserved across +			 * exec, then don't clear it. +			 */ +			if (is_exec && i == CTT_PROCESS) { +				ctmpl_process_t *ctp = tmpl->ctmpl_data; + +				if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) +					continue; +			} + +			ctmpl_free(tmpl); +			lwp->lwp_ct_active[i] = NULL; +		}  	}  } @@ -891,13 +951,6 @@ lwp_exit(void)  	if (t->t_upimutex != NULL)  		upimutex_cleanup(); -	/* -	 * Perform any brand specific exit processing, then release any -	 * brand data associated with the lwp -	 */ -	if (PROC_IS_BRANDED(p)) -		BROP(p)->b_lwpexit(lwp); -  	lwp_pcb_exit();  	mutex_enter(&p->p_lock); @@ -941,6 +994,18 @@ lwp_exit(void)  	DTRACE_PROC(lwp__exit);  	/* +	 * Perform any brand specific exit processing, then release any +	 * brand data associated with the lwp +	 */ +	if (PROC_IS_BRANDED(p)) { +		mutex_exit(&p->p_lock); +		BROP(p)->b_lwpexit(lwp); +		BROP(p)->b_freelwp(lwp); +		mutex_enter(&p->p_lock); +		prbarrier(p); +	} + +	/*  	 * If the lwp is a detached lwp or if the process is exiting,  	 * remove (lwp_hash_out()) the lwp from the lwp directory.  	 * Otherwise null out the lwp's le_thread pointer in the lwp @@ -1101,7 +1166,7 @@ lwp_cleanup(void)  	}  	kpreempt_enable(); -	lwp_ctmpl_clear(ttolwp(t)); +	lwp_ctmpl_clear(ttolwp(t), B_FALSE);  }  int diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 7afc1cfe00..dda0b3e4a6 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -27,7 +27,7 @@  /*	  All Rights Reserved  	*/  /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc.   */  #include <sys/types.h> @@ -157,7 +157,7 @@ exec_init(const char *initpath, const char *args)  	int error = 0, count = 0;  	proc_t *p = ttoproc(curthread);  	klwp_t *lwp = ttolwp(curthread); -	int brand_action; +	int brand_action = EBA_NONE;  	if (args == NULL)  		args = ""; @@ -268,7 +268,15 @@ exec_init(const char *initpath, const char *args)  	 */  	sigemptyset(&curthread->t_hold); -	brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; +	/* +	 * Only instruct exec_common to brand the process if necessary.  It is +	 * possible that the init process is already properly branded due to the +	 * proc_exit -> restart_init -> exec_init call chain. +	 */ +	if (ZONE_IS_BRANDED(p->p_zone) && +	    p->p_brand != p->p_zone->zone_brand) { +		brand_action = EBA_BRAND; +	}  again:  	error = exec_common((const char *)(uintptr_t)exec_fnamep,  	    (const char **)(uintptr_t)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c index 142c10754e..0410e6f47b 100644 --- a/usr/src/uts/common/os/mmapobj.c +++ b/usr/src/uts/common/os/mmapobj.c @@ -1381,10 +1381,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,  			}  			if (num_segs++ == 0) {  				/* -				 * The p_vaddr of the first PT_LOAD segment -				 * must either be NULL or within the first -				 * page in order to be interpreted. -				 * Otherwise, its an invalid file. +				 * While ELF doesn't specify the meaning of +				 * p_vaddr for PT_LOAD segments in ET_DYN +				 * objects, we mandate that is either NULL or +				 * (to accommodate some historical binaries) +				 * within the first page.  (Note that there +				 * exist non-native ET_DYN objects that violate +				 * this constraint that we nonetheless must be +				 * able to execute; see the ET_DYN handling in +				 * mapelfexec() for details.)  				 */  				if (e_type == ET_DYN &&  				    ((caddr_t)((uintptr_t)vaddr & diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index b555bb82b7..eba6147fab 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -21,6 +21,7 @@  /*   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -112,6 +113,18 @@ pid_lookup(pid_t pid)  	return (pidp);  } +struct pid * +pid_find(pid_t pid) +{ +	struct pid *pidp; + +	mutex_enter(&pidlinklock); +	pidp = pid_lookup(pid); +	mutex_exit(&pidlinklock); + +	return (pidp); +} +  void  pid_setmin(void)  { @@ -522,6 +535,20 @@ sprunlock(proc_t *p)  	THREAD_KPRI_RELEASE();  } +/* + * Undo effects of sprlock but without dropping p->p_lock + */ +void +sprunprlock(proc_t *p) +{ +	ASSERT(p->p_proc_flag & P_PR_LOCK); +	ASSERT(MUTEX_HELD(&p->p_lock)); + +	cv_signal(&pr_pid_cv[p->p_slot]); +	p->p_proc_flag &= ~P_PR_LOCK; +	THREAD_KPRI_RELEASE(); +} +  void  pid_init(void)  { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index d6821c83b0..8cc7f009a3 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -56,6 +56,7 @@  #include <sys/mntent.h>  #include <sys/contract_impl.h>  #include <sys/dld_ioc.h> +#include <sys/brand.h>  /*   * There are two possible layers of privilege routines and two possible @@ -1244,6 +1245,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner)  void  secpolicy_setid_clear(vattr_t *vap, cred_t *cr)  { +	proc_t *p = curproc; + +	/* +	 * Allow the brand to override this behaviour. +	 */ +	if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { +		/* +		 * This brand hook will return 0 if handling is complete, or +		 * some other value if the brand would like us to fall back to +		 * the usual behaviour. +		 */ +		if (BROP(p)->b_setid_clear(vap, cr) == 0) { +			return; +		} +	} +  	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&  	    secpolicy_vnode_setid_retain(cr,  	    (vap->va_mode & S_ISUID) != 0 && @@ -2092,6 +2109,13 @@ secpolicy_meminfo(const cred_t *cr)  }  int +secpolicy_fs_import(const cred_t *cr) +{ +	return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int  secpolicy_pfexec_register(const cred_t *cr)  {  	return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2607,3 +2631,11 @@ secpolicy_ppp_config(const cred_t *cr)  		return (secpolicy_net_config(cr, B_FALSE));  	return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));  } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ +	if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) +		return (EPERM); +	return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index bc1787c9ca..854fb602da 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP  	Allows a process to perform privileged mappings through a  	graphics device. +privilege PRIV_HYPRLOFS_CONTROL + +	Allows a process to manage hyprlofs entries. +  privilege PRIV_IPC_DAC_READ  	Allows a process to read a System V IPC @@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES  	Allows a process to open the real console device directly.  	Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + +	Allows a process to import a potentially untrusted file system. +  privilege PRIV_SYS_IPC_CONFIG  	Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index c1d6569f11..15e77d39f7 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/  /*	  All Rights Reserved	*/ +/* + * Copyright (c) 2015, Joyent, Inc.  All rights reserved. + */ +  #include <sys/param.h>  #include <sys/types.h>  #include <sys/sysmacros.h> @@ -646,16 +650,17 @@ top:  		klwp_t *lwp = ttolwp(tp);  		/* -		 * Swapout eligible lwps (specified by the scheduling -		 * class) which don't have TS_DONT_SWAP set.  Set the -		 * "intent to swap" flag (TS_SWAPENQ) on threads -		 * which have TS_DONT_SWAP set so that they can be +		 * Swapout eligible lwps (specified by the scheduling class) +		 * which don't have TS_DONT_SWAP set.  Set the "intent to swap" +		 * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP +		 * set or are currently on a split stack so that they can be  		 * swapped if and when they reach a safe point.  		 */  		thread_lock(tp);  		thread_pri = CL_SWAPOUT(tp, swapflags);  		if (thread_pri != -1) { -			if (tp->t_schedflag & TS_DONT_SWAP) { +			if ((tp->t_schedflag & TS_DONT_SWAP) || +			    (tp->t_flag & T_SPLITSTK)) {  				tp->t_schedflag |= TS_SWAPENQ;  				tp->t_trapret = 1;  				aston(tp); diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 5721083751..18b396a765 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -22,6 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2016 Joyent, Inc.   */  #include <sys/types.h> @@ -327,12 +328,17 @@ schedctl_sigblock(kthread_t *t)  /* - * If the sc_sigblock field is set for the specified thread, set - * its signal mask to block all maskable signals, then clear the - * sc_sigblock field.  This finishes what user-level code requested - * to be done when it set tdp->sc_shared->sc_sigblock non-zero. - * Called from signal-related code either by the current thread for - * itself or by a thread that holds the process's p_lock (/proc code). + * If the sc_sigblock field is set for the specified thread, set its signal + * mask to block all maskable signals, then clear the sc_sigblock field.  This + * accomplishes what user-level code requested to be done when it set + * tdp->sc_shared->sc_sigblock non-zero. + * + * This is generally called by signal-related code in the current thread.  In + * order to call against a thread other than curthread, p_lock for the + * containing process must be held.  Even then, the caller is not protected + * from races with the thread in question updating its own fields.  It is the + * responsibility of the caller to perform additional synchronization. + *   */  void  schedctl_finish_sigblock(kthread_t *t) diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c index bacc595f78..5deae96d73 100644 --- a/usr/src/uts/common/os/shm.c +++ b/usr/src/uts/common/os/shm.c @@ -21,6 +21,7 @@  /*   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)  		size_t	share_size;  		struct	shm_data ssd;  		uintptr_t align_hint; +		long	curprot;  		/*  		 * Pick a share pagesize to use, if (!isspt(sp)). @@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)  			}  		} +		curprot = sp->shm_opts & SHM_PROT_MASK;  		if (!isspt(sp)) {  			error = sptcreate(size, &segspt, sp->shm_amp, prot,  			    flags, share_szc); @@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)  			}  			sp->shm_sptinfo->sptas = segspt->s_as;  			sp->shm_sptseg = segspt; -			sp->shm_sptprot = prot; -		} else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { +			sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot; +		} else if ((prot & curprot) != curprot) {  			/*  			 * Ensure we're attaching to an ISM segment with  			 * fewer or equal permissions than what we're @@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg)  		}  		break; +	/* Stage segment for removal, but don't remove until last detach */ +	case SHM_RMID: +		if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0) +			break; + +		/* +		 * If attached, just mark it as a pending remove, otherwise +		 * we must perform the normal ipc_rmid now. +		 */ +		if ((sp->shm_perm.ipc_ref - 1) > 0) { +			sp->shm_opts |= SHM_RM_PENDING; +		} else { +			mutex_exit(lock); +			return (ipc_rmid(shm_svc, shmid, cr)); +		} +		break; +  	default:  		error = EINVAL;  		break; @@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap)  		sp->shm_ismattch--;  	sp->shm_dtime = gethrestime_sec();  	sp->shm_lpid = pp->p_pid; +	if ((sp->shm_opts & SHM_RM_PENDING) != 0 && +	    sp->shm_perm.ipc_ref == 2) { +		/* +		 * If this is the last detach of the segment across the whole +		 * system then now we can perform the delayed IPC_RMID. +		 * The ipc_ref count has 1 for the original 'get' and one for +		 * each 'attach' (see 'stat' handling in shmctl). +		 */ +		sp->shm_opts &= ~SHM_RM_PENDING; +		mutex_enter(&shm_svc->ipcs_lock); +		ipc_rmsvc(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */ +		ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock)); +		ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0); + +		/* Lock was dropped, need to retake it for following rele. */ +		(void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); +	}  	ipc_rele(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */  	kmem_free(sap, sizeof (segacct_t)); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 453b1f22d4..5ef12f3ae4 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc.  All rights reserved. + * Copyright 2015, Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -60,6 +60,7 @@  #include <sys/cyclic.h>  #include <sys/dtrace.h>  #include <sys/sdt.h> +#include <sys/brand.h>  #include <sys/signalfd.h>  const k_sigset_t nullsmask = {0, 0, 0}; @@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig)  }  /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ +	return (sigismember(&p->p_ignore, sig) &&	/* sig in ignore mask */ +	    !(PROC_IS_BRANDED(p) &&			/* allowed by brand */ +	    BROP(p)->b_sig_ignorable != NULL && +	    BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE)); + +} + +/*   * Return true if the signal can safely be discarded on generation.   * That is, if there is no need for the signal on the receiving end.   * The answer is true if the process is a zombie or @@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig)   *	the signal is not being accepted via sigwait()   */  static int -sig_discardable(proc_t *p, int sig) +sig_discardable(proc_t *p, kthread_t *tp, int sig)  {  	kthread_t *t = p->p_tlist; +	klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp;  	return (t == NULL ||		/* if zombie or ... */ -	    (sigismember(&p->p_ignore, sig) &&	/* signal is ignored */ +	    (sig_ignorable(p, lwp, sig) &&		/* signal is ignored */  	    t->t_forw == t &&			/* and single-threaded */  	    !tracing(p, sig) &&			/* and no /proc tracing */  	    !signal_is_blocked(t, sig) &&	/* and signal not blocked */ @@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig)  		    !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) {  			ttoproc(t)->p_stopsig = 0;  			t->t_dtrace_stop = 0; -			t->t_schedflag |= TS_XSTART | TS_PSTART; +			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;  			setrun_locked(t);  		} else if (t != curthread && t->t_state == TS_ONPROC) {  			aston(t);	/* make it do issig promptly */ @@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig)  		}  	} -	if (sig_discardable(p, sig)) { +	if (sig_discardable(p, t, sig)) {  		DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist,  		    proc_t *, p, int, sig);  		return; @@ -497,7 +514,7 @@ issig_justlooking(void)  			if (sigismember(&set, sig) &&  			    (tracing(p, sig) ||  			    sigismember(&t->t_sigwait, sig) || -			    !sigismember(&p->p_ignore, sig))) { +			    !sig_ignorable(p, lwp, sig))) {  				/*  				 * Don't promote a signal that will stop  				 * the process when lwp_nostop is set. @@ -623,6 +640,21 @@ issig_forreal(void)  		}  		/* +		 * Allow the brand the chance to alter (or suppress) delivery +		 * of this signal. +		 */ +		if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { +			/* +			 * The brand hook will return 0 if it would like +			 * us to drive on, or -1 if we should restart +			 * the loop to check other conditions. +			 */ +			if (BROP(p)->b_issig_stop(p, lwp) != 0) { +				continue; +			} +		} + +		/*  		 * Honor requested stop before dealing with the  		 * current signal; a debugger may change it.  		 * Do not want to go back to loop here since this is a special @@ -656,7 +688,7 @@ issig_forreal(void)  			lwp->lwp_cursig = 0;  			lwp->lwp_extsig = 0;  			if (sigismember(&t->t_sigwait, sig) || -			    (!sigismember(&p->p_ignore, sig) && +			    (!sig_ignorable(p, lwp, sig) &&  			    !isjobstop(sig))) {  				if (p->p_flag & (SEXITLWPS|SKILLED)) {  					sig = SIGKILL; @@ -708,7 +740,7 @@ issig_forreal(void)  				toproc = 0;  				if (tracing(p, sig) ||  				    sigismember(&t->t_sigwait, sig) || -				    !sigismember(&p->p_ignore, sig)) { +				    !sig_ignorable(p, lwp, sig)) {  					if (sigismember(&t->t_extsig, sig))  						ext = 1;  					break; @@ -722,7 +754,7 @@ issig_forreal(void)  				toproc = 1;  				if (tracing(p, sig) ||  				    sigismember(&t->t_sigwait, sig) || -				    !sigismember(&p->p_ignore, sig)) { +				    !sig_ignorable(p, lwp, sig)) {  					if (sigismember(&p->p_extsig, sig))  						ext = 1;  					break; @@ -954,6 +986,16 @@ stop(int why, int what)  		}  		break; +	case PR_BRAND: +		/* +		 * We have been stopped by the brand code for a brand-private +		 * reason.  This is an asynchronous stop affecting only this +		 * LWP. +		 */ +		VERIFY(PROC_IS_BRANDED(p)); +		flags &= ~TS_BSTART; +		break; +  	default:	/* /proc stop */  		flags &= ~TS_PSTART;  		/* @@ -1065,7 +1107,7 @@ stop(int why, int what)  		}  	} -	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { +	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) {  		/*  		 * Do process-level notification when all lwps are  		 * either stopped on events of interest to /proc @@ -1171,6 +1213,13 @@ stop(int why, int what)  	if (why == PR_CHECKPOINT)  		del_one_utstop(); +	/* +	 * Allow the brand to post notification of this stop condition. +	 */ +	if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { +		BROP(p)->b_stop_notify(p, lwp, why, what); +	} +  	thread_lock(t);  	ASSERT((t->t_schedflag & TS_ALLSTART) == 0);  	t->t_schedflag |= flags; @@ -1192,7 +1241,7 @@ stop(int why, int what)  		    (p->p_flag & (SEXITLWPS|SKILLED))) {  			p->p_stopsig = 0;  			thread_lock(t); -			t->t_schedflag |= TS_XSTART | TS_PSTART; +			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;  			setrun_locked(t);  			thread_unlock_nopreempt(t);  		} else if (why == PR_JOBCONTROL) { @@ -1327,7 +1376,7 @@ psig(void)  	 * this signal from pending to current (we dropped p->p_lock).  	 * This can happen only in a multi-threaded process.  	 */ -	if (sigismember(&p->p_ignore, sig) || +	if (sig_ignorable(p, lwp, sig) ||  	    (func == SIG_DFL && sigismember(&stopdefault, sig))) {  		lwp->lwp_cursig = 0;  		lwp->lwp_extsig = 0; @@ -1771,9 +1820,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp)  			/*  			 * This can only happen when the parent is init.  			 * (See call to sigcld(q, NULL) in exit().) -			 * Use KM_NOSLEEP to avoid deadlock. +			 * Use KM_NOSLEEP to avoid deadlock. The child procs +			 * initpid can be 1 for zlogin.  			 */ -			ASSERT(pp == proc_init); +			ASSERT(pp->p_pidp->pid_id == +			    cp->p_zone->zone_proc_initpid || +			    pp->p_pidp->pid_id == 1);  			winfo(cp, &info, 0);  			sigaddq(pp, NULL, &info, KM_NOSLEEP);  		} else { @@ -1804,6 +1856,15 @@ sigcld_repost()  	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);  	mutex_enter(&pidlock); +	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { +		/* +		 * Allow the brand to inject synthetic SIGCLD signals. +		 */ +		if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { +			mutex_exit(&pidlock); +			return; +		} +	}  	for (cp = pp->p_child; cp; cp = cp->p_sibling) {  		if (cp->p_pidflag & CLDPEND) {  			post_sigcld(cp, sqp); @@ -2115,7 +2176,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp)  	ASSERT(MUTEX_HELD(&p->p_lock));  	ASSERT(sig >= 1 && sig < NSIG); -	if (sig_discardable(p, sig)) +	if (sig_discardable(p, t, sig))  		siginfofree(sigqp);  	else  		sigaddqins(p, t, sigqp); @@ -2141,7 +2202,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags)  	 * blocking the signal (it *could* change it's mind while  	 * the signal is pending) then don't bother creating one.  	 */ -	if (!sig_discardable(p, sig) && +	if (!sig_discardable(p, t, sig) &&  	    (sigismember(&p->p_siginfo, sig) ||  	    (curproc->p_ct_process != p->p_ct_process) ||  	    (sig == SIGCLD && SI_FROMKERNEL(infop))) && diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c index 6084676b17..6dc7230bed 100644 --- a/usr/src/uts/common/os/smb_subr.c +++ b/usr/src/uts/common/os/smb_subr.c @@ -25,7 +25,9 @@   * Use is subject to license terms.   */ -#pragma ident	"%Z%%M%	%I%	%E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc.  All rights reserved. + */  #include <sys/smbios_impl.h>  #include <sys/cmn_err.h> @@ -43,13 +45,13 @@ smb_strerror(int err)  void *  smb_alloc(size_t len)  { -	return (kmem_alloc(len, KM_SLEEP)); +	return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL);  }  void *  smb_zalloc(size_t len)  { -	return (kmem_zalloc(len, KM_SLEEP)); +	return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL);  }  void diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index 62f94729cf..21ec25b5b3 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -24,7 +24,7 @@  /*   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc.   */  #include <sys/types.h> @@ -77,6 +77,7 @@  #include <sys/policy.h>  #include <sys/dld.h>  #include <sys/zone.h> +#include <sys/limits.h>  #include <c2/audit.h>  /* @@ -985,12 +986,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,  		 * (registered in sd_wakeq).  		 */  		struiod_t uiod; +		struct iovec buf[IOV_MAX_STACK]; +		int iovlen = 0;  		if (first)  			stp->sd_wakeq &= ~RSLEEP; -		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, -		    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); +		if (uiop->uio_iovcnt > IOV_MAX_STACK) { +			iovlen = uiop->uio_iovcnt * sizeof (iovec_t); +			uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); +		} else { +			uiod.d_iov = buf; +		} + +		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);  		uiod.d_mp = 0;  		/*  		 * Mark that a thread is in rwnext on the read side @@ -1029,6 +1038,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,  			if ((bp = uiod.d_mp) != NULL) {  				*errorp = 0;  				ASSERT(MUTEX_HELD(&stp->sd_lock)); +				if (iovlen != 0) +					kmem_free(uiod.d_iov, iovlen);  				return (bp);  			}  			error = 0; @@ -1048,8 +1059,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,  		} else {  			*errorp = error;  			ASSERT(MUTEX_HELD(&stp->sd_lock)); +			if (iovlen != 0) +				kmem_free(uiod.d_iov, iovlen);  			return (NULL);  		} + +		if (iovlen != 0) +			kmem_free(uiod.d_iov, iovlen); +  		/*  		 * Try a getq in case a rwnext() generated mblk  		 * has bubbled up via strrput(). @@ -2544,6 +2561,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,      int b_flag, int pri, int flags)  {  	struiod_t uiod; +	struct iovec buf[IOV_MAX_STACK]; +	int iovlen = 0;  	mblk_t *mp;  	queue_t *wqp = stp->sd_wrq;  	int error = 0; @@ -2635,13 +2654,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,  	mp->b_flag |= b_flag;  	mp->b_band = (uchar_t)pri; -	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, -	    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); +	if (uiop->uio_iovcnt > IOV_MAX_STACK) { +		iovlen = uiop->uio_iovcnt * sizeof (iovec_t); +		uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP); +	} else { +		uiod.d_iov = buf; +	} + +	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);  	uiod.d_uio.uio_offset = 0;  	uiod.d_mp = mp;  	error = rwnext(wqp, &uiod);  	if (! uiod.d_mp) {  		uioskip(uiop, *iosize); +		if (iovlen != 0) +			kmem_free(uiod.d_iov, iovlen);  		return (error);  	}  	ASSERT(mp == uiod.d_mp); @@ -2659,17 +2686,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,  		error = 0;  	} else {  		freemsg(mp); +		if (iovlen != 0) +			kmem_free(uiod.d_iov, iovlen);  		return (error);  	}  	/* Have to check canput before consuming data from the uio */  	if (pri == 0) {  		if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {  			freemsg(mp); +			if (iovlen != 0) +				kmem_free(uiod.d_iov, iovlen);  			return (EWOULDBLOCK);  		}  	} else {  		if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {  			freemsg(mp); +			if (iovlen != 0) +				kmem_free(uiod.d_iov, iovlen);  			return (EWOULDBLOCK);  		}  	} @@ -2677,6 +2710,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,  	/* Copyin data from the uio */  	if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {  		freemsg(mp); +		if (iovlen != 0) +			kmem_free(uiod.d_iov, iovlen);  		return (error);  	}  	uioskip(uiop, *iosize); @@ -2693,6 +2728,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,  		putnext(wqp, mp);  		stream_runservice(stp);  	} +	if (iovlen != 0) +		kmem_free(uiod.d_iov, iovlen);  	return (0);  } @@ -3178,6 +3215,7 @@ job_control_type(int cmd)  	case JAGENT:	/* Obsolete */  	case JTRUN:	/* Obsolete */  	case JXTPROTO:	/* Obsolete */ +	case TIOCSETLD:  		return (JCSETP);  	} @@ -8174,12 +8212,8 @@ out:   * an M_PROTO/M_PCPROTO part).   */  int -strpoll( -	struct stdata *stp, -	short events_arg, -	int anyyet, -	short *reventsp, -	struct pollhead **phpp) +strpoll(struct stdata *stp, short events_arg, int anyyet, short *reventsp, +    struct pollhead **phpp)  {  	int events = (ushort_t)events_arg;  	int retevents = 0; @@ -8316,8 +8350,7 @@ chkrd:  				retevents |= (events & (POLLIN | POLLRDBAND));  			break;  		} -		if (! (retevents & normevents) && -		    (stp->sd_wakeq & RSLEEP)) { +		if (!(retevents & normevents) && (stp->sd_wakeq & RSLEEP)) {  			/*  			 * Sync stream barrier read queue has data.  			 */ @@ -8328,19 +8361,11 @@ chkrd:  			retevents |= normevents;  	} -	*reventsp = (short)retevents; -	if (retevents && !(events & POLLET)) { -		if (headlocked) -			mutex_exit(&stp->sd_lock); -		return (0); -	} -  	/* -	 * If poll() has not found any events yet, set up event cell -	 * to wake up the poll if a requested event occurs on this -	 * stream.  Check for collisions with outstanding poll requests. +	 * Pass back a pollhead if no events are pending or if edge-triggering +	 * has been configured on this resource.  	 */ -	if (!anyyet) { +	if ((retevents == 0 && !anyyet) || (events & POLLET)) {  		*phpp = &stp->sd_pollist;  		if (headlocked == 0) {  			if (polllock(&stp->sd_pollist, &stp->sd_lock) != 0) { @@ -8351,6 +8376,8 @@ chkrd:  		}  		stp->sd_rput_opt |= SR_POLLIN;  	} + +	*reventsp = (short)retevents;  	if (headlocked)  		mutex_exit(&stp->sd_lock);  	return (0); diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index b3861dec03..4dd48c7e19 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -23,7 +23,7 @@   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.   * Copyright 2012 Milan Jurik. All rights reserved.   * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2015, Joyent, Inc. + * Copyright 2016 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -61,8 +61,7 @@ struct mmaplf32a;  int	access(char *, int);  int	alarm(int);  int	auditsys(struct auditcalls *, rval_t *); -int64_t	brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, -    uintptr_t); +int64_t	brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t);  intptr_t	brk(caddr_t);  int	chdir(char *);  int	chmod(char *, int); @@ -647,7 +646,7 @@ struct sysent sysent[NSYSCALL] =  			SYSENT_NOSYS(),  			SYSENT_C("llseek",	llseek32,	4)),  	/* 176 */ SYSENT_LOADABLE(),		/* inst_sync */ -	/* 177 */ SYSENT_CI("brandsys",		brandsys,	6), +	/* 177 */ SYSENT_CI("brandsys",		brandsys,	5),  	/* 178 */ SYSENT_LOADABLE(),		/* kaio */  	/* 179 */ SYSENT_LOADABLE(),		/* cpc */  	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3), @@ -1002,7 +1001,7 @@ struct sysent sysent32[NSYSCALL] =  	/* 174 */ SYSENT_CI("pwrite",		pwrite32,		4),  	/* 175 */ SYSENT_C("llseek",		llseek32,	4),  	/* 176 */ SYSENT_LOADABLE32(),		/* inst_sync */ -	/* 177 */ SYSENT_CI("brandsys",		brandsys,	6), +	/* 177 */ SYSENT_CI("brandsys",		brandsys,	5),  	/* 178 */ SYSENT_LOADABLE32(),		/* kaio */  	/* 179 */ SYSENT_LOADABLE32(),		/* cpc */  	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3), @@ -1094,18 +1093,20 @@ char **syscallnames;  systrace_sysent_t *systrace_sysent;  void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, -    uintptr_t, uintptr_t, uintptr_t, uintptr_t); +    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);  /*ARGSUSED*/  void  systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, -    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) +    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, +    uintptr_t arg6, uintptr_t arg7)  {}  /*ARGSUSED*/  int64_t  dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, -    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) +    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, +    uintptr_t arg7)  {  	systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum];  	dtrace_id_t id; @@ -1113,7 +1114,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,  	proc_t *p;  	if ((id = sy->stsy_entry) != DTRACE_IDNONE) -		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); +		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, +		    arg6, arg7);  	/*  	 * We want to explicitly allow DTrace consumers to stop a process @@ -1127,14 +1129,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,  	}  	mutex_exit(&p->p_lock); -	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); +	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, +	    arg6, arg7);  	if (ttolwp(curthread)->lwp_errno != 0)  		rval = -1;  	if ((id = sy->stsy_return) != DTRACE_IDNONE)  		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, -		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); +		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0);  	return (rval);  } @@ -1146,7 +1149,8 @@ systrace_sysent_t *systrace_sysent32;  /*ARGSUSED*/  int64_t  dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, -    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) +    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, +    uintptr_t arg7)  {  	systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum];  	dtrace_id_t id; @@ -1154,7 +1158,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,  	proc_t *p;  	if ((id = sy->stsy_entry) != DTRACE_IDNONE) -		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); +		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, +		    arg7);  	/*  	 * We want to explicitly allow DTrace consumers to stop a process @@ -1168,14 +1173,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,  	}  	mutex_exit(&p->p_lock); -	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); +	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, +	    arg7);  	if (ttolwp(curthread)->lwp_errno != 0)  		rval = -1;  	if ((id = sy->stsy_return) != DTRACE_IDNONE)  		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, -		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); +		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0);  	return (rval);  } @@ -1203,5 +1209,5 @@ dtrace_systrace_rtt(void)  	}  	if ((id = sy->stsy_return) != DTRACE_IDNONE) -		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0); +		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0);  } diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c index b25a6cbcf1..5453ebf380 100644 --- a/usr/src/uts/common/os/timer.c +++ b/usr/src/uts/common/os/timer.c @@ -25,11 +25,12 @@   */  /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc.   */  #include <sys/timer.h>  #include <sys/systm.h> +#include <sys/sysmacros.h>  #include <sys/param.h>  #include <sys/kmem.h>  #include <sys/debug.h> @@ -81,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it)   * waiters.  p_lock must be held on entry; it will not be dropped by   * timer_unlock().   */ +/* ARGSUSED */  static void  timer_unlock(proc_t *p, itimer_t *it)  { @@ -123,6 +125,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)  		timer_lock(p, it);  	} +	ASSERT(p->p_itimer_sz > tid);  	ASSERT(p->p_itimer[tid] == it);  	p->p_itimer[tid] = NULL; @@ -137,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)  	it->it_backend->clk_timer_delete(it); -	if (it->it_portev) { +	if (it->it_flags & IT_PORT) {  		mutex_enter(&it->it_mutex);  		if (it->it_portev) {  			port_kevent_t	*pev; @@ -199,18 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)  static itimer_t *  timer_grab(proc_t *p, timer_t tid)  { -	itimer_t **itp, *it; +	itimer_t *it; -	if (tid >= timer_max || tid < 0) +	if (tid < 0) {  		return (NULL); +	}  	mutex_enter(&p->p_lock); - -	if ((itp = p->p_itimer) == NULL || (it = itp[tid]) == NULL) { +	if (p->p_itimer == NULL || tid >= p->p_itimer_sz || +	    (it = p->p_itimer[tid]) == NULL) {  		mutex_exit(&p->p_lock);  		return (NULL);  	} +	/* This may drop p_lock temporarily. */  	timer_lock(p, it);  	if (it->it_lock & ITLK_REMOVE) { @@ -232,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid)   * should not be held on entry; timer_release() will acquire p_lock but   * will drop it before returning.   */ -static void +void  timer_release(proc_t *p, itimer_t *it)  {  	mutex_enter(&p->p_lock); @@ -245,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it)   * p_lock should not be held on entry; timer_delete_grabbed() will acquire   * p_lock, but will drop it before returning.   */ -static void +void  timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it)  {  	mutex_enter(&p->p_lock); @@ -258,6 +263,13 @@ clock_timer_init()  {  	clock_timer_cache = kmem_cache_create("timer_cache",  	    sizeof (itimer_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + +	/* +	 * Push the timer_max limit up to at least 4 * NCPU.  Due to the way +	 * NCPU is defined, proper initialization of the timer limit is +	 * performed at runtime. +	 */ +	timer_max = MAX(NCPU * 4, timer_max);  }  void @@ -453,6 +465,9 @@ timer_fire(itimer_t *it)  			it->it_pending = 1;  			port_send_event((port_kevent_t *)it->it_portev);  			mutex_exit(&it->it_mutex); +		} else if (it->it_flags & IT_CALLBACK) { +			it->it_cb_func(it); +			ASSERT(MUTEX_NOT_HELD(&it->it_mutex));  		} else if (it->it_flags & IT_SIGNAL) {  			it->it_pending = 1;  			mutex_exit(&it->it_mutex); @@ -466,159 +481,175 @@ timer_fire(itimer_t *it)  		mutex_exit(&p->p_lock);  } -int -timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) +/* + * Allocate an itimer_t and find and appropriate slot for it in p_itimer. + * Acquires p_lock and holds it on return, regardless of success. + */ +static itimer_t * +timer_alloc(proc_t *p, timer_t *id)  { -	struct sigevent ev; -	proc_t *p = curproc; -	clock_backend_t *backend; -	itimer_t *it, **itp; -	sigqueue_t *sigq; -	cred_t *cr = CRED(); -	int error = 0; -	timer_t i; -	port_notify_t tim_pnevp; -	port_kevent_t *pkevp = NULL; +	itimer_t *it, **itp = NULL; +	uint_t i; -	if ((backend = CLOCK_BACKEND(clock)) == NULL) -		return (set_errno(EINVAL)); +	ASSERT(MUTEX_NOT_HELD(&p->p_lock)); -	if (evp != NULL) { -		/* -		 * short copyin() for binary compatibility -		 * fetch oldsigevent to determine how much to copy in. -		 */ -		if (get_udatamodel() == DATAMODEL_NATIVE) { -			if (copyin(evp, &ev, sizeof (struct oldsigevent))) -				return (set_errno(EFAULT)); +	it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); +	bzero(it, sizeof (itimer_t)); +	mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); -			if (ev.sigev_notify == SIGEV_PORT || -			    ev.sigev_notify == SIGEV_THREAD) { -				if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, -				    sizeof (port_notify_t))) -					return (set_errno(EFAULT)); +	mutex_enter(&p->p_lock); +retry: +	if (p->p_itimer != NULL) { +		for (i = 0; i < p->p_itimer_sz; i++) { +			if (p->p_itimer[i] == NULL) { +				itp = &(p->p_itimer[i]); +				break;  			} -#ifdef	_SYSCALL32_IMPL -		} else { -			struct sigevent32 ev32; -			port_notify32_t tim_pnevp32; +		} +	} -			if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) -				return (set_errno(EFAULT)); -			ev.sigev_notify = ev32.sigev_notify; -			ev.sigev_signo = ev32.sigev_signo; +	/* +	 * A suitable slot was not found.  If possible, allocate (or resize) +	 * the p_itimer array and try again. +	 */ +	if (itp == NULL) { +		uint_t target_sz = _TIMER_ALLOC_INIT; +		itimer_t **itp_new; + +		if (p->p_itimer != NULL) { +			ASSERT(p->p_itimer_sz != 0); + +			target_sz = p->p_itimer_sz * 2; +		} +		/* +		 * Protect against exceeding the max or overflow +		 */ +		if (target_sz > timer_max || target_sz > INT_MAX || +		    target_sz < p->p_itimer_sz) { +			kmem_cache_free(clock_timer_cache, it); +			return (NULL); +		} +		mutex_exit(&p->p_lock); +		itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *), +		    KM_SLEEP); +		mutex_enter(&p->p_lock); +		if (target_sz <= p->p_itimer_sz) {  			/* -			 * See comment in sigqueue32() on handling of 32-bit -			 * sigvals in a 64-bit kernel. +			 * A racing thread performed the resize while we were +			 * waiting outside p_lock.  Discard our now-useless +			 * allocation and retry.  			 */ -			ev.sigev_value.sival_int = ev32.sigev_value.sival_int; -			if (ev.sigev_notify == SIGEV_PORT || -			    ev.sigev_notify == SIGEV_THREAD) { -				if (copyin((void *)(uintptr_t) -				    ev32.sigev_value.sival_ptr, -				    (void *)&tim_pnevp32, -				    sizeof (port_notify32_t))) -					return (set_errno(EFAULT)); -				tim_pnevp.portnfy_port = -				    tim_pnevp32.portnfy_port; -				tim_pnevp.portnfy_user = -				    (void *)(uintptr_t)tim_pnevp32.portnfy_user; +			kmem_free(itp_new, target_sz * sizeof (itimer_t *)); +			goto retry; +		} else { +			/* +			 * Instantiate the larger allocation and select the +			 * first fresh entry for use. +			 */ +			if (p->p_itimer != NULL) { +				uint_t old_sz; + +				old_sz = p->p_itimer_sz; +				bcopy(p->p_itimer, itp_new, +				    old_sz * sizeof (itimer_t *)); +				kmem_free(p->p_itimer, +				    old_sz * sizeof (itimer_t *)); + +				/* +				 * Short circuit to use the first free entry in +				 * the new allocation.  It's possible that +				 * other lower-indexed timers were freed while +				 * p_lock was dropped, but skipping over them +				 * is not harmful at all.  In the common case, +				 * we skip the need to walk over an array +				 * filled with timers before arriving at the +				 * slot we know is fresh from the allocation. +				 */ +				i = old_sz; +			} else { +				/* +				 * For processes lacking any existing timers, +				 * we can simply select the first entry. +				 */ +				i = 0;  			} -#endif +			p->p_itimer = itp_new; +			p->p_itimer_sz = target_sz;  		} -		switch (ev.sigev_notify) { -		case SIGEV_NONE: -			break; -		case SIGEV_SIGNAL: -			if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) -				return (set_errno(EINVAL)); -			break; -		case SIGEV_THREAD: -		case SIGEV_PORT: -			break; -		default: -			return (set_errno(EINVAL)); -		} -	} else { -		/* -		 * Use the clock's default sigevent (this is a structure copy). -		 */ -		ev = backend->clk_default;  	} +	ASSERT(i <= INT_MAX); +	*id = (timer_t)i; +	return (it); +} + +/* + * Setup a timer + * + * This allocates an itimer_t (including a timer_t ID and slot in the process), + * wires it up according to the provided sigevent, and associates it with the + * desired clock backend.  Upon successful completion, the timer will be + * locked, preventing it from being armed via timer_settime() or deleted via + * timer_delete().  This gives the caller a chance to perform any last minute + * manipulations (such as configuring the IT_CALLBACK functionality and/or + * copying the timer_t out to userspace) before using timer_release() to unlock + * it or timer_delete_grabbed() to delete it. + */ +int +timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp, +    itimer_t **itp, timer_t *tidp) +{ +	proc_t *p = curproc; +	int error = 0; +	itimer_t *it; +	sigqueue_t *sigq; +	timer_t tid; +  	/* -	 * We'll allocate our timer and sigqueue now, before we grab p_lock. -	 * If we can't find an empty slot, we'll free them before returning. +	 * We'll allocate our sigqueue now, before we grab p_lock. +	 * If we can't find an empty slot, we'll free it before returning.  	 */ -	it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); -	bzero(it, sizeof (itimer_t)); -	mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);  	sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); -	mutex_enter(&p->p_lock); -  	/* -	 * If this is this process' first timer, we need to attempt to allocate -	 * an array of timerstr_t pointers.  We drop p_lock to perform the -	 * allocation; if we return to discover that p_itimer is non-NULL, -	 * we will free our allocation and drive on. +	 * Allocate a timer and choose a slot for it. This acquires p_lock.  	 */ -	if ((itp = p->p_itimer) == NULL) { -		mutex_exit(&p->p_lock); -		itp = kmem_zalloc(timer_max * sizeof (itimer_t *), KM_SLEEP); -		mutex_enter(&p->p_lock); - -		if (p->p_itimer == NULL) -			p->p_itimer = itp; -		else { -			kmem_free(itp, timer_max * sizeof (itimer_t *)); -			itp = p->p_itimer; -		} -	} - -	for (i = 0; i < timer_max && itp[i] != NULL; i++) -		continue; +	it = timer_alloc(p, &tid); +	ASSERT(MUTEX_HELD(&p->p_lock)); -	if (i == timer_max) { -		/* -		 * We couldn't find a slot.  Drop p_lock, free the preallocated -		 * timer and sigqueue, and return an error. -		 */ +	if (it == NULL) {  		mutex_exit(&p->p_lock); -		kmem_cache_free(clock_timer_cache, it);  		kmem_free(sigq, sizeof (sigqueue_t)); - -		return (set_errno(EAGAIN)); +		return (EAGAIN);  	} -	ASSERT(i < timer_max && itp[i] == NULL); +	ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL); +	ASSERT(evp != NULL);  	/*  	 * If we develop other notification mechanisms, this will need  	 * to call into (yet another) backend.  	 */ -	sigq->sq_info.si_signo = ev.sigev_signo; -	if (evp == NULL) -		sigq->sq_info.si_value.sival_int = i; -	else -		sigq->sq_info.si_value = ev.sigev_value; +	sigq->sq_info.si_signo = evp->sigev_signo; +	sigq->sq_info.si_value = evp->sigev_value;  	sigq->sq_info.si_code = SI_TIMER;  	sigq->sq_info.si_pid = p->p_pid;  	sigq->sq_info.si_ctid = PRCTID(p);  	sigq->sq_info.si_zoneid = getzoneid(); -	sigq->sq_info.si_uid = crgetruid(cr); +	sigq->sq_info.si_uid = crgetruid(CRED());  	sigq->sq_func = timer_signal;  	sigq->sq_next = NULL;  	sigq->sq_backptr = it;  	it->it_sigq = sigq;  	it->it_backend = backend;  	it->it_lock = ITLK_LOCKED; -	itp[i] = it; - -	if (ev.sigev_notify == SIGEV_THREAD || -	    ev.sigev_notify == SIGEV_PORT) { +	if (evp->sigev_notify == SIGEV_THREAD || +	    evp->sigev_notify == SIGEV_PORT) {  		int port; +		port_kevent_t *pkevp = NULL; + +		ASSERT(pnp != NULL);  		/*  		 * This timer is programmed to use event port notification when @@ -638,18 +669,17 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)  		 */  		it->it_flags |= IT_PORT; -		port = tim_pnevp.portnfy_port; +		port = pnp->portnfy_port;  		/* associate timer as event source with the port */  		error = port_associate_ksource(port, PORT_SOURCE_TIMER,  		    (port_source_t **)&it->it_portsrc, timer_close_port,  		    (void *)it, NULL);  		if (error) { -			itp[i] = NULL;		/* clear slot */  			mutex_exit(&p->p_lock);  			kmem_cache_free(clock_timer_cache, it);  			kmem_free(sigq, sizeof (sigqueue_t)); -			return (set_errno(error)); +			return (error);  		}  		/* allocate an event structure/slot */ @@ -658,23 +688,24 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)  		if (error) {  			(void) port_dissociate_ksource(port, PORT_SOURCE_TIMER,  			    (port_source_t *)it->it_portsrc); -			itp[i] = NULL;		/* clear slot */  			mutex_exit(&p->p_lock);  			kmem_cache_free(clock_timer_cache, it);  			kmem_free(sigq, sizeof (sigqueue_t)); -			return (set_errno(error)); +			return (error);  		}  		/* initialize event data */ -		port_init_event(pkevp, i, tim_pnevp.portnfy_user, +		port_init_event(pkevp, tid, pnp->portnfy_user,  		    timer_port_callback, it);  		it->it_portev = pkevp;  		it->it_portfd = port;  	} else { -		if (ev.sigev_notify == SIGEV_SIGNAL) +		if (evp->sigev_notify == SIGEV_SIGNAL)  			it->it_flags |= IT_SIGNAL;  	} +	/* Populate the slot now that the timer is prepped. */ +	p->p_itimer[tid] = it;  	mutex_exit(&p->p_lock);  	/* @@ -687,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)  	it->it_lwp = ttolwp(curthread);  	it->it_proc = p; -	if (copyout(&i, tid, sizeof (timer_t)) != 0) { -		error = EFAULT; -		goto err; -	} - -	/* -	 * If we're here, then we have successfully created the timer; we -	 * just need to release the timer and return. -	 */ -	timer_release(p, it); - +	*itp = it; +	*tidp = tid;  	return (0);  err: @@ -708,11 +730,115 @@ err:  	 * impossible for a removal to be pending.  	 */  	ASSERT(!(it->it_lock & ITLK_REMOVE)); -	timer_delete_grabbed(p, i, it); +	timer_delete_grabbed(p, tid, it); -	return (set_errno(error)); +	return (error);  } + +int +timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp) +{ +	int error = 0; +	proc_t *p = curproc; +	clock_backend_t *backend; +	struct sigevent ev; +	itimer_t *it; +	timer_t tid; +	port_notify_t tim_pnevp; + +	if ((backend = CLOCK_BACKEND(clock)) == NULL) +		return (set_errno(EINVAL)); + +	if (evp != NULL) { +		/* +		 * short copyin() for binary compatibility +		 * fetch oldsigevent to determine how much to copy in. +		 */ +		if (get_udatamodel() == DATAMODEL_NATIVE) { +			if (copyin(evp, &ev, sizeof (struct oldsigevent))) +				return (set_errno(EFAULT)); + +			if (ev.sigev_notify == SIGEV_PORT || +			    ev.sigev_notify == SIGEV_THREAD) { +				if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, +				    sizeof (port_notify_t))) +					return (set_errno(EFAULT)); +			} +#ifdef	_SYSCALL32_IMPL +		} else { +			struct sigevent32 ev32; +			port_notify32_t tim_pnevp32; + +			if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) +				return (set_errno(EFAULT)); +			ev.sigev_notify = ev32.sigev_notify; +			ev.sigev_signo = ev32.sigev_signo; +			/* +			 * See comment in sigqueue32() on handling of 32-bit +			 * sigvals in a 64-bit kernel. +			 */ +			ev.sigev_value.sival_int = ev32.sigev_value.sival_int; +			if (ev.sigev_notify == SIGEV_PORT || +			    ev.sigev_notify == SIGEV_THREAD) { +				if (copyin((void *)(uintptr_t) +				    ev32.sigev_value.sival_ptr, +				    (void *)&tim_pnevp32, +				    sizeof (port_notify32_t))) +					return (set_errno(EFAULT)); +				tim_pnevp.portnfy_port = +				    tim_pnevp32.portnfy_port; +				tim_pnevp.portnfy_user = +				    (void *)(uintptr_t)tim_pnevp32.portnfy_user; +			} +#endif +		} +		switch (ev.sigev_notify) { +		case SIGEV_NONE: +			break; +		case SIGEV_SIGNAL: +			if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) +				return (set_errno(EINVAL)); +			break; +		case SIGEV_THREAD: +		case SIGEV_PORT: +			break; +		default: +			return (set_errno(EINVAL)); +		} +	} else { +		/* +		 * Use the clock's default sigevent (this is a structure copy). +		 */ +		ev = backend->clk_default; +	} + +	if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) { +		return (set_errno(error)); +	} + +	/* +	 * Populate si_value with the timer ID if no sigevent was passed in. +	 */ +	if (evp == NULL) { +		it->it_sigq->sq_info.si_value.sival_int = tid; +	} + +	if (copyout(&tid, tidp, sizeof (timer_t)) != 0) { +		timer_delete_grabbed(p, tid, it); +		return (set_errno(EFAULT)); +	} + +	/* +	 * If we're here, then we have successfully created the timer; we +	 * just need to release the timer and return. +	 */ +	timer_release(p, it); + +	return (0); +} + +  int  timer_gettime(timer_t tid, itimerspec_t *val)  { @@ -832,20 +958,23 @@ timer_getoverrun(timer_t tid)  void  timer_lwpexit(void)  { -	timer_t i; +	uint_t i;  	proc_t *p = curproc;  	klwp_t *lwp = ttolwp(curthread); -	itimer_t *it, **itp; +	itimer_t *it;  	ASSERT(MUTEX_HELD(&p->p_lock)); -	if ((itp = p->p_itimer) == NULL) +	if (p->p_itimer == NULL) {  		return; +	} -	for (i = 0; i < timer_max; i++) { -		if ((it = itp[i]) == NULL) +	for (i = 0; i < p->p_itimer_sz; i++) { +		if ((it = p->p_itimer[i]) == NULL) {  			continue; +		} +		/* This may drop p_lock temporarily. */  		timer_lock(p, it);  		if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) { @@ -876,20 +1005,22 @@ timer_lwpexit(void)  void  timer_lwpbind()  { -	timer_t i; +	uint_t i;  	proc_t *p = curproc;  	klwp_t *lwp = ttolwp(curthread); -	itimer_t *it, **itp; +	itimer_t *it;  	ASSERT(MUTEX_HELD(&p->p_lock)); -	if ((itp = p->p_itimer) == NULL) +	if (p->p_itimer == NULL) {  		return; +	} -	for (i = 0; i < timer_max; i++) { -		if ((it = itp[i]) == NULL) +	for (i = 0; i < p->p_itimer_sz; i++) { +		if ((it = p->p_itimer[i]) == NULL)  			continue; +		/* This may drop p_lock temporarily. */  		timer_lock(p, it);  		if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) { @@ -911,16 +1042,19 @@ timer_lwpbind()  void  timer_exit(void)  { -	timer_t i; +	uint_t i;  	proc_t *p = curproc;  	ASSERT(p->p_itimer != NULL); +	ASSERT(p->p_itimer_sz != 0); -	for (i = 0; i < timer_max; i++) -		(void) timer_delete(i); +	for (i = 0; i < p->p_itimer_sz; i++) { +		(void) timer_delete((timer_t)i); +	} -	kmem_free(p->p_itimer, timer_max * sizeof (itimer_t *)); +	kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *));  	p->p_itimer = NULL; +	p->p_itimer_sz = 0;  }  /* @@ -977,7 +1111,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose)  	for (tid = 0; tid < timer_max; tid++) {  		if ((it = timer_grab(p, tid)) == NULL)  			continue; -		if (it->it_portev) { +		if (it->it_flags & IT_PORT) {  			mutex_enter(&it->it_mutex);  			if (it->it_portfd == port) {  				port_kevent_t *pev; diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c index 61acc6cf97..53be806026 100644 --- a/usr/src/uts/common/os/timers.c +++ b/usr/src/uts/common/os/timers.c @@ -22,6 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2016 Joyent, Inc.   */  /* @@ -1172,6 +1173,14 @@ timespectohz64(timespec_t *tv)  void  hrt2ts(hrtime_t hrt, timestruc_t *tsp)  { +#if defined(__amd64) +	/* +	 * The cleverness explained above is unecessary on x86_64 CPUs where +	 * modern compilers are able to optimize down to faster operations. +	 */ +	tsp->tv_sec = hrt / NANOSEC; +	tsp->tv_nsec = hrt % NANOSEC; +#else  	uint32_t sec, nsec, tmp;  	tmp = (uint32_t)(hrt >> 30); @@ -1193,20 +1202,28 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp)  	}  	tsp->tv_sec = (time_t)sec;  	tsp->tv_nsec = nsec; +#endif /* defined(__amd64) */  }  /*   * Convert from timestruc_t to hrtime_t. - * - * The code below is equivalent to: - * - *	hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; - * - * but requires no integer multiply.   */  hrtime_t  ts2hrt(const timestruc_t *tsp)  { +#if defined(__amd64) || defined(__i386) +	/* +	 * On modern x86 CPUs, the simple version is faster. +	 */ +	return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec); +#else +	/* +	 * The code below is equivalent to: +	 * +	 *	hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; +	 * +	 * but requires no integer multiply. +	 */  	hrtime_t hrt;  	hrt = tsp->tv_sec; @@ -1215,6 +1232,7 @@ ts2hrt(const timestruc_t *tsp)  	hrt = (hrt << 7) - hrt - hrt - hrt;  	hrt = (hrt << 9) + tsp->tv_nsec;  	return (hrt); +#endif /* defined(__amd64) || defined(__i386) */  }  /* @@ -1246,6 +1264,13 @@ tv2hrt(struct timeval *tvp)  void  hrt2tv(hrtime_t hrt, struct timeval *tvp)  { +#if defined(__amd64) +	/* +	 * Like hrt2ts, the simple version is faster on x86_64. +	 */ +	tvp->tv_sec = hrt / NANOSEC; +	tvp->tv_usec = (hrt % NANOSEC) / (NANOSEC / MICROSEC); +#else  	uint32_t sec, nsec, tmp;  	uint32_t q, r, t; @@ -1267,17 +1292,17 @@ hrt2tv(hrtime_t hrt, struct timeval *tvp)  		sec++;  	}  	tvp->tv_sec = (time_t)sec; -/* - * this routine is very similar to hr2ts, but requires microseconds - * instead of nanoseconds, so an interger divide by 1000 routine - * completes the conversion - */ +	/* +	 * this routine is very similar to hr2ts, but requires microseconds +	 * instead of nanoseconds, so an interger divide by 1000 routine +	 * completes the conversion +	 */  	t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12);  	q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14);  	q = q >> 9;  	r = nsec - q*1000;  	tvp->tv_usec = q + ((r + 24) >> 10); - +#endif /* defined(__amd64) */  }  int diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index b79b1b4cf7..e3da4df247 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1621,7 +1621,7 @@ vmem_destroy(vmem_t *vmp)  	leaked = vmem_size(vmp, VMEM_ALLOC);  	if (leaked != 0) -		cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", +		cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",  		    vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?  		    "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 5a5dc7d107..a8993524ac 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@  /*   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc.   * Copyright (c) 2016 by Delphix. All rights reserved.   */ @@ -251,6 +251,8 @@  #include <sys/cpucaps.h>  #include <vm/seg.h>  #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h>  /*   * This constant specifies the number of seconds that threads waiting for @@ -371,8 +373,12 @@ static char *zone_ref_subsys_names[] = {  rctl_hndl_t rc_zone_cpu_shares;  rctl_hndl_t rc_zone_locked_mem;  rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem;  rctl_hndl_t rc_zone_max_lofi;  rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri;  rctl_hndl_t rc_zone_nlwps;  rctl_hndl_t rc_zone_nprocs;  rctl_hndl_t rc_zone_shmmax; @@ -418,8 +424,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,   * Version 5 alters the zone_boot system call, and converts its old   *     bootargs parameter to be set by the zone_setattr API instead.   * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create.   */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7;  /*   * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1378,6 +1385,114 @@ static rctl_ops_t zone_cpu_cap_ops = {  /*ARGSUSED*/  static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ +	ASSERT(MUTEX_HELD(&p->p_lock)); +	return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, +    rctl_qty_t nv) +{ +	zone_t *zone = e->rcep_p.zone; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	ASSERT(e->rcep_t == RCENTITY_ZONE); + +	if (zone == NULL) +		return (0); + +	return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { +	rcop_no_action, +	zone_cpu_base_get, +	zone_cpu_base_set, +	rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ +	ASSERT(MUTEX_HELD(&p->p_lock)); +	return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, +    rctl_qty_t nv) +{ +	zone_t *zone = e->rcep_p.zone; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	ASSERT(e->rcep_t == RCENTITY_ZONE); + +	if (zone == NULL) +		return (0); + +	return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { +	rcop_no_action, +	zone_cpu_burst_time_get, +	zone_cpu_burst_time_set, +	rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ +	ASSERT(MUTEX_HELD(&p->p_lock)); +	return (p->p_zone->zone_zfs_io_pri); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, +    rctl_qty_t nv) +{ +	zone_t *zone = e->rcep_p.zone; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	ASSERT(e->rcep_t == RCENTITY_ZONE); + +	if (zone == NULL) +		return (0); + +	/* +	 * set priority to the new value. +	 */ +	zone->zone_zfs_io_pri = nv; +	return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { +	rcop_no_action, +	zone_zfs_io_pri_get, +	zone_zfs_io_pri_set, +	rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t  zone_lwps_usage(rctl_t *r, proc_t *p)  {  	rctl_qty_t nlwps; @@ -1704,6 +1819,39 @@ static rctl_ops_t zone_max_swap_ops = {  /*ARGSUSED*/  static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ +	rctl_qty_t q; +	zone_t *z = p->p_zone; + +	ASSERT(MUTEX_HELD(&p->p_lock)); +	/* No additional lock because not enforced in the kernel */ +	q = z->zone_phys_mem; +	return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, +    rctl_qty_t nv) +{ +	ASSERT(MUTEX_HELD(&p->p_lock)); +	ASSERT(e->rcep_t == RCENTITY_ZONE); +	if (e->rcep_p.zone == NULL) +		return (0); +	e->rcep_p.zone->zone_phys_mem_ctl = nv; +	return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { +	rcop_no_action, +	zone_phys_mem_usage, +	zone_phys_mem_set, +	rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)  {  	rctl_qty_t q; @@ -1797,6 +1945,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)  }  static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ +	zone_t *zone = ksp->ks_private; +	zone_kstat_t *zk = ksp->ks_data; + +	if (rw == KSTAT_WRITE) +		return (EACCES); + +	zk->zk_usage.value.ui64 = zone->zone_phys_mem; +	zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl; +	return (0); +} + +static int  zone_nprocs_kstat_update(kstat_t *ksp, int rw)  {  	zone_t *zone = ksp->ks_private; @@ -1825,7 +1987,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)  }  static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name,      int (*updatefunc) (kstat_t *, int))  {  	kstat_t *ksp; @@ -1850,6 +2012,160 @@ zone_kstat_create_common(zone_t *zone, char *name,  	return (ksp);  } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ +	zone_t *zone = ksp->ks_private; +	zone_vfs_kstat_t *zvp = ksp->ks_data; +	kstat_io_t *kiop = &zone->zone_vfs_rwstats; + +	if (rw == KSTAT_WRITE) +		return (EACCES); + +	/* +	 * Extract the VFS statistics from the kstat_io_t structure used by +	 * kstat_runq_enter() and related functions.  Since the slow ops +	 * counters are updated directly by the VFS layer, there's no need to +	 * copy those statistics here. +	 * +	 * Note that kstat_runq_enter() and the related functions use +	 * gethrtime_unscaled(), so scale the time here. +	 */ +	zvp->zv_nread.value.ui64 = kiop->nread; +	zvp->zv_reads.value.ui64 = kiop->reads; +	zvp->zv_rtime.value.ui64 = kiop->rtime; +	zvp->zv_rcnt.value.ui64 = kiop->rcnt; +	zvp->zv_rlentime.value.ui64 = kiop->rlentime; +	zvp->zv_nwritten.value.ui64 = kiop->nwritten; +	zvp->zv_writes.value.ui64 = kiop->writes; +	zvp->zv_wtime.value.ui64 = kiop->wtime; +	zvp->zv_wcnt.value.ui64 = kiop->wcnt; +	zvp->zv_wlentime.value.ui64 = kiop->wlentime; + +	scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); +	scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); +	scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); +	scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + +	return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ +	kstat_t *ksp; +	zone_vfs_kstat_t *zvp; + +	if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, +	    zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, +	    sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), +	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) +		return (NULL); + +	if (zone->zone_id != GLOBAL_ZONEID) +		kstat_zone_add(ksp, GLOBAL_ZONEID); + +	zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); +	ksp->ks_data_size += strlen(zone->zone_name) + 1; +	ksp->ks_lock = &zone->zone_vfs_lock; +	zone->zone_vfs_stats = zvp; + +	/* The kstat "name" field is not large enough for a full zonename */ +	kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); +	kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); +	kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); +	kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + +	ksp->ks_update = zone_vfs_kstat_update; +	ksp->ks_private = zone; + +	kstat_install(ksp); +	return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ +	zone_t *zone = ksp->ks_private; +	zone_zfs_kstat_t *zzp = ksp->ks_data; +	kstat_io_t *kiop = &zone->zone_zfs_rwstats; + +	if (rw == KSTAT_WRITE) +		return (EACCES); + +	/* +	 * Extract the ZFS statistics from the kstat_io_t structure used by +	 * kstat_runq_enter() and related functions.  Since the I/O throttle +	 * counters are updated directly by the ZFS layer, there's no need to +	 * copy those statistics here. +	 * +	 * Note that kstat_runq_enter() and the related functions use +	 * gethrtime_unscaled(), so scale the time here. +	 */ +	zzp->zz_nread.value.ui64 = kiop->nread; +	zzp->zz_reads.value.ui64 = kiop->reads; +	zzp->zz_rtime.value.ui64 = kiop->rtime; +	zzp->zz_rlentime.value.ui64 = kiop->rlentime; +	zzp->zz_nwritten.value.ui64 = kiop->nwritten; +	zzp->zz_writes.value.ui64 = kiop->writes; + +	scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); +	scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + +	return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ +	kstat_t *ksp; +	zone_zfs_kstat_t *zzp; + +	if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, +	    zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, +	    sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), +	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) +		return (NULL); + +	if (zone->zone_id != GLOBAL_ZONEID) +		kstat_zone_add(ksp, GLOBAL_ZONEID); + +	zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); +	ksp->ks_data_size += strlen(zone->zone_name) + 1; +	ksp->ks_lock = &zone->zone_zfs_lock; +	zone->zone_zfs_stats = zzp; + +	/* The kstat "name" field is not large enough for a full zonename */ +	kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); +	kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); +	kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); +	kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + +	ksp->ks_update = zone_zfs_kstat_update; +	ksp->ks_private = zone; + +	kstat_install(ksp); +	return (ksp); +}  static int  zone_mcap_kstat_update(kstat_t *ksp, int rw) @@ -1860,11 +2176,19 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw)  	if (rw == KSTAT_WRITE)  		return (EACCES); +	zmp->zm_rss.value.ui64 = zone->zone_phys_mem; +	zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl; +	zmp->zm_swap.value.ui64 = zone->zone_max_swap; +	zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; +	zmp->zm_nover.value.ui64 = zone->zone_mcap_nover; +	zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;  	zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;  	zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;  	zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;  	zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;  	zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail; +	zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle; +	zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;  	return (0);  } @@ -1892,12 +2216,22 @@ zone_mcap_kstat_create(zone_t *zone)  	/* The kstat "name" field is not large enough for a full zonename */  	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);  	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); +	kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);  	kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);  	kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);  	kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);  	kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);  	kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",  	    KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle", +	    KSTAT_DATA_UINT64); +	kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec", +	    KSTAT_DATA_UINT64);  	ksp->ks_update = zone_mcap_kstat_update;  	ksp->ks_private = zone; @@ -1935,6 +2269,8 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)  	zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;  	zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; +	zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim; +  	zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;  	zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; @@ -1978,6 +2314,8 @@ zone_misc_kstat_create(zone_t *zone)  	    KSTAT_DATA_UINT32);  	kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);  	kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); +	kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim", +	    KSTAT_DATA_UINT32);  	kstat_named_init(&zmp->zm_nested_intp, "nested_interp",  	    KSTAT_DATA_UINT32);  	kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); @@ -1993,13 +2331,25 @@ zone_misc_kstat_create(zone_t *zone)  static void  zone_kstat_create(zone_t *zone)  { -	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, +	zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,  	    "lockedmem", zone_lockedmem_kstat_update); -	zone->zone_swapresv_kstat = zone_kstat_create_common(zone, +	zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,  	    "swapresv", zone_swapresv_kstat_update); -	zone->zone_nprocs_kstat = zone_kstat_create_common(zone, +	zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, +	    "physicalmem", zone_physmem_kstat_update); +	zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,  	    "nprocs", zone_nprocs_kstat_update); +	if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { +		zone->zone_vfs_stats = kmem_zalloc( +		    sizeof (zone_vfs_kstat_t), KM_SLEEP); +	} + +	if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { +		zone->zone_zfs_stats = kmem_zalloc( +		    sizeof (zone_zfs_kstat_t), KM_SLEEP); +	} +  	if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {  		zone->zone_mcap_stats = kmem_zalloc(  		    sizeof (zone_mcap_kstat_t), KM_SLEEP); @@ -2031,8 +2381,15 @@ zone_kstat_delete(zone_t *zone)  	    sizeof (zone_kstat_t));  	zone_kstat_delete_common(&zone->zone_swapresv_kstat,  	    sizeof (zone_kstat_t)); +	zone_kstat_delete_common(&zone->zone_physmem_kstat, +	    sizeof (zone_kstat_t));  	zone_kstat_delete_common(&zone->zone_nprocs_kstat,  	    sizeof (zone_kstat_t)); + +	zone_kstat_delete_common(&zone->zone_vfs_ksp, +	    sizeof (zone_vfs_kstat_t)); +	zone_kstat_delete_common(&zone->zone_zfs_ksp, +	    sizeof (zone_zfs_kstat_t));  	zone_kstat_delete_common(&zone->zone_mcap_ksp,  	    sizeof (zone_mcap_kstat_t));  	zone_kstat_delete_common(&zone->zone_misc_ksp, @@ -2070,6 +2427,8 @@ zone_zsd_init(void)  	zone0.zone_locked_mem_ctl = UINT64_MAX;  	ASSERT(zone0.zone_max_swap == 0);  	zone0.zone_max_swap_ctl = UINT64_MAX; +	zone0.zone_phys_mem = 0; +	zone0.zone_phys_mem_ctl = UINT64_MAX;  	zone0.zone_max_lofi = 0;  	zone0.zone_max_lofi_ctl = UINT64_MAX;  	zone0.zone_shmmax = 0; @@ -2094,8 +2453,9 @@ zone_zsd_init(void)  	zone0.zone_initname = initname;  	zone0.zone_lockedmem_kstat = NULL;  	zone0.zone_swapresv_kstat = NULL; +	zone0.zone_physmem_kstat = NULL;  	zone0.zone_nprocs_kstat = NULL; - +	zone0.zone_zfs_io_pri = 1;  	zone0.zone_stime = 0;  	zone0.zone_utime = 0;  	zone0.zone_wtime = 0; @@ -2206,6 +2566,21 @@ zone_init(void)  	    RCTL_GLOBAL_INFINITE,  	    MAXCAP, MAXCAP, &zone_cpu_cap_ops); +	rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", +	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | +	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, +	    MAXCAP, MAXCAP, &zone_cpu_base_ops); + +	rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", +	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | +	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, +	    INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + +	rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", +	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | +	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, +	    16384, 16384, &zone_zfs_io_pri_ops); +  	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,  	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,  	    INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2247,6 +2622,20 @@ zone_init(void)  	rde = rctl_dict_lookup("zone.cpu-shares");  	(void) rctl_val_list_insert(&rde->rcd_default_value, dval); +	/* +	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach +	 * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. +	 */ +	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); +	bzero(dval, sizeof (rctl_val_t)); +	dval->rcv_value = 1; +	dval->rcv_privilege = RCPRIV_PRIVILEGED; +	dval->rcv_flagaction = RCTL_LOCAL_NOACTION; +	dval->rcv_action_recip_pid = -1; + +	rde = rctl_dict_lookup("zone.zfs-io-priority"); +	(void) rctl_val_list_insert(&rde->rcd_default_value, dval); +  	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",  	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |  	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2257,6 +2646,11 @@ zone_init(void)  	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,  	    &zone_max_swap_ops); +	rc_zone_phys_mem = rctl_register("zone.max-physical-memory", +	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | +	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, +	    &zone_phys_mem_ops); +  	rc_zone_max_lofi = rctl_register("zone.max-lofi",  	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |  	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2278,6 +2672,8 @@ zone_init(void)  	zone0.zone_ntasks = 1;  	mutex_exit(&p0.p_lock);  	zone0.zone_restart_init = B_TRUE; +	zone0.zone_reboot_on_init_exit = B_FALSE; +	zone0.zone_init_status = -1;  	zone0.zone_brand = &native_brand;  	rctl_prealloc_destroy(gp);  	/* @@ -2357,6 +2753,8 @@ zone_init(void)  static void  zone_free(zone_t *zone)  { +	zone_dl_t *zdl; +  	ASSERT(zone != global_zone);  	ASSERT(zone->zone_ntasks == 0);  	ASSERT(zone->zone_nlwps == 0); @@ -2385,6 +2783,19 @@ zone_free(zone_t *zone)  	list_destroy(&zone->zone_ref_list);  	zone_free_zsd(zone);  	zone_free_datasets(zone); + +	/* +	 * While dlmgmtd should have removed all of these, it could have left +	 * something behind or crashed. In which case it's not safe for us to +	 * assume that the list is empty which list_destroy() will ASSERT. We +	 * clean up for our userland comrades which may have crashed, or worse, +	 * been disabled by SMF. +	 */ +	while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { +		if (zdl->zdl_net != NULL) +			nvlist_free(zdl->zdl_net); +		kmem_free(zdl, sizeof (zone_dl_t)); +	}  	list_destroy(&zone->zone_dl_list);  	if (zone->zone_rootvp != NULL) @@ -2429,12 +2840,18 @@ zone_free(zone_t *zone)  static void  zone_status_set(zone_t *zone, zone_status_t status)  { +	timestruc_t now; +	uint64_t t;  	nvlist_t *nvl = NULL;  	ASSERT(MUTEX_HELD(&zone_status_lock));  	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&  	    status >= zone_status_get(zone)); +	/* Current time since Jan 1 1970 but consumers expect NS */ +	gethrestime(&now); +	t = (now.tv_sec * NANOSEC) + now.tv_nsec; +  	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||  	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||  	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE, @@ -2442,7 +2859,7 @@ zone_status_set(zone_t *zone, zone_status_t status)  	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,  	    zone_status_table[zone->zone_status]) ||  	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || -	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || +	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||  	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,  	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {  #ifdef DEBUG @@ -2520,9 +2937,14 @@ zone_set_brand(zone_t *zone, const char *brand)  		return (EINVAL);  	} -	/* set up the brand specific data */ +	/* +	 * Set up the brand specific data. +	 * Note that it's possible that the hook has to drop the +	 * zone_status_lock and reaquire it before returning so we can't +	 * assume the lock has been held the entire time. +	 */  	zone->zone_brand = bp; -	ZBROP(zone)->b_init_brand_data(zone); +	ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);  	mutex_exit(&zone_status_lock);  	return (0); @@ -2594,14 +3016,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname)  	return (0);  } +/* + * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used + * to provide the physical memory capping kstats.  Since physical memory + * capping is currently implemented in userland, that code uses the setattr + * entry point to increment the kstats.  We always simply increment nover + * every time that setattr is called and we always add in the input value + * to zone_mcap_pagedout every time that is called. + */ +/*ARGSUSED*/  static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)  { -	uint64_t mcap; -	int err = 0; +	zone->zone_mcap_nover++; + +	return (0); +} + +static int +zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout) +{ +	uint64_t pageout; +	int err; + +	if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) +		zone->zone_mcap_pagedout += pageout; + +	return (err); +} + +/* + * The zone_set_page_fault_delay function is used to set the number of usecs + * to throttle page faults.  This is normally 0 but can be set to a non-0 value + * by the user-land memory capping code when the zone is over its physcial + * memory cap. + */ +static int +zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) +{ +	uint32_t dusec; +	int err; + +	if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0) +		zone->zone_pg_flt_delay = dusec; + +	return (err); +} -	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) -		zone->zone_phys_mcap = mcap; +/* + * The zone_set_rss function is used to set the zone's RSS when we do the + * fast, approximate calculation in user-land. + */ +static int +zone_set_rss(zone_t *zone, const uint64_t *prss) +{ +	uint64_t rss; +	int err; + +	if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) +		zone->zone_phys_mem = rss;  	return (err);  } @@ -3013,6 +3486,12 @@ getzoneid(void)  	return (curproc->p_zone->zone_id);  } +zoneid_t +getzonedid(void) +{ +	return (curproc->p_zone->zone_did); +} +  /*   * Internal versions of zone_find_by_*().  These don't zone_hold() or   * check the validity of a zone's state. @@ -3756,6 +4235,17 @@ zone_start_init(void)  	 */  	z->zone_proc_initpid = p->p_pid; +	if (z->zone_setup_app_contract == B_TRUE) { +		/* +		 * Normally a process cannot modify its own contract, but we're +		 * just starting the zone's init process and its contract is +		 * always initialized from the sys_process_tmpl template, so +		 * this is the simplest way to setup init's contract to kill +		 * the process if any other process in the contract exits. +		 */ +		p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; +	} +  	/*  	 * We maintain zone_boot_err so that we can return the cause of the  	 * failure back to the caller of the zone_boot syscall. @@ -3784,9 +4274,54 @@ zone_start_init(void)  			lwp_exit();  		}  	} else { +		id_t cid = curthread->t_cid; +  		if (zone_status_get(z) == ZONE_IS_BOOTING)  			zone_status_set(z, ZONE_IS_RUNNING);  		mutex_exit(&zone_status_lock); + +		mutex_enter(&class_lock); +		ASSERT(cid < loaded_classes); +		if (strcmp(sclass[cid].cl_name, "FX") == 0 && +		    z->zone_fixed_hipri) { +			/* +			 * If the zone is using FX then by default all +			 * processes start at the lowest priority and stay +			 * there. We provide a mechanism for the zone to +			 * indicate that it should run at "high priority". In +			 * this case we setup init to run at the highest FX +			 * priority (which is one level higher than the +			 * non-fixed scheduling classes can use). +			 */ +			pcparms_t pcparms; + +			pcparms.pc_cid = cid; +			((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; +			((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = +			    FXMAXUPRI; +			((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = +			    FX_DOUPRILIM | FX_DOUPRI; + +			mutex_enter(&pidlock); +			mutex_enter(&curproc->p_lock); + +			(void) parmsset(&pcparms, curthread); + +			mutex_exit(&curproc->p_lock); +			mutex_exit(&pidlock); +		} else if (strcmp(sclass[cid].cl_name, "RT") == 0) { +			/* +			 * zsched always starts the init lwp at priority +			 * minclsyspri - 1. This priority gets set in t_pri and +			 * is invalid for RT, but RT never uses t_pri. However +			 * t_pri is used by procfs, so we always see processes +			 * within an RT zone with an invalid priority value. +			 * We fix that up now. +			 */ +			curthread->t_pri = RTGPPRIO0; +		} +		mutex_exit(&class_lock); +  		/* cause the process to return to userland. */  		lwp_rtt();  	} @@ -4272,8 +4807,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)  		error = EINVAL;  		name = nvpair_name(nvp); -		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) -		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { +		if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && +		    strncmp(name, "project.", sizeof ("project.") - 1) != 0) || +		    nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {  			goto out;  		}  		if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4392,7 +4928,7 @@ zone_create(const char *zone_name, const char *zone_root,      caddr_t rctlbuf, size_t rctlbufsz,      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,      int match, uint32_t doi, const bslabel_t *label, -    int flags) +    int flags, zoneid_t zone_did)  {  	struct zsched_arg zarg;  	nvlist_t *rctls = NULL; @@ -4464,6 +5000,7 @@ zone_create(const char *zone_name, const char *zone_root,  	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);  	zone->zone_id = zoneid; +	zone->zone_did = zone_did;  	zone->zone_status = ZONE_IS_UNINITIALIZED;  	zone->zone_pool = pool_default;  	zone->zone_pool_mod = gethrtime(); @@ -4471,6 +5008,8 @@ zone_create(const char *zone_name, const char *zone_root,  	zone->zone_ncpus = 0;  	zone->zone_ncpus_online = 0;  	zone->zone_restart_init = B_TRUE; +	zone->zone_reboot_on_init_exit = B_FALSE; +	zone->zone_init_status = -1;  	zone->zone_brand = &native_brand;  	zone->zone_initname = NULL;  	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4538,10 +5077,14 @@ zone_create(const char *zone_name, const char *zone_root,  	zone->zone_locked_mem_ctl = UINT64_MAX;  	zone->zone_max_swap = 0;  	zone->zone_max_swap_ctl = UINT64_MAX; +	zone->zone_phys_mem = 0; +	zone->zone_phys_mem_ctl = UINT64_MAX;  	zone->zone_max_lofi = 0;  	zone->zone_max_lofi_ctl = UINT64_MAX; -	zone0.zone_lockedmem_kstat = NULL; -	zone0.zone_swapresv_kstat = NULL; +	zone->zone_lockedmem_kstat = NULL; +	zone->zone_swapresv_kstat = NULL; +	zone->zone_physmem_kstat = NULL; +	zone->zone_zfs_io_pri = 1;  	/*  	 * Zsched initializes the rctls. @@ -4696,8 +5239,8 @@ zone_create(const char *zone_name, const char *zone_root,  	/*  	 * The process, task, and project rctls are probably wrong;  	 * we need an interface to get the default values of all rctls, -	 * and initialize zsched appropriately.  I'm not sure that that -	 * makes much of a difference, though. +	 * and initialize zsched appropriately. However, we allow zoneadmd +	 * to pass down both zone and project rctls for the zone's init.  	 */  	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);  	if (error != 0) { @@ -4836,6 +5379,7 @@ zone_boot(zoneid_t zoneid)  static int  zone_empty(zone_t *zone)  { +	int cnt = 0;  	int waitstatus;  	/* @@ -4846,7 +5390,16 @@ zone_empty(zone_t *zone)  	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));  	while ((waitstatus = zone_status_timedwait_sig(zone,  	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { -		killall(zone->zone_id); +		boolean_t force = B_FALSE; + +		/* Every 30 seconds, try harder */ +		if (cnt++ >= 30) { +			cmn_err(CE_WARN, "attempt to force kill zone %d\n", +			    zone->zone_id); +			force = B_TRUE; +			cnt = 0; +		} +		killall(zone->zone_id, force);  	}  	/*  	 * return EINTR if we were signaled @@ -5597,14 +6150,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  				error = EFAULT;  		}  		break; -	case ZONE_ATTR_PHYS_MCAP: -		size = sizeof (zone->zone_phys_mcap); -		if (bufsize > size) -			bufsize = size; -		if (buf != NULL && -		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) -			error = EFAULT; -		break;  	case ZONE_ATTR_SCHED_CLASS:  		mutex_enter(&class_lock); @@ -5666,6 +6211,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  		}  		kmem_free(zbuf, bufsize);  		break; +	case ZONE_ATTR_DID: +		size = sizeof (zoneid_t); +		if (bufsize > size) +			bufsize = size; + +		if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) +			error = EFAULT; +		break; +	case ZONE_ATTR_SCHED_FIXEDHI: +		size = sizeof (boolean_t); +		if (bufsize > size) +			bufsize = size; + +		if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, +		    bufsize) != 0) +			error = EFAULT; +		break;  	default:  		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {  			size = bufsize; @@ -5697,10 +6259,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  		return (set_errno(EPERM));  	/* -	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the -	 * global zone. +	 * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT +	 * attributes can be set on the global zone.  	 */ -	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { +	if (zoneid == GLOBAL_ZONEID && +	    attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {  		return (set_errno(EINVAL));  	} @@ -5717,7 +6280,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  	 * non-global zones.  	 */  	zone_status = zone_status_get(zone); -	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { +	if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && +	    attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && +	    zone_status > ZONE_IS_READY) {  		err = EINVAL;  		goto done;  	} @@ -5739,12 +6304,21 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  	case ZONE_ATTR_FS_ALLOWED:  		err = zone_set_fs_allowed(zone, (const char *)buf);  		break; +	case ZONE_ATTR_PMCAP_NOVER: +		err = zone_set_mcap_nover(zone, (const uint64_t *)buf); +		break; +	case ZONE_ATTR_PMCAP_PAGEOUT: +		err = zone_set_mcap_pageout(zone, (const uint64_t *)buf); +		break; +	case ZONE_ATTR_PG_FLT_DELAY: +		err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); +		break; +	case ZONE_ATTR_RSS: +		err = zone_set_rss(zone, (const uint64_t *)buf); +		break;  	case ZONE_ATTR_SECFLAGS:  		err = zone_set_secflags(zone, (psecflags_t *)buf);  		break; -	case ZONE_ATTR_PHYS_MCAP: -		err = zone_set_phys_mcap(zone, (const uint64_t *)buf); -		break;  	case ZONE_ATTR_SCHED_CLASS:  		err = zone_set_sched_class(zone, (const char *)buf);  		break; @@ -5772,6 +6346,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)  		err = zone_set_network(zoneid, zbuf);  		kmem_free(zbuf, bufsize);  		break; +	case ZONE_ATTR_APP_SVC_CT: +		if (bufsize != sizeof (boolean_t)) { +			err = EINVAL; +		} else { +			zone->zone_setup_app_contract = (boolean_t)buf; +			err = 0; +		} +		break; +	case ZONE_ATTR_SCHED_FIXEDHI: +		if (bufsize != sizeof (boolean_t)) { +			err = EINVAL; +		} else { +			zone->zone_fixed_hipri = (boolean_t)buf; +			err = 0; +		} +		break;  	default:  		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))  			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6475,6 +7065,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)  			zs.doi = zs32.doi;  			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;  			zs.flags = zs32.flags; +			zs.zoneid = zs32.zoneid;  #else  			panic("get_udatamodel() returned bogus result\n");  #endif @@ -6485,7 +7076,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)  		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,  		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,  		    zs.extended_error, zs.match, zs.doi, -		    zs.label, zs.flags)); +		    zs.label, zs.flags, zs.zoneid));  	case ZONE_BOOT:  		return (zone_boot((zoneid_t)(uintptr_t)arg1));  	case ZONE_DESTROY: @@ -6586,6 +7177,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)  	bcopy(zone->zone_name, zone_name, zone_namelen);  	zoneid = zone->zone_id;  	uniqid = zone->zone_uniqid; +	arg.status = zone->zone_init_status;  	/*  	 * zoneadmd may be down, but at least we can empty out the zone.  	 * We can ignore the return value of zone_empty() since we're called @@ -6763,7 +7355,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)  	 * zone_ki_call_zoneadmd() will do a more thorough job of this  	 * later.  	 */ -	killall(zone->zone_id); +	killall(zone->zone_id, B_FALSE);  	/*  	 * Now, create the thread to contact zoneadmd and do the rest of the  	 * work.  This thread can't be created in our zone otherwise @@ -6826,16 +7418,15 @@ zone_shutdown_global(void)  }  /* - * Returns true if the named dataset is visible in the current zone. + * Returns true if the named dataset is visible in the specified zone.   * The 'write' parameter is set to 1 if the dataset is also writable.   */  int -zone_dataset_visible(const char *dataset, int *write) +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)  {  	static int zfstype = -1;  	zone_dataset_t *zd;  	size_t len; -	zone_t *zone = curproc->p_zone;  	const char *name = NULL;  	vfs_t *vfsp = NULL; @@ -6903,7 +7494,8 @@ zone_dataset_visible(const char *dataset, int *write)  	vfs_list_read_lock();  	vfsp = zone->zone_vfslist;  	do { -		ASSERT(vfsp); +		if (vfsp == NULL) +			break;  		if (vfsp->vfs_fstype == zfstype) {  			name = refstr_value(vfsp->vfs_resource); @@ -6940,6 +7532,18 @@ zone_dataset_visible(const char *dataset, int *write)  }  /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ +	zone_t *zone = curproc->p_zone; + +	return (zone_dataset_visible_inzone(zone, dataset, write)); +} + +/*   * zone_find_by_any_path() -   *   * kernel-private routine similar to zone_find_by_path(), but which @@ -7164,6 +7768,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)  	mutex_exit(&zone->zone_lock);  	zone_rele(zone); +	/* +	 * Prevent returning negative nump values -- we should never +	 * have this many links anyways. +	 */ +	if (num > INT_MAX) +		return (set_errno(EOVERFLOW)); +  	/* Increased or decreased, caller should be notified. */  	if (num != dlcount) {  		if (copyout(&num, nump, sizeof (num)) != 0) | 
