2 files changed, 355 insertions, 120 deletions
diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c
index 73d302aaa5..53bbd078ba 100644
--- a/usr/src/uts/common/exec/elf/elf.c
+++ b/usr/src/uts/common/exec/elf/elf.c
@@ -26,7 +26,7 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	   All Rights Reserved	*/
 /*
- * Copyright 2019, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright 2022 Oxide Computer Company
  */
 
@@ -94,7 +94,6 @@ static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **,
     Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *,
     size_t, size_t *, size_t *);
 
-
 #ifdef _ELF32_COMPAT
 /* Link against the non-compat instances when compiling the 32-bit version. */
 extern size_t elf_datasz_max;
@@ -181,12 +180,16 @@ elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz)
 #endif /* _ELF32_COMPAT */
 
 /*
- * Map in the executable pointed to by vp. Returns 0 on success.
+ * Map in the executable pointed to by vp. Returns 0 on success.  Note that
+ * this function currently has the maximum number of arguments allowed by
+ * modstubs on x86 (MAXNARG)!  Do _not_ add to this function signature without
+ * adding to MAXNARG.  (Better yet, do not add to this monster of a function
+ * signature!)
  */
 int
 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
-    intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
-    caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
+    intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
+    caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
 {
 	size_t		len, phdrsize;
 	struct vattr	vat;
@@ -197,12 +200,16 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 	Phdr		*junk = NULL;
 	Phdr		*dynphdr = NULL;
 	Phdr		*dtrphdr = NULL;
+	char		*interp = NULL;
 	uintptr_t	lddata, minaddr;
 	size_t		execsz;
 
 	if (lddatap != NULL)
 		*lddatap = 0;
 
+	if (minaddrp != NULL)
+		*minaddrp = (uintptr_t)NULL;
+
 	if (error = execpermissions(vp, &vat, args)) {
 		uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
 		return (error);
@@ -234,24 +241,85 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 		return (error);
 	}
 
+	if (minaddrp != NULL)
+		*minaddrp = minaddr;
+
 	/*
-	 * Inform our caller if the executable needs an interpreter.
+	 * If the executable requires an interpreter, determine its name.
 	 */
-	*interp = (dynphdr == NULL) ? 0 : 1;
+	if (dynphdr != NULL) {
+		ssize_t	resid;
+
+		if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
+			uprintf("%s: Invalid interpreter\n", exec_file);
+			kmem_free(phdrbase, phdrsize);
+			return (ENOEXEC);
+		}
+
+		interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+		if ((error = vn_rdwr(UIO_READ, vp, interp,
+		    (ssize_t)dynphdr->p_filesz,
+		    (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
+		    (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
+		    interp[dynphdr->p_filesz - 1] != '\0') {
+			uprintf("%s: Cannot obtain interpreter pathname\n",
+			    exec_file);
+			kmem_free(interp, MAXPATHLEN);
+			kmem_free(phdrbase, phdrsize);
+			return (error != 0 ? error : ENOEXEC);
+		}
+	}
 
 	/*
 	 * If this is a statically linked executable, voffset should indicate
 	 * the address of the executable itself (it normally holds the address
 	 * of the interpreter).
 	 */
-	if (ehdr->e_type == ET_EXEC && *interp == 0)
+	if (ehdr->e_type == ET_EXEC && interp == NULL)
 		*voffset = minaddr;
 
+	/*
+	 * If the caller has asked for the interpreter name, return it (it's
+	 * up to the caller to free it); if the caller hasn't asked for it,
+	 * free it ourselves.
+	 */
+	if (interpp != NULL) {
+		*interpp = interp;
+	} else if (interp != NULL) {
+		kmem_free(interp, MAXPATHLEN);
+	}
+
 	if (uphdr != NULL) {
 		*uphdr_vaddr = uphdr->p_vaddr;
 
 		if (uphdr->p_flags == 0)
 			kmem_free(uphdr, sizeof (Phdr));
+	} else if (ehdr->e_type == ET_DYN) {
+		/*
+		 * If we don't have a uphdr, we'll apply the logic found
+		 * in mapelfexec() and use the p_vaddr of the first PT_LOAD
+		 * section as the base address of the object.
+		 */
+		const Phdr *phdr = (Phdr *)phdrbase;
+		const uint_t hsize = ehdr->e_phentsize;
+		uint_t i;
+
+		for (i = nphdrs; i > 0; i--) {
+			if (phdr->p_type == PT_LOAD) {
+				*uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
+				    ehdr->e_phoff;
+				break;
+			}
+
+			phdr = (Phdr *)((caddr_t)phdr + hsize);
+		}
+
+		/*
+		 * If we don't have a PT_LOAD segment, we should have returned
+		 * ENOEXEC when elfsize() returned 0, above.
+		 */
+		VERIFY(i > 0);
 	} else {
 		*uphdr_vaddr = (Addr)-1;
 	}
@@ -263,13 +331,13 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 int
 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
     int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
-    int brand_action)
+    int *brand_action)
 {
 	caddr_t		phdrbase = NULL;
 	caddr_t		bssbase = 0;
 	caddr_t		brkbase = 0;
 	size_t		brksize = 0;
-	size_t		dlnsize;
+	size_t		dlnsize, nsize = 0;
 	aux_entry_t	*aux;
 	int		error;
 	ssize_t		resid;
@@ -349,7 +417,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		*execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
 	} else {
 		args->to_model = DATAMODEL_LP64;
-		args->stk_prot &= ~PROT_EXEC;
+		if (!args->stk_prot_override) {
+			args->stk_prot &= ~PROT_EXEC;
+		}
 #if defined(__x86)
 		args->dat_prot &= ~PROT_EXEC;
 #endif
@@ -361,11 +431,25 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 #endif	/* _LP64 */
 
 	/*
-	 * We delay invoking the brand callback until we've figured out
-	 * what kind of elf binary we're trying to run, 32-bit or 64-bit.
-	 * We do this because now the brand library can just check
-	 * args->to_model to see if the target is 32-bit or 64-bit without
-	 * having do duplicate all the code above.
+	 * We delay invoking the brand callback until we've figured out what
+	 * kind of elf binary we're trying to run, 32-bit or 64-bit.  We do this
+	 * because now the brand library can just check args->to_model to see if
+	 * the target is 32-bit or 64-bit without having do duplicate all the
+	 * code above.
+	 *
+	 * We also give the brand a chance to indicate that based on the ELF
+	 * OSABI of the target binary it should become unbranded and optionally
+	 * indicate that it should be treated as existing in a specific prefix.
+	 *
+	 * Note that if a brand opts to go down this route it does not actually
+	 * end up being debranded. In other words, future programs that exec
+	 * will still be considered for branding unless this escape hatch is
+	 * used. Consider the case of lx brand for example. If a user runs
+	 * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
+	 * of DTrace that's in /native will take this escape hatch and be run
+	 * and interpreted using the normal system call table; however, the
+	 * execution of a non-illumos binary in the form of /bin/ls will still
+	 * be branded and be subject to all of the normal actions of the brand.
 	 *
 	 * The level checks associated with brand handling below are used to
 	 * prevent a loop since the brand elfexec function typically comes back
@@ -373,8 +457,20 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	 * handling in the #! interpreter code will increment the level before
 	 * calling gexec to run the final elfexec interpreter.
 	 */
+	if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
+	    (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
+		if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
+		    &args->brand_nroot) == B_TRUE) {
+			ASSERT(ehdrp->e_ident[EI_OSABI]);
+			*brand_action = EBA_NATIVE;
+			/* Add one for the trailing '/' in the path */
+			if (args->brand_nroot != NULL)
+				nsize = strlen(args->brand_nroot) + 1;
+		}
+	}
+
 	if ((level <= INTP_MAXDEPTH) &&
-	    (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
+	    (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 		error = BROP(p)->b_elfexec(vp, uap, args,
 		    idatap, level + 1, execsz, setid, exec_file, cred,
 		    brand_action);
@@ -448,6 +544,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		 *	AT_BASE
 		 *	AT_FLAGS
 		 *	AT_PAGESZ
+		 *	AT_RANDOM	(added in stk_copyout)
 		 *	AT_SUN_AUXFLAGS
 		 *	AT_SUN_HWCAP
 		 *	AT_SUN_HWCAP2
@@ -456,7 +553,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		 *	AT_SUN_EXECNAME (added in stk_copyout)
 		 *	AT_NULL
 		 *
-		 * total == 10
+		 * total == 11
 		 */
 		if (hasintp && hasu) {
 			/*
@@ -471,7 +568,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 			 *
 			 * total = 5
 			 */
-			args->auxsize = (10 + 5) * sizeof (aux_entry_t);
+			args->auxsize = (11 + 5) * sizeof (aux_entry_t);
 		} else if (hasintp) {
 			/*
 			 * Has PT_INTERP but no PT_PHDR
@@ -481,9 +578,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 			 *
 			 * total = 2
 			 */
-			args->auxsize = (10 + 2) * sizeof (aux_entry_t);
+			args->auxsize = (11 + 2) * sizeof (aux_entry_t);
 		} else {
-			args->auxsize = 10 * sizeof (aux_entry_t);
+			args->auxsize = 11 * sizeof (aux_entry_t);
 		}
 	} else {
 		args->auxsize = 0;
@@ -497,6 +594,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		args->auxsize += sizeof (aux_entry_t);
 
 	/*
+	 * If this is a native binary that's been given a modified interpreter
+	 * root, inform it that the native system exists at that root.
+	 */
+	if (args->brand_nroot != NULL) {
+		args->auxsize += sizeof (aux_entry_t);
+	}
+
+
+	/*
 	 * On supported kernels (x86_64) make room in the auxv for the
 	 * AT_SUN_COMMPAGE entry.  This will go unpopulated on i86xpv systems
 	 * which do not provide such functionality.
@@ -508,13 +614,24 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	args->auxsize += 3 * sizeof (aux_entry_t);
 #endif /* defined(__amd64) */
 
-	if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
+	/*
+	 * If we have user credentials, we'll supply the following entries:
+	 *	AT_SUN_UID
+	 *	AT_SUN_RUID
+	 *	AT_SUN_GID
+	 *	AT_SUN_RGID
+	 */
+	if (cred != NULL) {
+		args->auxsize += 4 * sizeof (aux_entry_t);
+	}
+
+	if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 		branded = 1;
 		/*
-		 * We will be adding 4 entries to the aux vectors.  One for
-		 * the the brandname and 3 for the brand specific aux vectors.
+		 * We will be adding 5 entries to the aux vectors.  One for
+		 * the brandname and 4 for the brand specific aux vectors.
 		 */
-		args->auxsize += 4 * sizeof (aux_entry_t);
+		args->auxsize += 5 * sizeof (aux_entry_t);
 	}
 
 	/* If the binary has an explicit ASLR flag, it must be honoured */
@@ -595,7 +712,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	aux = bigwad->elfargs;
 	/*
 	 * Move args to the user's stack.
-	 * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries.
+	 * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM
+	 * aux entries.
 	 */
 	if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
 		if (error == -1) {
@@ -645,7 +763,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		char		*p;
 		struct vnode	*nvp;
 
-		dlnsize = intphdr->p_filesz;
+		dlnsize = intphdr->p_filesz + nsize;
 
 		/*
 		 * Make sure none of the component pieces of dlnsize result in
@@ -656,10 +774,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 			goto bad;
 		}
 
+		if (nsize != 0) {
+			bcopy(args->brand_nroot, dlnp, nsize - 1);
+			dlnp[nsize - 1] = '/';
+		}
+
 		/*
 		 * Read in "interpreter" pathname.
 		 */
-		if ((error = vn_rdwr(UIO_READ, vp, dlnp,
+		if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
 		    (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset,
 		    UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
 			uprintf("%s: Cannot obtain interpreter pathname\n",
@@ -842,8 +965,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 #endif /* defined(__amd64) */
 
 		/*
-		 * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
-		 * exec_args()
+		 * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were
+		 * filled in via exec_args()
 		 */
 		ADDAUX(aux, AT_BASE, voffset)
 		ADDAUX(aux, AT_FLAGS, at_flags)
@@ -871,7 +994,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		 * malicious user within the zone from crafting a wrapper to
 		 * run native suid commands with unsecure libraries interposed.
 		 */
-		if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
+		if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
 		    (setid &= ~EXECSETID_SETID) != 0))
 			auxf &= ~AF_SUN_SETUGID;
 
@@ -886,6 +1009,17 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
 
 		/*
+		 * Record information about the real and effective user and
+		 * group IDs.
+		 */
+		if (cred != NULL) {
+			ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
+			ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
+			ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
+			ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
+		}
+
+		/*
 		 * Hardware capability flag word (performance hints)
 		 * Used for choosing faster library routines.
 		 * (Potentially different between 32-bit and 64-bit ABIs)
@@ -912,6 +1046,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 			ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
 			ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
 			ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
+			ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
 		}
 
 		/*
@@ -1119,10 +1254,10 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
 	 * We got here by the first two bytes in ident,
 	 * now read the entire ELF header.
 	 */
-	if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr,
-	    sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0,
-	    (rlim64_t)0, credp, &resid)) != 0)
+	if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, sizeof (Ehdr),
+	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid)) != 0) {
 		return (error);
+	}
 
 	/*
 	 * Since a separate version is compiled for handling 32-bit and
@@ -1131,8 +1266,9 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
 	 */
 	if (resid != 0 ||
 	    ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
-	    ehdr->e_ident[EI_MAG3] != ELFMAG3)
+	    ehdr->e_ident[EI_MAG3] != ELFMAG3) {
 		return (ENOEXEC);
+	}
 
 	if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
 #if defined(_ILP32) || defined(_ELF32_COMPAT)
@@ -1141,8 +1277,9 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
 	    ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
 #endif
 	    !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
-	    ehdr->e_flags))
+	    ehdr->e_flags)) {
 		return (EINVAL);
+	}
 
 	*nshdrs = ehdr->e_shnum;
 	*shstrndx = ehdr->e_shstrndx;
@@ -1162,9 +1299,8 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
 
 		if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
 		    sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
-		    (rlim64_t)0, credp, NULL)) != 0) {
+		    (rlim64_t)0, credp, NULL)) != 0)
 			return (error);
-		}
 
 		if (*nshdrs == 0)
 			*nshdrs = shdr.sh_size;
@@ -1335,7 +1471,7 @@ mapelfexec(
 	size_t *brksize)
 {
 	Phdr *phdr;
-	int error, page, prot;
+	int error, page, prot, lastprot = 0;
 	caddr_t addr = NULL;
 	caddr_t minaddr = (caddr_t)UINTPTR_MAX;
 	uint_t i;
@@ -1343,9 +1479,11 @@ mapelfexec(
 	boolean_t ptload = B_FALSE;
 	off_t offset;
 	const uint_t hsize = ehdr->e_phentsize;
+	uintptr_t lastaddr = 0;
 	extern int use_brk_lpg;
 
 	if (ehdr->e_type == ET_DYN) {
+		caddr_t vaddr;
 		secflagset_t flags = 0;
 		/*
 		 * Obtain the virtual address of a hole in the
@@ -1357,23 +1495,65 @@ mapelfexec(
 		map_addr(&addr, len, (offset_t)0, 1, flags);
 		if (addr == NULL)
 			return (ENOMEM);
-		*voffset = (intptr_t)addr;
 
 		/*
-		 * Calculate the minimum vaddr so it can be subtracted out.
-		 * According to the ELF specification, since PT_LOAD sections
-		 * must be sorted by increasing p_vaddr values, this is
-		 * guaranteed to be the first PT_LOAD section.
+		 * Despite the fact that mmapobj(2) refuses to load them, we
+		 * need to support executing ET_DYN objects that have a
+		 * non-NULL p_vaddr.  When found in the wild, these objects
+		 * are likely to be due to an old (and largely obviated) Linux
+		 * facility, prelink(8), that rewrites shared objects to
+		 * prefer specific (disjoint) virtual address ranges.  (Yes,
+		 * this is putatively for performance -- and yes, it has
+		 * limited applicability, many edge conditions and grisly
+		 * failure modes; even for Linux, it's insane.)  As ELF
+		 * mandates that the PT_LOAD segments be in p_vaddr order, we
+		 * find the lowest p_vaddr by finding the first PT_LOAD
+		 * segment.
 		 */
 		phdr = (Phdr *)phdrbase;
 		for (i = nphdrs; i > 0; i--) {
 			if (phdr->p_type == PT_LOAD) {
-				*voffset -= (uintptr_t)phdr->p_vaddr;
+				addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
 				break;
 			}
 			phdr = (Phdr *)((caddr_t)phdr + hsize);
 		}
 
+		/*
+		 * We have a non-zero p_vaddr in the first PT_LOAD segment --
+		 * presumably because we're directly executing a prelink(8)'d
+		 * ld-linux.so.  While we could correctly execute such an
+		 * object without locating it at its desired p_vaddr (it is,
+		 * after all, still relocatable), our inner antiquarian
+		 * derives a perverse pleasure in accommodating the steampunk
+		 * prelink(8) contraption -- goggles on!
+		 */
+		if ((vaddr = addr) != NULL) {
+			if (as_gap(curproc->p_as, len, &addr, &len,
+			    AH_LO, NULL) == -1 || addr != vaddr) {
+				addr = NULL;
+			}
+		}
+
+		if (addr == NULL) {
+			/*
+			 * We either have a NULL p_vaddr (the common case, by
+			 * many orders of magnitude) or we have a non-NULL
+			 * p_vaddr and we were unable to obtain the specified
+			 * VA range (presumably because it's an illegal
+			 * address).  Either way, obtain an address in which
+			 * to map the interpreter.
+			 */
+			map_addr(&addr, len, (offset_t)0, 1, 0);
+			if (addr == NULL)
+				return (ENOMEM);
+		}
+
+		/*
+		 * Our voffset is the difference between where we landed and
+		 * where we wanted to be.
+		 */
+		*voffset = (uintptr_t)addr - (uintptr_t)vaddr;
 	} else {
 		*voffset = 0;
 	}
@@ -1437,6 +1617,41 @@ mapelfexec(
 			if (addr < minaddr)
 				minaddr = addr;
 
+			/*
+			 * Segments need not correspond to page boundaries:
+			 * they are permitted to share a page.  If two PT_LOAD
+			 * segments share the same page, and the permissions
+			 * of the segments differ, the behavior is historically
+			 * that the permissions of the latter segment are used
+			 * for the page that the two segments share.  This is
+			 * also historically a non-issue:  binaries generated
+			 * by most anything will make sure that two PT_LOAD
+			 * segments with differing permissions don't actually
+			 * share any pages.  However, there exist some crazy
+			 * things out there (including at least an obscure
+			 * Portuguese teaching language called G-Portugol) that
+			 * actually do the wrong thing and expect it to work:
+			 * they have a segment with execute permission share
+			 * a page with a subsequent segment that does not
+			 * have execute permissions and expect the resulting
+			 * shared page to in fact be executable.  To accommodate
+			 * such broken link editors, we take advantage of a
+			 * latitude explicitly granted to the loader:  it is
+			 * permitted to make _any_ PT_LOAD segment executable
+			 * (provided that it is readable or writable).  If we
+			 * see that we're sharing a page and that the previous
+			 * page was executable, we will add execute permissions
+			 * to our segment.
+			 */
+			if (btop(lastaddr) == btop((uintptr_t)addr) &&
+			    (phdr->p_flags & (PF_R | PF_W)) &&
+			    (lastprot & PROT_EXEC)) {
+				prot |= PROT_EXEC;
+			}
+
+			lastaddr = (uintptr_t)addr + phdr->p_filesz;
+			lastprot = prot;
+
 			zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
 
 			offset = phdr->p_offset;
@@ -1521,8 +1736,22 @@ mapelfexec(
 			break;
 
 		case PT_INTERP:
-			if (ptload)
-				goto bad;
+			/*
+			 * The ELF specification is unequivocal about the
+			 * PT_INTERP program header with respect to any PT_LOAD
+			 * program header:  "If it is present, it must precede
+			 * any loadable segment entry." Linux, however, makes
+			 * no attempt to enforce this -- which has allowed some
+			 * binary editing tools to get away with generating
+			 * invalid ELF binaries in the respect that PT_INTERP
+			 * occurs after the first PT_LOAD program header.  This
+			 * is unfortunate (and of course, disappointing) but
+			 * it's no worse than that: there is no reason that we
+			 * can't process the PT_INTERP entry (if present) after
+			 * one or more PT_LOAD entries.  We therefore
+			 * deliberately do not check ptload here and always
+			 * store dyphdr to be the PT_INTERP program header.
+			 */
 			*intphdr = phdr;
 			break;
 
@@ -1629,6 +1858,7 @@ elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
 	return (0);
 }
 
+
 /*
  * Copy the section data from one vnode to the section of another vnode.
  */
@@ -1676,28 +1906,38 @@ elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst)
 }
 
 /*
+ * The design of this check is intentional.
+ * In particular, we want to capture any sections that begin with '.debug_' for
+ * a few reasons:
+ *
+ * 1) Various revisions to the DWARF spec end up changing the set of section
+ *    headers that exist. This ensures that we don't need to change the kernel
+ *    to get a new version.
+ *
+ * 2) Other software uses .debug_ sections for things which aren't DWARF. This
+ *    allows them to be captured as well.
+ */
+#define	IS_DEBUGSECTION(name) (strncmp(name, ".debug_", strlen(".debug_")) == 0)
+
+/*
  * Walk sections for a given ELF object, counting (or copying) those of
  * interest (CTF, symtab, strtab, .debug_*).
  */
-static int
+static uint_t
 elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
-    Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab, uint_t *countp)
+    Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab, int *errp)
 {
 	Ehdr ehdr;
 	const core_content_t content = ctx->ecc_content;
 	cred_t *credp = ctx->ecc_credp;
 	Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL;
 	uintptr_t off = 0;
-	uint_t nshdrs, shstrndx, nphdrs, count = 0;
+	uint_t nshdrs, shstrndx, nphdrs, ndebug, count = 0;
 	u_offset_t *doffp = &ctx->ecc_doffset;
 	boolean_t ctf_link = B_FALSE;
 	caddr_t shbase;
 	size_t shsize, shstrsize;
 	char *shstrbase;
-	int error = 0;
-	const boolean_t justcounting = v == NULL;
-
-	*countp = 0;
 
 	if ((content &
 	    (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG)) == 0) {
@@ -1712,6 +1952,7 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
 
 	/* Starting at index 1 skips SHT_NULL which is expected at index 0 */
 	off = ehdr.e_shentsize;
+	ndebug = 0;
 	for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
 		Shdr *shdr, *symchk = NULL, *strchk;
 		const char *name;
@@ -1739,51 +1980,8 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
 		    strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) {
 			symchk = shdr;
 		} else if ((content & CC_CONTENT_DEBUG) != 0 &&
-		    strncmp(name, ".debug_", strlen(".debug_")) == 0) {
-			/*
-			 * The design of the above check is intentional. In
-			 * particular, we want to capture any sections that
-			 * begin with '.debug_' for a few reasons:
-			 *
-			 * 1) Various revisions to the DWARF spec end up
-			 * changing the set of section headers that exist. This
-			 * ensures that we don't need to change the kernel to
-			 * get a new version.
-			 *
-			 * 2) Other software uses .debug_ sections for things
-			 * which aren't DWARF. This allows them to be captured
-			 * as well.
-			 */
-			count++;
-
-			if (!justcounting) {
-				if (count > remain) {
-					error = ENOMEM;
-					goto done;
-				}
-
-				elf_ctx_resize_scratch(ctx, shdr->sh_size);
-
-				if (!shstrtab_ndx(shstrtab,
-				    name, &v[idx].sh_name)) {
-					error = ENOMEM;
-					goto done;
-				}
-
-				v[idx].sh_addr = (Addr)(uintptr_t)saddr;
-				v[idx].sh_type = shdr->sh_type;
-				v[idx].sh_addralign = shdr->sh_addralign;
-				*doffp = roundup(*doffp, v[idx].sh_addralign);
-				v[idx].sh_offset = *doffp;
-				v[idx].sh_size = shdr->sh_size;
-				v[idx].sh_link = 0;
-				v[idx].sh_entsize = shdr->sh_entsize;
-				v[idx].sh_info = shdr->sh_info;
-
-				elf_copy_scn(ctx, shdr, mvp, &v[idx]);
-				idx++;
-			}
-
+		    IS_DEBUGSECTION(name)) {
+			ndebug++;
 			continue;
 		} else {
 			continue;
@@ -1815,24 +2013,19 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
 		count += 1;
 	if (symtab != NULL)
 		count += 2;
-
-	if (count > remain) {
-		count = remain;
-		if (!justcounting)
-			error = ENOMEM;
+	count += ndebug;
+	if (v == NULL || count == 0 || count > remain) {
+		count = MIN(count, remain);
 		goto done;
 	}
 
-	if (justcounting)
-		goto done;
-
 	/* output CTF section */
 	if (ctf != NULL) {
 		elf_ctx_resize_scratch(ctx, ctf->sh_size);
 
 		if (!shstrtab_ndx(shstrtab,
 		    shstrtab_data[STR_CTF], &v[idx].sh_name)) {
-			error = ENOMEM;
+			*errp = ENOMEM;
 			goto done;
 		}
 		v[idx].sh_addr = (Addr)(uintptr_t)saddr;
@@ -1875,12 +2068,12 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
 
 		if (!shstrtab_ndx(shstrtab,
 		    shstrtab_data[symtab_type], &symtab_name)) {
-			error = ENOMEM;
+			*errp = ENOMEM;
 			goto done;
 		}
 		if (!shstrtab_ndx(shstrtab,
 		    shstrtab_data[strtab_type], &strtab_name)) {
-			error = ENOMEM;
+			*errp = ENOMEM;
 			goto done;
 		}
 
@@ -1915,14 +2108,52 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
 		idx++;
 	}
 
+	if (ndebug == 0)
+		goto done;
+
+	/* output DEBUG sections */
+	off = 0;
+	for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
+		const char *name;
+		Shdr *shdr;
+
+		shdr = (Shdr *)(shbase + off);
+		if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL)
+			continue;
+
+		name = shstrbase + shdr->sh_name;
+
+		if (!IS_DEBUGSECTION(name))
+			continue;
+
+		elf_ctx_resize_scratch(ctx, shdr->sh_size);
+
+		if (!shstrtab_ndx(shstrtab, name, &v[idx].sh_name)) {
+			*errp = ENOMEM;
+			goto done;
+		}
+
+		v[idx].sh_addr = (Addr)(uintptr_t)saddr;
+		v[idx].sh_type = shdr->sh_type;
+		v[idx].sh_addralign = shdr->sh_addralign;
+		*doffp = roundup(*doffp, v[idx].sh_addralign);
+		v[idx].sh_offset = *doffp;
+		v[idx].sh_size = shdr->sh_size;
+		v[idx].sh_link = 0;
+		v[idx].sh_entsize = shdr->sh_entsize;
+		v[idx].sh_info = shdr->sh_info;
+
+		elf_copy_scn(ctx, shdr, mvp, &v[idx]);
+		idx++;
+
+		if (--ndebug == 0)
+			break;
+	}
+
 done:
 	kmem_free(shstrbase, shstrsize);
 	kmem_free(shbase, shsize);
-
-	if (error == 0)
-		*countp = count;
-
-	return (error);
+	return (count);
 }
 
 /*
@@ -1979,8 +2210,9 @@ elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
 		if (seg->s_ops != &segvn_ops ||
 		    SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
 		    mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
-		    (segsize = pr_getsegsize(seg, 1)) == 0)
+		    (segsize = pr_getsegsize(seg, 1)) == 0) {
 			continue;
+		}
 
 		eaddr = saddr + segsize;
 		prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
@@ -1993,8 +2225,8 @@ elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
 		if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
 			continue;
 
-		error = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
-		    &shstrtab, &count);
+		count = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
+		    &shstrtab, &error);
 		if (error != 0)
 			goto done;
 
@@ -2106,8 +2338,9 @@ top:
 	 * we overflow the 16 bits allotted to the program header count in
 	 * the ELF header, we'll need that program header at index zero.
 	 */
-	if (nshdrs == 0 && nphdrs >= PN_XNUM)
+	if (nshdrs == 0 && nphdrs >= PN_XNUM) {
 		nshdrs = 1;
+	}
 
 	/*
 	 * Allocate a buffer which is sized adequately to hold the ehdr, phdrs
@@ -2556,7 +2789,7 @@ static struct modlexec modlexec = {
 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
 			intpdata_t *idatap, int level, size_t *execsz,
 			int setid, caddr_t exec_file, cred_t *cred,
-			int brand_action);
+			int *brand_action);
 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
 			rlim64_t rlimit, int sig, core_content_t content);
 
diff --git a/usr/src/uts/common/exec/elf/elf_notes.c b/usr/src/uts/common/exec/elf/elf_notes.c
index 78305cc076..0a0d405eba 100644
--- a/usr/src/uts/common/exec/elf/elf_notes.c
+++ b/usr/src/uts/common/exec/elf/elf_notes.c
@@ -347,11 +347,13 @@ write_elfnotes(proc_t *p, int sig, vnode_t *vp, offset_t offset,
 
 
 	/* open file table */
+	mutex_enter(&p->p_lock);
 	vroot = PTOU(p)->u_rdir;
 	if (vroot == NULL)
 		vroot = rootdir;
 
 	VN_HOLD(vroot);
+	mutex_exit(&p->p_lock);
 
 	fip = P_FINFO(p);