1 files changed, 1118 insertions, 0 deletions
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mem.c b/usr/src/uts/common/brand/lx/syscall/lx_mem.c
new file mode 100644
index 0000000000..15351444c8
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_mem.c
@@ -0,0 +1,1118 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/mman.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/policy.h>
+#include <sys/lx_brand.h>
+#include <sys/fcntl.h>
+#include <sys/pathname.h>
+#include <vm/seg_vn.h>
+#include <vm/seg_spt.h>
+#include <sys/shm_impl.h>
+#include <vm/as.h>
+
+/* From uts/common/os/grow.c */
+extern int mprotect(caddr_t, size_t, int);
+extern caddr_t smmap64(caddr_t, size_t, int, int, int, off_t);
+extern int munmap(caddr_t, size_t);
+/* From uts/common/syscall/close.c */
+extern int close(int);
+/* From uts/common/fs/proc/prsubr.c */
+extern uint_t pr_getprot(struct seg *, int, void **, caddr_t *, caddr_t *,
+    caddr_t);
+/* From uts/common/vm/seg_spt.c */
+extern struct seg_ops segspt_shmops;
+/* From uts/common/syscall/memcntl.c */
+extern int memcntl(caddr_t, size_t, int, caddr_t, int, int);
+/* From uts/common/os/grow.c */
+extern int smmap_common(caddr_t *, size_t, int, int, struct file *, offset_t);
+
+/*
+ * After Linux 2.6.8, an unprivileged process can lock memory up to its
+ * RLIMIT_MEMLOCK resource limit.
+ *
+ * Within memcntl() it assumes we have PRIV_PROC_LOCK_MEMORY, or the check in
+ * secpolicy_lock_memory() will fail when we attempt to lock memory. Thus,
+ * to support the Linux semantics, we bypass memcntl() and perform the locking
+ * operations directly.
+ */
+
+#define	LX_MADV_NORMAL		0
+#define	LX_MADV_RANDOM		1
+#define	LX_MADV_SEQUENTIAL	2
+#define	LX_MADV_WILLNEED	3
+#define	LX_MADV_DONTNEED	4
+#define	LX_MADV_FREE		8
+#define	LX_MADV_REMOVE		9
+#define	LX_MADV_DONTFORK	10
+#define	LX_MADV_DOFORK		11
+#define	LX_MADV_MERGEABLE	12
+#define	LX_MADV_UNMERGEABLE	13
+#define	LX_MADV_HUGEPAGE	14
+#define	LX_MADV_NOHUGEPAGE	15
+#define	LX_MADV_DONTDUMP	16
+#define	LX_MADV_DODUMP		17
+
+#define	LX_VALID_MSYNC	(MS_ASYNC|MS_INVALIDATE|MS_SYNC)
+
+#define	LX_PROT_GROWSDOWN	0x01000000
+#define	LX_PROT_GROWSUP		0x02000000
+
+/* Internal segment map flags */
+#define	LX_SM_READ	0x01
+#define	LX_SM_WRITE	0x02
+#define	LX_SM_EXEC	0x04
+#define	LX_SM_SHM	0x08
+#define	LX_SM_ANON	0x10
+#define	LX_SM_SHARED	0x20
+#define	LX_SM_NORESERVE	0x40
+
+/* For convenience */
+#define	LX_PROT_GROWMASK	(LX_PROT_GROWSUP|LX_PROT_GROWSDOWN)
+
+/* From lx_rlimit.c */
+extern void lx_get_rctl(char *, struct rlimit64 *);
+
+static int
+lx_mlock_common(int op, uintptr_t addr, size_t len)
+{
+	int err;
+	struct as *as = curproc->p_as;
+	const uintptr_t align_addr = addr & (uintptr_t)PAGEMASK;
+	const size_t align_len = P2ROUNDUP(len + (addr & PAGEOFFSET), PAGESIZE);
+
+	if (len == 0) {
+		/* Linux short-circuits to success on zero length */
+		return (0);
+	} else if ((align_addr + align_len) <= align_addr) {
+		/* Catch overflow (including when aligning len) */
+		return (set_errno(EINVAL));
+	}
+
+	err = as_ctl(as, (caddr_t)align_addr, align_len, op, 0, 0, NULL, 0);
+	if (err == EAGAIN)
+		err = ENOMEM;
+	return (err == 0 ? 0 : set_errno(err));
+}
+
+int
+lx_mlock(uintptr_t addr, size_t len)
+{
+	int err;
+
+	/*
+	 * If the the caller is not privileged and either the limit is 0, or
+	 * the kernel version is earlier than 2.6.9, then fail with EPERM. See
+	 * LTP mlock2.c.
+	 */
+	if ((err = secpolicy_lock_memory(CRED())) != 0) {
+		struct rlimit64 rlim64;
+
+		lx_get_rctl("process.max-locked-memory", &rlim64);
+		if (rlim64.rlim_cur == 0 ||
+		    lx_kern_release_cmp(curzone, "2.6.9") < 0)
+			return (set_errno(err));
+	}
+
+	return (lx_mlock_common(MC_LOCK, addr, len));
+}
+
+int
+lx_munlock(uintptr_t addr, size_t len)
+{
+	return (lx_mlock_common(MC_UNLOCK, addr, len));
+}
+
+int
+lx_mlockall(int flags)
+{
+	int err;
+	struct as *as = curproc->p_as;
+
+	/*
+	 * If the the caller is not privileged and either the limit is 0, or
+	 * the kernel version is earlier than 2.6.9, then fail with EPERM. See
+	 * LTP mlockall2.c.
+	 */
+	if ((err = secpolicy_lock_memory(CRED())) != 0) {
+		struct rlimit64 rlim64;
+
+		lx_get_rctl("process.max-locked-memory", &rlim64);
+		if (rlim64.rlim_cur == 0 ||
+		    lx_kern_release_cmp(curzone, "2.6.9") < 0)
+			return (set_errno(err));
+	}
+
+	if ((flags & ~(MCL_FUTURE | MCL_CURRENT)) || flags == 0)
+		return (set_errno(EINVAL));
+
+	err = as_ctl(as, 0, 0, MC_LOCKAS, 0, (uintptr_t)flags, NULL, 0);
+	if (err == EAGAIN)
+		err = ENOMEM;
+	return (err == 0 ? 0 : set_errno(err));
+}
+
+int
+lx_munlockall(void)
+{
+	int err;
+	struct as *as = curproc->p_as;
+
+	if (lx_kern_release_cmp(curzone, "2.6.9") < 0) {
+		if ((err = secpolicy_lock_memory(CRED())) != 0)
+			return (set_errno(err));
+	}
+
+	err = as_ctl(as, 0, 0, MC_UNLOCKAS, 0, 0, NULL, 0);
+	return (err == 0 ? 0 : set_errno(err));
+}
+
+int
+lx_msync(uintptr_t addr, size_t len, int flags)
+{
+	const size_t align_len = P2ROUNDUP(len, PAGESIZE);
+
+	if ((addr & PAGEOFFSET) != 0 ||
+	    (flags & ~LX_VALID_MSYNC) != 0) {
+		return (set_errno(EINVAL));
+	} else if (len == 0) {
+		/* Linux short-circuits to success on zero length */
+		return (0);
+	} else if ((addr + align_len) < addr) {
+		/* Catch overflow (including when aligning len) */
+		return (set_errno(ENOMEM));
+	}
+
+	return (memcntl((caddr_t)addr, align_len, MC_SYNC,
+	    (caddr_t)(uintptr_t)flags, 0, 0));
+}
+
+int
+lx_madvise(uintptr_t addr, size_t len, int advice)
+{
+	int err;
+	const size_t align_len = P2ROUNDUP(len, PAGESIZE);
+
+	switch (advice) {
+	case LX_MADV_REMOVE:
+		/* approximately similar */
+		advice = MADV_FREE;
+		break;
+
+	case LX_MADV_DONTNEED:
+		/*
+		 * On Linux, MADV_DONTNEED implies an immediate purge of the
+		 * specified region.  This is spuriously different from
+		 * (nearly) every other Unix, having apparently been done to
+		 * mimic the semantics on Digital Unix (!).  This is bad enough
+		 * (MADV_FREE both has better semantics and results in better
+		 * performance), but it gets worse:  Linux applications (and
+		 * notably, jemalloc) have managed to depend on the busted
+		 * semantics of MADV_DONTNEED on Linux.  We implement these
+		 * semantics via MADV_PURGE -- and we translate our advice
+		 * accordingly.
+		 */
+		advice = MADV_PURGE;
+		break;
+
+	case LX_MADV_FREE:
+		advice = MADV_FREE;
+		break;
+
+	case LX_MADV_NORMAL:
+	case LX_MADV_RANDOM:
+	case LX_MADV_SEQUENTIAL:
+	case LX_MADV_WILLNEED:
+		/* These map directly to the illumos values */
+		break;
+
+	case LX_MADV_DONTFORK:
+	case LX_MADV_DOFORK:
+	case LX_MADV_HUGEPAGE:
+	case LX_MADV_NOHUGEPAGE:
+	case LX_MADV_DONTDUMP:
+	case LX_MADV_DODUMP:
+		/* harmless to pretend these work */
+		return (0);
+	default:
+		return (set_errno(EINVAL));
+	}
+
+	if ((addr & PAGEOFFSET) != 0) {
+		return (set_errno(EINVAL));
+	} else if (len == 0) {
+		/* Linux short-circuits to success on zero length */
+		return (0);
+	} else if ((addr + align_len) <= addr) {
+		/*
+		 * Catch overflow (including when aligning len).  Unlike
+		 * similar syscalls, this is an EINVAL failure for madvise(2).
+		 */
+		return (set_errno(EINVAL));
+	}
+
+	err = memcntl((caddr_t)addr, align_len, MC_ADVISE,
+	    (caddr_t)(intptr_t)advice, 0, 0);
+	if (err == EBUSY) {
+		if (advice != MADV_PURGE) {
+			return (set_errno(EINVAL));
+		}
+		/*
+		 * If we received an EBUSY from a MADV_PURGE, we will now try
+		 * again with a MADV_DONTNEED: there are conditions (namely,
+		 * with locked mappings that haven't yet been faulted in) where
+		 * MADV_PURGE will fail but MADV_DONTNEED will succeed.  If
+		 * this succeeds, we'll call the operation a success; if not,
+		 * we'll kick back EINVAL.
+		 */
+		advice = MADV_DONTNEED;
+		err = memcntl((caddr_t)addr, align_len, MC_ADVISE,
+		    (caddr_t)(intptr_t)advice, 0, 0);
+		if (err != 0) {
+			return (set_errno(EINVAL));
+		}
+		/* Clear the old errno since success was eventually achieved. */
+		ttolwp(curthread)->lwp_errno = 0;
+	}
+	return (err);
+}
+
+int
+lx_mprotect(uintptr_t addr, size_t len, int prot)
+{
+	const size_t align_len = P2ROUNDUP(len, PAGESIZE);
+
+	/*
+	 * The flags for native mprotect(2) are essentially the same as those
+	 * on Linux, with the exception of PROT_GROWSUP/PROT_GROWSDOWN, for
+	 * which there is no native analog.  Those flags are presently ignored,
+	 * unless they are both present, which represents an invalid argument.
+	 */
+	if ((prot & LX_PROT_GROWMASK) == LX_PROT_GROWMASK) {
+		return (set_errno(EINVAL));
+	}
+	prot &= ~(LX_PROT_GROWMASK);
+
+	if ((addr & PAGEOFFSET) != 0) {
+		return (set_errno(EINVAL));
+	} else if (len == 0) {
+		/* Linux short-circuits to success on zero length */
+		return (0);
+	} else if ((addr + align_len) <= addr) {
+		/* Catch overflow (including when aligning len) */
+		return (set_errno(ENOMEM));
+	}
+
+	return (mprotect((void *)addr, align_len, prot));
+}
+
+/*
+ * There are two forms of mmap, mmap() and mmap2().  The only difference is that
+ * the final argument to mmap2() specifies the number of pages, not bytes. Also,
+ * mmap2 is 32-bit only.
+ *
+ * Linux has a number of additional flags, but they are all deprecated.  We also
+ * ignore the MAP_GROWSDOWN flag, which has no equivalent on Solaris.
+ *
+ * The Linux mmap() returns ENOMEM in some cases where illumos returns
+ * EOVERFLOW, so we translate the errno as necessary.
+ */
+
+#define	LX_MAP_ANONYMOUS	0x00020
+#define	LX_MAP_LOCKED		0x02000
+#define	LX_MAP_NORESERVE	0x04000
+#define	LX_MAP_32BIT		0x00040
+
+#define	ONE_GB			0x40000000
+
+static void lx_remap_anoncache_invalidate(uintptr_t, size_t);
+
+static int
+lx_ltos_mmap_flags(int flags)
+{
+	int new_flags;
+
+	new_flags = flags & (MAP_TYPE | MAP_FIXED);
+
+	if (flags & LX_MAP_ANONYMOUS)
+		new_flags |= MAP_ANONYMOUS;
+	if (flags & LX_MAP_NORESERVE)
+		new_flags |= MAP_NORESERVE;
+
+#if defined(_LP64)
+	if (flags & LX_MAP_32BIT)
+		new_flags |= MAP_32BIT;
+#endif
+
+	return (new_flags);
+}
+
+static void *
+lx_mmap_common(void *addr, size_t len, int prot, int flags, int fd, off64_t off)
+{
+	caddr_t ret;
+	lx_proc_data_t *lxpd = ptolxproc(curproc);
+
+	/*
+	 * Under Linux, the file descriptor is ignored when mapping zfod
+	 * anonymous memory,  On illumos, we want the fd set to -1 for the
+	 * same functionality.
+	 */
+	if (flags & LX_MAP_ANONYMOUS)
+		fd = -1;
+
+	/*
+	 * We refuse, as a matter of principle, to overcommit memory.
+	 * Unfortunately, several bits of important and popular software expect
+	 * to be able to pre-allocate large amounts of virtual memory but then
+	 * probably never use it.  One particularly bad example of this
+	 * practice is golang. Another is the JVM.
+	 *
+	 * In the interest of running software, unsafe or not, we fudge
+	 * something vaguely similar to overcommit by permanently enabling
+	 * MAP_NORESERVE unless MAP_LOCKED was requested:
+	 */
+	if (!(flags & LX_MAP_LOCKED)) {
+		flags |= LX_MAP_NORESERVE;
+	}
+
+	/*
+	 * This is totally insane. The NOTES section in the linux mmap(2) man
+	 * page claims that on some architectures, read protection may
+	 * automatically include exec protection. It has been observed on a
+	 * native linux system that the /proc/<pid>/maps file does indeed
+	 * show that segments mmap'd from userland (such as libraries mapped in
+	 * by the dynamic linker) all have exec the permission set, even for
+	 * data segments.
+	 *
+	 * This insanity is tempered by the fact that the behavior is disabled
+	 * for ELF binaries bearing a PT_GNU_STACK header which lacks PF_X
+	 * (which most do).  Such a header will clear the READ_IMPLIES_EXEC
+	 * flag from the process personality.
+	 */
+	if (prot & PROT_READ) {
+		if ((lxpd->l_personality & LX_PER_READ_IMPLIES_EXEC) != 0) {
+			prot |= PROT_EXEC;
+		}
+	}
+
+	ret = smmap64(addr, len, prot, lx_ltos_mmap_flags(flags), fd, off);
+	if (ttolwp(curthread)->lwp_errno != 0) {
+		if (ttolwp(curthread)->lwp_errno == EOVERFLOW)
+			(void) set_errno(ENOMEM);
+		return ((void *)-1);
+	}
+
+	if (flags & LX_MAP_LOCKED) {
+		(void) lx_mlock_common(MC_LOCK, (uintptr_t)ret, len);
+		/* clear any errno from mlock */
+		ttolwp(curthread)->lwp_errno = 0;
+	}
+
+	/*
+	 * We have a new mapping; invalidate any cached anonymous regions that
+	 * overlap(ped) with it.
+	 */
+	mutex_enter(&lxpd->l_remap_anoncache_lock);
+	lx_remap_anoncache_invalidate((uintptr_t)ret, len);
+	mutex_exit(&lxpd->l_remap_anoncache_lock);
+
+	return (ret);
+}
+
+long
+lx_mmap(void *addr, size_t len, int prot, int flags, int fd, off64_t off)
+{
+	return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd, off));
+}
+
+long
+lx_mmap2(void *addr, size_t len, int prot, int flags,
+    int fd, off_t off)
+{
+	return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd,
+	    (off64_t)off * PAGESIZE));
+}
+
+long
+lx_munmap(void *addr, size_t len)
+{
+	lx_proc_data_t *lxpd = ptolxproc(curproc);
+
+	/*
+	 * Invalidate any cached anonymous regions that overlap(ped) with it.
+	 */
+	mutex_enter(&lxpd->l_remap_anoncache_lock);
+	lx_remap_anoncache_invalidate((uintptr_t)addr, len);
+	mutex_exit(&lxpd->l_remap_anoncache_lock);
+
+	return (munmap(addr, len));
+}
+
+#define	LX_MREMAP_MAYMOVE	1	/* mapping can be moved */
+#define	LX_MREMAP_FIXED		2	/* address is fixed */
+
+/*
+ * Unfortunately, the Linux mremap() manpage contains a statement that is, at
+ * best, grossly oversimplified: that mremap() "can be used to implement a
+ * very efficient realloc(3)."  To the degree this is true at all, it is only
+ * true narrowly (namely, when large buffers are being expanded but can't be
+ * expanded in place due to virtual address space restrictions) -- but
+ * apparently, someone took this very literally, because variants of glibc
+ * appear to simply implement realloc() in terms of mremap().  This is
+ * unfortunate because absent intelligent usage, it forces realloc() to have
+ * an unncessary interaction with the VM system for small expansions -- and if
+ * realloc() is itself abused (e.g., if a consumer repeatedly expands and
+ * contracts the same memory buffer), the net result can be less efficient
+ * than a much more naive realloc() implementation.  And if native Linux is
+ * suboptimal in this case, we are deeply pathological, having not
+ * historically supported mremap() for anonymous mappings at all.  To make
+ * this at least palatable, we not only support remap for anonymous mappings
+ * (see lx_remap_anon(), below), we also cache the metadata associated with
+ * these anonymous remappings to reduce the need to search our address space.
+ * We implement the anonymous metadata cache with l_remap_anoncache, an LRU
+ * cache of lx_segmap_t's that correspond to anonymous segments that have been
+ * resized (only anonymous mappings that have been remapped are cached). The
+ * cache is part of the process's lx-brand-specifc data.
+ */
+
+/*
+ * Search our address space (as) mappings to find the specified mapping. This
+ * is derived from the procfs prgetmap() code. We implement the "reserved"
+ * behavior on the segment so as to accommodate the case where an mmap()'d and
+ * then ftruncate()'d file is being mremap()'d: we use the size of the
+ * mapping (which we need to validate old_size).
+ *
+ * Return 0 if mapping is found, errno if there is a problem or if mapping
+ * not found. If the mapping is found, we populate the mp parameter, vpp and
+ * offp with the results.
+ */
+static int
+lx_get_mapping(uintptr_t find_addr, size_t find_size, lx_segmap_t *mp,
+    vnode_t **vpp, offset_t *offp)
+{
+	struct as *as = curproc->p_as;
+	struct seg *seg;
+	uint_t prot;
+	caddr_t saddr, eaddr, naddr;
+
+	/* pr_getprot asserts that the as is held as a writer */
+	AS_LOCK_ENTER(as, RW_WRITER);
+
+	seg = as_segat(as, (caddr_t)find_addr);
+	if (seg == NULL || (seg->s_flags & S_HOLE) != 0) {
+		AS_LOCK_EXIT(as);
+		return (EFAULT);
+	}
+
+	/*
+	 * We're interested in the reserved space, so we use the size of the
+	 * segment itself.
+	 */
+	eaddr = seg->s_base + seg->s_size;
+	for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
+		uintptr_t vaddr;
+		size_t size;
+		struct vnode *vp;
+		void *tmp = NULL;
+
+		prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
+		if (saddr == naddr)
+			continue;
+
+		vaddr = (uintptr_t)saddr;
+		size = (uintptr_t)naddr - (uintptr_t)saddr;
+
+		if (vaddr == find_addr && find_size < size &&
+		    (find_size & PAGEOFFSET) != 0) {
+			/*
+			 * We found a mapping but the size being requested is
+			 * less than the mapping and not a multiple of our page
+			 * size. If it is an anonymous mapping, that likely
+			 * means the application did the initial mmap with this
+			 * odd size. We'll round up to the next page boundary
+			 * in this case.
+			 */
+			if (seg->s_ops == &segspt_shmops ||
+			    (seg->s_ops == &segvn_ops &&
+			    (SEGOP_GETVP(seg, saddr, &vp) != 0 ||
+			    vp == NULL))) {
+				/*
+				 * It's anonymous, round up the size.
+				 */
+				find_size = ptob(btopr(find_size));
+			}
+		}
+
+		/* Check if mapping matches our arguments */
+		if (vaddr == find_addr && size == find_size) {
+			struct vattr vattr;
+
+			mp->lxsm_vaddr = vaddr;
+			mp->lxsm_size = size;
+			mp->lxsm_flags = 0;
+
+			*offp = SEGOP_GETOFFSET(seg, saddr);
+
+			if (prot & PROT_READ)
+				mp->lxsm_flags |= LX_SM_READ;
+			if (prot & PROT_WRITE)
+				mp->lxsm_flags |= LX_SM_WRITE;
+			if (prot & PROT_EXEC)
+				mp->lxsm_flags |= LX_SM_EXEC;
+			if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED)
+				mp->lxsm_flags |= LX_SM_SHARED;
+			if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE)
+				mp->lxsm_flags |= LX_SM_NORESERVE;
+			if (seg->s_ops == &segspt_shmops ||
+			    (seg->s_ops == &segvn_ops &&
+			    (SEGOP_GETVP(seg, saddr, &vp) != 0 ||
+			    vp == NULL)))
+				mp->lxsm_flags |= LX_SM_ANON;
+
+			if (seg->s_ops == &segspt_shmops) {
+				mp->lxsm_flags |= LX_SM_SHM;
+			} else if ((mp->lxsm_flags & LX_SM_SHARED) &&
+			    curproc->p_segacct && shmgetid(curproc,
+			    seg->s_base) != SHMID_NONE) {
+				mp->lxsm_flags |= LX_SM_SHM;
+			}
+
+			vattr.va_mask = AT_FSID | AT_NODEID;
+			if (seg->s_ops == &segvn_ops &&
+			    SEGOP_GETVP(seg, saddr, &vp) == 0 &&
+			    vp != NULL && vp->v_type == VREG &&
+			    VOP_GETATTR(vp, &vattr, 0, CRED(),
+			    NULL) == 0) {
+				VN_HOLD(vp);
+				*vpp = vp;
+			} else {
+				*vpp = NULL;
+			}
+
+			AS_LOCK_EXIT(as);
+			return (0);
+		}
+
+		if (vaddr <= find_addr &&
+		    find_addr + find_size < vaddr + size) {
+			/*
+			 * We have a mismatch, but our specified range is a
+			 * subset of the actual segment; this is EINVAL.
+			 */
+			AS_LOCK_EXIT(as);
+			DTRACE_PROBE2(lx__mremap__badsubset, caddr_t,
+			    vaddr, size_t, size);
+			return (EINVAL);
+		}
+	}
+
+	AS_LOCK_EXIT(as);
+	return (EFAULT);
+}
+
+static void
+lx_remap_anoncache_invalidate(uintptr_t addr, size_t size)
+{
+	lx_proc_data_t *lxpd = ptolxproc(curproc);
+	uint_t i;
+
+	ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock));
+
+	if (lxpd->l_remap_anoncache_generation == 0)
+		return;
+
+	for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) {
+		lx_segmap_t *map = &lxpd->l_remap_anoncache[i];
+
+		/*
+		 * If the ranges overlap at all, we zap it.
+		 */
+		if (addr < map->lxsm_vaddr + map->lxsm_size &&
+		    map->lxsm_vaddr < addr + size) {
+			bzero(map, sizeof (lx_segmap_t));
+		}
+	}
+}
+
+static void
+lx_remap_anoncache_load(lx_segmap_t *map, size_t size)
+{
+	lx_proc_data_t *lxpd = ptolxproc(curproc);
+	uint64_t oldest = UINT64_MAX;
+	lx_segmap_t *evict = NULL;
+	uint_t i;
+
+	ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock));
+
+	for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) {
+		lx_segmap_t *cp = &lxpd->l_remap_anoncache[i];
+
+		if (cp->lxsm_vaddr == map->lxsm_vaddr) {
+			/*
+			 * We're already in the cache -- we just need to update
+			 * our LRU field and size to reflect the hit.
+			 */
+			cp->lxsm_lru = lxpd->l_remap_anoncache_generation++;
+			cp->lxsm_size = size;
+			return;
+		}
+
+		if (cp->lxsm_vaddr == 0) {
+			evict = cp;
+			break;
+		}
+
+		if (cp->lxsm_lru < oldest) {
+			oldest = cp->lxsm_lru;
+			evict = cp;
+		}
+	}
+
+	/* Update the entry we're evicting */
+	ASSERT(evict != NULL);
+	evict->lxsm_vaddr = map->lxsm_vaddr;
+	evict->lxsm_size = size;
+	evict->lxsm_flags = map->lxsm_flags;
+	evict->lxsm_lru = lxpd->l_remap_anoncache_generation++;
+}
+
+static int lx_u2u_copy(void *, void *, size_t);
+
+/*
+ * As part of lx_remap() (see below) and to accommodate heavy realloc() use
+ * cases (see the discussion of the l_remap_anoncache, above), we allow
+ * anonymous segments to be "remapped" in that we are willing to truncate them
+ * or append to them (as much as that's allowed by virtual address space
+ * usage).  If we fall out of these cases, we take the more expensive option
+ * of actually copying the data to a new segment -- but we locate the address
+ * in a portion of the address space that should give us plenty of VA space to
+ * expand.
+ *
+ * We return the address of the mapping or set errno if there is a problem.
+ */
+static long
+lx_remap_anon(lx_segmap_t *mapin, size_t new_size, uint_t flags,
+    uintptr_t new_addr)
+{
+	lx_segmap_t m;
+	int mflags = MAP_ANON;
+	int prot = 0;
+	void *addr, *hint = NULL;
+
+	ASSERT(MUTEX_HELD(&ptolxproc(curproc)->l_remap_anoncache_lock));
+
+	/*
+	 * Make a copy of the input lx_segmap_t argument since it might be
+	 * a reference into the anon cache, and we're manipulating cache
+	 * entries during this function.
+	 */
+	m = *mapin;
+
+	/*
+	 * If our new size is less than our old size and we're either not
+	 * being ordered to move it or the address we're being ordered to
+	 * move it to is our current address, we can just act as Procrustes
+	 * and chop off anything larger than the new size.
+	 */
+	if (new_size < m.lxsm_size && (!(flags & LX_MREMAP_FIXED) ||
+	    new_addr == m.lxsm_vaddr)) {
+		if (munmap((void *)(m.lxsm_vaddr + new_size),
+		    m.lxsm_size - new_size) != 0) {
+			return (set_errno(EINVAL));
+		}
+
+		lx_remap_anoncache_load(&m, new_size);
+		return (m.lxsm_vaddr);
+	}
+
+	if (m.lxsm_flags & LX_SM_SHM)
+		return (set_errno(EINVAL));
+
+	if (m.lxsm_flags & LX_SM_WRITE)
+		prot |= PROT_WRITE;
+
+	if (m.lxsm_flags & LX_SM_READ)
+		prot |= PROT_READ;
+
+	if (m.lxsm_flags & LX_SM_EXEC)
+		prot |= PROT_EXEC;
+
+	mflags |= (m.lxsm_flags & LX_SM_SHARED) ? MAP_SHARED : MAP_PRIVATE;
+
+	if (m.lxsm_flags & LX_SM_NORESERVE)
+		mflags |= MAP_NORESERVE;
+
+	/*
+	 * If we're not being told where to move it, let's try to expand our
+	 * mapping in place by adding a fixed mapping after it.
+	 */
+	if (!(flags & LX_MREMAP_FIXED)) {
+		void *tmp_addr = (void *)(m.lxsm_vaddr + m.lxsm_size);
+
+		ASSERT(new_size > m.lxsm_size);
+		addr = smmap64(tmp_addr, new_size - m.lxsm_size, prot,
+		    mflags, -1, 0);
+		if (ttolwp(curthread)->lwp_errno != 0) {
+			/* There is no place to mmap some extra anon */
+			return (set_errno(EINVAL));
+		}
+
+		if (addr == tmp_addr) {
+			/* The expansion worked */
+			lx_remap_anoncache_load(&m, new_size);
+			return (m.lxsm_vaddr);
+		}
+
+		/*
+		 * Our advisory address was not followed -- which, as a
+		 * practical matter, means that the range conflicted with an
+		 * extant mapping.  Unmap wherever our attempted expansion
+		 * landed, and drop into the relocation case.
+		 */
+		(void) munmap(addr, new_size - m.lxsm_size);
+	}
+
+	lx_remap_anoncache_invalidate(m.lxsm_vaddr, m.lxsm_size);
+
+	/*
+	 * If we're here, we actually need to move this mapping -- so if we
+	 * can't move it, we're done.
+	 */
+	if (!(flags & LX_MREMAP_MAYMOVE))
+		return (set_errno(ENOMEM));
+
+	/*
+	 * If this is a shared private mapping, we can't remap it.
+	 */
+	if (m.lxsm_flags & LX_SM_SHARED)
+		return (set_errno(EINVAL));
+
+	if (flags & LX_MREMAP_FIXED) {
+		mflags |= MAP_FIXED;
+		hint = (void *)new_addr;
+	} else {
+		/*
+		 * Search our address space for a gap to remap into. To give
+		 * ourselves plenty of room for further mremap() expansion,
+		 * we'll multiply our new size by 16 and look for a gap at
+		 * least that big. Historically we looked for an empty gap
+		 * around the 2GB region, so we start our search for the lowest
+		 * gap in that vicinity.
+		 */
+		caddr_t base;
+		size_t upper;
+
+		base = (caddr_t)ONE_GB;
+		upper = (uintptr_t)USERLIMIT - (uintptr_t)base;
+
+		if (as_gap(curproc->p_as, (new_size << 4UL), &base, &upper,
+		    AH_LO, NULL) != -1)
+			hint = base;
+	}
+
+	addr = smmap64(hint, new_size, prot, mflags, -1, 0);
+	if (ttolwp(curthread)->lwp_errno != 0) {
+		return (ttolwp(curthread)->lwp_errno);
+	}
+
+	if (lx_u2u_copy((void *)m.lxsm_vaddr, addr, m.lxsm_size) != 0) {
+		/* We couldn't complete the relocation, backout & fail */
+		(void) munmap(addr, new_size);
+		return (set_errno(ENOMEM));
+	}
+
+	(void) munmap((void *)m.lxsm_vaddr, m.lxsm_size);
+
+	/*
+	 * Add the relocated mapping to the cache.
+	 */
+	m.lxsm_vaddr = (uintptr_t)addr;
+	lx_remap_anoncache_load(&m, new_size);
+
+	return ((long)addr);
+}
+
+/*
+ * We don't have a native mremap() (nor do we particularly want one), so
+ * we emulate it strictly in lx.  The idea is simple: we just want to
+ * mmap() the underlying object with the new size and rip down the old mapping.
+ * However, this is slightly complicated because we don't actually have the
+ * file descriptor that corresponds to the resized mapping. So to get a file
+ * descriptor, we may have to search our address space for the mapping and use
+ * the associated vnode to create a file descriptor. Assuming that this
+ * succeeds, we then mmap() it and rip down the original mapping.  There are
+ * clearly many reasons why this might fail; absent a more apt errno (e.g.,
+ * ENOMEM in some cases), we return EINVAL to denote these cases.
+ */
+long
+lx_mremap(uintptr_t old_addr, size_t old_size, size_t new_size, int flags,
+    uintptr_t new_addr)
+{
+	int prot = 0, oflags, mflags = 0, i, res;
+	lx_segmap_t map, *mp;
+	int rval = 0;
+	lx_proc_data_t *lxpd;
+	offset_t off;
+	struct vnode *vp = NULL;
+	file_t *fp;
+	caddr_t naddr;
+
+	if (flags & LX_MREMAP_FIXED) {
+		/* MREMAP_FIXED requires MREMAP_MAYMOVE */
+		if ((flags & LX_MREMAP_MAYMOVE) == 0)
+			return (set_errno(EINVAL));
+
+		if (new_addr & PAGEOFFSET)
+			return (set_errno(EINVAL));
+
+		mflags |= MAP_FIXED;
+	} else {
+		if (new_size == old_size)
+			return (old_addr);
+
+		/* new_addr is optional and only valid when LX_MREMAP_FIXED. */
+		new_addr = (uintptr_t)NULL;
+	}
+
+	if (old_addr & PAGEOFFSET)
+		return (set_errno(EINVAL));
+
+	if (new_size == 0)
+		return (set_errno(EINVAL));
+
+	/*
+	 * First consult the anoncache; if we find the segment there, we'll
+	 * drop straight into lx_remap_anon() and save ourself the pain of
+	 * searching our address space.
+	 */
+	lxpd = ptolxproc(curproc);
+	mutex_enter(&lxpd->l_remap_anoncache_lock);
+
+	for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) {
+		long rv;
+
+		mp = &lxpd->l_remap_anoncache[i];
+
+		if (mp->lxsm_vaddr != old_addr)
+			continue;
+
+		if (mp->lxsm_size != old_size)
+			continue;
+
+		/*
+		 * lx_remap_anon will either:
+		 * a) expand/contract in place, returning old_addr
+		 * b) relocate & expand the mapping, returning a new address
+		 * c) there will be an error of some sort and errno will be set
+		 */
+		rv = lx_remap_anon(mp, new_size, flags, new_addr);
+		mutex_exit(&lxpd->l_remap_anoncache_lock);
+		return (rv);
+	}
+
+	mutex_exit(&lxpd->l_remap_anoncache_lock);
+
+	/*
+	 * Search our address space to find the specified mapping.
+	 */
+	if ((res = lx_get_mapping(old_addr, old_size, &map, &vp, &off)) > 0)
+		return (set_errno(res));
+
+	/*
+	 * We found the mapping.
+	 */
+	mp = &map;
+	DTRACE_PROBE1(lx__mremap__seg, lx_segmap_t *, mp);
+
+	if (mp->lxsm_flags & LX_SM_SHM) {
+		/*
+		 * If this is either ISM or System V shared memory, we're not
+		 * going to remap it.
+		 */
+		rval = set_errno(EINVAL);
+		goto out;
+	}
+
+	if (mp->lxsm_flags & LX_SM_ANON) {
+		/*
+		 * This is an anonymous mapping -- which is the one case in
+		 * which we perform something that approaches a true remap.
+		 */
+		long rv;
+
+		if (vp != NULL)
+			VN_RELE(vp);
+		mutex_enter(&lxpd->l_remap_anoncache_lock);
+		rv = lx_remap_anon(mp, new_size, flags, new_addr);
+		mutex_exit(&lxpd->l_remap_anoncache_lock);
+		return (rv);
+	}
+
+	/* The rest of the code is for a 'named' mapping */
+
+	if (!(flags & LX_MREMAP_MAYMOVE)) {
+		/*
+		 * If we're not allowed to move this mapping, we're going to
+		 * act as if we can't expand it.
+		 */
+		rval = set_errno(ENOMEM);
+		goto out;
+	}
+
+	if (!(mp->lxsm_flags & LX_SM_SHARED)) {
+		/*
+		 * If this is a private mapping, we're not going to remap it.
+		 */
+		rval = set_errno(EINVAL);
+		goto out;
+	}
+
+	oflags = (mp->lxsm_flags & LX_SM_WRITE) ? (FWRITE | FREAD) : FREAD;
+	if (vp == NULL) {
+		/*
+		 * If vp is NULL, the path might not exist. We're going to kick
+		 * it back with EINVAL.
+		 */
+		rval = set_errno(EINVAL);
+		goto out;
+	}
+
+	/* falloc cannot fail with a NULL fdp. */
+	VERIFY0(falloc(vp, oflags, &fp, NULL));
+	mutex_exit(&fp->f_tlock);
+
+	if (mp->lxsm_flags & LX_SM_WRITE)
+		prot |= PROT_WRITE;
+
+	if (mp->lxsm_flags & LX_SM_READ)
+		prot |= PROT_READ;
+
+	if (mp->lxsm_flags & LX_SM_EXEC)
+		prot |= PROT_EXEC;
+
+	mflags |= MAP_SHARED;
+
+	/*
+	 * We're using smmap_common to pass the fp directly, instead of
+	 * initializing a temporary file descriptor for smmap64(), so as to
+	 * prevent any inadvertent use of that temporary fd within the
+	 * application.
+	 */
+	naddr = (caddr_t)new_addr;
+	rval = smmap_common(&naddr, new_size, prot, mflags, fp, off);
+
+	mutex_enter(&fp->f_tlock);
+	unfalloc(fp);
+
+	if (rval != 0) {
+		rval = set_errno(ENOMEM);
+		goto out;
+	}
+
+	/*
+	 * Our mapping succeeded; we're now going to rip down the old mapping.
+	 */
+	(void) munmap((void *)old_addr, old_size);
+
+out:
+	if (vp != NULL)
+		VN_RELE(vp);
+
+	if (rval == 0)
+		return ((long)naddr);
+	return ((long)rval);
+}
+
+#pragma GCC diagnostic ignored "-Wclobbered"
+/*
+ * During mremap we had to relocate the initial anonymous mapping to a new
+ * location (a new anonymous mapping). Copy the user-level data from the first
+ * mapping to the second mapping.
+ *
+ * We have to lock both sides to ensure there is no fault. We do this in 16MB
+ * chunks at a time and we do not concern ourselves with the zone's
+ * max-locked-memory rctl.
+ *
+ * Keep this function at the end since we're disabling the compiler's "clobber"
+ * check due to the on_fault call.
+ */
+static int
+lx_u2u_copy(void *src, void *dst, size_t len)
+{
+	size_t mlen;
+	caddr_t sp, dp;
+	int err;
+	page_t **ppa_src, **ppa_dst;
+	label_t ljb;
+	struct as *p_as = curproc->p_as;
+
+	/* Both sides should be page aligned since they're from smmap64 */
+	ASSERT(((uintptr_t)src & PAGEOFFSET) == 0);
+	ASSERT(((uintptr_t)dst & PAGEOFFSET) == 0);
+	/* Both came from mmap, so they should be valid user pointers */
+	ASSERT((uintptr_t)src < USERLIMIT && (uintptr_t)dst < USERLIMIT);
+
+	sp = src;
+	dp = dst;
+
+	do {
+		mlen = MIN(len, 16 * 1024 * 1024);
+
+		err = as_pagelock(p_as, &ppa_src, sp, mlen, S_READ);
+		if (err != 0) {
+			return (err);
+		}
+		err = as_pagelock(p_as, &ppa_dst, dp, mlen, S_WRITE);
+		if (err != 0) {
+			as_pageunlock(p_as, ppa_src, sp, mlen, S_READ);
+			return (err);
+		}
+
+		DTRACE_PROBE3(lx__mremap__copy, void *, sp, void *, dp,
+		    size_t, mlen);
+
+		/* on_fault calls smap_disable */
+		if (on_fault(&ljb)) {
+			/*
+			 * Given that the pages are locked and smap is disabled,
+			 * we really should never get here. If we somehow do
+			 * get here, the copy fails just as if we could not
+			 * lock the pages to begin with.
+			 */
+			as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE);
+			as_pageunlock(p_as, ppa_src, sp, mlen, S_READ);
+			return (EFAULT);
+		}
+		ucopy(sp, dp, mlen);
+		no_fault();		/* calls smap_enable */
+
+		as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE);
+		as_pageunlock(p_as, ppa_src, sp, mlen, S_READ);
+
+		len -= mlen;
+		sp += mlen;
+		dp += mlen;
+	} while (len > 0);
+
+	return (0);
+}