diff options
Diffstat (limited to 'usr/src/uts/common/brand/lx/syscall/lx_mem.c')
-rw-r--r-- | usr/src/uts/common/brand/lx/syscall/lx_mem.c | 1118 |
1 files changed, 1118 insertions, 0 deletions
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mem.c b/usr/src/uts/common/brand/lx/syscall/lx_mem.c new file mode 100644 index 0000000000..15351444c8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_mem.c @@ -0,0 +1,1118 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/mman.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/policy.h> +#include <sys/lx_brand.h> +#include <sys/fcntl.h> +#include <sys/pathname.h> +#include <vm/seg_vn.h> +#include <vm/seg_spt.h> +#include <sys/shm_impl.h> +#include <vm/as.h> + +/* From uts/common/os/grow.c */ +extern int mprotect(caddr_t, size_t, int); +extern caddr_t smmap64(caddr_t, size_t, int, int, int, off_t); +extern int munmap(caddr_t, size_t); +/* From uts/common/syscall/close.c */ +extern int close(int); +/* From uts/common/fs/proc/prsubr.c */ +extern uint_t pr_getprot(struct seg *, int, void **, caddr_t *, caddr_t *, + caddr_t); +/* From uts/common/vm/seg_spt.c */ +extern struct seg_ops segspt_shmops; +/* From uts/common/syscall/memcntl.c */ +extern int memcntl(caddr_t, size_t, int, caddr_t, int, int); +/* From uts/common/os/grow.c */ +extern int smmap_common(caddr_t *, size_t, int, int, struct file *, offset_t); + +/* + * After Linux 2.6.8, an unprivileged process can lock memory up to its + * RLIMIT_MEMLOCK resource limit. + * + * Within memcntl() it assumes we have PRIV_PROC_LOCK_MEMORY, or the check in + * secpolicy_lock_memory() will fail when we attempt to lock memory. Thus, + * to support the Linux semantics, we bypass memcntl() and perform the locking + * operations directly. + */ + +#define LX_MADV_NORMAL 0 +#define LX_MADV_RANDOM 1 +#define LX_MADV_SEQUENTIAL 2 +#define LX_MADV_WILLNEED 3 +#define LX_MADV_DONTNEED 4 +#define LX_MADV_FREE 8 +#define LX_MADV_REMOVE 9 +#define LX_MADV_DONTFORK 10 +#define LX_MADV_DOFORK 11 +#define LX_MADV_MERGEABLE 12 +#define LX_MADV_UNMERGEABLE 13 +#define LX_MADV_HUGEPAGE 14 +#define LX_MADV_NOHUGEPAGE 15 +#define LX_MADV_DONTDUMP 16 +#define LX_MADV_DODUMP 17 + +#define LX_VALID_MSYNC (MS_ASYNC|MS_INVALIDATE|MS_SYNC) + +#define LX_PROT_GROWSDOWN 0x01000000 +#define LX_PROT_GROWSUP 0x02000000 + +/* Internal segment map flags */ +#define LX_SM_READ 0x01 +#define LX_SM_WRITE 0x02 +#define LX_SM_EXEC 0x04 +#define LX_SM_SHM 0x08 +#define LX_SM_ANON 0x10 +#define LX_SM_SHARED 0x20 +#define LX_SM_NORESERVE 0x40 + +/* For convenience */ +#define LX_PROT_GROWMASK (LX_PROT_GROWSUP|LX_PROT_GROWSDOWN) + +/* From lx_rlimit.c */ +extern void lx_get_rctl(char *, struct rlimit64 *); + +static int +lx_mlock_common(int op, uintptr_t addr, size_t len) +{ + int err; + struct as *as = curproc->p_as; + const uintptr_t align_addr = addr & (uintptr_t)PAGEMASK; + const size_t align_len = P2ROUNDUP(len + (addr & PAGEOFFSET), PAGESIZE); + + if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((align_addr + align_len) <= align_addr) { + /* Catch overflow (including when aligning len) */ + return (set_errno(EINVAL)); + } + + err = as_ctl(as, (caddr_t)align_addr, align_len, op, 0, 0, NULL, 0); + if (err == EAGAIN) + err = ENOMEM; + return (err == 0 ? 0 : set_errno(err)); +} + +int +lx_mlock(uintptr_t addr, size_t len) +{ + int err; + + /* + * If the the caller is not privileged and either the limit is 0, or + * the kernel version is earlier than 2.6.9, then fail with EPERM. See + * LTP mlock2.c. + */ + if ((err = secpolicy_lock_memory(CRED())) != 0) { + struct rlimit64 rlim64; + + lx_get_rctl("process.max-locked-memory", &rlim64); + if (rlim64.rlim_cur == 0 || + lx_kern_release_cmp(curzone, "2.6.9") < 0) + return (set_errno(err)); + } + + return (lx_mlock_common(MC_LOCK, addr, len)); +} + +int +lx_munlock(uintptr_t addr, size_t len) +{ + return (lx_mlock_common(MC_UNLOCK, addr, len)); +} + +int +lx_mlockall(int flags) +{ + int err; + struct as *as = curproc->p_as; + + /* + * If the the caller is not privileged and either the limit is 0, or + * the kernel version is earlier than 2.6.9, then fail with EPERM. See + * LTP mlockall2.c. + */ + if ((err = secpolicy_lock_memory(CRED())) != 0) { + struct rlimit64 rlim64; + + lx_get_rctl("process.max-locked-memory", &rlim64); + if (rlim64.rlim_cur == 0 || + lx_kern_release_cmp(curzone, "2.6.9") < 0) + return (set_errno(err)); + } + + if ((flags & ~(MCL_FUTURE | MCL_CURRENT)) || flags == 0) + return (set_errno(EINVAL)); + + err = as_ctl(as, 0, 0, MC_LOCKAS, 0, (uintptr_t)flags, NULL, 0); + if (err == EAGAIN) + err = ENOMEM; + return (err == 0 ? 0 : set_errno(err)); +} + +int +lx_munlockall(void) +{ + int err; + struct as *as = curproc->p_as; + + if (lx_kern_release_cmp(curzone, "2.6.9") < 0) { + if ((err = secpolicy_lock_memory(CRED())) != 0) + return (set_errno(err)); + } + + err = as_ctl(as, 0, 0, MC_UNLOCKAS, 0, 0, NULL, 0); + return (err == 0 ? 0 : set_errno(err)); +} + +int +lx_msync(uintptr_t addr, size_t len, int flags) +{ + const size_t align_len = P2ROUNDUP(len, PAGESIZE); + + if ((addr & PAGEOFFSET) != 0 || + (flags & ~LX_VALID_MSYNC) != 0) { + return (set_errno(EINVAL)); + } else if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((addr + align_len) < addr) { + /* Catch overflow (including when aligning len) */ + return (set_errno(ENOMEM)); + } + + return (memcntl((caddr_t)addr, align_len, MC_SYNC, + (caddr_t)(uintptr_t)flags, 0, 0)); +} + +int +lx_madvise(uintptr_t addr, size_t len, int advice) +{ + int err; + const size_t align_len = P2ROUNDUP(len, PAGESIZE); + + switch (advice) { + case LX_MADV_REMOVE: + /* approximately similar */ + advice = MADV_FREE; + break; + + case LX_MADV_DONTNEED: + /* + * On Linux, MADV_DONTNEED implies an immediate purge of the + * specified region. This is spuriously different from + * (nearly) every other Unix, having apparently been done to + * mimic the semantics on Digital Unix (!). This is bad enough + * (MADV_FREE both has better semantics and results in better + * performance), but it gets worse: Linux applications (and + * notably, jemalloc) have managed to depend on the busted + * semantics of MADV_DONTNEED on Linux. We implement these + * semantics via MADV_PURGE -- and we translate our advice + * accordingly. + */ + advice = MADV_PURGE; + break; + + case LX_MADV_FREE: + advice = MADV_FREE; + break; + + case LX_MADV_NORMAL: + case LX_MADV_RANDOM: + case LX_MADV_SEQUENTIAL: + case LX_MADV_WILLNEED: + /* These map directly to the illumos values */ + break; + + case LX_MADV_DONTFORK: + case LX_MADV_DOFORK: + case LX_MADV_HUGEPAGE: + case LX_MADV_NOHUGEPAGE: + case LX_MADV_DONTDUMP: + case LX_MADV_DODUMP: + /* harmless to pretend these work */ + return (0); + default: + return (set_errno(EINVAL)); + } + + if ((addr & PAGEOFFSET) != 0) { + return (set_errno(EINVAL)); + } else if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((addr + align_len) <= addr) { + /* + * Catch overflow (including when aligning len). Unlike + * similar syscalls, this is an EINVAL failure for madvise(2). + */ + return (set_errno(EINVAL)); + } + + err = memcntl((caddr_t)addr, align_len, MC_ADVISE, + (caddr_t)(intptr_t)advice, 0, 0); + if (err == EBUSY) { + if (advice != MADV_PURGE) { + return (set_errno(EINVAL)); + } + /* + * If we received an EBUSY from a MADV_PURGE, we will now try + * again with a MADV_DONTNEED: there are conditions (namely, + * with locked mappings that haven't yet been faulted in) where + * MADV_PURGE will fail but MADV_DONTNEED will succeed. If + * this succeeds, we'll call the operation a success; if not, + * we'll kick back EINVAL. + */ + advice = MADV_DONTNEED; + err = memcntl((caddr_t)addr, align_len, MC_ADVISE, + (caddr_t)(intptr_t)advice, 0, 0); + if (err != 0) { + return (set_errno(EINVAL)); + } + /* Clear the old errno since success was eventually achieved. */ + ttolwp(curthread)->lwp_errno = 0; + } + return (err); +} + +int +lx_mprotect(uintptr_t addr, size_t len, int prot) +{ + const size_t align_len = P2ROUNDUP(len, PAGESIZE); + + /* + * The flags for native mprotect(2) are essentially the same as those + * on Linux, with the exception of PROT_GROWSUP/PROT_GROWSDOWN, for + * which there is no native analog. Those flags are presently ignored, + * unless they are both present, which represents an invalid argument. + */ + if ((prot & LX_PROT_GROWMASK) == LX_PROT_GROWMASK) { + return (set_errno(EINVAL)); + } + prot &= ~(LX_PROT_GROWMASK); + + if ((addr & PAGEOFFSET) != 0) { + return (set_errno(EINVAL)); + } else if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((addr + align_len) <= addr) { + /* Catch overflow (including when aligning len) */ + return (set_errno(ENOMEM)); + } + + return (mprotect((void *)addr, align_len, prot)); +} + +/* + * There are two forms of mmap, mmap() and mmap2(). The only difference is that + * the final argument to mmap2() specifies the number of pages, not bytes. Also, + * mmap2 is 32-bit only. + * + * Linux has a number of additional flags, but they are all deprecated. We also + * ignore the MAP_GROWSDOWN flag, which has no equivalent on Solaris. + * + * The Linux mmap() returns ENOMEM in some cases where illumos returns + * EOVERFLOW, so we translate the errno as necessary. + */ + +#define LX_MAP_ANONYMOUS 0x00020 +#define LX_MAP_LOCKED 0x02000 +#define LX_MAP_NORESERVE 0x04000 +#define LX_MAP_32BIT 0x00040 + +#define ONE_GB 0x40000000 + +static void lx_remap_anoncache_invalidate(uintptr_t, size_t); + +static int +lx_ltos_mmap_flags(int flags) +{ + int new_flags; + + new_flags = flags & (MAP_TYPE | MAP_FIXED); + + if (flags & LX_MAP_ANONYMOUS) + new_flags |= MAP_ANONYMOUS; + if (flags & LX_MAP_NORESERVE) + new_flags |= MAP_NORESERVE; + +#if defined(_LP64) + if (flags & LX_MAP_32BIT) + new_flags |= MAP_32BIT; +#endif + + return (new_flags); +} + +static void * +lx_mmap_common(void *addr, size_t len, int prot, int flags, int fd, off64_t off) +{ + caddr_t ret; + lx_proc_data_t *lxpd = ptolxproc(curproc); + + /* + * Under Linux, the file descriptor is ignored when mapping zfod + * anonymous memory, On illumos, we want the fd set to -1 for the + * same functionality. + */ + if (flags & LX_MAP_ANONYMOUS) + fd = -1; + + /* + * We refuse, as a matter of principle, to overcommit memory. + * Unfortunately, several bits of important and popular software expect + * to be able to pre-allocate large amounts of virtual memory but then + * probably never use it. One particularly bad example of this + * practice is golang. Another is the JVM. + * + * In the interest of running software, unsafe or not, we fudge + * something vaguely similar to overcommit by permanently enabling + * MAP_NORESERVE unless MAP_LOCKED was requested: + */ + if (!(flags & LX_MAP_LOCKED)) { + flags |= LX_MAP_NORESERVE; + } + + /* + * This is totally insane. The NOTES section in the linux mmap(2) man + * page claims that on some architectures, read protection may + * automatically include exec protection. It has been observed on a + * native linux system that the /proc/<pid>/maps file does indeed + * show that segments mmap'd from userland (such as libraries mapped in + * by the dynamic linker) all have exec the permission set, even for + * data segments. + * + * This insanity is tempered by the fact that the behavior is disabled + * for ELF binaries bearing a PT_GNU_STACK header which lacks PF_X + * (which most do). Such a header will clear the READ_IMPLIES_EXEC + * flag from the process personality. + */ + if (prot & PROT_READ) { + if ((lxpd->l_personality & LX_PER_READ_IMPLIES_EXEC) != 0) { + prot |= PROT_EXEC; + } + } + + ret = smmap64(addr, len, prot, lx_ltos_mmap_flags(flags), fd, off); + if (ttolwp(curthread)->lwp_errno != 0) { + if (ttolwp(curthread)->lwp_errno == EOVERFLOW) + (void) set_errno(ENOMEM); + return ((void *)-1); + } + + if (flags & LX_MAP_LOCKED) { + (void) lx_mlock_common(MC_LOCK, (uintptr_t)ret, len); + /* clear any errno from mlock */ + ttolwp(curthread)->lwp_errno = 0; + } + + /* + * We have a new mapping; invalidate any cached anonymous regions that + * overlap(ped) with it. + */ + mutex_enter(&lxpd->l_remap_anoncache_lock); + lx_remap_anoncache_invalidate((uintptr_t)ret, len); + mutex_exit(&lxpd->l_remap_anoncache_lock); + + return (ret); +} + +long +lx_mmap(void *addr, size_t len, int prot, int flags, int fd, off64_t off) +{ + return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd, off)); +} + +long +lx_mmap2(void *addr, size_t len, int prot, int flags, + int fd, off_t off) +{ + return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd, + (off64_t)off * PAGESIZE)); +} + +long +lx_munmap(void *addr, size_t len) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + + /* + * Invalidate any cached anonymous regions that overlap(ped) with it. + */ + mutex_enter(&lxpd->l_remap_anoncache_lock); + lx_remap_anoncache_invalidate((uintptr_t)addr, len); + mutex_exit(&lxpd->l_remap_anoncache_lock); + + return (munmap(addr, len)); +} + +#define LX_MREMAP_MAYMOVE 1 /* mapping can be moved */ +#define LX_MREMAP_FIXED 2 /* address is fixed */ + +/* + * Unfortunately, the Linux mremap() manpage contains a statement that is, at + * best, grossly oversimplified: that mremap() "can be used to implement a + * very efficient realloc(3)." To the degree this is true at all, it is only + * true narrowly (namely, when large buffers are being expanded but can't be + * expanded in place due to virtual address space restrictions) -- but + * apparently, someone took this very literally, because variants of glibc + * appear to simply implement realloc() in terms of mremap(). This is + * unfortunate because absent intelligent usage, it forces realloc() to have + * an unncessary interaction with the VM system for small expansions -- and if + * realloc() is itself abused (e.g., if a consumer repeatedly expands and + * contracts the same memory buffer), the net result can be less efficient + * than a much more naive realloc() implementation. And if native Linux is + * suboptimal in this case, we are deeply pathological, having not + * historically supported mremap() for anonymous mappings at all. To make + * this at least palatable, we not only support remap for anonymous mappings + * (see lx_remap_anon(), below), we also cache the metadata associated with + * these anonymous remappings to reduce the need to search our address space. + * We implement the anonymous metadata cache with l_remap_anoncache, an LRU + * cache of lx_segmap_t's that correspond to anonymous segments that have been + * resized (only anonymous mappings that have been remapped are cached). The + * cache is part of the process's lx-brand-specifc data. + */ + +/* + * Search our address space (as) mappings to find the specified mapping. This + * is derived from the procfs prgetmap() code. We implement the "reserved" + * behavior on the segment so as to accommodate the case where an mmap()'d and + * then ftruncate()'d file is being mremap()'d: we use the size of the + * mapping (which we need to validate old_size). + * + * Return 0 if mapping is found, errno if there is a problem or if mapping + * not found. If the mapping is found, we populate the mp parameter, vpp and + * offp with the results. + */ +static int +lx_get_mapping(uintptr_t find_addr, size_t find_size, lx_segmap_t *mp, + vnode_t **vpp, offset_t *offp) +{ + struct as *as = curproc->p_as; + struct seg *seg; + uint_t prot; + caddr_t saddr, eaddr, naddr; + + /* pr_getprot asserts that the as is held as a writer */ + AS_LOCK_ENTER(as, RW_WRITER); + + seg = as_segat(as, (caddr_t)find_addr); + if (seg == NULL || (seg->s_flags & S_HOLE) != 0) { + AS_LOCK_EXIT(as); + return (EFAULT); + } + + /* + * We're interested in the reserved space, so we use the size of the + * segment itself. + */ + eaddr = seg->s_base + seg->s_size; + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { + uintptr_t vaddr; + size_t size; + struct vnode *vp; + void *tmp = NULL; + + prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr); + if (saddr == naddr) + continue; + + vaddr = (uintptr_t)saddr; + size = (uintptr_t)naddr - (uintptr_t)saddr; + + if (vaddr == find_addr && find_size < size && + (find_size & PAGEOFFSET) != 0) { + /* + * We found a mapping but the size being requested is + * less than the mapping and not a multiple of our page + * size. If it is an anonymous mapping, that likely + * means the application did the initial mmap with this + * odd size. We'll round up to the next page boundary + * in this case. + */ + if (seg->s_ops == &segspt_shmops || + (seg->s_ops == &segvn_ops && + (SEGOP_GETVP(seg, saddr, &vp) != 0 || + vp == NULL))) { + /* + * It's anonymous, round up the size. + */ + find_size = ptob(btopr(find_size)); + } + } + + /* Check if mapping matches our arguments */ + if (vaddr == find_addr && size == find_size) { + struct vattr vattr; + + mp->lxsm_vaddr = vaddr; + mp->lxsm_size = size; + mp->lxsm_flags = 0; + + *offp = SEGOP_GETOFFSET(seg, saddr); + + if (prot & PROT_READ) + mp->lxsm_flags |= LX_SM_READ; + if (prot & PROT_WRITE) + mp->lxsm_flags |= LX_SM_WRITE; + if (prot & PROT_EXEC) + mp->lxsm_flags |= LX_SM_EXEC; + if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED) + mp->lxsm_flags |= LX_SM_SHARED; + if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE) + mp->lxsm_flags |= LX_SM_NORESERVE; + if (seg->s_ops == &segspt_shmops || + (seg->s_ops == &segvn_ops && + (SEGOP_GETVP(seg, saddr, &vp) != 0 || + vp == NULL))) + mp->lxsm_flags |= LX_SM_ANON; + + if (seg->s_ops == &segspt_shmops) { + mp->lxsm_flags |= LX_SM_SHM; + } else if ((mp->lxsm_flags & LX_SM_SHARED) && + curproc->p_segacct && shmgetid(curproc, + seg->s_base) != SHMID_NONE) { + mp->lxsm_flags |= LX_SM_SHM; + } + + vattr.va_mask = AT_FSID | AT_NODEID; + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, saddr, &vp) == 0 && + vp != NULL && vp->v_type == VREG && + VOP_GETATTR(vp, &vattr, 0, CRED(), + NULL) == 0) { + VN_HOLD(vp); + *vpp = vp; + } else { + *vpp = NULL; + } + + AS_LOCK_EXIT(as); + return (0); + } + + if (vaddr <= find_addr && + find_addr + find_size < vaddr + size) { + /* + * We have a mismatch, but our specified range is a + * subset of the actual segment; this is EINVAL. + */ + AS_LOCK_EXIT(as); + DTRACE_PROBE2(lx__mremap__badsubset, caddr_t, + vaddr, size_t, size); + return (EINVAL); + } + } + + AS_LOCK_EXIT(as); + return (EFAULT); +} + +static void +lx_remap_anoncache_invalidate(uintptr_t addr, size_t size) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + uint_t i; + + ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock)); + + if (lxpd->l_remap_anoncache_generation == 0) + return; + + for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) { + lx_segmap_t *map = &lxpd->l_remap_anoncache[i]; + + /* + * If the ranges overlap at all, we zap it. + */ + if (addr < map->lxsm_vaddr + map->lxsm_size && + map->lxsm_vaddr < addr + size) { + bzero(map, sizeof (lx_segmap_t)); + } + } +} + +static void +lx_remap_anoncache_load(lx_segmap_t *map, size_t size) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + uint64_t oldest = UINT64_MAX; + lx_segmap_t *evict = NULL; + uint_t i; + + ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock)); + + for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) { + lx_segmap_t *cp = &lxpd->l_remap_anoncache[i]; + + if (cp->lxsm_vaddr == map->lxsm_vaddr) { + /* + * We're already in the cache -- we just need to update + * our LRU field and size to reflect the hit. + */ + cp->lxsm_lru = lxpd->l_remap_anoncache_generation++; + cp->lxsm_size = size; + return; + } + + if (cp->lxsm_vaddr == 0) { + evict = cp; + break; + } + + if (cp->lxsm_lru < oldest) { + oldest = cp->lxsm_lru; + evict = cp; + } + } + + /* Update the entry we're evicting */ + ASSERT(evict != NULL); + evict->lxsm_vaddr = map->lxsm_vaddr; + evict->lxsm_size = size; + evict->lxsm_flags = map->lxsm_flags; + evict->lxsm_lru = lxpd->l_remap_anoncache_generation++; +} + +static int lx_u2u_copy(void *, void *, size_t); + +/* + * As part of lx_remap() (see below) and to accommodate heavy realloc() use + * cases (see the discussion of the l_remap_anoncache, above), we allow + * anonymous segments to be "remapped" in that we are willing to truncate them + * or append to them (as much as that's allowed by virtual address space + * usage). If we fall out of these cases, we take the more expensive option + * of actually copying the data to a new segment -- but we locate the address + * in a portion of the address space that should give us plenty of VA space to + * expand. + * + * We return the address of the mapping or set errno if there is a problem. + */ +static long +lx_remap_anon(lx_segmap_t *mapin, size_t new_size, uint_t flags, + uintptr_t new_addr) +{ + lx_segmap_t m; + int mflags = MAP_ANON; + int prot = 0; + void *addr, *hint = NULL; + + ASSERT(MUTEX_HELD(&ptolxproc(curproc)->l_remap_anoncache_lock)); + + /* + * Make a copy of the input lx_segmap_t argument since it might be + * a reference into the anon cache, and we're manipulating cache + * entries during this function. + */ + m = *mapin; + + /* + * If our new size is less than our old size and we're either not + * being ordered to move it or the address we're being ordered to + * move it to is our current address, we can just act as Procrustes + * and chop off anything larger than the new size. + */ + if (new_size < m.lxsm_size && (!(flags & LX_MREMAP_FIXED) || + new_addr == m.lxsm_vaddr)) { + if (munmap((void *)(m.lxsm_vaddr + new_size), + m.lxsm_size - new_size) != 0) { + return (set_errno(EINVAL)); + } + + lx_remap_anoncache_load(&m, new_size); + return (m.lxsm_vaddr); + } + + if (m.lxsm_flags & LX_SM_SHM) + return (set_errno(EINVAL)); + + if (m.lxsm_flags & LX_SM_WRITE) + prot |= PROT_WRITE; + + if (m.lxsm_flags & LX_SM_READ) + prot |= PROT_READ; + + if (m.lxsm_flags & LX_SM_EXEC) + prot |= PROT_EXEC; + + mflags |= (m.lxsm_flags & LX_SM_SHARED) ? MAP_SHARED : MAP_PRIVATE; + + if (m.lxsm_flags & LX_SM_NORESERVE) + mflags |= MAP_NORESERVE; + + /* + * If we're not being told where to move it, let's try to expand our + * mapping in place by adding a fixed mapping after it. + */ + if (!(flags & LX_MREMAP_FIXED)) { + void *tmp_addr = (void *)(m.lxsm_vaddr + m.lxsm_size); + + ASSERT(new_size > m.lxsm_size); + addr = smmap64(tmp_addr, new_size - m.lxsm_size, prot, + mflags, -1, 0); + if (ttolwp(curthread)->lwp_errno != 0) { + /* There is no place to mmap some extra anon */ + return (set_errno(EINVAL)); + } + + if (addr == tmp_addr) { + /* The expansion worked */ + lx_remap_anoncache_load(&m, new_size); + return (m.lxsm_vaddr); + } + + /* + * Our advisory address was not followed -- which, as a + * practical matter, means that the range conflicted with an + * extant mapping. Unmap wherever our attempted expansion + * landed, and drop into the relocation case. + */ + (void) munmap(addr, new_size - m.lxsm_size); + } + + lx_remap_anoncache_invalidate(m.lxsm_vaddr, m.lxsm_size); + + /* + * If we're here, we actually need to move this mapping -- so if we + * can't move it, we're done. + */ + if (!(flags & LX_MREMAP_MAYMOVE)) + return (set_errno(ENOMEM)); + + /* + * If this is a shared private mapping, we can't remap it. + */ + if (m.lxsm_flags & LX_SM_SHARED) + return (set_errno(EINVAL)); + + if (flags & LX_MREMAP_FIXED) { + mflags |= MAP_FIXED; + hint = (void *)new_addr; + } else { + /* + * Search our address space for a gap to remap into. To give + * ourselves plenty of room for further mremap() expansion, + * we'll multiply our new size by 16 and look for a gap at + * least that big. Historically we looked for an empty gap + * around the 2GB region, so we start our search for the lowest + * gap in that vicinity. + */ + caddr_t base; + size_t upper; + + base = (caddr_t)ONE_GB; + upper = (uintptr_t)USERLIMIT - (uintptr_t)base; + + if (as_gap(curproc->p_as, (new_size << 4UL), &base, &upper, + AH_LO, NULL) != -1) + hint = base; + } + + addr = smmap64(hint, new_size, prot, mflags, -1, 0); + if (ttolwp(curthread)->lwp_errno != 0) { + return (ttolwp(curthread)->lwp_errno); + } + + if (lx_u2u_copy((void *)m.lxsm_vaddr, addr, m.lxsm_size) != 0) { + /* We couldn't complete the relocation, backout & fail */ + (void) munmap(addr, new_size); + return (set_errno(ENOMEM)); + } + + (void) munmap((void *)m.lxsm_vaddr, m.lxsm_size); + + /* + * Add the relocated mapping to the cache. + */ + m.lxsm_vaddr = (uintptr_t)addr; + lx_remap_anoncache_load(&m, new_size); + + return ((long)addr); +} + +/* + * We don't have a native mremap() (nor do we particularly want one), so + * we emulate it strictly in lx. The idea is simple: we just want to + * mmap() the underlying object with the new size and rip down the old mapping. + * However, this is slightly complicated because we don't actually have the + * file descriptor that corresponds to the resized mapping. So to get a file + * descriptor, we may have to search our address space for the mapping and use + * the associated vnode to create a file descriptor. Assuming that this + * succeeds, we then mmap() it and rip down the original mapping. There are + * clearly many reasons why this might fail; absent a more apt errno (e.g., + * ENOMEM in some cases), we return EINVAL to denote these cases. + */ +long +lx_mremap(uintptr_t old_addr, size_t old_size, size_t new_size, int flags, + uintptr_t new_addr) +{ + int prot = 0, oflags, mflags = 0, i, res; + lx_segmap_t map, *mp; + int rval = 0; + lx_proc_data_t *lxpd; + offset_t off; + struct vnode *vp = NULL; + file_t *fp; + caddr_t naddr; + + if (flags & LX_MREMAP_FIXED) { + /* MREMAP_FIXED requires MREMAP_MAYMOVE */ + if ((flags & LX_MREMAP_MAYMOVE) == 0) + return (set_errno(EINVAL)); + + if (new_addr & PAGEOFFSET) + return (set_errno(EINVAL)); + + mflags |= MAP_FIXED; + } else { + if (new_size == old_size) + return (old_addr); + + /* new_addr is optional and only valid when LX_MREMAP_FIXED. */ + new_addr = (uintptr_t)NULL; + } + + if (old_addr & PAGEOFFSET) + return (set_errno(EINVAL)); + + if (new_size == 0) + return (set_errno(EINVAL)); + + /* + * First consult the anoncache; if we find the segment there, we'll + * drop straight into lx_remap_anon() and save ourself the pain of + * searching our address space. + */ + lxpd = ptolxproc(curproc); + mutex_enter(&lxpd->l_remap_anoncache_lock); + + for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) { + long rv; + + mp = &lxpd->l_remap_anoncache[i]; + + if (mp->lxsm_vaddr != old_addr) + continue; + + if (mp->lxsm_size != old_size) + continue; + + /* + * lx_remap_anon will either: + * a) expand/contract in place, returning old_addr + * b) relocate & expand the mapping, returning a new address + * c) there will be an error of some sort and errno will be set + */ + rv = lx_remap_anon(mp, new_size, flags, new_addr); + mutex_exit(&lxpd->l_remap_anoncache_lock); + return (rv); + } + + mutex_exit(&lxpd->l_remap_anoncache_lock); + + /* + * Search our address space to find the specified mapping. + */ + if ((res = lx_get_mapping(old_addr, old_size, &map, &vp, &off)) > 0) + return (set_errno(res)); + + /* + * We found the mapping. + */ + mp = ↦ + DTRACE_PROBE1(lx__mremap__seg, lx_segmap_t *, mp); + + if (mp->lxsm_flags & LX_SM_SHM) { + /* + * If this is either ISM or System V shared memory, we're not + * going to remap it. + */ + rval = set_errno(EINVAL); + goto out; + } + + if (mp->lxsm_flags & LX_SM_ANON) { + /* + * This is an anonymous mapping -- which is the one case in + * which we perform something that approaches a true remap. + */ + long rv; + + if (vp != NULL) + VN_RELE(vp); + mutex_enter(&lxpd->l_remap_anoncache_lock); + rv = lx_remap_anon(mp, new_size, flags, new_addr); + mutex_exit(&lxpd->l_remap_anoncache_lock); + return (rv); + } + + /* The rest of the code is for a 'named' mapping */ + + if (!(flags & LX_MREMAP_MAYMOVE)) { + /* + * If we're not allowed to move this mapping, we're going to + * act as if we can't expand it. + */ + rval = set_errno(ENOMEM); + goto out; + } + + if (!(mp->lxsm_flags & LX_SM_SHARED)) { + /* + * If this is a private mapping, we're not going to remap it. + */ + rval = set_errno(EINVAL); + goto out; + } + + oflags = (mp->lxsm_flags & LX_SM_WRITE) ? (FWRITE | FREAD) : FREAD; + if (vp == NULL) { + /* + * If vp is NULL, the path might not exist. We're going to kick + * it back with EINVAL. + */ + rval = set_errno(EINVAL); + goto out; + } + + /* falloc cannot fail with a NULL fdp. */ + VERIFY0(falloc(vp, oflags, &fp, NULL)); + mutex_exit(&fp->f_tlock); + + if (mp->lxsm_flags & LX_SM_WRITE) + prot |= PROT_WRITE; + + if (mp->lxsm_flags & LX_SM_READ) + prot |= PROT_READ; + + if (mp->lxsm_flags & LX_SM_EXEC) + prot |= PROT_EXEC; + + mflags |= MAP_SHARED; + + /* + * We're using smmap_common to pass the fp directly, instead of + * initializing a temporary file descriptor for smmap64(), so as to + * prevent any inadvertent use of that temporary fd within the + * application. + */ + naddr = (caddr_t)new_addr; + rval = smmap_common(&naddr, new_size, prot, mflags, fp, off); + + mutex_enter(&fp->f_tlock); + unfalloc(fp); + + if (rval != 0) { + rval = set_errno(ENOMEM); + goto out; + } + + /* + * Our mapping succeeded; we're now going to rip down the old mapping. + */ + (void) munmap((void *)old_addr, old_size); + +out: + if (vp != NULL) + VN_RELE(vp); + + if (rval == 0) + return ((long)naddr); + return ((long)rval); +} + +#pragma GCC diagnostic ignored "-Wclobbered" +/* + * During mremap we had to relocate the initial anonymous mapping to a new + * location (a new anonymous mapping). Copy the user-level data from the first + * mapping to the second mapping. + * + * We have to lock both sides to ensure there is no fault. We do this in 16MB + * chunks at a time and we do not concern ourselves with the zone's + * max-locked-memory rctl. + * + * Keep this function at the end since we're disabling the compiler's "clobber" + * check due to the on_fault call. + */ +static int +lx_u2u_copy(void *src, void *dst, size_t len) +{ + size_t mlen; + caddr_t sp, dp; + int err; + page_t **ppa_src, **ppa_dst; + label_t ljb; + struct as *p_as = curproc->p_as; + + /* Both sides should be page aligned since they're from smmap64 */ + ASSERT(((uintptr_t)src & PAGEOFFSET) == 0); + ASSERT(((uintptr_t)dst & PAGEOFFSET) == 0); + /* Both came from mmap, so they should be valid user pointers */ + ASSERT((uintptr_t)src < USERLIMIT && (uintptr_t)dst < USERLIMIT); + + sp = src; + dp = dst; + + do { + mlen = MIN(len, 16 * 1024 * 1024); + + err = as_pagelock(p_as, &ppa_src, sp, mlen, S_READ); + if (err != 0) { + return (err); + } + err = as_pagelock(p_as, &ppa_dst, dp, mlen, S_WRITE); + if (err != 0) { + as_pageunlock(p_as, ppa_src, sp, mlen, S_READ); + return (err); + } + + DTRACE_PROBE3(lx__mremap__copy, void *, sp, void *, dp, + size_t, mlen); + + /* on_fault calls smap_disable */ + if (on_fault(&ljb)) { + /* + * Given that the pages are locked and smap is disabled, + * we really should never get here. If we somehow do + * get here, the copy fails just as if we could not + * lock the pages to begin with. + */ + as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE); + as_pageunlock(p_as, ppa_src, sp, mlen, S_READ); + return (EFAULT); + } + ucopy(sp, dp, mlen); + no_fault(); /* calls smap_enable */ + + as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE); + as_pageunlock(p_as, ppa_src, sp, mlen, S_READ); + + len -= mlen; + sp += mlen; + dp += mlen; + } while (len > 0); + + return (0); +} |