diff options
Diffstat (limited to 'usr/src/uts/common/syscall')
-rw-r--r-- | usr/src/uts/common/syscall/brandsys.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/chdir.c | 29 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/fcntl.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/memcntl.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/open.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/poll.c | 358 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/rusagesys.c | 14 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/rw.c | 375 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/sendfile.c | 19 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/stat.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/sysconfig.c | 27 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/uadmin.c | 10 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/umount.c | 11 |
13 files changed, 533 insertions, 339 deletions
diff --git a/usr/src/uts/common/syscall/brandsys.c b/usr/src/uts/common/syscall/brandsys.c index 9b4bd38baa..245ef9f14f 100644 --- a/usr/src/uts/common/syscall/brandsys.c +++ b/usr/src/uts/common/syscall/brandsys.c @@ -23,7 +23,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2016 Joyent, Inc. + */ #include <sys/brand.h> #include <sys/systm.h> @@ -35,7 +37,7 @@ */ int64_t brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, - uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg4) { struct proc *p = curthread->t_procp; int64_t rval = 0; @@ -49,7 +51,7 @@ brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (set_errno(ENOSYS)); if ((err = ZBROP(p->p_zone)->b_brandsys(cmd, &rval, arg1, arg2, arg3, - arg4, arg5, arg6)) != 0) + arg4)) != 0) return (set_errno(err)); return (rval); diff --git a/usr/src/uts/common/syscall/chdir.c b/usr/src/uts/common/syscall/chdir.c index 84c924f570..deb5532b50 100644 --- a/usr/src/uts/common/syscall/chdir.c +++ b/usr/src/uts/common/syscall/chdir.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -62,7 +63,7 @@ /* * Change current working directory ("."). */ -static int chdirec(vnode_t *, int ischroot, int do_traverse); +static int chdirec(vnode_t *, boolean_t ischroot, boolean_t do_traverse); int chdir(char *fname) @@ -78,7 +79,7 @@ lookup: return (set_errno(error)); } - error = chdirec(vp, 0, 1); + error = chdirec(vp, B_FALSE, B_TRUE); if (error) { if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto lookup; @@ -102,7 +103,7 @@ fchdir(int fd) vp = fp->f_vnode; VN_HOLD(vp); releasef(fd); - error = chdirec(vp, 0, 0); + error = chdirec(vp, B_FALSE, B_FALSE); if (error) return (set_errno(error)); return (0); @@ -125,7 +126,7 @@ lookup: return (set_errno(error)); } - error = chdirec(vp, 1, 1); + error = chdirec(vp, B_TRUE, B_TRUE); if (error) { if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto lookup; @@ -152,18 +153,18 @@ fchroot(int fd) vp = fp->f_vnode; VN_HOLD(vp); releasef(fd); - error = chdirec(vp, 1, 0); + error = chdirec(vp, B_TRUE, B_FALSE); if (error) return (set_errno(error)); return (0); } static int -chdirec(vnode_t *vp, int ischroot, int do_traverse) +chdirec_common(proc_t *pp, vnode_t *vp, boolean_t ischroot, + boolean_t do_traverse) { int error; vnode_t *oldvp; - proc_t *pp = curproc; vnode_t **vpp; refstr_t *cwd; int newcwd = 1; @@ -194,7 +195,7 @@ chdirec(vnode_t *vp, int ischroot, int do_traverse) if (ischroot) { struct vattr tattr; struct vattr rattr; - vnode_t *zonevp = curproc->p_zone->zone_rootvp; + vnode_t *zonevp = pp->p_zone->zone_rootvp; tattr.va_mask = AT_FSID|AT_NODEID; if (error = VOP_GETATTR(vp, &tattr, 0, CRED(), NULL)) @@ -243,3 +244,15 @@ bad: VN_RELE(vp); return (error); } + +int +chdir_proc(proc_t *pp, vnode_t *vp, boolean_t ischroot, boolean_t do_traverse) +{ + return (chdirec_common(pp, vp, ischroot, do_traverse)); +} + +static int +chdirec(vnode_t *vp, boolean_t ischroot, boolean_t do_traverse) +{ + return (chdirec_common(curproc, vp, ischroot, do_traverse)); +} diff --git a/usr/src/uts/common/syscall/fcntl.c b/usr/src/uts/common/syscall/fcntl.c index e9a97da97c..14e3b7018e 100644 --- a/usr/src/uts/common/syscall/fcntl.c +++ b/usr/src/uts/common/syscall/fcntl.c @@ -54,7 +54,8 @@ #include <sys/cmn_err.h> -static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); +/* This is global so that it can be used by brand emulation. */ +int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *); static void fd_too_big(proc_t *); diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c index 1ee4b6a395..721f884a7e 100644 --- a/usr/src/uts/common/syscall/memcntl.c +++ b/usr/src/uts/common/syscall/memcntl.c @@ -115,13 +115,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask) * MS_SYNC used to be defined to be zero but is now non-zero. * For binary compatibility we still accept zero * (the absence of MS_ASYNC) to mean the same thing. + * Binary compatibility is not an issue for MS_INVALCURPROC. */ iarg = (uintptr_t)arg; if ((iarg & ~MS_INVALIDATE) == 0) iarg |= MS_SYNC; - if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) || - ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) { + if (((iarg & + ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) || + ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) || + ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) == + (MS_INVALIDATE|MS_INVALCURPROC))) { error = set_errno(EINVAL); } else { error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0); diff --git a/usr/src/uts/common/syscall/open.c b/usr/src/uts/common/syscall/open.c index edb04c824b..874e31869c 100644 --- a/usr/src/uts/common/syscall/open.c +++ b/usr/src/uts/common/syscall/open.c @@ -74,12 +74,12 @@ copen(int startfd, char *fname, int filemode, int createmode) if (filemode & (FSEARCH|FEXEC)) { /* - * Must be one or the other and neither FREAD nor FWRITE + * Must be one or the other. * Must not be any of FAPPEND FCREAT FTRUNC FXATTR FXATTRDIROPEN - * XXX: Should these just be silently ignored? + * XXX: Should these just be silently ignored like we + * silently ignore FREAD|FWRITE? */ - if ((filemode & (FREAD|FWRITE)) || - (filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || + if ((filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || (filemode & (FAPPEND|FCREAT|FTRUNC|FXATTR|FXATTRDIROPEN))) return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index 6ffef9e3ec..2eb50323a0 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -29,7 +29,7 @@ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright (c) 2017, Joyent, Inc. */ /* @@ -317,20 +317,57 @@ polllock(pollhead_t *php, kmutex_t *lp) return (0); } -static int -poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) +int +poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds) +{ + pollfd_t *pollfdp; + nfds_t old_nfds; + + /* + * NOTE: for performance, buffers are saved across poll() calls. + * The theory is that if a process polls heavily, it tends to poll + * on the same set of descriptors. Therefore, we only reallocate + * buffers when nfds changes. There is no hysteresis control, + * because there is no data to suggest that this is necessary; + * the penalty of reallocating is not *that* great in any event. + */ + old_nfds = ps->ps_nfds; + if (nfds != old_nfds) { + kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); + pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + ps->ps_pollfd = pollfdp; + ps->ps_nfds = nfds; + } + + pollfdp = ps->ps_pollfd; + if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { + return (EFAULT); + } + + if (fds == NULL) { + /* + * If the process has page 0 mapped, then the copyin() above + * will succeed even if fds is NULL. However, our cached + * poll lists are keyed by the address of the passed-in fds + * structure, and we use the value NULL to indicate an unused + * poll cache list entry. As such, we elect not to support + * NULL as a valid (user) memory address and fail the poll() + * call. + */ + return (EFAULT); + } + return (0); +} + +int +poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp, + int *fdcnt) { kthread_t *t = curthread; - klwp_t *lwp = ttolwp(t); - proc_t *p = ttoproc(t); - int fdcnt = 0; - int i; hrtime_t deadline; /* hrtime value when we want to return */ pollfd_t *pollfdp; - pollstate_t *ps; pollcache_t *pcp; int error = 0; - nfds_t old_nfds; int cacheindex = 0; /* which cache set is used */ /* @@ -340,33 +377,34 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) deadline = -1; } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { deadline = 0; + } else if (tsp->tv_sec >= HRTIME_MAX/NANOSEC) { + /* Use an indefinite timeout if tv_sec would cause overflow */ + deadline = -1; } else { + /* + * The above check, when combined with the protections offered + * by itimerspecfix (ensuring that neither field is negative + * and that tv_nsec represents less than a whole second), will + * prevent overflow during the conversion from timespec_t to + * uhrtime_t. + */ + uhrtime_t utime = tsp->tv_sec * NANOSEC; + utime += tsp->tv_nsec; + /* They must wait at least a tick. */ - deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; - deadline = MAX(deadline, nsec_per_tick); - deadline += gethrtime(); - } + utime = MAX(utime, nsec_per_tick); - /* - * Reset our signal mask, if requested. - */ - if (ksetp != NULL) { - mutex_enter(&p->p_lock); - schedctl_finish_sigblock(t); - lwp->lwp_sigoldmask = t->t_hold; - t->t_hold = *ksetp; - t->t_flag |= T_TOMASK; /* - * Call cv_reltimedwait_sig() just to check for signals. - * We will return immediately with either 0 or -1. + * Since utime has an upper bound of HRTIME_MAX, adding the + * gethrtime() result cannot incur an overflow as the unsigned + * type has an adequate bound. */ - if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, - TR_CLOCK_TICK)) { - mutex_exit(&p->p_lock); - error = EINTR; - goto pollout; + utime += (uhrtime_t)gethrtime(); + if (utime > HRTIME_MAX) { + deadline = -1; + } else { + deadline = (hrtime_t)utime; } - mutex_exit(&p->p_lock); } /* @@ -374,6 +412,7 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * If yes then bypass all the other stuff and make it sleep. */ if (nfds == 0) { + *fdcnt = 0; /* * Sleep until we have passed the requested future * time or until interrupted by a signal. @@ -385,66 +424,14 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) &t->t_delay_lock, deadline)) > 0) continue; mutex_exit(&t->t_delay_lock); - error = (error == 0) ? EINTR : 0; + return ((error == 0) ? EINTR : 0); } - goto pollout; - } - - if (nfds > p->p_fno_ctl) { - mutex_enter(&p->p_lock); - (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], - p->p_rctls, p, RCA_SAFE); - mutex_exit(&p->p_lock); - error = EINVAL; - goto pollout; - } - - /* - * Need to allocate memory for pollstate before anything because - * the mutex and cv are created in this space - */ - ps = pollstate_create(); - - if (ps->ps_pcache == NULL) - ps->ps_pcache = pcache_alloc(); - pcp = ps->ps_pcache; - - /* - * NOTE: for performance, buffers are saved across poll() calls. - * The theory is that if a process polls heavily, it tends to poll - * on the same set of descriptors. Therefore, we only reallocate - * buffers when nfds changes. There is no hysteresis control, - * because there is no data to suggest that this is necessary; - * the penalty of reallocating is not *that* great in any event. - */ - old_nfds = ps->ps_nfds; - if (nfds != old_nfds) { - - kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); - pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); - ps->ps_pollfd = pollfdp; - ps->ps_nfds = nfds; + return (0); } + VERIFY(ps != NULL); pollfdp = ps->ps_pollfd; - if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { - error = EFAULT; - goto pollout; - } - - if (fds == NULL) { - /* - * If the process has page 0 mapped, then the copyin() above - * will succeed even if fds is NULL. However, our cached - * poll lists are keyed by the address of the passed-in fds - * structure, and we use the value NULL to indicate an unused - * poll cache list entry. As such, we elect not to support - * NULL as a valid (user) memory address and fail the poll() - * call. - */ - error = EINVAL; - goto pollout; - } + VERIFY(pollfdp != NULL); /* * If this thread polls for the first time, allocate ALL poll @@ -460,10 +447,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) /* * poll and cache this poll fd list in ps_pcacheset[0]. */ - error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); - if (fdcnt || error) { + error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); + if (error || *fdcnt) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } } else { pollcacheset_t *pcset = ps->ps_pcacheset; @@ -488,11 +475,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * the callee will guarantee the consistency * of cached poll list and cache content. */ - error = pcacheset_resolve(ps, nfds, &fdcnt, + error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } break; } @@ -509,11 +496,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * found an unused entry. Use it to cache * this poll list. */ - error = pcacheset_cache_list(ps, fds, &fdcnt, + error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); - if (fdcnt || error) { + if (error || *fdcnt) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } break; } @@ -527,10 +514,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) cacheindex = pcacheset_replace(ps); ASSERT(cacheindex < ps->ps_nsets); pcset[cacheindex].pcs_usradr = (uintptr_t)fds; - error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); + error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } } } @@ -548,8 +535,8 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) mutex_enter(&pcp->pc_lock); for (;;) { pcp->pc_flag = 0; - error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); - if (fdcnt || error) { + error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex); + if (error || *fdcnt) { mutex_exit(&pcp->pc_lock); mutex_exit(&ps->ps_lock); break; @@ -595,13 +582,116 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) mutex_enter(&pcp->pc_lock); } + return (error); +} + +/* + * This is the system call trap that poll(), + * select() and pselect() are built upon. + * It is a private interface between libc and the kernel. + */ +int +pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + timespec_t ts; + timespec_t *tsp; + k_sigset_t kset; + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL; + int error = 0, fdcnt = 0; + + /* + * Copy in timeout + */ + if (timeoutp == NULL) { + tsp = NULL; + } else { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + + /* + * Copy in and reset signal mask, if requested. + */ + if (setp != NULL) { + sigset_t set; + + if (copyin(setp, &set, sizeof (set))) + return (set_errno(EFAULT)); + sigutok(&set, &kset); + + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = kset; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto pollout; + } + mutex_exit(&p->p_lock); + } + + /* + * Initialize pollstate and copy in pollfd data if present. + * If nfds == 0, we will skip all of the copying and check steps and + * proceed directly into poll_common to process the supplied timeout. + */ + if (nfds != 0) { + if (nfds > p->p_fno_ctl) { + mutex_enter(&p->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + p->p_rctls, p, RCA_SAFE); + mutex_exit(&p->p_lock); + error = EINVAL; + goto pollout; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + if ((error = poll_copyin(ps, fds, nfds)) != 0) + goto pollout; + pollfdp = ps->ps_pollfd; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fds, nfds, tsp, &fdcnt); + pollout: /* - * If we changed the signal mask but we received - * no signal then restore the signal mask. - * Otherwise psig() will deal with the signal mask. + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. */ - if (ksetp != NULL) { + if (setp != NULL) { mutex_enter(&p->p_lock); if (lwp->lwp_cursig == 0) { t->t_hold = lwp->lwp_sigoldmask; @@ -612,12 +702,10 @@ pollout: if (error) return (set_errno(error)); - /* * Copy out the events and return the fdcnt to the user. */ - if (nfds != 0 && - copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) + if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) return (set_errno(EFAULT)); #ifdef DEBUG @@ -625,7 +713,7 @@ pollout: * Another sanity check: */ if (fdcnt) { - int reventcnt = 0; + int i, reventcnt = 0; for (i = 0; i < nfds; i++) { if (pollfdp[i].fd < 0) { @@ -638,6 +726,8 @@ pollout: } ASSERT(fdcnt == reventcnt); } else { + int i; + for (i = 0; i < nfds; i++) { ASSERT(pollfdp[i].revents == 0); } @@ -648,52 +738,6 @@ pollout: } /* - * This is the system call trap that poll(), - * select() and pselect() are built upon. - * It is a private interface between libc and the kernel. - */ -int -pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) -{ - timespec_t ts; - timespec_t *tsp; - sigset_t set; - k_sigset_t kset; - k_sigset_t *ksetp; - model_t datamodel = get_udatamodel(); - - if (timeoutp == NULL) - tsp = NULL; - else { - if (datamodel == DATAMODEL_NATIVE) { - if (copyin(timeoutp, &ts, sizeof (ts))) - return (set_errno(EFAULT)); - } else { - timespec32_t ts32; - - if (copyin(timeoutp, &ts32, sizeof (ts32))) - return (set_errno(EFAULT)); - TIMESPEC32_TO_TIMESPEC(&ts, &ts32) - } - - if (itimerspecfix(&ts)) - return (set_errno(EINVAL)); - tsp = &ts; - } - - if (setp == NULL) - ksetp = NULL; - else { - if (copyin(setp, &set, sizeof (set))) - return (set_errno(EFAULT)); - sigutok(&set, &kset); - ksetp = &kset; - } - - return (poll_common(fds, nfds, tsp, ksetp)); -} - -/* * Clean up any state left around by poll(2). Called when a thread exits. */ void @@ -1277,8 +1321,8 @@ pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, * be OK too. */ ASSERT(curthread->t_pollcache == NULL); - error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, - &memphp, NULL); + error = VOP_POLL(fp->f_vnode, pollfdp->events | ps->ps_implicit_ev, 0, + &pollfdp->revents, &memphp, NULL); if (error) { return (error); } @@ -1992,7 +2036,8 @@ retry: * flag. */ ASSERT(curthread->t_pollcache == NULL); - error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, + error = VOP_POLL(fp->f_vnode, + pollfdp[entry].events | ps->ps_implicit_ev, 0, &pollfdp[entry].revents, &php, NULL); /* * releasef after completely done with this cached @@ -2291,6 +2336,7 @@ pollstate_create() } else { ASSERT(ps->ps_depth == 0); ASSERT(ps->ps_flags == 0); + ASSERT(ps->ps_implicit_ev == 0); ASSERT(ps->ps_pc_stack[0] == 0); } return (ps); @@ -3025,7 +3071,7 @@ plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, php = NULL; ASSERT(curthread->t_pollcache == NULL); error = VOP_POLL(fp->f_vnode, - pollfdp[i].events, 0, + pollfdp[i].events | psp->ps_implicit_ev, 0, &pollfdp[i].revents, &php, NULL); if (error) { return (error); diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c index 3e0e63f4c0..417c629168 100644 --- a/usr/src/uts/common/syscall/rusagesys.c +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* @@ -257,6 +258,19 @@ rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4) case _RUSAGESYS_GETVMUSAGE: return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2, (vmusage_t *)arg3, (size_t *)arg4, 0)); + case _RUSAGESYS_INVALMAP: + /* + * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD + * handling so callers on SPARC should get simple sync + * handling with invalidation to all processes. + */ +#if defined(__sparc) + return (memcntl((caddr_t)arg2, (size_t)arg3, MC_SYNC, + (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0)); +#else + return (vm_map_inval((pid_t)(uintptr_t)arg1, (caddr_t)arg2, + (size_t)arg3)); +#endif default: return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c index a28894b2c9..23f03e841d 100644 --- a/usr/src/uts/common/syscall/rw.c +++ b/usr/src/uts/common/syscall/rw.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -50,6 +50,7 @@ #include <sys/debug.h> #include <sys/rctl.h> #include <sys/nbmlock.h> +#include <sys/limits.h> #define COPYOUT_MAX_CACHE (1<<17) /* 128K */ @@ -607,19 +608,12 @@ out: return (bcount); } -/* - * XXX -- The SVID refers to IOV_MAX, but doesn't define it. Grrrr.... - * XXX -- However, SVVS expects readv() and writev() to fail if - * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source), - * XXX -- so I guess that's the "interface". - */ -#define DEF_IOV_MAX 16 - ssize_t readv(int fdes, struct iovec *iovp, int iovcnt) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -630,9 +624,14 @@ readv(int fdes, struct iovec *iovp, int iovcnt) u_offset_t fileoff; int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -640,36 +639,63 @@ readv(int fdes, struct iovec *iovp, int iovcnt) * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((fp = getf(fdes)) == NULL) + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FREAD) == 0) { error = EBADF; goto out; @@ -768,6 +794,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -777,7 +805,8 @@ ssize_t writev(int fdes, struct iovec *iovp, int iovcnt) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -788,9 +817,14 @@ writev(int fdes, struct iovec *iovp, int iovcnt) u_offset_t fileoff; int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -798,36 +832,62 @@ writev(int fdes, struct iovec *iovp, int iovcnt) * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen = aiov32[i].iov_len; count32 += iovlen; - if (iovlen < 0 || count32 < 0) + if (iovlen < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((fp = getf(fdes)) == NULL) + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FWRITE) == 0) { error = EBADF; goto out; @@ -917,6 +977,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -927,7 +989,8 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, off_t extended_offset) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -936,25 +999,35 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, int error = 0; int i; + /* + * In a 64-bit kernel, this interface supports native 64-bit + * applications as well as 32-bit applications using both standard and + * large-file access. For 32-bit large-file aware applications, the + * offset is passed as two parameters which are joined into the actual + * offset used. The 64-bit libc always passes 0 for the extended_offset. + * Note that off_t is a signed value, but the preadv/pwritev API treats + * the offset as a position in the file for the operation, so passing + * a negative value will likely fail the maximum offset checks below + * because we convert it to an unsigned value which will be larger than + * the maximum valid offset. + */ #if defined(_SYSCALL32_IMPL) || defined(_ILP32) u_offset_t fileoff = ((u_offset_t)extended_offset << 32) | (u_offset_t)offset; #else /* _SYSCALL32_IMPL || _ILP32 */ u_offset_t fileoff = (u_offset_t)(ulong_t)offset; #endif /* _SYSCALL32_IMPR || _ILP32 */ -#ifdef _SYSCALL32_IMPL - const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 && - extended_offset == 0? - MAXOFF32_T : MAXOFFSET_T; -#else /* _SYSCALL32_IMPL */ - const u_offset_t maxoff = MAXOFF32_T; -#endif /* _SYSCALL32_IMPL */ int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -962,61 +1035,104 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif /* _SYSCALL32_IMPL */ - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((bcount = (ssize_t)count) < 0) + if ((bcount = count) < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); - if ((fp = getf(fdes)) == NULL) + } + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FREAD) == 0) { error = EBADF; goto out; } vp = fp->f_vnode; rwflag = 0; - if (vp->v_type == VREG) { + /* + * Behaviour is same as read(2). Please see comments in read(2). + */ + if (vp->v_type == VREG) { if (bcount == 0) goto out; - /* - * return EINVAL for offsets that cannot be - * represented in an off_t. - */ - if (fileoff > maxoff) { - error = EINVAL; + /* Handle offset past maximum offset allowed for file. */ + if (fileoff >= OFFSET_MAX(fp)) { + struct vattr va; + va.va_mask = AT_SIZE; + + error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL); + if (error == 0) { + if (fileoff >= va.va_size) { + count = 0; + } else { + error = EOVERFLOW; + } + } goto out; } - if (fileoff + bcount > maxoff) - bcount = (ssize_t)((u_offset_t)maxoff - fileoff); + ASSERT(bcount == count); + + /* Note: modified count used in nbl_conflict() call below. */ + if ((fileoff + count) > OFFSET_MAX(fp)) + count = (ssize_t)(OFFSET_MAX(fp) - fileoff); + } else if (vp->v_type == VFIFO) { error = ESPIPE; goto out; @@ -1033,8 +1149,7 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, error = nbl_svmand(vp, fp->f_cred, &svmand); if (error != 0) goto out; - if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, - NULL)) { + if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) { error = EACCES; goto out; } @@ -1042,33 +1157,6 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, (void) VOP_RWLOCK(vp, rwflag, NULL); - /* - * Behaviour is same as read(2). Please see comments in - * read(2). - */ - - if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) { - struct vattr va; - va.va_mask = AT_SIZE; - if ((error = - VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) { - VOP_RWUNLOCK(vp, rwflag, NULL); - goto out; - } - if (fileoff >= va.va_size) { - VOP_RWUNLOCK(vp, rwflag, NULL); - count = 0; - goto out; - } else { - VOP_RWUNLOCK(vp, rwflag, NULL); - error = EOVERFLOW; - goto out; - } - } - if ((vp->v_type == VREG) && - (fileoff + count > OFFSET_MAX(fp))) { - count = (ssize_t)(OFFSET_MAX(fp) - fileoff); - } auio.uio_loffset = fileoff; auio.uio_iov = aiov; auio.uio_iovcnt = iovcnt; @@ -1099,6 +1187,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -1109,7 +1199,8 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, off_t extended_offset) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -1118,25 +1209,26 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, int error = 0; int i; + /* + * See the comment in preadv for how the offset is handled. + */ #if defined(_SYSCALL32_IMPL) || defined(_ILP32) u_offset_t fileoff = ((u_offset_t)extended_offset << 32) | (u_offset_t)offset; #else /* _SYSCALL32_IMPL || _ILP32 */ u_offset_t fileoff = (u_offset_t)(ulong_t)offset; #endif /* _SYSCALL32_IMPR || _ILP32 */ -#ifdef _SYSCALL32_IMPL - const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 && - extended_offset == 0? - MAXOFF32_T : MAXOFFSET_T; -#else /* _SYSCALL32_IMPL */ - const u_offset_t maxoff = MAXOFF32_T; -#endif /* _SYSCALL32_IMPL */ int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -1144,58 +1236,92 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif /* _SYSCALL32_IMPL */ - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((bcount = (ssize_t)count) < 0) + if ((bcount = count) < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); - if ((fp = getf(fdes)) == NULL) + } + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FWRITE) == 0) { error = EBADF; goto out; } vp = fp->f_vnode; rwflag = 1; - if (vp->v_type == VREG) { + /* + * The kernel's write(2) code checks the rctl & OFFSET_MAX and returns + * EFBIG when fileoff exceeds either limit. We do the same. + */ + if (vp->v_type == VREG) { if (bcount == 0) goto out; /* - * return EINVAL for offsets that cannot be - * represented in an off_t. + * Don't allow pwritev to cause file size to exceed the proper + * offset limit. */ - if (fileoff > maxoff) { - error = EINVAL; + if (fileoff >= OFFSET_MAX(fp)) { + error = EFBIG; goto out; } + /* * Take appropriate action if we are trying * to write above the resource limit. @@ -1218,17 +1344,13 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, error = EFBIG; goto out; } - /* - * Don't allow pwritev to cause file sizes to exceed - * maxoff. - */ - if (fileoff == maxoff) { - error = EFBIG; - goto out; - } - if (fileoff + bcount > maxoff) - bcount = (ssize_t)((u_offset_t)maxoff - fileoff); + ASSERT(bcount == count); + + /* Note: modified count used in nbl_conflict() call below. */ + if ((fileoff + count) > OFFSET_MAX(fp)) + count = (ssize_t)(OFFSET_MAX(fp) - fileoff); + } else if (vp->v_type == VFIFO) { error = ESPIPE; goto out; @@ -1245,8 +1367,7 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, error = nbl_svmand(vp, fp->f_cred, &svmand); if (error != 0) goto out; - if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, - NULL)) { + if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) { error = EACCES; goto out; } @@ -1254,34 +1375,6 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, (void) VOP_RWLOCK(vp, rwflag, NULL); - - /* - * Behaviour is same as write(2). Please see comments for - * write(2). - */ - - if (vp->v_type == VREG) { - if (fileoff >= curproc->p_fsz_ctl) { - VOP_RWUNLOCK(vp, rwflag, NULL); - mutex_enter(&curproc->p_lock); - /* see above rctl_action comment */ - (void) rctl_action( - rctlproc_legacy[RLIMIT_FSIZE], - curproc->p_rctls, - curproc, RCA_UNSAFE_SIGINFO); - mutex_exit(&curproc->p_lock); - error = EFBIG; - goto out; - } - if (fileoff >= OFFSET_MAX(fp)) { - VOP_RWUNLOCK(vp, rwflag, NULL); - error = EFBIG; - goto out; - } - if (fileoff + count > OFFSET_MAX(fp)) - count = (ssize_t)(OFFSET_MAX(fp) - fileoff); - } - auio.uio_loffset = fileoff; auio.uio_iov = aiov; auio.uio_iovcnt = iovcnt; @@ -1308,6 +1401,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c index 822f14416c..f42c907cf3 100644 --- a/usr/src/uts/common/syscall/sendfile.c +++ b/usr/src/uts/common/syscall/sendfile.c @@ -82,7 +82,7 @@ extern sotpi_info_t *sotpi_sototpi(struct sonode *); * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer * more than 2GB of data. */ -int +static int sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, int copy_cnt, ssize32_t *count) { @@ -343,7 +343,7 @@ sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, return (0); } -ssize32_t +static ssize32_t sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, size32_t *xferred, int fildes) { @@ -390,7 +390,7 @@ sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, } #endif -int +static int sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) { @@ -680,7 +680,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, } -int +static int sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t *count) { @@ -1160,6 +1160,17 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, } else { maxblk = (int)vp->v_stream->sd_maxblk; } + + /* + * We need to make sure that the socket that we're sending on + * supports sendfile behavior. sockfs doesn't know that the APIs + * we want to use are coming from sendfile, so we can't rely on + * it to check for us. + */ + if ((so->so_mode & SM_SENDFILESUPP) == 0) { + error = EOPNOTSUPP; + goto err; + } break; case VREG: break; diff --git a/usr/src/uts/common/syscall/stat.c b/usr/src/uts/common/syscall/stat.c index 4085104cc7..93f26121bc 100644 --- a/usr/src/uts/common/syscall/stat.c +++ b/usr/src/uts/common/syscall/stat.c @@ -61,7 +61,7 @@ * to VOP_GETATTR */ -static int +int cstatat_getvp(int fd, char *name, int follow, vnode_t **vp, cred_t **cred) { vnode_t *startvp; diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index 03f2fabe13..92daeed703 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -111,6 +112,9 @@ sysconfig(int which) case _CONFIG_NPROC_MAX: return (max_ncpus); + case _CONFIG_NPROC_NCPU: + return (NCPU); /* Private sysconfig for direct NCPU access */ + case _CONFIG_STACK_PROT: return (curproc->p_stkprot & ~PROT_USER); @@ -170,8 +174,8 @@ sysconfig(int which) * even though rcapd can be used on the global zone too. */ if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) - return (MIN(btop(curproc->p_zone->zone_phys_mcap), + curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) + return (MIN(btop(curproc->p_zone->zone_phys_mem_ctl), physinstalled)); return (physinstalled); @@ -179,26 +183,23 @@ sysconfig(int which) case _CONFIG_AVPHYS_PAGES: /* * If the non-global zone has a phys. memory cap, use - * the phys. memory cap - zone's current rss. We always + * the phys. memory cap - zone's rss. We always * report the system-wide value for the global zone, even - * though rcapd can be used on the global zone too. + * though memory capping can be used on the global zone too. + * We use the cached value for the RSS since vm_getusage() + * is so expensive and we don't need this value to be exact. */ if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) { + curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) { pgcnt_t cap, rss, free; - vmusage_t in_use; - size_t cnt = 1; - cap = btop(curproc->p_zone->zone_phys_mcap); + cap = btop(curproc->p_zone->zone_phys_mem_ctl); if (cap > physinstalled) return (freemem); - if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt, - FKIOCTL) != 0) - in_use.vmu_rss_all = 0; - rss = btop(in_use.vmu_rss_all); + rss = btop(curproc->p_zone->zone_phys_mem); /* - * Because rcapd implements a soft cap, it is possible + * Because this is a soft cap, it is possible * for rss to be temporarily over the cap. */ if (cap > rss) diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c index 2dda4001bf..7aac5b52a7 100644 --- a/usr/src/uts/common/syscall/uadmin.c +++ b/usr/src/uts/common/syscall/uadmin.c @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/param.h> @@ -78,7 +78,7 @@ volatile int fastreboot_dryrun = 0; * system with many zones. */ void -killall(zoneid_t zoneid) +killall(zoneid_t zoneid, boolean_t force) { proc_t *p; @@ -108,7 +108,7 @@ killall(zoneid_t zoneid) p->p_stat != SIDL && p->p_stat != SZOMB) { mutex_enter(&p->p_lock); - if (sigismember(&p->p_sig, SIGKILL)) { + if (!force && sigismember(&p->p_sig, SIGKILL)) { mutex_exit(&p->p_lock); p = p->p_next; } else { @@ -245,12 +245,13 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp) */ zone_shutdown_global(); - killall(ALL_ZONES); + killall(ALL_ZONES, B_FALSE); /* * If we are calling kadmin() from a kernel context then we * do not release these resources. */ if (ttoproc(curthread) != &p0) { + mutex_enter(&curproc->p_lock); VN_RELE(PTOU(curproc)->u_cdir); if (PTOU(curproc)->u_rdir) VN_RELE(PTOU(curproc)->u_rdir); @@ -260,6 +261,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp) PTOU(curproc)->u_cdir = rootdir; PTOU(curproc)->u_rdir = NULL; PTOU(curproc)->u_cwd = NULL; + mutex_exit(&curproc->p_lock); } /* diff --git a/usr/src/uts/common/syscall/umount.c b/usr/src/uts/common/syscall/umount.c index a2deedb163..b25f89b6d5 100644 --- a/usr/src/uts/common/syscall/umount.c +++ b/usr/src/uts/common/syscall/umount.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -125,6 +126,7 @@ umount2(char *pathp, int flag) struct pathname pn; struct vfs *vfsp; int error; + boolean_t altroot; /* * Some flags are disallowed through the system call interface. @@ -154,9 +156,12 @@ umount2(char *pathp, int flag) * isn't in an environment with an alternate root (to the zone's root) * directory, i.e. chroot(2). */ - if (secpolicy_fs_unmount(CRED(), NULL) != 0 || - (PTOU(curproc)->u_rdir != NULL && - PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp) || + mutex_enter(&curproc->p_lock); + altroot = (PTOU(curproc)->u_rdir != NULL && + PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp); + mutex_exit(&curproc->p_lock); + + if (secpolicy_fs_unmount(CRED(), NULL) != 0 || altroot || (vfsp = vfs_mntpoint2vfsp(pn.pn_path)) == NULL) { vnode_t *fsrootvp; |