diff options
author | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
---|---|---|
committer | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
commit | 7c478bd95313f5f23a4c958a745db2134aa03244 (patch) | |
tree | c871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/uts/common/syscall | |
download | illumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz |
OpenSolaris Launch
Diffstat (limited to 'usr/src/uts/common/syscall')
95 files changed, 31467 insertions, 0 deletions
diff --git a/usr/src/uts/common/syscall/SYSCALL.README b/usr/src/uts/common/syscall/SYSCALL.README new file mode 100644 index 0000000000..2850b2f947 --- /dev/null +++ b/usr/src/uts/common/syscall/SYSCALL.README @@ -0,0 +1,306 @@ + +CDDL HEADER START + +The contents of this file are subject to the terms of the +Common Development and Distribution License, Version 1.0 only +(the "License"). You may not use this file except in compliance +with the License. + +You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +or http://www.opensolaris.org/os/licensing. +See the License for the specific language governing permissions +and limitations under the License. + +When distributing Covered Code, include this CDDL HEADER in each +file and include the License file at usr/src/OPENSOLARIS.LICENSE. +If applicable, add the following below this CDDL HEADER, with the +fields enclosed by brackets "[]" replaced with your own identifying +information: Portions Copyright [yyyy] [name of copyright owner] + +CDDL HEADER END + +Copyright 2000 Sun Microsystems, Inc. All rights reserved. +Use is subject to license terms. + +ident "%Z%%M% %I% %E% SMI" + +System Call Files +------ ---- ----- + +The universal dumping grounds for system calls in Solaris 2.x, +common/os/scalls.c and common/fs/vncalls.c, have been sub-divided into +smaller files. + +The old files had become quite large, and contained much completely +unrelated code. From a software engineering standpoint, it didn't seem +like a good idea to permit system calls or underlying routines +to be cognizant of the internal interfaces and underlying routines +of unrelated system calls. + +From a practical standpoint, recompiling all of scalls.c or vncalls.c +after making only a small change in one system call seemed like +cruel and unusual punishment. Also, running "bringover" after +changing scalls.c or vncalls.c in one's own environment had a +high probability of encountering a conflict. + +In an attempt to improve maintainability, we have split these files +and created new directories to hold the results. One hopes that this +new organization will prove easier to maintain and change. + +The principles listed below guided the split-up. Please try to adhere +to them if you add new system calls. + + +1) System calls now live in directories called "syscall". Architecture + independant system calls live in common/syscall and architecture + dependant system calls live in sparc/syscall or i86/syscall. + +2) Most system calls have their own separate file. We try to keep + these files as small as possible. + +3) Unrelated system calls should NEVER be put in the same file. Do + not consider any of these files "dumping grounds" for new system + call work. + +4) Some files DO contain more than one system call. This occurs + under the following restricted conditions: + + o System calls that are internally related, either because + they alone call a set of static functions to do the dirty + work, or because they access locally-defined static data. + The system calls in sigqueue.c and lwpsys.c are examples + of the first case; lwp_sobj.c is an example of the second. + + o Fairly trivial pairs of "get-" and "set-" operation system + calls. The file rlimit.c, containing getrlimit() and + setrlimit() is a case in point. + + o System calls that are basically "variations on a theme," + such as the the different forms of stat in stat.c. + +5) If a number of system calls make use of a local function, or, + if a function is used more widely than in a few system calls, + then perhaps this function needs to be moved to one of the + kernel-implementation files in common/os or common/fs. For + example, this was done with the functions namesetattr and + fdsetattr, which were used by several different system calls. + These functions were moved into common/os/fio.c, where they + seemed to fit better. + +------------------------------------------------------------------- +System Call Reorganization +------ ---- -------------- + +The system calls in common/os/scalls.c, common/fs/vncalls.c +have been broken up into smaller files. In addition, system +calls that previously resided in <arch>/os/archdep.c have +been removed from that file. The table below describes the +manner in which the files have been split up. + +The original syscall files have not been deleted, but have been +renamed to reflect their diminished contents. The file scalls.c +has been renamed to ssig.c, and vncalls.c has been renamed to +poll.c. + + +Syscall Entry Point Old File New File +------------------- --- ---- --- ---- +gtime scalls.c common/syscall/time.c +stime scalls.c common/syscall/time.c + +adjtime scalls.c common/syscall/adjtime.c + +times scalls.c common/syscall/times.c + +sysconfig scalls.c common/syscall/sysconfig.c + +setuid scalls.c common/syscall/uid.c +getuid scalls.c common/syscall/uid.c +seteuid scalls.c common/syscall/uid.c + +setgid scalls.c common/syscall/gid.c +getgid scalls.c common/syscall/gid.c +setegid scalls.c common/syscall/gid.c + +getpid scalls.c common/syscall/getpid.c + +setgroups scalls.c common/syscall/groups.c +getgroups scalls.c common/syscall/groups.c + +setpgrp scalls.c common/syscall/pgrpsys.c + +pause scalls.c common/syscall/pause.c + +ssig scalls.c common/syscall/ssig.c + +sigtimedwait scalls.c common/syscall/sigtimedwait.c + +sigsuspend scalls.c common/syscall/sigsuspend.c + +sigaltstack scalls.c common/syscall/sigaltstack.c + +sigpending scalls.c common/syscall/sigpending.c + +sigprocmask scalls.c common/syscall/sigprocmask.c + +sigaction scalls.c common/syscall/sigaction.c + +kill scalls.c common/syscall/sigqueue.c +sigqueue scalls.c common/syscall/sigqueue.c + +sigsendsys scalls.c common/syscall/sigsendset.c + +profil scalls.c common/syscall/profil.c + +alarm scalls.c common/syscall/alarm.c + +umask scalls.c common/syscall/umask.c + +ulimit scalls.c common/syscall/rlimit.c +getrlimit scalls.c common/syscall/rlimit.c +setrlimit scalls.c common/syscall/rlimit.c + +utssys scalls.c common/syscall/utssys.c + +uname scalls.c common/syscall/uname.c + +uadmin scalls.c common/syscall/uadmin.c + +systeminfo scalls.c common/syscall/systeminfo.c + +syslwp_create scalls.c common/syscall/lwp_create.c +syslwp_exit scalls.c common/syscall/lwp_create.c + +syslwp_syspend scalls.c common/syscall/lwpsys.c +syslwp_continue scalls.c common/syscall/lwpsys.c +lwp_kill scalls.c common/syscall/lwpsys.c +lwp_wait scalls.c common/syscall/lwpsys.c + +yield scalls.c common/syscall/yield.c + +lwp_self scalls.c common/syscall/lwp_self.c + +lwp_info scalls.c common/syscall/lwp_info.c + +lwp_mutex_lock scalls.c common/syscall/lwp_sobj.c +lwp_mutex_unlock scalls.c common/syscall/lwp_sobj.c +lwp_cond_wait scalls.c common/syscall/lwp_sobj.c +lwp_cond_signal scalls.c common/syscall/lwp_sobj.c +lwp_cond_broadcast scalls.c common/syscall/lwp_sobj.c +lwp_sema_p scalls.c common/syscall/lwp_sobj.c +lwp_sema_v scalls.c common/syscall/lwp_sobj.c + +open vncalls.c common/syscall/open.c +creat vncalls.c common/syscall/open.c + +close vncalls.c common/syscall/close.c + +read vncalls.c common/syscall/rw.c +write vncalls.c common/syscall/rw.c +pread vncalls.c common/syscall/rw.c +pwrite vncalls.c common/syscall/rw.c +readv vncalls.c common/syscall/rw.c +writev vncalls.c common/syscall/rw.c + +chdir vncalls.c common/syscall/chdir.c +fchdir vncalls.c common/syscall/chdir.c +chroot vncalls.c common/syscall/chdir.c +fchroot vncalls.c common/syscall/chdir.c + +mknod vncalls.c common/syscall/mknod.c +xmknod vncalls.c common/syscall/mknod.c + +mkdir vncalls.c common/syscall/mkdir.c + +link vncalls.c common/syscall/link.c + +rename vncalls.c common/syscall/rename.c + +symlink vncalls.c common/syscall/symlink.c + +unlink vncalls.c common/syscall/unlink.c + +rmdir vncalls.c common/syscall/rmdir.c + +getdents vncalls.c common/syscall/getdents.c + +lseek vncalls.c common/syscall/lseek.c +llseek vncalls.c common/syscall/lseek.c + +access vncalls.c common/syscall/access.c + +stat vncalls.c common/syscall/stat.c +lstat vncalls.c common/syscall/stat.c +fstat vncalls.c common/syscall/stat.c +xstat vncalls.c common/syscall/stat.c +lxstat vncalls.c common/syscall/stat.c +fxstat vncalls.c common/syscall/stat.c + +fpathconf vncalls.c common/syscall/pathconf.c +pathconf vncalls.c common/syscall/pathconf.c + +readlink vncalls.c common/syscall/readlink.c + +chmod vncalls.c common/syscall/chmod.c +fchmod vncalls.c common/syscall/chmod.c + +chown vncalls.c common/syscall/chown.c +lchown vncalls.c common/syscall/chown.c +fchown vncalls.c common/syscall/chown.c + +utime vncalls.c common/syscall/utime.c +utimes vncalls.c common/syscall/utime.c + +fdsync vncalls.c common/syscall/fdsync.c + +fcntl vncalls.c common/syscall/fcntl.c + +dup vncalls.c common/syscall/dup.c + +ioctl vncalls.c common/syscall/ioctl.c +stty vncalls.c common/syscall/ioctl.c +gtty vncalls.c common/syscall/ioctl.c + +poll vncalls.c common/syscall/poll.c + +acl vncalls.c common/syscall/acl.c +facl vncalls.c common/syscall/acl.c + +mount vfs.c common/syscall/mount.c + +statfs vfs.c common/syscall/statfs.c +fstatfs vfs.c common/syscall/statfs.c + +statvfs vfs.c common/syscall/statvfs.c +fstatvfs vfs.c common/syscall/statvfs.c + +sync vfs.c common/syscall/sync.c + +sysfs vfs.c common/syscall/sysfs.c + +umount vfs.c common/syscall/umount.c + +nice priocntl.c common/syscall/nice.c + +pipe os/pipe.c common/syscall/pipe.c + +msgsys os/msg.c common/syscall/msg.c + +semsys os/sem.c common/syscall/sem.c + +shmsys os/shm.c common/syscall/shm.c + +getcontext sparc/archdep.c sparc/syscall/getcontext.c +lwp_getprivate sparc/archdep.c sparc/syscall/lwp_private.c +lwp_setprivate sparc/archdep.c sparc/syscall/lwp_private.c + +getcontext i86/archdep.c i86/syscall/getcontext.c +lwp_getprivate i86/archdep.c i86/syscall/lwp_private.c +lwp_setprivate i86/archdep.c i86/syscall/lwp_private.c + +----------------------------------------------------------------- + +Most of the system calls in this directory have been converted +to use C-style argument passing, instead of the old uap-pointer +method. This usually makes the system calls faster and more +"natural" in implementation. diff --git a/usr/src/uts/common/syscall/access.c b/usr/src/uts/common/syscall/access.c new file mode 100644 index 0000000000..e13a754cc5 --- /dev/null +++ b/usr/src/uts/common/syscall/access.c @@ -0,0 +1,114 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred_impl.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> + +/* + * Determine accessibility of file. + */ + +#define E_OK 010 /* use effective ids */ +#define R_OK 004 +#define W_OK 002 +#define X_OK 001 + +int +access(char *fname, int fmode) +{ + vnode_t *vp; + cred_t *tmpcr; + int error; + int mode; + int eok; + cred_t *cr; + + if (fmode & ~(E_OK|R_OK|W_OK|X_OK)) + return (set_errno(EINVAL)); + + mode = ((fmode & (R_OK|W_OK|X_OK)) << 6); + + cr = CRED(); + + /* OK to use effective uid/gid, i.e., no need to crdup(CRED())? */ + eok = (fmode & E_OK) || + (cr->cr_uid == cr->cr_ruid && cr->cr_gid == cr->cr_rgid); + + if (eok) + tmpcr = cr; + else { + tmpcr = crdup(cr); + tmpcr->cr_uid = cr->cr_ruid; + tmpcr->cr_gid = cr->cr_rgid; + tmpcr->cr_ruid = cr->cr_uid; + tmpcr->cr_rgid = cr->cr_gid; + } + +lookup: + if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + if (!eok) + crfree(tmpcr); + return (set_errno(error)); + } + + if (mode) { + error = VOP_ACCESS(vp, mode, 0, tmpcr); + if (error) { + if (error == ESTALE) { + VN_RELE(vp); + goto lookup; + } + (void) set_errno(error); + } + } + + if (!eok) + crfree(tmpcr); + VN_RELE(vp); + return (error); +} diff --git a/usr/src/uts/common/syscall/acctctl.c b/usr/src/uts/common/syscall/acctctl.c new file mode 100644 index 0000000000..8c134b0a62 --- /dev/null +++ b/usr/src/uts/common/syscall/acctctl.c @@ -0,0 +1,620 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/user.h> +#include <sys/cred.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/pathname.h> +#include <sys/modctl.h> +#include <sys/acctctl.h> +#include <sys/bitmap.h> +#include <sys/exacct.h> +#include <sys/policy.h> + +/* + * acctctl(2) + * + * acctctl() provides the administrative interface to the extended accounting + * subsystem. The process and task accounting facilities are configurable: + * resources can be individually specified for recording in the appropriate + * accounting file. + * + * The current implementation of acctctl() requires that the process and task + * and flow files be distinct across all zones. + * + * Locking + * Each accounting species has an ac_info_t which contains a mutex, + * used to protect the ac_info_t's contents, and to serialize access to the + * appropriate file. + */ + +static list_t exacct_globals_list; +static kmutex_t exacct_globals_list_lock; + +static int +ac_state_set(ac_info_t *info, void *buf, size_t bufsz) +{ + int state; + + if (buf == NULL || (bufsz != sizeof (int))) + return (EINVAL); + + if (copyin(buf, &state, bufsz) != 0) + return (EFAULT); + + if (state != AC_ON && state != AC_OFF) + return (EINVAL); + + mutex_enter(&info->ac_lock); + info->ac_state = state; + mutex_exit(&info->ac_lock); + return (0); +} + +static int +ac_state_get(ac_info_t *info, void *buf, size_t bufsz) +{ + if (buf == NULL || (bufsz != sizeof (int))) + return (EINVAL); + + mutex_enter(&info->ac_lock); + if (copyout(&info->ac_state, buf, bufsz) != 0) { + mutex_exit(&info->ac_lock); + return (EFAULT); + } + mutex_exit(&info->ac_lock); + return (0); +} + +static boolean_t +ac_file_in_use(vnode_t *vp) +{ + boolean_t in_use = B_FALSE; + struct exacct_globals *acg; + + if (vp == NULL) + return (B_FALSE); + mutex_enter(&exacct_globals_list_lock); + /* + * Start off by grabbing all locks. + */ + for (acg = list_head(&exacct_globals_list); acg != NULL; + acg = list_next(&exacct_globals_list, acg)) { + mutex_enter(&acg->ac_proc.ac_lock); + mutex_enter(&acg->ac_task.ac_lock); + mutex_enter(&acg->ac_flow.ac_lock); + } + + for (acg = list_head(&exacct_globals_list); !in_use && acg != NULL; + acg = list_next(&exacct_globals_list, acg)) { + /* + * We need to verify that we aren't already using this file for + * accounting in any zone. + */ + if (vn_compare(acg->ac_proc.ac_vnode, vp) || + vn_compare(acg->ac_task.ac_vnode, vp) || + vn_compare(acg->ac_flow.ac_vnode, vp)) + in_use = B_TRUE; + } + + /* + * Drop all locks. + */ + for (acg = list_head(&exacct_globals_list); acg != NULL; + acg = list_next(&exacct_globals_list, acg)) { + mutex_exit(&acg->ac_proc.ac_lock); + mutex_exit(&acg->ac_task.ac_lock); + mutex_exit(&acg->ac_flow.ac_lock); + } + mutex_exit(&exacct_globals_list_lock); + return (in_use); +} + +static int +ac_file_set(ac_info_t *info, void *ubuf, size_t bufsz) +{ + int error = 0; + void *kbuf; + void *namebuf; + int namelen; + vnode_t *vp; + void *hdr; + size_t hdrsize; + + if (ubuf == NULL) { + mutex_enter(&info->ac_lock); + + /* + * Closing accounting file + */ + if (info->ac_vnode != NULL) { + error = VOP_CLOSE(info->ac_vnode, FWRITE, 1, 0, CRED()); + if (error) { + mutex_exit(&info->ac_lock); + return (error); + } + VN_RELE(info->ac_vnode); + info->ac_vnode = NULL; + } + if (info->ac_file != NULL) { + kmem_free(info->ac_file, strlen(info->ac_file) + 1); + info->ac_file = NULL; + } + + mutex_exit(&info->ac_lock); + return (error); + } + + if (bufsz < 2 || bufsz > MAXPATHLEN) + return (EINVAL); + + /* + * We have to copy in the whole buffer since we can't tell the length + * of the string in user's address space. + */ + kbuf = kmem_zalloc(bufsz, KM_SLEEP); + if ((error = copyinstr((char *)ubuf, (char *)kbuf, bufsz, NULL)) != 0) { + kmem_free(kbuf, bufsz); + return (error); + } + if (*((char *)kbuf) != '/') { + kmem_free(kbuf, bufsz); + return (EINVAL); + } + + /* + * Now, allocate the space where we are going to save the + * name of the accounting file and kmem_free kbuf. We have to do this + * now because it is not good to sleep in kmem_alloc() while + * holding ac_info's lock. + */ + namelen = strlen(kbuf) + 1; + namebuf = kmem_alloc(namelen, KM_SLEEP); + (void) strcpy(namebuf, kbuf); + kmem_free(kbuf, bufsz); + + /* + * Check if this file already exists. + */ + error = lookupname(namebuf, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + /* + * Check if the file is already in use. + */ + if (!error) { + if (ac_file_in_use(vp)) { + /* + * If we're already using it then return EBUSY + */ + kmem_free(namebuf, namelen); + VN_RELE(vp); + return (EBUSY); + } + VN_RELE(vp); + } + + /* + * Now, grab info's ac_lock and try to set up everything. + */ + mutex_enter(&info->ac_lock); + + if ((error = vn_open(namebuf, UIO_SYSSPACE, + FCREAT | FWRITE | FTRUNC, 0600, &vp, CRCREAT, 0)) != 0) { + mutex_exit(&info->ac_lock); + kmem_free(namebuf, namelen); + return (error); + } + + if (vp->v_type != VREG) { + VN_RELE(vp); + mutex_exit(&info->ac_lock); + kmem_free(namebuf, namelen); + return (EACCES); + } + + if (info->ac_vnode != NULL) { + /* + * Switch from an old file to a new file by swapping + * their vnode pointers. + */ + vnode_t *oldvp; + oldvp = info->ac_vnode; + info->ac_vnode = vp; + vp = oldvp; + } else { + /* + * Start writing accounting records to a new file. + */ + info->ac_vnode = vp; + vp = NULL; + } + if (vp) { + /* + * We still need to close the old file. + */ + if ((error = VOP_CLOSE(vp, FWRITE, 1, 0, CRED())) != 0) { + VN_RELE(vp); + mutex_exit(&info->ac_lock); + kmem_free(namebuf, namelen); + return (error); + } + VN_RELE(vp); + if (info->ac_file != NULL) { + kmem_free(info->ac_file, + strlen(info->ac_file) + 1); + info->ac_file = NULL; + } + } + /* + * Finally, point ac_file to the filename string and release the lock. + */ + info->ac_file = namebuf; + mutex_exit(&info->ac_lock); + + /* + * Create and write an exacct header to the file. + */ + hdr = exacct_create_header(&hdrsize); + error = exacct_write_header(info, hdr, hdrsize); + + return (error); +} + +static int +ac_file_get(ac_info_t *info, void *buf, size_t bufsz) +{ + int error = 0; + vnode_t *vnode; + char *file; + + mutex_enter(&info->ac_lock); + file = info->ac_file; + vnode = info->ac_vnode; + + if (file == NULL || vnode == NULL) { + mutex_exit(&info->ac_lock); + return (ENOTACTIVE); + } + + if (strlen(file) >= bufsz) + error = ENOMEM; + else + error = copyoutstr(file, buf, MAXPATHLEN, NULL); + + mutex_exit(&info->ac_lock); + return (error); +} + +static int +ac_res_set(ac_info_t *info, void *buf, size_t bufsz, int maxres) +{ + ac_res_t *res; + ac_res_t *tmp; + ulong_t *maskp; + int id; + uint_t counter = 0; + + /* + * Validate that a non-zero buffer, sized within limits and to an + * integral number of ac_res_t's has been specified. + */ + if (bufsz == 0 || + bufsz > sizeof (ac_res_t) * (AC_MAX_RES + 1) || + (bufsz / sizeof (ac_res_t)) * sizeof (ac_res_t) != bufsz) + return (EINVAL); + + tmp = res = kmem_alloc(bufsz, KM_SLEEP); + if (copyin(buf, res, bufsz) != 0) { + kmem_free(res, bufsz); + return (EFAULT); + } + + maskp = (ulong_t *)&info->ac_mask; + + mutex_enter(&info->ac_lock); + while ((id = tmp->ar_id) != AC_NONE && counter < maxres + 1) { + if (id > maxres || id < 0) { + mutex_exit(&info->ac_lock); + kmem_free(res, bufsz); + return (EINVAL); + } + if (tmp->ar_state == AC_ON) { + BT_SET(maskp, id); + } else if (tmp->ar_state == AC_OFF) { + BT_CLEAR(maskp, id); + } else { + mutex_exit(&info->ac_lock); + kmem_free(res, bufsz); + return (EINVAL); + } + tmp++; + counter++; + } + mutex_exit(&info->ac_lock); + kmem_free(res, bufsz); + return (0); +} + +static int +ac_res_get(ac_info_t *info, void *buf, size_t bufsz, int maxres) +{ + int error = 0; + ac_res_t *res; + ac_res_t *tmp; + size_t ressz = sizeof (ac_res_t) * (maxres + 1); + ulong_t *maskp; + int id; + + if (bufsz < ressz) + return (EINVAL); + tmp = res = kmem_alloc(ressz, KM_SLEEP); + + mutex_enter(&info->ac_lock); + maskp = (ulong_t *)&info->ac_mask; + for (id = 1; id <= maxres; id++) { + tmp->ar_id = id; + tmp->ar_state = BT_TEST(maskp, id); + tmp++; + } + tmp->ar_id = AC_NONE; + tmp->ar_state = AC_OFF; + mutex_exit(&info->ac_lock); + error = copyout(res, buf, ressz); + kmem_free(res, ressz); + return (error); +} + +/* + * acctctl() + * + * Overview + * acctctl() is the entry point for the acctctl(2) system call. + * + * Return values + * On successful completion, return 0; otherwise -1 is returned and errno is + * set appropriately. + * + * Caller's context + * Called from the system call path. + */ +int +acctctl(int cmd, void *buf, size_t bufsz) +{ + int error = 0; + int mode = AC_MODE(cmd); + int option = AC_OPTION(cmd); + int maxres; + ac_info_t *info; + zone_t *zone = curproc->p_zone; + struct exacct_globals *acg; + + acg = zone_getspecific(exacct_zone_key, zone); + /* + * exacct_zone_key and associated per-zone state were initialized when + * the module was loaded. + */ + ASSERT(exacct_zone_key != ZONE_KEY_UNINITIALIZED); + ASSERT(acg != NULL); + + switch (mode) { /* sanity check */ + case AC_TASK: + info = &acg->ac_task; + maxres = AC_TASK_MAX_RES; + break; + case AC_PROC: + info = &acg->ac_proc; + maxres = AC_PROC_MAX_RES; + break; + case AC_FLOW: + /* + * Flow accounting isn't currently configurable in non-global + * zones, but we have this field on a per-zone basis for future + * expansion as well as the ability to return default "unset" + * values for the various AC_*_GET queries. AC_*_SET commands + * fail with EPERM for AC_FLOW in non-global zones. + */ + info = &acg->ac_flow; + maxres = AC_FLOW_MAX_RES; + break; + default: + return (set_errno(EINVAL)); + } + + switch (option) { + case AC_STATE_SET: + if ((error = secpolicy_acct(CRED())) != 0) + break; + if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) { + error = EPERM; + break; + } + error = ac_state_set(info, buf, bufsz); + break; + case AC_STATE_GET: + error = ac_state_get(info, buf, bufsz); + break; + case AC_FILE_SET: + if ((error = secpolicy_acct(CRED())) != 0) + break; + if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) { + error = EPERM; + break; + } + error = ac_file_set(info, buf, bufsz); + break; + case AC_FILE_GET: + error = ac_file_get(info, buf, bufsz); + break; + case AC_RES_SET: + if ((error = secpolicy_acct(CRED())) != 0) + break; + if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) { + error = EPERM; + break; + } + error = ac_res_set(info, buf, bufsz, maxres); + break; + case AC_RES_GET: + error = ac_res_get(info, buf, bufsz, maxres); + break; + default: + return (set_errno(EINVAL)); + } + if (error) + return (set_errno(error)); + return (0); +} + +static struct sysent ac_sysent = { + 3, + SE_NOUNLOAD | SE_ARGC | SE_32RVAL1, + acctctl +}; + +static struct modlsys modlsys = { + &mod_syscallops, + "acctctl system call", + &ac_sysent +}; + +#ifdef _SYSCALL32_IMPL +static struct modlsys modlsys32 = { + &mod_syscallops32, + "32-bit acctctl system call", + &ac_sysent +}; +#endif + +static struct modlinkage modlinkage = { + MODREV_1, + &modlsys, +#ifdef _SYSCALL32_IMPL + &modlsys32, +#endif + NULL +}; + +/* ARGSUSED */ +static void * +exacct_zone_init(zoneid_t zoneid) +{ + struct exacct_globals *acg; + + acg = kmem_zalloc(sizeof (*acg), KM_SLEEP); + mutex_enter(&exacct_globals_list_lock); + list_insert_tail(&exacct_globals_list, acg); + mutex_exit(&exacct_globals_list_lock); + return (acg); +} + +static void +exacct_free_info(ac_info_t *info) +{ + mutex_enter(&info->ac_lock); + if (info->ac_vnode) { + (void) VOP_CLOSE(info->ac_vnode, FWRITE, 1, 0, kcred); + VN_RELE(info->ac_vnode); + kmem_free(info->ac_file, strlen(info->ac_file) + 1); + } + info->ac_state = AC_OFF; + info->ac_vnode = NULL; + info->ac_file = NULL; + mutex_exit(&info->ac_lock); +} + +/* ARGSUSED */ +static void +exacct_zone_shutdown(zoneid_t zoneid, void *data) +{ + struct exacct_globals *acg = data; + + /* + * The accounting files need to be closed during shutdown rather than + * destroy, since otherwise the filesystem they reside on may fail to + * unmount, thus causing the entire zone halt/reboot to fail. + */ + exacct_free_info(&acg->ac_proc); + exacct_free_info(&acg->ac_task); + exacct_free_info(&acg->ac_flow); +} + +/* ARGSUSED */ +static void +exacct_zone_fini(zoneid_t zoneid, void *data) +{ + struct exacct_globals *acg = data; + + mutex_enter(&exacct_globals_list_lock); + list_remove(&exacct_globals_list, acg); + mutex_exit(&exacct_globals_list_lock); + + mutex_destroy(&acg->ac_proc.ac_lock); + mutex_destroy(&acg->ac_task.ac_lock); + mutex_destroy(&acg->ac_flow.ac_lock); + kmem_free(acg, sizeof (*acg)); +} + +int +_init() +{ + int error; + + mutex_init(&exacct_globals_list_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&exacct_globals_list, sizeof (struct exacct_globals), + offsetof(struct exacct_globals, ac_link)); + zone_key_create(&exacct_zone_key, exacct_zone_init, + exacct_zone_shutdown, exacct_zone_fini); + + if ((error = mod_install(&modlinkage)) != 0) { + (void) zone_key_delete(exacct_zone_key); + exacct_zone_key = ZONE_KEY_UNINITIALIZED; + mutex_destroy(&exacct_globals_list_lock); + list_destroy(&exacct_globals_list); + } + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini() +{ + return (EBUSY); +} diff --git a/usr/src/uts/common/syscall/acl.c b/usr/src/uts/common/syscall/acl.c new file mode 100644 index 0000000000..a52184ec2e --- /dev/null +++ b/usr/src/uts/common/syscall/acl.c @@ -0,0 +1,430 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/fcntl.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/uio.h> +#include <sys/kmem.h> +#include <sys/filio.h> +#include <sys/acl.h> +#include <sys/cmn_err.h> + +#include <sys/unistd.h> +#include <sys/debug.h> + +static int cacl(int cmd, int nentries, void *aclbufp, + vnode_t *vp, int *rv); + +/* + * Get/Set ACL of a file. + */ +int +acl(const char *fname, int cmd, int nentries, void *aclbufp) +{ + struct vnode *vp; + int error; + int rv = 0; + + /* Sanity check arguments */ + if (fname == NULL) + return (set_errno(EINVAL)); +lookup: + error = lookupname((char *)fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + + error = cacl(cmd, nentries, aclbufp, vp, &rv); + VN_RELE(vp); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + return (rv); +} + +/* + * Get/Set ACL of a file with facl system call. + */ +int +facl(int fdes, int cmd, int nentries, void *aclbufp) +{ + file_t *fp; + int error; + int rv = 0; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); +#ifdef C2_AUDIT + if (fp->f_flag & FREVOKED) { + releasef(fdes); + return (set_errno(EBADF)); + } +#endif /* C2_AUDIT */ + + error = cacl(cmd, nentries, aclbufp, fp->f_vnode, &rv); + releasef(fdes); + + if (error) + return (set_errno(error)); + return (rv); +} + + +/* + * Common code for acl() and facl(). + */ +static int +cacl(int cmd, int nentries, void *aclbufp, vnode_t *vp, int *rv) +{ + int error; + int aclbsize; /* size of acl list in bytes */ + int dfaclbsize; /* size of default acl list in bytes */ + int numacls; + caddr_t uaddrp; + aclent_t *aclp, *aaclp; + vsecattr_t vsecattr; + + ASSERT(vp); + + bzero(&vsecattr, sizeof (vsecattr_t)); + + switch (cmd) { + + case ACE_GETACLCNT: + case GETACLCNT: + if (cmd == GETACLCNT) + vsecattr.vsa_mask = VSA_ACLCNT | VSA_DFACLCNT; + else + vsecattr.vsa_mask = VSA_ACECNT; + if (error = VOP_GETSECATTR(vp, &vsecattr, 0, CRED())) + return (error); + *rv = vsecattr.vsa_aclcnt + vsecattr.vsa_dfaclcnt; + if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp) { + kmem_free(vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt * sizeof (aclent_t)); + } + if (vsecattr.vsa_dfaclcnt && vsecattr.vsa_dfaclentp) { + kmem_free(vsecattr.vsa_dfaclentp, + vsecattr.vsa_dfaclcnt * sizeof (aclent_t)); + } + break; + case GETACL: + /* + * Minimum ACL size is three entries so might as well + * bail out here. + */ + if (nentries < 3) + return (EINVAL); + /* + * NULL output buffer is also a pretty easy bail out. + */ + if (aclbufp == NULL) + return (EFAULT); + vsecattr.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL | + VSA_DFACLCNT; + if (error = VOP_GETSECATTR(vp, &vsecattr, 0, CRED())) + return (error); + /* Check user's buffer is big enough */ + numacls = vsecattr.vsa_aclcnt + vsecattr.vsa_dfaclcnt; + aclbsize = vsecattr.vsa_aclcnt * sizeof (aclent_t); + dfaclbsize = vsecattr.vsa_dfaclcnt * sizeof (aclent_t); + if (numacls > nentries) { + error = ENOSPC; + goto errout; + } + /* Sort the acl & default acl lists */ + if (vsecattr.vsa_aclcnt > 1) + ksort((caddr_t)vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt, sizeof (aclent_t), cmp2acls); + if (vsecattr.vsa_dfaclcnt > 1) + ksort((caddr_t)vsecattr.vsa_dfaclentp, + vsecattr.vsa_dfaclcnt, sizeof (aclent_t), cmp2acls); + /* Copy out acl's */ + uaddrp = (caddr_t)aclbufp; + if (aclbsize > 0) { /* bug #1262490 */ + if (copyout(vsecattr.vsa_aclentp, uaddrp, aclbsize)) { + error = EFAULT; + goto errout; + } + } + /* Copy out default acl's */ + if (dfaclbsize > 0) { + uaddrp += aclbsize; + if (copyout(vsecattr.vsa_dfaclentp, + uaddrp, dfaclbsize)) { + error = EFAULT; + goto errout; + } + } + *rv = numacls; + if (vsecattr.vsa_aclcnt) { + kmem_free(vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt * sizeof (aclent_t)); + } + if (vsecattr.vsa_dfaclcnt) { + kmem_free(vsecattr.vsa_dfaclentp, + vsecattr.vsa_dfaclcnt * sizeof (aclent_t)); + } + break; + + case ACE_GETACL: + if (nentries < 3) + return (EINVAL); + + if (aclbufp == NULL) + return (EFAULT); + + vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; + if (error = VOP_GETSECATTR(vp, &vsecattr, 0, CRED())) + return (error); + + aclbsize = vsecattr.vsa_aclcnt * sizeof (ace_t); + if (vsecattr.vsa_aclcnt > nentries) { + error = ENOSPC; + goto errout; + } + + if (aclbsize > 0) { + if ((error = copyout(vsecattr.vsa_aclentp, + aclbufp, aclbsize)) != 0) { + goto errout; + } + } + + *rv = vsecattr.vsa_aclcnt; + if (vsecattr.vsa_aclcnt) { + kmem_free(vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt * sizeof (ace_t)); + } + break; + + case SETACL: + /* + * Minimum ACL size is three entries so might as well + * bail out here. Also limit request size to prevent user + * from allocating too much kernel memory. Maximum size + * is MAX_ACL_ENTRIES for the ACL part and MAX_ACL_ENTRIES + * for the default ACL part. (bug 4058667) + */ + if (nentries < 3 || nentries > (MAX_ACL_ENTRIES * 2)) + return (EINVAL); + /* + * NULL output buffer is also an easy bail out. + */ + if (aclbufp == NULL) + return (EFAULT); + vsecattr.vsa_mask = VSA_ACL; + aclbsize = nentries * sizeof (aclent_t); + vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); + aaclp = vsecattr.vsa_aclentp; + vsecattr.vsa_aclcnt = nentries; + uaddrp = (caddr_t)aclbufp; + if (copyin(uaddrp, vsecattr.vsa_aclentp, aclbsize)) { + kmem_free(aaclp, aclbsize); + return (EFAULT); + } + /* Sort the acl list */ + ksort((caddr_t)vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt, sizeof (aclent_t), cmp2acls); + + /* Break into acl and default acl lists */ + for (numacls = 0, aclp = vsecattr.vsa_aclentp; + numacls < vsecattr.vsa_aclcnt; + aclp++, numacls++) { + if (aclp->a_type & ACL_DEFAULT) + break; + } + + /* Find where defaults start (if any) */ + if (numacls < vsecattr.vsa_aclcnt) { + vsecattr.vsa_mask |= VSA_DFACL; + vsecattr.vsa_dfaclcnt = nentries - numacls; + vsecattr.vsa_dfaclentp = aclp; + vsecattr.vsa_aclcnt = numacls; + } + /* Adjust if they're all defaults */ + if (vsecattr.vsa_aclcnt == 0) { + vsecattr.vsa_mask &= ~VSA_ACL; + vsecattr.vsa_aclentp = NULL; + } + /* Only directories can have defaults */ + if (vsecattr.vsa_dfaclcnt && vp->v_type != VDIR) { + kmem_free(aaclp, aclbsize); + return (ENOTDIR); + } + (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); + if (error = VOP_SETSECATTR(vp, &vsecattr, 0, CRED())) { + kmem_free(aaclp, aclbsize); + VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); + return (error); + } + + /* + * Should return 0 upon success according to the man page + * and SVR4 semantics. (Bug #1214399: SETACL returns wrong rc) + */ + *rv = 0; + kmem_free(aaclp, aclbsize); + VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); + break; + + case ACE_SETACL: + if (nentries < 3 || nentries > (MAX_ACL_ENTRIES * 2)) + return (EINVAL); + + if (aclbufp == NULL) + return (EFAULT); + + vsecattr.vsa_mask = VSA_ACE; + aclbsize = nentries * sizeof (ace_t); + vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); + aaclp = vsecattr.vsa_aclentp; + vsecattr.vsa_aclcnt = nentries; + uaddrp = (caddr_t)aclbufp; + if (copyin(uaddrp, vsecattr.vsa_aclentp, aclbsize)) { + kmem_free(aaclp, aclbsize); + return (EFAULT); + } + (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); + if (error = VOP_SETSECATTR(vp, &vsecattr, 0, CRED())) { + kmem_free(aaclp, aclbsize); + VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); + return (error); + } + *rv = 0; + kmem_free(aaclp, aclbsize); + VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); + break; + + default: + return (EINVAL); + } + + return (0); + +errout: + if (aclbsize && vsecattr.vsa_aclentp) + kmem_free(vsecattr.vsa_aclentp, aclbsize); + if (dfaclbsize && vsecattr.vsa_dfaclentp) + kmem_free(vsecattr.vsa_dfaclentp, dfaclbsize); + return (error); +} + + +/* + * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified. + * v = Ptr to array/vector of objs + * n = # objs in the array + * s = size of each obj (must be multiples of a word size) + * f = ptr to function to compare two objs + * returns (-1 = less than, 0 = equal, 1 = greater than + */ +void +ksort(caddr_t v, int n, int s, int (*f)()) +{ + int g, i, j, ii; + unsigned int *p1, *p2; + unsigned int tmp; + + /* No work to do */ + if (v == NULL || n <= 1) + return; + + /* Sanity check on arguments */ + ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0); + ASSERT(s > 0); + for (g = n / 2; g > 0; g /= 2) { + for (i = g; i < n; i++) { + for (j = i - g; j >= 0 && + (*f)(v + j * s, v + (j + g) * s) == 1; + j -= g) { + p1 = (unsigned *)(v + j * s); + p2 = (unsigned *)(v + (j + g) * s); + for (ii = 0; ii < s / 4; ii++) { + tmp = *p1; + *p1++ = *p2; + *p2++ = tmp; + } + } + } + } +} + +/* + * Compare two acls, all fields. Returns: + * -1 (less than) + * 0 (equal) + * +1 (greater than) + */ +int +cmp2acls(void *a, void *b) +{ + aclent_t *x = (aclent_t *)a; + aclent_t *y = (aclent_t *)b; + + /* Compare types */ + if (x->a_type < y->a_type) + return (-1); + if (x->a_type > y->a_type) + return (1); + /* Equal types; compare id's */ + if (x->a_id < y->a_id) + return (-1); + if (x->a_id > y->a_id) + return (1); + /* Equal ids; compare perms */ + if (x->a_perm < y->a_perm) + return (-1); + if (x->a_perm > y->a_perm) + return (1); + /* Totally equal */ + return (0); +} diff --git a/usr/src/uts/common/syscall/adjtime.c b/usr/src/uts/common/syscall/adjtime.c new file mode 100644 index 0000000000..dc2dde5306 --- /dev/null +++ b/usr/src/uts/common/syscall/adjtime.c @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 1999,2001-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/tuneable.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/debug.h> +#include <sys/model.h> +#include <sys/policy.h> + +int +adjtime(struct timeval *delta, struct timeval *olddelta) +{ + struct timeval atv, oatv; + int64_t ndelta; + int64_t old_delta; + int s; + model_t datamodel = get_udatamodel(); + + if (secpolicy_settime(CRED()) != 0) + return (set_errno(EPERM)); + + if (datamodel == DATAMODEL_NATIVE) { + if (copyin(delta, &atv, sizeof (atv))) + return (set_errno(EFAULT)); + } else { + struct timeval32 atv32; + + if (copyin(delta, &atv32, sizeof (atv32))) + return (set_errno(EFAULT)); + TIMEVAL32_TO_TIMEVAL(&atv, &atv32); + } + + if (atv.tv_usec <= -MICROSEC || atv.tv_usec >= MICROSEC) + return (set_errno(EINVAL)); + + /* + * The SVID specifies that if delta is 0, then there is + * no effect upon time correction, just return olddelta. + */ + ndelta = (int64_t)atv.tv_sec * NANOSEC + atv.tv_usec * 1000; + mutex_enter(&tod_lock); + s = hr_clock_lock(); + old_delta = timedelta; + if (ndelta) + timedelta = ndelta; + /* + * Always set tod_needsync on all adjtime() calls, since it implies + * someone is watching over us and keeping the local clock in sync. + */ + tod_needsync = 1; + hr_clock_unlock(s); + mutex_exit(&tod_lock); + + if (olddelta) { + oatv.tv_sec = old_delta / NANOSEC; + oatv.tv_usec = (old_delta % NANOSEC) / 1000; + if (datamodel == DATAMODEL_NATIVE) { + if (copyout(&oatv, olddelta, sizeof (oatv))) + return (set_errno(EFAULT)); + } else { + struct timeval32 oatv32; + + if (TIMEVAL_OVERFLOW(&oatv)) + return (set_errno(EOVERFLOW)); + + TIMEVAL_TO_TIMEVAL32(&oatv32, &oatv); + + if (copyout(&oatv32, olddelta, sizeof (oatv32))) + return (set_errno(EFAULT)); + } + } + return (0); +} diff --git a/usr/src/uts/common/syscall/alarm.c b/usr/src/uts/common/syscall/alarm.c new file mode 100644 index 0000000000..15027cdd82 --- /dev/null +++ b/usr/src/uts/common/syscall/alarm.c @@ -0,0 +1,87 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright (c) 1999-2001 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/signal.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> + +static void +sigalarm2proc(void *arg) +{ + proc_t *p = arg; + + mutex_enter(&p->p_lock); + p->p_alarmid = 0; + sigtoproc(p, NULL, SIGALRM); + mutex_exit(&p->p_lock); +} + +int +alarm(int deltat) +{ + proc_t *p = ttoproc(curthread); + clock_t del = 0; + clock_t ret; + timeout_id_t tmp_id; + + /* + * We must single-thread this code relative to other + * lwps in the same process also performing an alarm(). + * The mutex dance in the while loop is necessary because + * we cannot call untimeout() while holding a lock that + * is grabbed by the timeout function, sigalarm2proc(). + * We can, however, hold p->p_lock across realtime_timeout(). + */ + mutex_enter(&p->p_lock); + while ((tmp_id = p->p_alarmid) != 0) { + p->p_alarmid = 0; + mutex_exit(&p->p_lock); + del = untimeout(tmp_id); + mutex_enter(&p->p_lock); + } + + if (del < 0) + ret = 0; + else + ret = (del + hz - 1) / hz; /* convert to seconds */ + if (deltat) + p->p_alarmid = realtime_timeout(sigalarm2proc, p, deltat * hz); + mutex_exit(&p->p_lock); + return (ret); +} diff --git a/usr/src/uts/common/syscall/auditsys.c b/usr/src/uts/common/syscall/auditsys.c new file mode 100644 index 0000000000..2beaf4fc7e --- /dev/null +++ b/usr/src/uts/common/syscall/auditsys.c @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1994,2002-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/policy.h> + +#include <c2/audit.h> + +/*ARGSUSED1*/ +int +auditsys(struct auditcalls *uap, rval_t *rvp) +{ + int err; + + /* + * this ugly hack is because auditsys returns 0 for + * all cases except audit_active == 0 and + * uap->code == BSM_AUDITCTRL || BSM_AUDITON || default) + */ + + switch (uap->code) { + case BSM_GETAUID: + case BSM_SETAUID: + case BSM_GETAUDIT: + case BSM_SETAUDIT: + case BSM_AUDIT: + case BSM_AUDITSVC: + return (0); + case BSM_AUDITCTL: + case BSM_AUDITON: + if ((int)uap->a1 == A_GETCOND) + err = secpolicy_audit_getattr(CRED()); + else + /* FALLTHROUGH */ + default: + /* Return a different error when not privileged */ + err = secpolicy_audit_config(CRED()); + if (err == 0) + return (EINVAL); + else + return (err); + } +} diff --git a/usr/src/uts/common/syscall/chdir.c b/usr/src/uts/common/syscall/chdir.c new file mode 100644 index 0000000000..a8b28f9589 --- /dev/null +++ b/usr/src/uts/common/syscall/chdir.c @@ -0,0 +1,247 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/fcntl.h> +#include <sys/pathname.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/ioreq.h> +#include <sys/poll.h> +#include <sys/kmem.h> +#include <sys/filio.h> +#include <sys/cmn_err.h> +#include <sys/policy.h> +#include <sys/zone.h> + +#include <sys/debug.h> +#include <c2/audit.h> + +/* + * Change current working directory ("."). + */ +static int chdirec(vnode_t *, int ischroot, int do_traverse); + +int +chdir(char *fname) +{ + vnode_t *vp; + int error; + +lookup: + if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + + error = chdirec(vp, 0, 1); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +/* + * File-descriptor based version of 'chdir'. + */ +int +fchdir(int fd) +{ + vnode_t *vp; + file_t *fp; + int error; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + error = chdirec(vp, 0, 0); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * Change notion of root ("/") directory. + */ +int +chroot(char *fname) +{ + vnode_t *vp; + int error; + +lookup: + if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + + error = chdirec(vp, 1, 1); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +/* + * ++++++++++++++++++++++++ + * ++ SunOS4.1 Buyback ++ + * ++++++++++++++++++++++++ + * Change root directory with a user given fd + */ +int +fchroot(int fd) +{ + vnode_t *vp; + file_t *fp; + int error; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + error = chdirec(vp, 1, 0); + if (error) + return (set_errno(error)); + return (0); +} + +static int +chdirec(vnode_t *vp, int ischroot, int do_traverse) +{ + int error; + vnode_t *oldvp; + proc_t *pp = curproc; + vnode_t **vpp; + refstr_t *cwd; + int newcwd = 1; + + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (error = VOP_ACCESS(vp, VEXEC, 0, CRED())) + goto bad; + + /* + * The VOP_ACCESS() may have covered 'vp' with a new filesystem, + * if 'vp' is an autoFS vnode. Traverse the mountpoint so + * that we don't end up with a covered current directory. + */ + if (vn_mountedvfs(vp) != NULL && do_traverse) { + if (error = traverse(&vp)) + goto bad; + } + + /* + * Special chroot semantics: chroot is allowed if privileged + * or if the target is really a loopback mount of the root (or + * root of the zone) as determined by comparing dev and inode + * numbers + */ + if (ischroot) { + struct vattr tattr; + struct vattr rattr; + vnode_t *zonevp = curproc->p_zone->zone_rootvp; + + tattr.va_mask = AT_FSID|AT_NODEID; + if (error = VOP_GETATTR(vp, &tattr, 0, CRED())) + goto bad; + + rattr.va_mask = AT_FSID|AT_NODEID; + if (error = VOP_GETATTR(zonevp, &rattr, 0, CRED())) + goto bad; + + if ((tattr.va_fsid != rattr.va_fsid || + tattr.va_nodeid != rattr.va_nodeid) && + (error = secpolicy_chroot(CRED())) != 0) + goto bad; + + vpp = &PTOU(pp)->u_rdir; + } else { + vpp = &PTOU(pp)->u_cdir; + } + +#ifdef C2_AUDIT + if (audit_active) /* update abs cwd/root path see c2audit.c */ + audit_chdirec(vp, vpp); +#endif + + mutex_enter(&pp->p_lock); + /* + * This bit of logic prevents us from overwriting u_cwd if we are + * changing to the same directory. We set the cwd to NULL so that we + * don't try to do the lookup on the next call to getcwd(). + */ + if (!ischroot && *vpp != NULL && vp != NULL && VN_CMP(*vpp, vp)) + newcwd = 0; + + oldvp = *vpp; + *vpp = vp; + if ((cwd = PTOU(pp)->u_cwd) != NULL && newcwd) + PTOU(pp)->u_cwd = NULL; + mutex_exit(&pp->p_lock); + + if (cwd && newcwd) + refstr_rele(cwd); + if (oldvp) + VN_RELE(oldvp); + return (0); + +bad: + VN_RELE(vp); + return (error); +} diff --git a/usr/src/uts/common/syscall/chmod.c b/usr/src/uts/common/syscall/chmod.c new file mode 100644 index 0000000000..8fb42e0843 --- /dev/null +++ b/usr/src/uts/common/syscall/chmod.c @@ -0,0 +1,81 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1989 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/dirent.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/fcntl.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/uio.h> +#include <sys/filio.h> +#include <sys/debug.h> + +extern int namesetattr(char *, enum symfollow, vattr_t *, int); +extern int fdsetattr(int, vattr_t *); + +/* + * Change mode of file given path name. + */ +int +chmod(char *fname, int fmode) +{ + struct vattr vattr; + + vattr.va_mode = fmode & MODEMASK; + vattr.va_mask = AT_MODE; + return (namesetattr(fname, FOLLOW, &vattr, 0)); +} + +/* + * Change mode of file given file descriptor. + */ +int +fchmod(int fd, int fmode) +{ + struct vattr vattr; + + vattr.va_mode = fmode & MODEMASK; + vattr.va_mask = AT_MODE; + return (fdsetattr(fd, &vattr)); +} diff --git a/usr/src/uts/common/syscall/chown.c b/usr/src/uts/common/syscall/chown.c new file mode 100644 index 0000000000..7dc7fc663e --- /dev/null +++ b/usr/src/uts/common/syscall/chown.c @@ -0,0 +1,181 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/fcntl.h> +#include <sys/pathname.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/debug.h> +#include <c2/audit.h> + +/* + * nmflag has the following values + * + * 1 - Always do lookup. i.e. chown, lchown. + * 2 - Name is optional i.e. fchownat + * 0 - Don't lookup name, vp is in file_p. i.e. fchown + * + */ +int +cfchownat(int fd, char *name, int nmflag, uid_t uid, gid_t gid, int flags) +{ + vnode_t *startvp, *vp; + file_t *filefp; + struct vattr vattr; + int error = 0; + char startchar; + + if (uid < -1 || uid > MAXUID || gid < -1 || gid > MAXUID) + return (set_errno(EINVAL)); + vattr.va_uid = uid; + vattr.va_gid = gid; + vattr.va_mask = 0; + if (vattr.va_uid != -1) + vattr.va_mask |= AT_UID; + if (vattr.va_gid != -1) + vattr.va_mask |= AT_GID; + + + if (fd == AT_FDCWD && name == NULL) + return (set_errno(EFAULT)); + + if (nmflag == 1 || (nmflag == 2 && name != NULL)) { + if (copyin(name, &startchar, sizeof (char))) + return (set_errno(EFAULT)); + } else + startchar = '\0'; + + + if (fd == AT_FDCWD) + startvp = NULL; + else { + /* + * only get fd if not doing absolute lookup + */ + if (startchar != '/' || nmflag == 0) { + if ((filefp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + startvp = filefp->f_vnode; + VN_HOLD(startvp); + releasef(fd); + } else { + startvp = NULL; + } + } + +#if C2_AUDIT + if ((nmflag == 2) && audit_active) + audit_setfsat_path(1); +#endif /* C2_AUDIT */ + + /* + * Do lookups for chown, lchown and fchownat when name not NULL + */ + if ((nmflag == 2 && name != NULL) || nmflag == 1) { + if (error = lookupnameat(name, UIO_USERSPACE, + (flags == AT_SYMLINK_NOFOLLOW) ? + NO_FOLLOW : FOLLOW, + NULLVPP, &vp, startvp)) { + if (startvp != NULL) + VN_RELE(startvp); + return (set_errno(error)); + } + } else { + vp = startvp; + ASSERT(vp); + VN_HOLD(vp); + } + + if (vn_is_readonly(vp)) { + error = EROFS; + } else { + error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL); + } + + if (startvp != NULL) + VN_RELE(startvp); + if (vp != NULL) + VN_RELE(vp); + + if (error != 0) + return (set_errno(error)); + else + return (error); +} +/* + * Change ownership of file given file name. + */ +int +chown(char *fname, uid_t uid, gid_t gid) +{ + return (cfchownat(AT_FDCWD, fname, 1, uid, gid, 0)); +} + +int +lchown(char *fname, uid_t uid, gid_t gid) +{ + return (cfchownat(AT_FDCWD, fname, 1, uid, gid, AT_SYMLINK_NOFOLLOW)); +} + +/* + * Change ownership of file given file descriptor. + */ +int +fchown(int fd, uid_t uid, uid_t gid) +{ + return (cfchownat(fd, NULL, 0, uid, gid, 0)); +} + +int +fchownat(int fd, char *name, uid_t uid, gid_t gid, int flags) +{ + return (cfchownat(fd, name, 2, uid, gid, flags)); + +} diff --git a/usr/src/uts/common/syscall/cladm.c b/usr/src/uts/common/syscall/cladm.c new file mode 100644 index 0000000000..e2e034d93d --- /dev/null +++ b/usr/src/uts/common/syscall/cladm.c @@ -0,0 +1,100 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1998 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#ident "%Z%%M% %I% %E% SMI" + +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cladm.h> + +/* + * cladm(2) cluster administation system call. + */ +int +cladm(int fac, int cmd, void *arg) +{ + int error = 0; + int copyout_bootflags; + + switch (fac) { + case CL_INITIALIZE: + if (cmd != CL_GET_BOOTFLAG) { + error = EINVAL; + break; + } + + /* + * The CLUSTER_INSTALLING and CLUSTER_DCS_ENABLED bootflags are + * internal flags. We do not want to expose these to the user + * level. + */ + copyout_bootflags = (cluster_bootflags & + ~(CLUSTER_INSTALLING | CLUSTER_DCS_ENABLED)); + if (copyout(©out_bootflags, arg, sizeof (int))) { + error = EFAULT; + } + break; + + case CL_CONFIG: + /* + * We handle CL_NODEID here so that the node number + * can be returned if the system is configured as part + * of a cluster but not booted as part of the cluster. + */ + if (cmd == CL_NODEID) { + nodeid_t nid; + + /* return error if not configured as a cluster */ + if (!(cluster_bootflags & CLUSTER_CONFIGURED)) { + error = ENOSYS; + break; + } + + nid = clconf_get_nodeid(); + error = copyout(&nid, arg, sizeof (nid)); + break; + } + /* FALLTHROUGH */ + + default: + if ((cluster_bootflags & (CLUSTER_CONFIGURED|CLUSTER_BOOTED)) != + (CLUSTER_CONFIGURED|CLUSTER_BOOTED)) { + error = EINVAL; + break; + } + error = cladmin(fac, cmd, arg); + /* + * error will be -1 if the cladm module cannot be loaded; + * otherwise, it is the errno value returned + * (see {i86,sparc}/ml/modstubs.s). + */ + if (error < 0) + error = ENOSYS; + break; + } + + return (error ? set_errno(error) : 0); +} diff --git a/usr/src/uts/common/syscall/close.c b/usr/src/uts/common/syscall/close.c new file mode 100644 index 0000000000..dd79ccb10e --- /dev/null +++ b/usr/src/uts/common/syscall/close.c @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1998 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/debug.h> + +/* + * Close a file. + */ + +int +close(int fdes) +{ + int error; + + if ((error = closeandsetf(fdes, NULL)) != 0) + return (set_errno(error)); + return (0); +} diff --git a/usr/src/uts/common/syscall/corectl.c b/usr/src/uts/common/syscall/corectl.c new file mode 100644 index 0000000000..9e67ae545a --- /dev/null +++ b/usr/src/uts/common/syscall/corectl.c @@ -0,0 +1,558 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/atomic.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/procset.h> +#include <sys/corectl.h> +#include <sys/zone.h> +#include <sys/cmn_err.h> +#include <sys/policy.h> + +/* + * Core File Settings + * ------------------ + * + * A process's core file path and content live in separate reference-counted + * structures. The corectl_content_t structure is fairly straightforward -- + * the only subtlety is that we only really _need_ the mutex on architectures + * on which 64-bit memory operations are not atomic. The corectl_path_t + * structure is slightly trickier in that it contains a refstr_t rather than + * just a char * string. This is to allow consumers of the data in that + * structure (the core dumping sub-system for example) to safely use the + * string without holding any locks on it in light of updates. + * + * At system boot, init_core() sets init(1M)'s core file path and content to + * the same value as the fields core_default_path and core_default_content + * respectively (for the global zone). All subsequent children of init(1M) + * reference those same settings. During boot coreadm(1M) is invoked with + * the -u option to update the system settings from /etc/coreadm.conf. This + * has the effect of also changing the values in core_default_path and + * core_default_content which updates the core file settings for all + * processes in the zone. Each zone has different default settings; when + * processes enter a non-global zone, their core file path and content are + * set to the zone's default path and content. + * + * Processes that have their core file settings explicitly overridden using + * coreadm(1M) no longer reference core_default_path or core_default_content + * so subsequent changes to the default will not affect them. + */ + +zone_key_t core_zone_key; + +static int set_proc_info(pid_t pid, const char *path, core_content_t content); + +static corectl_content_t * +corectl_content_alloc(core_content_t cc) +{ + corectl_content_t *ccp; + + ccp = kmem_zalloc(sizeof (corectl_content_t), KM_SLEEP); + ccp->ccc_content = cc; + ccp->ccc_refcnt = 1; + + return (ccp); +} + +core_content_t +corectl_content_value(corectl_content_t *ccp) +{ + core_content_t content; + + mutex_enter(&ccp->ccc_mtx); + content = ccp->ccc_content; + mutex_exit(&ccp->ccc_mtx); + + return (content); +} + +static void +corectl_content_set(corectl_content_t *ccp, core_content_t content) +{ + mutex_enter(&ccp->ccc_mtx); + ccp->ccc_content = content; + mutex_exit(&ccp->ccc_mtx); +} + +void +corectl_content_hold(corectl_content_t *ccp) +{ + atomic_add_32(&ccp->ccc_refcnt, 1); +} + +void +corectl_content_rele(corectl_content_t *ccp) +{ + if (atomic_add_32_nv(&ccp->ccc_refcnt, -1) == 0) + kmem_free(ccp, sizeof (corectl_content_t)); +} + + +static corectl_path_t * +corectl_path_alloc(const char *path) +{ + corectl_path_t *ccp; + + ccp = kmem_zalloc(sizeof (corectl_path_t), KM_SLEEP); + ccp->ccp_path = refstr_alloc(path); + ccp->ccp_refcnt = 1; + + return (ccp); +} + +refstr_t * +corectl_path_value(corectl_path_t *ccp) +{ + refstr_t *path; + + mutex_enter(&ccp->ccp_mtx); + refstr_hold(path = ccp->ccp_path); + mutex_exit(&ccp->ccp_mtx); + + return (path); +} + +static void +corectl_path_set(corectl_path_t *ccp, const char *path) +{ + refstr_t *npath = refstr_alloc(path); + + mutex_enter(&ccp->ccp_mtx); + refstr_rele(ccp->ccp_path); + ccp->ccp_path = npath; + mutex_exit(&ccp->ccp_mtx); +} + +void +corectl_path_hold(corectl_path_t *ccp) +{ + atomic_add_32(&ccp->ccp_refcnt, 1); +} + +void +corectl_path_rele(corectl_path_t *ccp) +{ + if (atomic_add_32_nv(&ccp->ccp_refcnt, -1) == 0) { + refstr_rele(ccp->ccp_path); + kmem_free(ccp, sizeof (corectl_path_t)); + } +} + +/* + * Constructor routine to be called when a zone is created. + */ +/*ARGSUSED*/ +static void * +core_init_zone(zoneid_t zoneid) +{ + struct core_globals *cg; + + cg = kmem_alloc(sizeof (*cg), KM_SLEEP); + mutex_init(&cg->core_lock, NULL, MUTEX_DEFAULT, NULL); + cg->core_file = NULL; + cg->core_options = CC_PROCESS_PATH; + cg->core_content = CC_CONTENT_DEFAULT; + cg->core_rlimit = RLIM64_INFINITY; + cg->core_default_path = corectl_path_alloc("core"); + cg->core_default_content = corectl_content_alloc(CC_CONTENT_DEFAULT); + + return (cg); +} + +/* + * Destructor routine to be called when a zone is destroyed. + */ +/*ARGSUSED*/ +static void +core_free_zone(zoneid_t zoneid, void *arg) +{ + struct core_globals *cg = arg; + + if (cg == NULL) + return; + if (cg->core_file != NULL) + refstr_rele(cg->core_file); + corectl_path_rele(cg->core_default_path); + corectl_content_rele(cg->core_default_content); + kmem_free(cg, sizeof (*cg)); +} + +/* + * Called once, from icode(), to set init's core file path and content. + */ +void +init_core(void) +{ + struct core_globals *cg; + + zone_key_create(&core_zone_key, core_init_zone, NULL, core_free_zone); + + /* + * zone_key_create will have called core_init_zone for the + * global zone, which sets up the default path and content + * variables. + */ + cg = zone_getspecific(core_zone_key, global_zone); + ASSERT(cg != NULL); + + corectl_path_hold(cg->core_default_path); + corectl_content_hold(cg->core_default_content); + + curproc->p_corefile = cg->core_default_path; + curproc->p_content = cg->core_default_content; +} + +int +corectl(int subcode, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) +{ + int error = 0; + proc_t *p; + refstr_t *rp; + size_t size; + char *path; + core_content_t content = CC_CONTENT_INVALID; + struct core_globals *cg; + zone_t *zone = curproc->p_zone; + + cg = zone_getspecific(core_zone_key, zone); + ASSERT(cg != NULL); + + switch (subcode) { + case CC_SET_OPTIONS: + if ((error = secpolicy_coreadm(CRED())) == 0) { + if (arg1 & ~CC_OPTIONS) + error = EINVAL; + else + cg->core_options = (uint32_t)arg1; + } + break; + + case CC_GET_OPTIONS: + return (cg->core_options); + + case CC_GET_GLOBAL_PATH: + case CC_GET_DEFAULT_PATH: + case CC_GET_PROCESS_PATH: + if (subcode == CC_GET_GLOBAL_PATH) { + mutex_enter(&cg->core_lock); + if ((rp = cg->core_file) != NULL) + refstr_hold(rp); + mutex_exit(&cg->core_lock); + } else if (subcode == CC_GET_DEFAULT_PATH) { + rp = corectl_path_value(cg->core_default_path); + } else { + rp = NULL; + mutex_enter(&pidlock); + if ((p = prfind((pid_t)arg3)) == NULL || + p->p_stat == SIDL) { + mutex_exit(&pidlock); + error = ESRCH; + } else { + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + mutex_enter(&p->p_crlock); + if (!hasprocperm(p->p_cred, CRED())) + error = EPERM; + else if (p->p_corefile != NULL) + rp = corectl_path_value(p->p_corefile); + mutex_exit(&p->p_crlock); + mutex_exit(&p->p_lock); + } + } + if (rp == NULL) { + if (error == 0 && suword8((void *)arg1, 0)) + error = EFAULT; + } else { + error = copyoutstr(refstr_value(rp), (char *)arg1, + (size_t)arg2, NULL); + refstr_rele(rp); + } + break; + + case CC_SET_GLOBAL_PATH: + case CC_SET_DEFAULT_PATH: + if ((error = secpolicy_coreadm(CRED())) != 0) + break; + + /* FALLTHROUGH */ + case CC_SET_PROCESS_PATH: + if ((size = MIN((size_t)arg2, MAXPATHLEN)) == 0) { + error = EINVAL; + break; + } + path = kmem_alloc(size, KM_SLEEP); + error = copyinstr((char *)arg1, path, size, NULL); + if (error == 0) { + if (subcode == CC_SET_PROCESS_PATH) { + error = set_proc_info((pid_t)arg3, path, 0); + } else if (subcode == CC_SET_DEFAULT_PATH) { + corectl_path_set(cg->core_default_path, path); + } else if (*path != '\0' && *path != '/') { + error = EINVAL; + } else { + refstr_t *nrp = refstr_alloc(path); + + mutex_enter(&cg->core_lock); + rp = cg->core_file; + if (*path == '\0') + cg->core_file = NULL; + else + refstr_hold(cg->core_file = nrp); + mutex_exit(&cg->core_lock); + + if (rp != NULL) + refstr_rele(rp); + + refstr_rele(nrp); + } + } + kmem_free(path, size); + break; + + case CC_SET_GLOBAL_CONTENT: + case CC_SET_DEFAULT_CONTENT: + if ((error = secpolicy_coreadm(CRED())) != 0) + break; + + /* FALLTHROUGH */ + case CC_SET_PROCESS_CONTENT: + error = copyin((void *)arg1, &content, sizeof (content)); + if (error != 0) + break; + + /* + * If any unknown bits are set, don't let this charade + * continue. + */ + if (content & ~CC_CONTENT_ALL) { + error = EINVAL; + break; + } + + if (subcode == CC_SET_PROCESS_CONTENT) { + error = set_proc_info((pid_t)arg2, NULL, content); + } else if (subcode == CC_SET_DEFAULT_CONTENT) { + corectl_content_set(cg->core_default_content, content); + } else { + mutex_enter(&cg->core_lock); + cg->core_content = content; + mutex_exit(&cg->core_lock); + } + + break; + + case CC_GET_GLOBAL_CONTENT: + content = cg->core_content; + error = copyout(&content, (void *)arg1, sizeof (content)); + break; + + case CC_GET_DEFAULT_CONTENT: + content = corectl_content_value(cg->core_default_content); + error = copyout(&content, (void *)arg1, sizeof (content)); + break; + + case CC_GET_PROCESS_CONTENT: + mutex_enter(&pidlock); + if ((p = prfind((pid_t)arg2)) == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + error = ESRCH; + break; + } + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + mutex_enter(&p->p_crlock); + if (!hasprocperm(p->p_cred, CRED())) + error = EPERM; + else if (p->p_content == NULL) + content = CC_CONTENT_NONE; + else + content = corectl_content_value(p->p_content); + mutex_exit(&p->p_crlock); + mutex_exit(&p->p_lock); + + if (error == 0) + error = copyout(&content, (void *)arg1, + sizeof (content)); + break; + + default: + error = EINVAL; + break; + } + + if (error) + return (set_errno(error)); + return (0); +} + +typedef struct { + int cc_count; + corectl_path_t *cc_path; + corectl_content_t *cc_content; +} counter_t; + +static int +set_one_proc_info(proc_t *p, counter_t *counterp) +{ + corectl_path_t *corefile; + corectl_content_t *content; + + mutex_enter(&p->p_crlock); + + if (!(p->p_flag & SSYS) && hasprocperm(p->p_cred, CRED())) { + mutex_exit(&p->p_crlock); + counterp->cc_count++; + if (counterp->cc_path != NULL) { + corectl_path_hold(counterp->cc_path); + mutex_enter(&p->p_lock); + corefile = p->p_corefile; + p->p_corefile = counterp->cc_path; + mutex_exit(&p->p_lock); + if (corefile != NULL) + corectl_path_rele(corefile); + } else { + corectl_content_hold(counterp->cc_content); + mutex_enter(&p->p_lock); + content = p->p_content; + p->p_content = counterp->cc_content; + mutex_exit(&p->p_lock); + if (content != NULL) + corectl_content_rele(content); + } + } else { + mutex_exit(&p->p_crlock); + } + + return (0); +} + +static int +set_proc_info(pid_t pid, const char *path, core_content_t content) +{ + proc_t *p; + counter_t counter; + int error = 0; + + counter.cc_count = 0; + /* + * Only one of the core file path or content can be set at a time. + */ + if (path != NULL) { + counter.cc_path = corectl_path_alloc(path); + counter.cc_content = NULL; + } else { + counter.cc_path = NULL; + counter.cc_content = corectl_content_alloc(content); + } + + if (pid == -1) { + procset_t set; + + setprocset(&set, POP_AND, P_ALL, P_MYID, P_ALL, P_MYID); + error = dotoprocs(&set, set_one_proc_info, (char *)&counter); + if (error == 0 && counter.cc_count == 0) + error = EPERM; + } else if (pid > 0) { + mutex_enter(&pidlock); + if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) { + error = ESRCH; + } else { + (void) set_one_proc_info(p, &counter); + if (counter.cc_count == 0) + error = EPERM; + } + mutex_exit(&pidlock); + } else { + int nfound = 0; + pid_t pgid; + + if (pid == 0) + pgid = curproc->p_pgrp; + else + pgid = -pid; + + mutex_enter(&pidlock); + for (p = pgfind(pgid); p != NULL; p = p->p_pglink) { + if (p->p_stat != SIDL) { + nfound++; + (void) set_one_proc_info(p, &counter); + } + } + mutex_exit(&pidlock); + if (nfound == 0) + error = ESRCH; + else if (counter.cc_count == 0) + error = EPERM; + } + + if (path != NULL) + corectl_path_rele(counter.cc_path); + else + corectl_content_rele(counter.cc_content); + + if (error) + return (set_errno(error)); + return (0); +} + +/* + * Give current process the default core settings for its current zone; + * used for processes entering a zone via zone_enter. + */ +void +set_core_defaults(void) +{ + proc_t *p = curproc; + struct core_globals *cg; + corectl_path_t *oldpath, *newpath; + corectl_content_t *oldcontent, *newcontent; + + cg = zone_getspecific(core_zone_key, p->p_zone); + + /* make local copies of default values to protect against change */ + newpath = cg->core_default_path; + newcontent = cg->core_default_content; + + corectl_path_hold(newpath); + corectl_content_hold(newcontent); + mutex_enter(&p->p_lock); + oldpath = p->p_corefile; + p->p_corefile = newpath; + oldcontent = p->p_content; + p->p_content = newcontent; + mutex_exit(&p->p_lock); + if (oldpath != NULL) + corectl_path_rele(oldpath); + if (oldcontent != NULL) + corectl_content_rele(oldcontent); +} diff --git a/usr/src/uts/common/syscall/exacctsys.c b/usr/src/uts/common/syscall/exacctsys.c new file mode 100644 index 0000000000..af54737c57 --- /dev/null +++ b/usr/src/uts/common/syscall/exacctsys.c @@ -0,0 +1,406 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/acctctl.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/errno.h> +#include <sys/exacct.h> +#include <sys/modctl.h> +#include <sys/procset.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/task.h> +#include <sys/types.h> +#include <sys/user.h> +#include <sys/policy.h> + +/* + * getacct(2), putacct(2), and wracct(2) system calls + * + * The extended accounting subsystem provides three root-privileged system + * calls for interacting with the actual resource data associated with each + * task or process. getacct() copies a packed exacct record reflecting the + * resource usage out to the buffer provided by the user. wracct() writes a + * record to the appropriate extended accounting file. putacct() takes the + * buffer provided by the user, and appends a "tag" record associated with the + * specified task or project that encapsulates the user data. All three of + * these functions exit early if extended accounting is not active for the + * requested entity type. + * + * Locking + * Under the terminology introduced in os/task.c, all three of these system + * calls are task observers, when executing on an existing task. + */ + +/* + * getacct_callback() is used to copyout the buffer with accounting records + * from the kernel back to the user. It also sets actual to the size of the + * kernel buffer--the required minimum size for a successful outbound copy. + */ +/* ARGSUSED */ +static int +getacct_callback(ac_info_t *unused, void *ubuf, size_t usize, void *kbuf, + size_t ksize, size_t *actual) +{ + size_t size = MIN(usize, ksize); + + if (ubuf != NULL && copyout(kbuf, ubuf, size) != 0) + return (EFAULT); + *actual = ksize; + return (0); +} + +static int +getacct_task(ac_info_t *ac_task, taskid_t tkid, void *buf, size_t bufsize, + size_t *sizep) +{ + task_t *tk; + int error; + + mutex_enter(&ac_task->ac_lock); + if (ac_task->ac_state == AC_OFF) { + mutex_exit(&ac_task->ac_lock); + return (ENOTACTIVE); + } + mutex_exit(&ac_task->ac_lock); + + if ((tk = task_hold_by_id(tkid)) == NULL) + return (ESRCH); + error = exacct_assemble_task_usage(ac_task, tk, + getacct_callback, buf, bufsize, sizep, EW_PARTIAL); + task_rele(tk); + + return (error); +} + +static int +getacct_proc(ac_info_t *ac_proc, pid_t pid, void *buf, size_t bufsize, + size_t *sizep) +{ + proc_t *p; + proc_usage_t *pu; + ulong_t mask[AC_MASK_SZ]; + ulong_t *ac_mask = &mask[0]; + int error; + + mutex_enter(&ac_proc->ac_lock); + if (ac_proc->ac_state == AC_OFF) { + mutex_exit(&ac_proc->ac_lock); + return (ENOTACTIVE); + } + bt_copy(&ac_proc->ac_mask[0], ac_mask, AC_MASK_SZ); + mutex_exit(&ac_proc->ac_lock); + + pu = kmem_zalloc(sizeof (proc_usage_t), KM_SLEEP); + pu->pu_command = kmem_zalloc(MAXCOMLEN + 1, KM_SLEEP); + + mutex_enter(&pidlock); + if ((p = prfind(pid)) == NULL) { + mutex_exit(&pidlock); + kmem_free(pu->pu_command, MAXCOMLEN + 1); + kmem_free(pu, sizeof (proc_usage_t)); + return (ESRCH); + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + exacct_calculate_proc_usage(p, pu, ac_mask, EW_PARTIAL, 0); + mutex_exit(&p->p_lock); + + error = exacct_assemble_proc_usage(ac_proc, pu, + getacct_callback, buf, bufsize, sizep, EW_PARTIAL); + + kmem_free(pu->pu_command, MAXCOMLEN + 1); + kmem_free(pu, sizeof (proc_usage_t)); + + return (error); +} + +static ssize_t +getacct(idtype_t idtype, id_t id, void *buf, size_t bufsize) +{ + size_t size = 0; + int error; + struct exacct_globals *acg; + + if (bufsize > EXACCT_MAX_BUFSIZE) + bufsize = EXACCT_MAX_BUFSIZE; + + acg = zone_getspecific(exacct_zone_key, curproc->p_zone); + switch (idtype) { + case P_PID: + error = getacct_proc(&acg->ac_proc, id, buf, bufsize, &size); + break; + case P_TASKID: + error = getacct_task(&acg->ac_task, id, buf, bufsize, &size); + break; + default: + error = EINVAL; + break; + } + return (error == 0 ? (ssize_t)size : set_errno(error)); +} + +static int +putacct(idtype_t idtype, id_t id, void *buf, size_t bufsize, int flags) +{ + int error; + taskid_t tkid; + proc_t *p; + task_t *tk; + void *kbuf; + struct exacct_globals *acg; + + if (bufsize == 0 || bufsize > EXACCT_MAX_BUFSIZE) + return (set_errno(EINVAL)); + + kbuf = kmem_alloc(bufsize, KM_SLEEP); + if (copyin(buf, kbuf, bufsize) != 0) { + error = EFAULT; + goto out; + } + + acg = zone_getspecific(exacct_zone_key, curproc->p_zone); + switch (idtype) { + case P_PID: + mutex_enter(&pidlock); + if ((p = prfind(id)) == NULL) { + mutex_exit(&pidlock); + error = ESRCH; + } else { + zone_t *zone = p->p_zone; + + tkid = p->p_task->tk_tkid; + zone_hold(zone); + mutex_exit(&pidlock); + + error = exacct_tag_proc(&acg->ac_proc, id, tkid, kbuf, + bufsize, flags, zone->zone_nodename); + zone_rele(zone); + } + break; + case P_TASKID: + if ((tk = task_hold_by_id(id)) != NULL) { + error = exacct_tag_task(&acg->ac_task, tk, kbuf, + bufsize, flags); + task_rele(tk); + } else { + error = ESRCH; + } + break; + default: + error = EINVAL; + break; + } +out: + kmem_free(kbuf, bufsize); + return (error == 0 ? error : set_errno(error)); +} + +static int +wracct_task(ac_info_t *ac_task, taskid_t tkid, int flag, size_t *sizep) +{ + task_t *tk; + int error; + + mutex_enter(&ac_task->ac_lock); + if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) { + mutex_exit(&ac_task->ac_lock); + return (ENOTACTIVE); + } + mutex_exit(&ac_task->ac_lock); + + if ((tk = task_hold_by_id(tkid)) == NULL) + return (ESRCH); + error = exacct_assemble_task_usage(ac_task, tk, exacct_commit_callback, + NULL, 0, sizep, flag); + task_rele(tk); + + return (error); +} + +static int +wracct_proc(ac_info_t *ac_proc, pid_t pid, int flag, size_t *sizep) +{ + proc_t *p; + proc_usage_t *pu; + ulong_t mask[AC_MASK_SZ]; + ulong_t *ac_mask = &mask[0]; + int error; + + mutex_enter(&ac_proc->ac_lock); + if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) { + mutex_exit(&ac_proc->ac_lock); + return (ENOTACTIVE); + } + bt_copy(&ac_proc->ac_mask[0], ac_mask, AC_MASK_SZ); + mutex_exit(&ac_proc->ac_lock); + + pu = kmem_zalloc(sizeof (proc_usage_t), KM_SLEEP); + pu->pu_command = kmem_zalloc(MAXCOMLEN + 1, KM_SLEEP); + + mutex_enter(&pidlock); + if ((p = prfind(pid)) == NULL) { + mutex_exit(&pidlock); + kmem_free(pu->pu_command, MAXCOMLEN + 1); + kmem_free(pu, sizeof (proc_usage_t)); + return (ESRCH); + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + exacct_calculate_proc_usage(p, pu, ac_mask, flag, 0); + mutex_exit(&p->p_lock); + + error = exacct_assemble_proc_usage(ac_proc, pu, + exacct_commit_callback, NULL, 0, sizep, flag); + + kmem_free(pu->pu_command, MAXCOMLEN + 1); + kmem_free(pu, sizeof (proc_usage_t)); + + return (error); +} + +static int +wracct(idtype_t idtype, id_t id, int flags) +{ + int error; + size_t size = 0; + struct exacct_globals *acg; + + /* + * Validate flags. + */ + switch (flags) { + case EW_PARTIAL: + case EW_INTERVAL: + break; + default: + return (set_errno(EINVAL)); + } + + acg = zone_getspecific(exacct_zone_key, curproc->p_zone); + switch (idtype) { + case P_PID: + if (flags == EW_INTERVAL) + return (set_errno(ENOTSUP)); + error = wracct_proc(&acg->ac_proc, id, flags, &size); + break; + case P_TASKID: + error = wracct_task(&acg->ac_task, id, flags, &size); + break; + default: + error = EINVAL; + break; + } + + return (error == 0 ? error : set_errno(error)); +} + +static long +exacct(int code, idtype_t idtype, id_t id, void *buf, size_t bufsize, + int flags) +{ + if (secpolicy_acct(CRED()) != 0) + return (set_errno(EPERM)); + + if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) + return (set_errno(ENOTACTIVE)); + + switch (code) { + case 0: + return (getacct(idtype, id, buf, bufsize)); + case 1: + return (putacct(idtype, id, buf, bufsize, flags)); + case 2: + return (wracct(idtype, id, flags)); + default: + return (set_errno(EINVAL)); + } +} + +#if defined(_LP64) +#define SE_LRVAL SE_64RVAL +#else +#define SE_LRVAL SE_32RVAL1 +#endif + +static struct sysent exacctsys_sysent = { + 6, + SE_NOUNLOAD | SE_ARGC | SE_LRVAL, + (int (*)())exacct +}; + +static struct modlsys modlsys = { + &mod_syscallops, + "extended accounting facility", + &exacctsys_sysent +}; + +#ifdef _SYSCALL32_IMPL + +static struct sysent exacctsys_sysent32 = { + 6, + SE_NOUNLOAD | SE_ARGC | SE_32RVAL1, + (int (*)())exacct +}; + +static struct modlsys modlsys32 = { + &mod_syscallops32, + "32-bit extended accounting facility", + &exacctsys_sysent32 +}; + +#endif + +static struct modlinkage modlinkage = { + MODREV_1, + &modlsys, +#ifdef _SYSCALL32_IMPL + &modlsys32, +#endif + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *mip) +{ + return (mod_info(&modlinkage, mip)); +} diff --git a/usr/src/uts/common/syscall/fcntl.c b/usr/src/uts/common/syscall/fcntl.c new file mode 100644 index 0000000000..39e0f7f6bd --- /dev/null +++ b/usr/src/uts/common/syscall/fcntl.c @@ -0,0 +1,802 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* ONC_PLUS EXTRACT START */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" +/* ONC_PLUS EXTRACT END */ + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/fcntl.h> +/* ONC_PLUS EXTRACT START */ +#include <sys/flock.h> +/* ONC_PLUS EXTRACT END */ +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/proc.h> +#include <sys/filio.h> +#include <sys/share.h> +#include <sys/debug.h> +#include <sys/rctl.h> +#include <sys/nbmlock.h> + +/* ONC_PLUS EXTRACT START */ +static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); +static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *); +static void fd_too_big(proc_t *); + +/* + * File control. + */ +int +fcntl(int fdes, int cmd, intptr_t arg) +{ + int iarg; + int error = 0; + int retval; + proc_t *p; + file_t *fp; + vnode_t *vp; + u_offset_t offset; + u_offset_t start; + struct vattr vattr; + int in_crit; + int flag; + struct flock sbf; + struct flock64 bf; + struct o_flock obf; + struct flock64_32 bf64_32; + struct fshare fsh; + struct shrlock shr; + struct shr_locowner shr_own; + offset_t maxoffset; + model_t datamodel; + +#if defined(_ILP32) && !defined(lint) && defined(_SYSCALL32) + ASSERT(sizeof (struct flock) == sizeof (struct flock32)); + ASSERT(sizeof (struct flock64) == sizeof (struct flock64_32)); +#endif +#if defined(_LP64) && !defined(lint) && defined(_SYSCALL32) + ASSERT(sizeof (struct flock) == sizeof (struct flock64_64)); + ASSERT(sizeof (struct flock64) == sizeof (struct flock64_64)); +#endif + + /* + * First, for speed, deal with the subset of cases + * that do not require getf() / releasef(). + */ + switch (cmd) { + case F_GETFD: + if ((error = f_getfd_error(fdes, &flag)) == 0) + retval = flag; + goto out; + + case F_SETFD: + error = f_setfd_error(fdes, (int)arg); + retval = 0; + goto out; + + case F_GETFL: + if ((error = f_getfl(fdes, &flag)) == 0) + retval = (flag & (FMASK | FASYNC)) + FOPEN; + goto out; + + case F_GETXFL: + if ((error = f_getfl(fdes, &flag)) == 0) + retval = flag + FOPEN; + goto out; + } + + /* + * Second, for speed, deal with the subset of cases that + * require getf() / releasef() but do not require copyin. + */ + if ((fp = getf(fdes)) == NULL) { + error = EBADF; + goto out; + } + iarg = (int)arg; + + switch (cmd) { +/* ONC_PLUS EXTRACT END */ + + case F_DUPFD: + p = curproc; + if ((uint_t)iarg >= p->p_fno_ctl) { + if (iarg >= 0) + fd_too_big(p); + error = EINVAL; + } else if ((retval = ufalloc_file(iarg, fp)) == -1) { + error = EMFILE; + } else { + mutex_enter(&fp->f_tlock); + fp->f_count++; + mutex_exit(&fp->f_tlock); + } + goto done; + + case F_DUP2FD: + p = curproc; + if (fdes == iarg) { + retval = iarg; + } else if ((uint_t)iarg >= p->p_fno_ctl) { + if (iarg >= 0) + fd_too_big(p); + error = EBADF; + } else { + /* + * We can't hold our getf(fdes) across the call to + * closeandsetf() because it creates a window for + * deadlock: if one thread is doing dup2(a, b) while + * another is doing dup2(b, a), each one will block + * waiting for the other to call releasef(). The + * solution is to increment the file reference count + * (which we have to do anyway), then releasef(fdes), + * then closeandsetf(). Incrementing f_count ensures + * that fp won't disappear after we call releasef(). + */ + mutex_enter(&fp->f_tlock); + fp->f_count++; + mutex_exit(&fp->f_tlock); + releasef(fdes); + (void) closeandsetf(iarg, fp); + retval = iarg; + goto out; + } + goto done; + + case F_SETFL: + vp = fp->f_vnode; + flag = fp->f_flag; + if ((iarg & (FNONBLOCK|FNDELAY)) == (FNONBLOCK|FNDELAY)) + iarg &= ~FNDELAY; + if ((error = VOP_SETFL(vp, flag, iarg, fp->f_cred)) == 0) { + iarg &= FMASK; + mutex_enter(&fp->f_tlock); + fp->f_flag &= ~FMASK | (FREAD|FWRITE); + fp->f_flag |= (iarg - FOPEN) & ~(FREAD|FWRITE); + mutex_exit(&fp->f_tlock); + } + retval = 0; + goto done; + } + + /* + * Finally, deal with the expensive cases. + */ + retval = 0; + in_crit = 0; + maxoffset = MAXOFF_T; + datamodel = DATAMODEL_NATIVE; +#if defined(_SYSCALL32_IMPL) + if ((datamodel = get_udatamodel()) == DATAMODEL_ILP32) + maxoffset = MAXOFF32_T; +#endif + + vp = fp->f_vnode; + flag = fp->f_flag; + offset = fp->f_offset; + + switch (cmd) { +/* ONC_PLUS EXTRACT START */ + /* + * The file system and vnode layers understand and implement + * locking with flock64 structures. So here once we pass through + * the test for compatibility as defined by LFS API, (for F_SETLK, + * F_SETLKW, F_GETLK, F_GETLKW, F_FREESP) we transform + * the flock structure to a flock64 structure and send it to the + * lower layers. Similarly in case of GETLK the returned flock64 + * structure is transformed to a flock structure if everything fits + * in nicely, otherwise we return EOVERFLOW. + */ + + case F_GETLK: + case F_O_GETLK: + case F_SETLK: + case F_SETLKW: + case F_SETLK_NBMAND: + + /* + * Copy in input fields only. + */ + + if (cmd == F_O_GETLK) { + if (datamodel != DATAMODEL_ILP32) { + error = EINVAL; + break; + } + + if (copyin((void *)arg, &obf, sizeof (obf))) { + error = EFAULT; + break; + } + bf.l_type = obf.l_type; + bf.l_whence = obf.l_whence; + bf.l_start = (off64_t)obf.l_start; + bf.l_len = (off64_t)obf.l_len; + bf.l_sysid = (int)obf.l_sysid; + bf.l_pid = obf.l_pid; + } else if (datamodel == DATAMODEL_NATIVE) { + if (copyin((void *)arg, &sbf, sizeof (sbf))) { + error = EFAULT; + break; + } + /* + * XXX In an LP64 kernel with an LP64 application + * there's no need to do a structure copy here + * struct flock == struct flock64. However, + * we did it this way to avoid more conditional + * compilation. + */ + bf.l_type = sbf.l_type; + bf.l_whence = sbf.l_whence; + bf.l_start = (off64_t)sbf.l_start; + bf.l_len = (off64_t)sbf.l_len; + bf.l_sysid = sbf.l_sysid; + bf.l_pid = sbf.l_pid; + } +#if defined(_SYSCALL32_IMPL) + else { + struct flock32 sbf32; + if (copyin((void *)arg, &sbf32, sizeof (sbf32))) { + error = EFAULT; + break; + } + bf.l_type = sbf32.l_type; + bf.l_whence = sbf32.l_whence; + bf.l_start = (off64_t)sbf32.l_start; + bf.l_len = (off64_t)sbf32.l_len; + bf.l_sysid = sbf32.l_sysid; + bf.l_pid = sbf32.l_pid; + } +#endif /* _SYSCALL32_IMPL */ + + /* + * 64-bit support: check for overflow for 32-bit lock ops + */ + if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0) + break; + + /* + * Not all of the filesystems understand F_O_GETLK, and + * there's no need for them to know. Map it to F_GETLK. + */ + if ((error = VOP_FRLOCK(vp, (cmd == F_O_GETLK) ? F_GETLK : cmd, + &bf, flag, offset, NULL, fp->f_cred)) != 0) + break; + + /* + * If command is GETLK and no lock is found, only + * the type field is changed. + */ + if ((cmd == F_O_GETLK || cmd == F_GETLK) && + bf.l_type == F_UNLCK) { + /* l_type always first entry, always a short */ + if (copyout(&bf.l_type, &((struct flock *)arg)->l_type, + sizeof (bf.l_type))) + error = EFAULT; + break; + } + + if (cmd == F_O_GETLK) { + /* + * Return an SVR3 flock structure to the user. + */ + obf.l_type = (int16_t)bf.l_type; + obf.l_whence = (int16_t)bf.l_whence; + obf.l_start = (int32_t)bf.l_start; + obf.l_len = (int32_t)bf.l_len; + if (bf.l_sysid > SHRT_MAX || bf.l_pid > SHRT_MAX) { + /* + * One or both values for the above fields + * is too large to store in an SVR3 flock + * structure. + */ + error = EOVERFLOW; + break; + } + obf.l_sysid = (int16_t)bf.l_sysid; + obf.l_pid = (int16_t)bf.l_pid; + if (copyout(&obf, (void *)arg, sizeof (obf))) + error = EFAULT; + } else if (cmd == F_GETLK) { + /* + * Copy out SVR4 flock. + */ + int i; + + if (bf.l_start > maxoffset || bf.l_len > maxoffset) { + error = EOVERFLOW; + break; + } + + if (datamodel == DATAMODEL_NATIVE) { + for (i = 0; i < 4; i++) + sbf.l_pad[i] = 0; + /* + * XXX In an LP64 kernel with an LP64 + * application there's no need to do a + * structure copy here as currently + * struct flock == struct flock64. + * We did it this way to avoid more + * conditional compilation. + */ + sbf.l_type = bf.l_type; + sbf.l_whence = bf.l_whence; + sbf.l_start = (off_t)bf.l_start; + sbf.l_len = (off_t)bf.l_len; + sbf.l_sysid = bf.l_sysid; + sbf.l_pid = bf.l_pid; + if (copyout(&sbf, (void *)arg, sizeof (sbf))) + error = EFAULT; + } +#if defined(_SYSCALL32_IMPL) + else { + struct flock32 sbf32; + if (bf.l_start > MAXOFF32_T || + bf.l_len > MAXOFF32_T) { + error = EOVERFLOW; + break; + } + for (i = 0; i < 4; i++) + sbf32.l_pad[i] = 0; + sbf32.l_type = (int16_t)bf.l_type; + sbf32.l_whence = (int16_t)bf.l_whence; + sbf32.l_start = (off32_t)bf.l_start; + sbf32.l_len = (off32_t)bf.l_len; + sbf32.l_sysid = (int32_t)bf.l_sysid; + sbf32.l_pid = (pid32_t)bf.l_pid; + if (copyout(&sbf32, + (void *)arg, sizeof (sbf32))) + error = EFAULT; + } +#endif + } + break; +/* ONC_PLUS EXTRACT END */ + + case F_CHKFL: + /* + * This is for internal use only, to allow the vnode layer + * to validate a flags setting before applying it. User + * programs can't issue it. + */ + error = EINVAL; + break; + + case F_ALLOCSP: + case F_FREESP: + if ((flag & FWRITE) == 0) { + error = EBADF; + break; + } + if (vp->v_type != VREG) { + error = EINVAL; + break; + } + +#if defined(_ILP32) || defined(_SYSCALL32_IMPL) + if (datamodel == DATAMODEL_ILP32) { + struct flock32 sbf32; + /* + * For compatibility we overlay an SVR3 flock on an SVR4 + * flock. This works because the input field offsets + * in "struct flock" were preserved. + */ + if (copyin((void *)arg, &sbf32, sizeof (sbf32))) { + error = EFAULT; + break; + } else { + bf.l_type = sbf32.l_type; + bf.l_whence = sbf32.l_whence; + bf.l_start = (off64_t)sbf32.l_start; + bf.l_len = (off64_t)sbf32.l_len; + bf.l_sysid = sbf32.l_sysid; + bf.l_pid = sbf32.l_pid; + } + } +#endif /* _ILP32 || _SYSCALL32_IMPL */ + +#if defined(_LP64) + if (datamodel == DATAMODEL_LP64) { + if (copyin((void *)arg, &bf, sizeof (bf))) { + error = EFAULT; + break; + } + } +#endif + + if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0) + break; + + if (vp->v_type == VREG && bf.l_len == 0 && + bf.l_start > OFFSET_MAX(fp)) { + error = EFBIG; + break; + } + + /* + * Make sure that there are no conflicting non-blocking + * mandatory locks in the region being manipulated. If + * there are such locks then return EACCES. + */ + if ((error = flock_get_start(vp, &bf, offset, &start)) != 0) + break; + + if (nbl_need_check(vp)) { + u_offset_t begin; + ssize_t length; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, CRED())) != 0) + break; + begin = start > vattr.va_size ? vattr.va_size : start; + length = vattr.va_size > start ? vattr.va_size - start : + start - vattr.va_size; + if (nbl_conflict(vp, NBL_WRITE, begin, length, 0)) { + error = EACCES; + break; + } + } + error = VOP_SPACE(vp, cmd, &bf, flag, offset, fp->f_cred, NULL); + break; + +#if !defined(_LP64) || defined(_SYSCALL32_IMPL) +/* ONC_PLUS EXTRACT START */ + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_SETLK64_NBMAND: + /* + * Large Files: Here we set cmd as *LK and send it to + * lower layers. *LK64 is only for the user land. + * Most of the comments described above for F_SETLK + * applies here too. + * Large File support is only needed for ILP32 apps! + */ + if (datamodel != DATAMODEL_ILP32) { + error = EINVAL; + break; + } + + if (cmd == F_GETLK64) + cmd = F_GETLK; + else if (cmd == F_SETLK64) + cmd = F_SETLK; + else if (cmd == F_SETLKW64) + cmd = F_SETLKW; + else if (cmd == F_SETLK64_NBMAND) + cmd = F_SETLK_NBMAND; + + /* + * Note that the size of flock64 is different in the ILP32 + * and LP64 models, due to the sucking l_pad field. + * We do not want to assume that the flock64 structure is + * laid out in the same in ILP32 and LP64 environments, so + * we will copy in the ILP32 version of flock64 explicitly + * and copy it to the native flock64 structure. + */ + + if (copyin((void *)arg, &bf64_32, sizeof (bf64_32))) { + error = EFAULT; + break; + } + bf.l_type = (short)bf64_32.l_type; + bf.l_whence = (short)bf64_32.l_whence; + bf.l_start = bf64_32.l_start; + bf.l_len = bf64_32.l_len; + bf.l_sysid = (int)bf64_32.l_sysid; + bf.l_pid = (pid_t)bf64_32.l_pid; + + if ((error = flock_check(vp, &bf, offset, MAXOFFSET_T)) != 0) + break; + + if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, + NULL, fp->f_cred)) != 0) + break; + + if ((cmd == F_GETLK) && bf.l_type == F_UNLCK) { + if (copyout(&bf.l_type, &((struct flock *)arg)->l_type, + sizeof (bf.l_type))) + error = EFAULT; + break; + } + + if (cmd == F_GETLK) { + int i; + + /* + * We do not want to assume that the flock64 structure + * is laid out in the same in ILP32 and LP64 + * environments, so we will copy out the ILP32 version + * of flock64 explicitly after copying the native + * flock64 structure to it. + */ + for (i = 0; i < 4; i++) + bf64_32.l_pad[i] = 0; + bf64_32.l_type = (int16_t)bf.l_type; + bf64_32.l_whence = (int16_t)bf.l_whence; + bf64_32.l_start = bf.l_start; + bf64_32.l_len = bf.l_len; + bf64_32.l_sysid = (int32_t)bf.l_sysid; + bf64_32.l_pid = (pid32_t)bf.l_pid; + if (copyout(&bf64_32, (void *)arg, sizeof (bf64_32))) + error = EFAULT; + } + break; +/* ONC_PLUS EXTRACT END */ + + case F_FREESP64: + if (datamodel != DATAMODEL_ILP32) { + error = EINVAL; + break; + } + cmd = F_FREESP; + if ((flag & FWRITE) == 0) + error = EBADF; + else if (vp->v_type != VREG) + error = EINVAL; + else if (copyin((void *)arg, &bf64_32, sizeof (bf64_32))) + error = EFAULT; + else { + /* + * Note that the size of flock64 is different in + * the ILP32 and LP64 models, due to the l_pad field. + * We do not want to assume that the flock64 structure + * is laid out the same in ILP32 and LP64 + * environments, so we will copy in the ILP32 + * version of flock64 explicitly and copy it to + * the native flock64 structure. + */ + bf.l_type = (short)bf64_32.l_type; + bf.l_whence = (short)bf64_32.l_whence; + bf.l_start = bf64_32.l_start; + bf.l_len = bf64_32.l_len; + bf.l_sysid = (int)bf64_32.l_sysid; + bf.l_pid = (pid_t)bf64_32.l_pid; + + if ((error = flock_check(vp, &bf, offset, + MAXOFFSET_T)) != 0) + break; + + if (vp->v_type == VREG && bf.l_len == 0 && + bf.l_start > OFFSET_MAX(fp)) { + error = EFBIG; + break; + } + /* + * Make sure that there are no conflicting non-blocking + * mandatory locks in the region being manipulated. If + * there are such locks then return EACCES. + */ + if ((error = flock_get_start(vp, &bf, offset, + &start)) != 0) + break; + if (nbl_need_check(vp)) { + u_offset_t begin; + ssize_t length; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, + CRED())) != 0) + break; + begin = start > vattr.va_size ? + vattr.va_size : start; + length = vattr.va_size > start ? + vattr.va_size - start : + start - vattr.va_size; + if (nbl_conflict(vp, NBL_WRITE, begin, + length, 0)) { + error = EACCES; + break; + } + } + error = VOP_SPACE(vp, cmd, &bf, flag, offset, + fp->f_cred, NULL); + } + break; +#endif /* !_LP64 || _SYSCALL32_IMPL */ + +/* ONC_PLUS EXTRACT START */ + case F_SHARE: + case F_SHARE_NBMAND: + case F_UNSHARE: + + /* + * Copy in input fields only. + */ + if (copyin((void *)arg, &fsh, sizeof (fsh))) { + error = EFAULT; + break; + } + + /* + * Local share reservations always have this simple form + */ + shr.s_access = fsh.f_access; + shr.s_deny = fsh.f_deny; + shr.s_sysid = 0; + shr.s_pid = ttoproc(curthread)->p_pid; + shr_own.sl_pid = shr.s_pid; + shr_own.sl_id = fsh.f_id; + shr.s_own_len = sizeof (shr_own); + shr.s_owner = (caddr_t)&shr_own; + error = VOP_SHRLOCK(vp, cmd, &shr, flag, fp->f_cred); +/* ONC_PLUS EXTRACT END */ + break; + + default: + error = EINVAL; + break; + } + + if (in_crit) + nbl_end_crit(vp); + +done: + releasef(fdes); +out: + if (error) + return (set_errno(error)); + return (retval); +} + +int +dup(int fd) +{ + return (fcntl(fd, F_DUPFD, 0)); +} + +/* ONC_PLUS EXTRACT START */ +int +flock_check(vnode_t *vp, flock64_t *flp, offset_t offset, offset_t max) +{ + struct vattr vattr; + int error; + u_offset_t start, end; + + /* + * Determine the starting point of the request + */ + switch (flp->l_whence) { + case 0: /* SEEK_SET */ + start = (u_offset_t)flp->l_start; + if (start > max) + return (EINVAL); + break; + case 1: /* SEEK_CUR */ + if (flp->l_start > (max - offset)) + return (EOVERFLOW); + start = (u_offset_t)(flp->l_start + offset); + if (start > max) + return (EINVAL); + break; + case 2: /* SEEK_END */ + vattr.va_mask = AT_SIZE; + if (error = VOP_GETATTR(vp, &vattr, 0, CRED())) + return (error); + if (flp->l_start > (max - (offset_t)vattr.va_size)) + return (EOVERFLOW); + start = (u_offset_t)(flp->l_start + (offset_t)vattr.va_size); + if (start > max) + return (EINVAL); + break; + default: + return (EINVAL); + } + + /* + * Determine the range covered by the request. + */ + if (flp->l_len == 0) + end = MAXEND; + else if ((offset_t)flp->l_len > 0) { + if (flp->l_len > (max - start + 1)) + return (EOVERFLOW); + end = (u_offset_t)(start + (flp->l_len - 1)); + ASSERT(end <= max); + } else { + /* + * Negative length; why do we even allow this ? + * Because this allows easy specification of + * the last n bytes of the file. + */ + end = start; + start += (u_offset_t)flp->l_len; + (start)++; + if (start > max) + return (EINVAL); + ASSERT(end <= max); + } + ASSERT(start <= max); + if (flp->l_type == F_UNLCK && flp->l_len > 0 && + end == (offset_t)max) { + flp->l_len = 0; + } + if (start > end) + return (EINVAL); + return (0); +} + +static int +flock_get_start(vnode_t *vp, flock64_t *flp, offset_t offset, u_offset_t *start) +{ + struct vattr vattr; + int error; + + /* + * Determine the starting point of the request. Assume that it is + * a valid starting point. + */ + switch (flp->l_whence) { + case 0: /* SEEK_SET */ + *start = (u_offset_t)flp->l_start; + break; + case 1: /* SEEK_CUR */ + *start = (u_offset_t)(flp->l_start + offset); + break; + case 2: /* SEEK_END */ + vattr.va_mask = AT_SIZE; + if (error = VOP_GETATTR(vp, &vattr, 0, CRED())) + return (error); + *start = (u_offset_t)(flp->l_start + (offset_t)vattr.va_size); + break; + default: + return (EINVAL); + } + + return (0); +} + +/* + * Take rctl action when the requested file descriptor is too big. + */ +static void +fd_too_big(proc_t *p) +{ + mutex_enter(&p->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + p->p_rctls, p, RCA_SAFE); + mutex_exit(&p->p_lock); +} +/* ONC_PLUS EXTRACT END */ diff --git a/usr/src/uts/common/syscall/fdsync.c b/usr/src/uts/common/syscall/fdsync.c new file mode 100644 index 0000000000..9951eb8727 --- /dev/null +++ b/usr/src/uts/common/syscall/fdsync.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1998 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/debug.h> + +/* + * Flush output pending for file. + */ +int +fdsync(int fd, int flag) +{ + file_t *fp; + register int error; + int syncflag; + + if ((fp = getf(fd)) != NULL) { + /* + * This flag will determine the file sync + * or data sync. + * FSYNC : file sync + * FDSYNC : data sync + */ + syncflag = flag & (FSYNC|FDSYNC); + + if (error = VOP_FSYNC(fp->f_vnode, syncflag, fp->f_cred)) + (void) set_errno(error); + releasef(fd); + } else + error = set_errno(EBADF); + return (error); +} diff --git a/usr/src/uts/common/syscall/fsat.c b/usr/src/uts/common/syscall/fsat.c new file mode 100644 index 0000000000..5e78a738c7 --- /dev/null +++ b/usr/src/uts/common/syscall/fsat.c @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/debug.h> + +extern int openat(int, char *, int, int); +extern int renameat(int, char *, int, char *); +extern int unlinkat(int, char *, int); +extern int fchownat(int, char *, uid_t, gid_t, int); +extern int fstatat(int, char *, struct stat *, int); +extern int futimesat(int, char *, struct timeval *); +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) +extern int fstatat64_32(int, char *, struct stat64_32 *, int); +extern int fstatat32(int, char *, struct stat32 *, int); +extern int openat32(int, char *, int, int); +extern int fstatat64(int, char *, struct stat64 *, int); +extern int openat64(int, char *, int, int); +extern int fstatat64_32(int, char *, struct stat64_32 *, int); +#endif + + +/* + * Handle all of the *at system calls + * + * subcodes: + * 0 - openat + * 1 - openat64 + * 2 - fstatat64 + * 3 - fstatat + * 4 - fchownat + * 5 - unlinkat + * 6 - futimesat + * 7 - renameat + * + * The code for handling the at functionality exists in the file where the + * base syscall is defined. For example openat is in open.c + */ + +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) + +int +fsat32(int code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, + uintptr_t arg4, uintptr_t arg5) +{ + switch (code) { + + case 0: /* openat */ +#if defined(_LP64) + return (openat32((int)arg1, (char *)arg2, + (int)arg3, (int)arg4)); +#else + return (openat((int)arg1, (char *)arg2, + (int)arg3, (int)arg4)); +#endif + case 1: /* openat64 */ + return (openat64((int)arg1, (char *)arg2, + (int)arg3, (int)arg4)); + case 2: /* fstatat64 */ +#if defined(_LP64) + return (fstatat64_32((int)arg1, (char *)arg2, + (struct stat64_32 *)arg3, (int)arg4)); +#else + return (fstatat64((int)arg1, (char *)arg2, + (struct stat64 *)arg3, (int)arg4)); +#endif + case 3: /* fstatat */ +#if defined(_LP64) + return (fstatat32((int)arg1, (char *)arg2, + (struct stat32 *)arg3, (int)arg4)); +#else + return (fstatat((int)arg1, (char *)arg2, + (struct stat *)arg3, (int)arg4)); +#endif + case 4: /* fchownat */ + return (fchownat((int)arg1, (char *)arg2, + (uid_t)arg3, (gid_t)arg4, (int)arg5)); + case 5: /* unlinkat */ + return (unlinkat((int)arg1, (char *)arg2, (int)arg3)); + case 6: /* futimesat */ + return (futimesat((int)arg1, + (char *)arg2, (struct timeval *)arg3)); + case 7: /* renameat */ + return (renameat((int)arg1, (char *)arg2, (int)arg3, + (char *)arg4)); + default: + return (set_errno(EINVAL)); + } +} + +#endif + +/* + * For 64 kernels, use fsat64 + */ + +#if defined(_LP64) + +int +fsat64(int code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, + uintptr_t arg4, uintptr_t arg5) +{ + switch (code) { + + case 0: /* openat */ + return (openat((int)arg1, (char *)arg2, + (int)arg3, (int)arg4)); + case 1: /* openat64 */ + return (set_errno(ENOSYS)); + case 2: /* fstatat64 */ + return (set_errno(ENOSYS)); + case 3: /* fstatat */ + return (fstatat((int)arg1, (char *)arg2, + (struct stat *)arg3, (int)arg4)); + case 4: /* fchownat */ + return (fchownat((int)arg1, (char *)arg2, + (uid_t)arg3, (gid_t)arg4, (int)arg5)); + case 5: /* unlinkat */ + return (unlinkat((int)arg1, (char *)arg2, (int)arg3)); + case 6: /* futimesat */ + return (futimesat((int)arg1, + (char *)arg2, (struct timeval *)arg3)); + case 7: /* renameat */ + return (renameat((int)arg1, (char *)arg2, (int)arg3, + (char *)arg4)); + default: + return (set_errno(EINVAL)); + } +} +#endif diff --git a/usr/src/uts/common/syscall/getcwd.c b/usr/src/uts/common/syscall/getcwd.c new file mode 100644 index 0000000000..f0ce066115 --- /dev/null +++ b/usr/src/uts/common/syscall/getcwd.c @@ -0,0 +1,81 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/copyops.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/param.h> +#include <sys/pathname.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/vnode.h> + +int +getcwd(char *buf, size_t buflen) +{ + int ret; + char *kbuf; + size_t kbuflen; + + /* + * The user should be able to specify any size buffer, but we don't want + * to arbitrarily allocate huge kernel buffers just because the user + * requests it. So we'll start with MAXPATHLEN (which should hold any + * normal path), and only increase it if we fail with ERANGE. + */ + kbuflen = MIN(buflen, MAXPATHLEN); + + for (;;) { + kbuf = kmem_alloc(kbuflen, KM_SLEEP); + + if ((ret = dogetcwd(kbuf, kbuflen)) == 0) + ret = copyout(kbuf, buf, strlen(kbuf) + 1); + + kmem_free(kbuf, kbuflen); + + if (ret == ENAMETOOLONG) { + /* + * If the user's buffer really was too small, give up. + * For some reason, getcwd() uses ERANGE for this case. + */ + if (kbuflen == buflen) { + ret = ERANGE; + break; + } + kbuflen = MIN(kbuflen * 2, buflen); + } else { + break; + } + } + + if (ret) + return (set_errno(ret)); + + return (ret); +} diff --git a/usr/src/uts/common/syscall/getdents.c b/usr/src/uts/common/syscall/getdents.c new file mode 100644 index 0000000000..fe97a02621 --- /dev/null +++ b/usr/src/uts/common/syscall/getdents.c @@ -0,0 +1,236 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/inttypes.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/uio.h> +#include <sys/ioreq.h> +#include <sys/filio.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> + +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) + +/* + * Get directory entries in a file system-independent format. + * + * The 32-bit version of this function now allocates a buffer to grab the + * directory entries in dirent64 formats from VOP_READDIR routines. + * The dirent64 structures are converted to dirent32 structures and + * copied to the user space. + * + * Both 32-bit and 64-bit versions of libc use getdents64() and therefore + * we don't expect any major performance impact due to the extra kmem_alloc's + * and copying done in this routine. + */ + +#define MAXGETDENTS_SIZE (64 * 1024) + +/* + * Native 32-bit system call for non-large-file applications. + */ +int +getdents32(int fd, void *buf, size_t count) +{ + vnode_t *vp; + file_t *fp; + struct uio auio; + struct iovec aiov; + register int error; + int sink; + char *newbuf; + char *obuf; + int bufsize; + int osize, nsize; + struct dirent64 *dp; + struct dirent32 *op; + + if (count < sizeof (struct dirent32)) + return (set_errno(EINVAL)); + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + vp = fp->f_vnode; + if (vp->v_type != VDIR) { + releasef(fd); + return (set_errno(ENOTDIR)); + } + + /* + * Don't let the user overcommit kernel resources. + */ + if (count > MAXGETDENTS_SIZE) + count = MAXGETDENTS_SIZE; + + bufsize = count; + newbuf = kmem_alloc(bufsize, KM_SLEEP); + obuf = kmem_alloc(bufsize, KM_SLEEP); + + aiov.iov_base = newbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = count; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); + error = VOP_READDIR(vp, &auio, fp->f_cred, &sink); + VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); + if (error) + goto out; + count = count - auio.uio_resid; + fp->f_offset = auio.uio_loffset; + + dp = (struct dirent64 *)newbuf; + op = (struct dirent32 *)obuf; + osize = 0; + nsize = 0; + + while (nsize < count) { + uint32_t reclen, namlen; + + /* + * This check ensures that the 64 bit d_ino and d_off + * fields will fit into their 32 bit equivalents. + * + * Although d_off is a signed value, the check is done + * against the full 32 bits because certain file systems, + * NFS for one, allow directory cookies to use the full + * 32 bits. We use uint64_t because there is no exact + * unsigned analog to the off64_t type of dp->d_off. + */ + if (dp->d_ino > (ino64_t)UINT32_MAX || + dp->d_off > (uint64_t)UINT32_MAX) { + error = EOVERFLOW; + goto out; + } + op->d_ino = (ino32_t)dp->d_ino; + op->d_off = (off32_t)dp->d_off; + namlen = strlen(dp->d_name); + reclen = DIRENT32_RECLEN(namlen); + op->d_reclen = (uint16_t)reclen; + + /* use strncpy(9f) to zero out uninitialized bytes */ + + (void) strncpy(op->d_name, dp->d_name, + DIRENT32_NAMELEN(reclen)); + nsize += (uint_t)dp->d_reclen; + osize += (uint_t)op->d_reclen; + dp = (struct dirent64 *)((char *)dp + (uint_t)dp->d_reclen); + op = (struct dirent32 *)((char *)op + (uint_t)op->d_reclen); + } + + ASSERT(osize <= count); + ASSERT((char *)op <= (char *)obuf + bufsize); + ASSERT((char *)dp <= (char *)newbuf + bufsize); + + if ((error = copyout(obuf, buf, osize)) < 0) + error = EFAULT; +out: + kmem_free(newbuf, bufsize); + kmem_free(obuf, bufsize); + + if (error) { + releasef(fd); + return (set_errno(error)); + } + + releasef(fd); + return (osize); +} + +#endif /* _SYSCALL32 || _ILP32 */ + +int +getdents64(int fd, void *buf, size_t count) +{ + vnode_t *vp; + file_t *fp; + struct uio auio; + struct iovec aiov; + register int error; + int sink; + + if (count < sizeof (struct dirent64)) + return (set_errno(EINVAL)); + + /* + * Don't let the user overcommit kernel resources. + */ + if (count > MAXGETDENTS_SIZE) + count = MAXGETDENTS_SIZE; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + vp = fp->f_vnode; + if (vp->v_type != VDIR) { + releasef(fd); + return (set_errno(ENOTDIR)); + } + aiov.iov_base = buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_resid = count; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); + error = VOP_READDIR(vp, &auio, fp->f_cred, &sink); + VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); + if (error) { + releasef(fd); + return (set_errno(error)); + } + count = count - auio.uio_resid; + fp->f_offset = auio.uio_loffset; + releasef(fd); + return (count); +} diff --git a/usr/src/uts/common/syscall/getloadavg.c b/usr/src/uts/common/syscall/getloadavg.c new file mode 100644 index 0000000000..c669f9b8ba --- /dev/null +++ b/usr/src/uts/common/syscall/getloadavg.c @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/loadavg.h> +#include <sys/zone.h> +#include <sys/pool_pset.h> + +/* + * Extract elements of the raw avenrun array from the kernel for the + * implementation of getloadavg(3c) + */ +int +getloadavg(int *buf, int nelem) +{ + int *loadbuf = &avenrun[0]; + int loadavg[LOADAVG_NSTATS]; + int error; + + if (nelem < 0) + return (set_errno(EINVAL)); + if (nelem > LOADAVG_NSTATS) + nelem = LOADAVG_NSTATS; + + if (!INGLOBALZONE(curproc)) { + mutex_enter(&cpu_lock); + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + error = cpupart_get_loadavg(psetid, &loadavg[0], nelem); + ASSERT(error == 0); /* pset isn't going anywhere */ + loadbuf = &loadavg[0]; + } + mutex_exit(&cpu_lock); + } + + error = copyout(loadbuf, buf, nelem * sizeof (avenrun[0])); + if (error) + return (set_errno(EFAULT)); + return (nelem); +} diff --git a/usr/src/uts/common/syscall/getpagesizes.c b/usr/src/uts/common/syscall/getpagesizes.c new file mode 100644 index 0000000000..d53e9a9936 --- /dev/null +++ b/usr/src/uts/common/syscall/getpagesizes.c @@ -0,0 +1,122 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <vm/page.h> +#include <sys/errno.h> + +/* + * Return supported page sizes. + */ +int +getpagesizes(size_t *buf, int nelem) +{ + int i, pagesizes = page_num_user_pagesizes(); + size_t *pgsza; + + if (nelem < 0) { + return (set_errno(EINVAL)); + } + if (nelem == 0 && buf != NULL) { + return (set_errno(EINVAL)); + } + if (nelem == 0 && buf == NULL) { + return (pagesizes); + } + if (buf == NULL) { + return (set_errno(EINVAL)); + } + if (nelem > pagesizes) { + nelem = pagesizes; + } + pgsza = kmem_alloc(sizeof (*pgsza) * nelem, KM_SLEEP); + for (i = 0; i < nelem; i++) { + pgsza[i] = page_get_user_pagesize(i); + } + if (copyout(pgsza, buf, nelem * sizeof (*pgsza)) != 0) { + kmem_free(pgsza, sizeof (*pgsza) * nelem); + return (set_errno(EFAULT)); + } + kmem_free(pgsza, sizeof (*pgsza) * nelem); + return (nelem); +} + +#if defined(_SYSCALL32_IMPL) + +/* + * Some future platforms will support page sizes larger than + * a 32-bit address space. + */ +int +getpagesizes32(size32_t *buf, int nelem) +{ + int i, pagesizes = page_num_user_pagesizes(); + size32_t *pgsza32; + size_t pgsz; + int rc; + + if (nelem < 0) { + return (set_errno(EINVAL)); + } + if (nelem == 0 && buf != NULL) { + return (set_errno(EINVAL)); + } + + pgsza32 = kmem_alloc(sizeof (*pgsza32) * pagesizes, KM_SLEEP); + for (i = 0; i < pagesizes; i++) { + pgsz = page_get_user_pagesize(i); + pgsza32[i] = (size32_t)pgsz; + if (pgsz > (size32_t)-1) { + pagesizes = i - 1; + break; + } + } + ASSERT(pagesizes > 0); + ASSERT(page_get_user_pagesize(pagesizes - 1) <= (size32_t)-1); + if (nelem > pagesizes) { + nelem = pagesizes; + } + if (nelem == 0 && buf == NULL) { + rc = pagesizes; + goto done; + } + if (buf == NULL) { + rc = set_errno(EINVAL); + goto done; + } + if (copyout(pgsza32, buf, nelem * sizeof (*pgsza32)) != 0) { + rc = set_errno(EFAULT); + goto done; + } + rc = nelem; +done: + kmem_free(pgsza32, sizeof (*pgsza32) * page_num_user_pagesizes()); + return (rc); +} +#endif diff --git a/usr/src/uts/common/syscall/getpid.c b/usr/src/uts/common/syscall/getpid.c new file mode 100644 index 0000000000..d061fe3b8b --- /dev/null +++ b/usr/src/uts/common/syscall/getpid.c @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/debug.h> +#include <sys/zone.h> + + +int64_t +getpid(void) +{ + rval_t r; + proc_t *p; + + p = ttoproc(curthread); + r.r_val1 = p->p_pid; + if (p->p_flag & SZONETOP) + r.r_val2 = curproc->p_zone->zone_zsched->p_pid; + else + r.r_val2 = p->p_ppid; + return (r.r_vals); +} diff --git a/usr/src/uts/common/syscall/gid.c b/usr/src/uts/common/syscall/gid.c new file mode 100644 index 0000000000..1cd5a4fd24 --- /dev/null +++ b/usr/src/uts/common/syscall/gid.c @@ -0,0 +1,235 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1994,2001-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/cred_impl.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/debug.h> +#include <sys/policy.h> + + +int +setgid(gid_t gid) +{ + register proc_t *p; + int error; + int do_nocd = 0; + cred_t *cr, *newcr; + + if (gid < 0 || gid > MAXUID) + return (set_errno(EINVAL)); + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if ((gid == cr->cr_rgid || gid == cr->cr_sgid) && + secpolicy_allow_setid(cr, -1, B_TRUE) != 0) { + error = 0; + crcopy_to(cr, newcr); + p->p_cred = newcr; + newcr->cr_gid = gid; + } else if ((error = secpolicy_allow_setid(cr, -1, B_FALSE)) == 0) { + /* + * A privileged process that makes itself look like a + * set-gid process must be marked to produce no core dump. + */ + if (cr->cr_gid != gid || + cr->cr_rgid != gid || + cr->cr_sgid != gid) + do_nocd = 1; + crcopy_to(cr, newcr); + p->p_cred = newcr; + newcr->cr_gid = gid; + newcr->cr_rgid = gid; + newcr->cr_sgid = gid; + } else + crfree(newcr); + + mutex_exit(&p->p_crlock); + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + return (0); + } + return (set_errno(error)); +} + +int64_t +getgid(void) +{ + rval_t r; + cred_t *cr; + + cr = curthread->t_cred; + r.r_val1 = cr->cr_rgid; + r.r_val2 = cr->cr_gid; + return (r.r_vals); +} + +int +setegid(gid_t gid) +{ + register proc_t *p; + register cred_t *cr, *newcr; + int error = EPERM; + int do_nocd = 0; + + if (gid < 0 || gid > MAXUID) + return (set_errno(EINVAL)); + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + if (gid == cr->cr_rgid || gid == cr->cr_gid || gid == cr->cr_sgid || + (error = secpolicy_allow_setid(cr, -1, B_FALSE)) == 0) { + /* + * A privileged process that makes itself look like a + * set-gid process must be marked to produce no core dump. + */ + if (cr->cr_gid != gid && error == 0) + do_nocd = 1; + error = 0; + crcopy_to(cr, newcr); + p->p_cred = newcr; + newcr->cr_gid = gid; + } else + crfree(newcr); + + mutex_exit(&p->p_crlock); + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + return (0); + } + return (set_errno(error)); +} + +/* + * Buy-back from SunOS 4.x + * + * Like setgid() and setegid() combined -except- that non-root users + * can change cr_rgid to cr_gid, and the semantics of cr_sgid are + * subtly different. + */ +int +setregid(gid_t rgid, gid_t egid) +{ + proc_t *p; + int error = EPERM; + int do_nocd = 0; + cred_t *cr, *newcr; + + if ((rgid != -1 && (rgid < 0 || rgid > MAXUID)) || + (egid != -1 && (egid < 0 || egid > MAXUID))) + return (set_errno(EINVAL)); + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if ((rgid == -1 || + rgid == cr->cr_rgid || rgid == cr->cr_gid || rgid == cr->cr_sgid) && + (egid == -1 || egid == cr->cr_rgid || egid == cr->cr_gid || + egid == cr->cr_sgid) || + (error = secpolicy_allow_setid(cr, -1, B_FALSE)) == 0) { + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (egid != -1) + newcr->cr_gid = egid; + if (rgid != -1) + newcr->cr_rgid = rgid; + /* + * "If the real gid is being changed, or the effective gid is + * being changed to a value not equal to the real gid, the + * saved gid is set to the new effective gid." + */ + if (rgid != -1 || + (egid != -1 && newcr->cr_gid != newcr->cr_rgid)) + newcr->cr_sgid = newcr->cr_gid; + /* + * A privileged process that makes itself look like a + * set-gid process must be marked to produce no core dump. + */ + if ((cr->cr_gid != newcr->cr_gid || + cr->cr_rgid != newcr->cr_rgid || + cr->cr_sgid != newcr->cr_sgid) && error == 0) + do_nocd = 1; + error = 0; + crfree(cr); + } + mutex_exit(&p->p_crlock); + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + return (0); + } + crfree(newcr); + return (set_errno(error)); +} diff --git a/usr/src/uts/common/syscall/groups.c b/usr/src/uts/common/syscall/groups.c new file mode 100644 index 0000000000..88e3777afd --- /dev/null +++ b/usr/src/uts/common/syscall/groups.c @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * Copyright 2001-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/cred_impl.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/policy.h> + +int +setgroups(int gidsetsize, gid_t *gidset) +{ + proc_t *p; + cred_t *cr, *newcr; + int i; + int n = gidsetsize; + gid_t *groups = NULL; + int error; + + /* Perform the cheapest tests before grabbing p_crlock */ + if (n > ngroups_max || n < 0) + return (set_errno(EINVAL)); + + if (n != 0) { + groups = kmem_alloc(n * sizeof (gid_t), KM_SLEEP); + + if (copyin(gidset, groups, n * sizeof (gid_t)) != 0) { + kmem_free(groups, n * sizeof (gid_t)); + return (set_errno(EFAULT)); + } + + for (i = 0; i < n; i++) { + if (groups[i] < 0 || groups[i] > MAXUID) { + kmem_free(groups, n * sizeof (gid_t)); + return (set_errno(EINVAL)); + } + } + } + + /* + * Need to pre-allocate the new cred structure before acquiring + * the p_crlock mutex. + */ + newcr = cralloc(); + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if ((error = secpolicy_allow_setid(cr, -1, B_FALSE)) != 0) { + mutex_exit(&p->p_crlock); + if (groups != NULL) + kmem_free(groups, n * sizeof (gid_t)); + crfree(newcr); + return (set_errno(error)); + } + + crdup_to(cr, newcr); + + if (n != 0) { + bcopy(groups, newcr->cr_groups, n * sizeof (gid_t)); + kmem_free(groups, n * sizeof (gid_t)); + } + + newcr->cr_ngroups = n; + + p->p_cred = newcr; + crhold(newcr); /* hold for the current thread */ + crfree(cr); /* free the old one */ + mutex_exit(&p->p_crlock); + + /* + * Broadcast new cred to process threads (including the current one). + */ + crset(p, newcr); + + return (0); +} + +int +getgroups(int gidsetsize, gid_t *gidset) +{ + struct cred *cr; + int n; + + cr = curthread->t_cred; + n = (int)cr->cr_ngroups; + + if (gidsetsize != 0) { + if (gidsetsize < n) + return (set_errno(EINVAL)); + if (copyout(cr->cr_groups, gidset, n * sizeof (gid_t))) + return (set_errno(EFAULT)); + } + + return (n); +} diff --git a/usr/src/uts/common/syscall/ioctl.c b/usr/src/uts/common/syscall/ioctl.c new file mode 100644 index 0000000000..c4b514d4de --- /dev/null +++ b/usr/src/uts/common/syscall/ioctl.c @@ -0,0 +1,169 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2001 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/ttold.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/kmem.h> +#include <sys/filio.h> +#include <sys/sunddi.h> +#include <sys/debug.h> +#include <sys/int_limits.h> +#include <sys/model.h> + +/* + * I/O control. + */ + +int +ioctl(int fdes, int cmd, intptr_t arg) +{ + file_t *fp; + int error = 0; + vnode_t *vp; + struct vattr vattr; + int32_t flag; + int rv = 0; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + vp = fp->f_vnode; + + if (vp->v_type == VREG || vp->v_type == VDIR) { + /* + * Handle these two ioctls for regular files and + * directories. All others will usually be failed + * with ENOTTY by the VFS-dependent code. System V + * always failed all ioctls on regular files, but SunOS + * supported these. + */ + switch (cmd) { + case FIONREAD: { + /* + * offset is int32_t because that is what FIONREAD + * is defined in terms of. We cap at INT_MAX as in + * other cases for this ioctl. + */ + int32_t offset; + + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred); + if (error) { + releasef(fdes); + return (set_errno(error)); + } + offset = MIN(vattr.va_size - fp->f_offset, INT_MAX); + if (copyout(&offset, (caddr_t)arg, sizeof (offset))) { + releasef(fdes); + return (set_errno(EFAULT)); + } + releasef(fdes); + return (0); + } + + case FIONBIO: + if (copyin((caddr_t)arg, &flag, sizeof (flag))) { + releasef(fdes); + return (set_errno(EFAULT)); + } + mutex_enter(&fp->f_tlock); + if (flag) + fp->f_flag |= FNDELAY; + else + fp->f_flag &= ~FNDELAY; + mutex_exit(&fp->f_tlock); + releasef(fdes); + return (0); + + default: + break; + } + } + + /* + * ioctl() now passes in the model information in some high bits. + */ + flag = fp->f_flag | get_udatamodel(); + error = VOP_IOCTL(fp->f_vnode, cmd, arg, flag, CRED(), &rv); + if (error != 0) { + releasef(fdes); + return (set_errno(error)); + } + switch (cmd) { + case FIONBIO: + if (copyin((caddr_t)arg, &flag, sizeof (flag))) { + releasef(fdes); + return (set_errno(EFAULT)); + } + mutex_enter(&fp->f_tlock); + if (flag) + fp->f_flag |= FNDELAY; + else + fp->f_flag &= ~FNDELAY; + mutex_exit(&fp->f_tlock); + break; + + default: + break; + } + releasef(fdes); + return (rv); +} + +/* + * Old stty and gtty. (Still.) + */ +int +stty(int fdes, intptr_t arg) +{ + return (ioctl(fdes, TIOCSETP, arg)); +} + +int +gtty(int fdes, intptr_t arg) +{ + return (ioctl(fdes, TIOCGETP, arg)); +} diff --git a/usr/src/uts/common/syscall/issetugid.c b/usr/src/uts/common/syscall/issetugid.c new file mode 100644 index 0000000000..4c734a784a --- /dev/null +++ b/usr/src/uts/common/syscall/issetugid.c @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2001 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/proc.h> + +/* + * System call returns true if the process was the result of exec'ing a set-uid + * or set-gid executable or was exec'ed with mismatch between real and + * effective uids or gids; false in all other cases. + */ +int +issetugid(void) +{ + return ((curproc->p_flag & SUGID) != 0); +} diff --git a/usr/src/uts/common/syscall/lgrpsys.c b/usr/src/uts/common/syscall/lgrpsys.c new file mode 100644 index 0000000000..09b9818ad6 --- /dev/null +++ b/usr/src/uts/common/syscall/lgrpsys.c @@ -0,0 +1,2105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * lgroup system calls + */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/sunddi.h> +#include <sys/systm.h> +#include <sys/mman.h> +#include <sys/cpupart.h> +#include <sys/lgrp.h> +#include <sys/lgrp_user.h> +#include <sys/promif.h> /* for prom_printf() */ +#include <sys/sysmacros.h> + +#include <vm/as.h> + + +/* definitions for mi_validity */ +#define VALID_ADDR 1 +#define VALID_REQ 2 + +/* + * run through the given number of addresses and requests and return the + * corresponding memory information for each address + */ +static int +meminfo(int addr_count, struct meminfo *mip) +{ + size_t in_size, out_size, req_size, val_size; + struct as *as; + struct hat *hat; + int i, j, out_idx, info_count; + lgrp_t *lgrp; + pfn_t pfn; + ssize_t pgsz; + int *req_array, *val_array; + uint64_t *in_array, *out_array; + uint64_t addr, paddr; + uintptr_t vaddr; + int ret = 0; + struct meminfo minfo; +#if defined(_SYSCALL32_IMPL) + struct meminfo32 minfo32; +#endif + + /* + * Make sure that there is at least one address to translate and + * limit how many virtual addresses the kernel can do per call + */ + if (addr_count < 1) + return (set_errno(EINVAL)); + else if (addr_count > MAX_MEMINFO_CNT) + addr_count = MAX_MEMINFO_CNT; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(mip, &minfo, sizeof (struct meminfo))) + return (set_errno(EFAULT)); + } +#if defined(_SYSCALL32_IMPL) + else { + bzero(&minfo, sizeof (minfo)); + if (copyin(mip, &minfo32, sizeof (struct meminfo32))) + return (set_errno(EFAULT)); + minfo.mi_inaddr = (const uint64_t *)(uintptr_t) + minfo32.mi_inaddr; + minfo.mi_info_req = (const uint_t *)(uintptr_t) + minfo32.mi_info_req; + minfo.mi_info_count = minfo32.mi_info_count; + minfo.mi_outdata = (uint64_t *)(uintptr_t) + minfo32.mi_outdata; + minfo.mi_validity = (uint_t *)(uintptr_t) + minfo32.mi_validity; + } +#endif + /* + * all the input parameters have been copied in:- + * addr_count - number of input addresses + * minfo.mi_inaddr - array of input addresses + * minfo.mi_info_req - array of types of information requested + * minfo.mi_info_count - no. of pieces of info requested for each addr + * minfo.mi_outdata - array into which the results are placed + * minfo.mi_validity - array containing bitwise result codes; 0th bit + * evaluates validity of corresponding input + * address, 1st bit validity of response to first + * member of info_req, etc. + */ + + /* make sure mi_info_count is within limit */ + info_count = minfo.mi_info_count; + if (info_count < 1 || info_count > MAX_MEMINFO_REQ) + return (set_errno(EINVAL)); + + /* + * allocate buffer in_array for the input addresses and copy them in + */ + in_size = sizeof (uint64_t) * addr_count; + in_array = kmem_alloc(in_size, KM_SLEEP); + if (copyin(minfo.mi_inaddr, in_array, in_size)) { + kmem_free(in_array, in_size); + return (set_errno(EFAULT)); + } + + /* + * allocate buffer req_array for the input info_reqs and copy them in + */ + req_size = sizeof (uint_t) * info_count; + req_array = kmem_alloc(req_size, KM_SLEEP); + if (copyin(minfo.mi_info_req, req_array, req_size)) { + kmem_free(req_array, req_size); + kmem_free(in_array, in_size); + return (set_errno(EFAULT)); + } + + /* + * allocate buffer out_array which holds the results and will have + * to be copied out later + */ + out_size = sizeof (uint64_t) * addr_count * info_count; + out_array = kmem_alloc(out_size, KM_SLEEP); + + /* + * allocate buffer val_array which holds the validity bits and will + * have to be copied out later + */ + val_size = sizeof (uint_t) * addr_count; + val_array = kmem_alloc(val_size, KM_SLEEP); + + if ((req_array[0] & MEMINFO_MASK) == MEMINFO_PLGRP) { + /* find the corresponding lgroup for each physical address */ + for (i = 0; i < addr_count; i++) { + paddr = in_array[i]; + pfn = btop(paddr); + lgrp = lgrp_pfn_to_lgrp(pfn); + if (lgrp) { + out_array[i] = lgrp->lgrp_id; + val_array[i] = VALID_ADDR | VALID_REQ; + } else { + out_array[i] = NULL; + val_array[i] = 0; + } + } + } else { + /* get the corresponding memory info for each virtual address */ + as = curproc->p_as; + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + hat = as->a_hat; + for (i = out_idx = 0; i < addr_count; i++, out_idx += + info_count) { + addr = in_array[i]; + vaddr = (uintptr_t)(addr & ~PAGEOFFSET); + if (!as_segat(as, (caddr_t)vaddr)) { + val_array[i] = 0; + continue; + } + val_array[i] = VALID_ADDR; + pfn = hat_getpfnum(hat, (caddr_t)vaddr); + if (pfn != PFN_INVALID) { + paddr = (uint64_t)((pfn << PAGESHIFT) | + (addr & PAGEOFFSET)); + for (j = 0; j < info_count; j++) { + switch (req_array[j] & MEMINFO_MASK) { + case MEMINFO_VPHYSICAL: + /* + * return the physical address + * corresponding to the input + * virtual address + */ + out_array[out_idx + j] = paddr; + val_array[i] |= VALID_REQ << j; + break; + case MEMINFO_VLGRP: + /* + * return the lgroup of physical + * page corresponding to the + * input virtual address + */ + lgrp = lgrp_pfn_to_lgrp(pfn); + if (lgrp) { + out_array[out_idx + j] = + lgrp->lgrp_id; + val_array[i] |= + VALID_REQ << j; + } + break; + case MEMINFO_VPAGESIZE: + /* + * return the size of physical + * page corresponding to the + * input virtual address + */ + pgsz = hat_getpagesize(hat, + (caddr_t)vaddr); + if (pgsz != -1) { + out_array[out_idx + j] = + pgsz; + val_array[i] |= + VALID_REQ << j; + } + break; + case MEMINFO_VREPLCNT: + /* + * for future use:- + * return the no. replicated + * physical pages corresponding + * to the input virtual address, + * so it is always 0 at the + * moment + */ + out_array[out_idx + j] = 0; + val_array[i] |= VALID_REQ << j; + break; + case MEMINFO_VREPL: + /* + * for future use:- + * return the nth physical + * replica of the specified + * virtual address + */ + break; + case MEMINFO_VREPL_LGRP: + /* + * for future use:- + * return the lgroup of nth + * physical replica of the + * specified virtual address + */ + break; + case MEMINFO_PLGRP: + /* + * this is for physical address + * only, shouldn't mix with + * virtual address + */ + break; + default: + break; + } + } + } + } + AS_LOCK_EXIT(as, &as->a_lock); + } + + /* copy out the results and validity bits and free the buffers */ + if ((copyout(out_array, minfo.mi_outdata, out_size) != 0) || + (copyout(val_array, minfo.mi_validity, val_size) != 0)) + ret = set_errno(EFAULT); + + kmem_free(in_array, in_size); + kmem_free(out_array, out_size); + kmem_free(req_array, req_size); + kmem_free(val_array, val_size); + + return (ret); +} + + +/* + * Initialize lgroup affinities for thread + */ +void +lgrp_affinity_init(lgrp_affinity_t **bufaddr) +{ + if (bufaddr) + *bufaddr = NULL; +} + + +/* + * Free lgroup affinities for thread and set to NULL + * just in case thread gets recycled + */ +void +lgrp_affinity_free(lgrp_affinity_t **bufaddr) +{ + if (bufaddr && *bufaddr) { + kmem_free(*bufaddr, nlgrpsmax * sizeof (lgrp_affinity_t)); + *bufaddr = NULL; + } +} + + +#define P_ANY -2 /* cookie specifying any ID */ + + +/* + * Find LWP with given ID in specified process and get its affinity for + * specified lgroup + */ +lgrp_affinity_t +lgrp_affinity_get_thread(proc_t *p, id_t lwpid, lgrp_id_t lgrp) +{ + lgrp_affinity_t aff; + int found; + kthread_t *t; + + ASSERT(MUTEX_HELD(&p->p_lock)); + + aff = LGRP_AFF_NONE; + found = 0; + t = p->p_tlist; + /* + * The process may be executing in proc_exit() and its p->p_list may be + * already NULL. + */ + if (t == NULL) + return (set_errno(ESRCH)); + + do { + if (t->t_tid == lwpid || lwpid == P_ANY) { + thread_lock(t); + /* + * Check to see whether caller has permission to set + * affinity for LWP + */ + if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) { + thread_unlock(t); + return (set_errno(EPERM)); + } + + if (t->t_lgrp_affinity) + aff = t->t_lgrp_affinity[lgrp]; + thread_unlock(t); + found = 1; + break; + } + } while ((t = t->t_forw) != p->p_tlist); + if (!found) + aff = set_errno(ESRCH); + + return (aff); +} + + +/* + * Get lgroup affinity for given LWP + */ +lgrp_affinity_t +lgrp_affinity_get(lgrp_affinity_args_t *ap) +{ + lgrp_affinity_t aff; + lgrp_affinity_args_t args; + id_t id; + idtype_t idtype; + lgrp_id_t lgrp; + proc_t *p; + kthread_t *t; + + /* + * Copyin arguments + */ + if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0) + return (set_errno(EFAULT)); + + id = args.id; + idtype = args.idtype; + lgrp = args.lgrp; + + /* + * Check for invalid lgroup + */ + if (lgrp < 0 || lgrp == LGRP_NONE) + return (set_errno(EINVAL)); + + /* + * Check for existing lgroup + */ + if (lgrp > lgrp_alloc_max) + return (set_errno(ESRCH)); + + /* + * Get lgroup affinity for given LWP or process + */ + switch (idtype) { + + case P_LWPID: + /* + * LWP in current process + */ + p = curproc; + mutex_enter(&p->p_lock); + if (id != P_MYID) /* different thread */ + aff = lgrp_affinity_get_thread(p, id, lgrp); + else { /* current thread */ + aff = LGRP_AFF_NONE; + t = curthread; + thread_lock(t); + if (t->t_lgrp_affinity) + aff = t->t_lgrp_affinity[lgrp]; + thread_unlock(t); + } + mutex_exit(&p->p_lock); + break; + + case P_PID: + /* + * Process + */ + mutex_enter(&pidlock); + + if (id == P_MYID) + p = curproc; + else { + p = prfind(id); + if (p == NULL) { + mutex_exit(&pidlock); + return (set_errno(ESRCH)); + } + } + + mutex_enter(&p->p_lock); + aff = lgrp_affinity_get_thread(p, P_ANY, lgrp); + mutex_exit(&p->p_lock); + + mutex_exit(&pidlock); + break; + + default: + aff = set_errno(EINVAL); + break; + } + + return (aff); +} + + +/* + * Find lgroup for which this thread has most affinity in specified partition + */ +lpl_t * +lgrp_affinity_best(kthread_t *t, struct cpupart *cpupart, lgrp_id_t start) +{ + lgrp_affinity_t *affs; + lgrp_affinity_t best_aff; + lpl_t *best_lpl; + lgrp_id_t home; + lgrp_id_t lgrpid; + lpl_t *lpl; + + ASSERT(t != NULL); + ASSERT((MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0) || + (MUTEX_HELD(&ttoproc(t)->p_lock) && THREAD_LOCK_HELD(t))); + ASSERT(cpupart != NULL); + + if (t->t_lgrp_affinity == NULL) + return (NULL); + + affs = t->t_lgrp_affinity; + + /* + * Thread bound to CPU + */ + if (t->t_bind_cpu != PBIND_NONE) { + cpu_t *cp; + + /* + * See whether thread has more affinity for root lgroup + * than lgroup containing CPU + */ + cp = cpu[t->t_bind_cpu]; + lpl = cp->cpu_lpl; + lgrpid = LGRP_ROOTID; + if (affs[lgrpid] > affs[lpl->lpl_lgrpid]) + return (&cpupart->cp_lgrploads[lgrpid]); + return (lpl); + } + + /* + * Start searching at given lgroup + */ + ASSERT(start >= 0 && start <= lgrp_alloc_max); + lgrpid = start; + + /* + * Begin with home as best lgroup if it's root or in this pset + * Otherwise, use starting lgroup given above as best first. + */ + home = t->t_lpl->lpl_lgrpid; + if (LGRP_CPUS_IN_PART(home, cpupart)) + best_lpl = &cpupart->cp_lgrploads[home]; + else + best_lpl = &cpupart->cp_lgrploads[lgrpid]; + + best_aff = affs[best_lpl->lpl_lgrpid]; + + do { + /* + * Skip any lgroups that don't have CPU resources + * in this processor set. + */ + if (!LGRP_CPUS_IN_PART(lgrpid, cpupart)) { + if (++lgrpid > lgrp_alloc_max) + lgrpid = 0; /* wrap the search */ + continue; + } + + /* + * Find lgroup with most affinity + */ + lpl = &cpupart->cp_lgrploads[lgrpid]; + if (affs[lgrpid] > best_aff) { + best_aff = affs[lgrpid]; + best_lpl = lpl; + } + + if (++lgrpid > lgrp_alloc_max) + lgrpid = 0; /* wrap the search */ + + } while (lgrpid != start); + + /* + * No lgroup (in this pset) with any affinity + */ + if (best_aff == LGRP_AFF_NONE) + return (NULL); + + lgrpid = best_lpl->lpl_lgrpid; + ASSERT(LGRP_CPUS_IN_PART(lgrpid, cpupart) && best_lpl->lpl_ncpu > 0); + + return (best_lpl); +} + + +/* + * Set thread's affinity for given lgroup + */ +int +lgrp_affinity_set_thread(kthread_t *t, lgrp_id_t lgrp, lgrp_affinity_t aff, + lgrp_affinity_t **aff_buf) +{ + lpl_t *best_lpl; + lgrp_id_t home; + int retval; + lgrp_id_t start; + + ASSERT(t != NULL); + ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); + + retval = 0; + + thread_lock(t); + + /* + * Check to see whether caller has permission to set affinity for + * thread + */ + if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) { + thread_unlock(t); + return (set_errno(EPERM)); + } + + if (t->t_lgrp_affinity == NULL) { + if (aff == LGRP_AFF_NONE) { + thread_unlock(t); + return (0); + } + ASSERT(aff_buf != NULL && *aff_buf != NULL); + t->t_lgrp_affinity = *aff_buf; + *aff_buf = NULL; + } + + t->t_lgrp_affinity[lgrp] = aff; + + /* + * Select a new home if the thread's affinity is being cleared + */ + if (aff == LGRP_AFF_NONE) { + lgrp_move_thread(t, lgrp_choose(t, t->t_cpupart), 1); + thread_unlock(t); + return (retval); + } + + /* + * Find lgroup for which thread has most affinity, + * starting after home + */ + home = t->t_lpl->lpl_lgrpid; + start = home + 1; + if (start > lgrp_alloc_max) + start = 0; + + best_lpl = lgrp_affinity_best(t, t->t_cpupart, start); + + /* + * Rehome if found lgroup with more affinity than home + */ + if (best_lpl != NULL && best_lpl != t->t_lpl) + lgrp_move_thread(t, best_lpl, 1); + + thread_unlock(t); + + return (retval); +} + + +/* + * Set process' affinity for specified lgroup + */ +int +lgrp_affinity_set_proc(proc_t *p, lgrp_id_t lgrp, lgrp_affinity_t aff, + lgrp_affinity_t **aff_buf_array) +{ + lgrp_affinity_t *buf; + int err = 0; + int i; + int retval; + kthread_t *t; + + ASSERT(MUTEX_HELD(&pidlock) && MUTEX_HELD(&p->p_lock)); + ASSERT(aff_buf_array != NULL); + + i = 0; + t = p->p_tlist; + if (t != NULL) { + do { + /* + * Set lgroup affinity for thread + */ + buf = aff_buf_array[i]; + retval = lgrp_affinity_set_thread(t, lgrp, aff, &buf); + + if (err == 0 && retval != 0) + err = retval; + + /* + * Advance pointer to next buffer + */ + if (buf == NULL) { + ASSERT(i < p->p_lwpcnt); + aff_buf_array[i] = NULL; + i++; + } + + } while ((t = t->t_forw) != p->p_tlist); + } + return (err); +} + + +/* + * Set LWP's or process' affinity for specified lgroup + * + * When setting affinities, pidlock, process p_lock, and thread_lock() + * need to be held in that order to protect target thread's pset, process, + * process contents, and thread contents. thread_lock() does splhigh(), + * so it ends up having similiar effect as kpreempt_disable(), so it will + * protect calls to lgrp_move_thread() and lgrp_choose() from pset changes. + */ +int +lgrp_affinity_set(lgrp_affinity_args_t *ap) +{ + lgrp_affinity_t aff; + lgrp_affinity_t *aff_buf; + lgrp_affinity_args_t args; + id_t id; + idtype_t idtype; + lgrp_id_t lgrp; + int nthreads; + proc_t *p; + int retval; + + /* + * Copyin arguments + */ + if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0) + return (set_errno(EFAULT)); + + idtype = args.idtype; + id = args.id; + lgrp = args.lgrp; + aff = args.aff; + + /* + * Check for invalid lgroup + */ + if (lgrp < 0 || lgrp == LGRP_NONE) + return (set_errno(EINVAL)); + + /* + * Check for existing lgroup + */ + if (lgrp > lgrp_alloc_max) + return (set_errno(ESRCH)); + + /* + * Check for legal affinity + */ + if (aff != LGRP_AFF_NONE && aff != LGRP_AFF_WEAK && + aff != LGRP_AFF_STRONG) + return (set_errno(EINVAL)); + + /* + * Must be process or LWP ID + */ + if (idtype != P_LWPID && idtype != P_PID) + return (set_errno(EINVAL)); + + /* + * Set given LWP's or process' affinity for specified lgroup + */ + switch (idtype) { + + case P_LWPID: + /* + * Allocate memory for thread's lgroup affinities + * ahead of time w/o holding locks + */ + aff_buf = kmem_zalloc(nlgrpsmax * sizeof (lgrp_affinity_t), + KM_SLEEP); + + p = curproc; + + /* + * Set affinity for thread + */ + mutex_enter(&p->p_lock); + if (id == P_MYID) { /* current thread */ + retval = lgrp_affinity_set_thread(curthread, lgrp, aff, + &aff_buf); + } else if (p->p_tlist == NULL) { + retval = set_errno(ESRCH); + } else { /* other thread */ + int found = 0; + kthread_t *t; + + t = p->p_tlist; + do { + if (t->t_tid == id) { + retval = lgrp_affinity_set_thread(t, + lgrp, aff, &aff_buf); + found = 1; + break; + } + } while ((t = t->t_forw) != p->p_tlist); + if (!found) + retval = set_errno(ESRCH); + } + mutex_exit(&p->p_lock); + + /* + * Free memory for lgroup affinities, + * since thread didn't need it + */ + if (aff_buf) + kmem_free(aff_buf, + nlgrpsmax * sizeof (lgrp_affinity_t)); + + break; + + case P_PID: + + do { + lgrp_affinity_t **aff_buf_array; + int i; + size_t size; + + /* + * Get process + */ + mutex_enter(&pidlock); + + if (id == P_MYID) + p = curproc; + else + p = prfind(id); + + if (p == NULL) { + mutex_exit(&pidlock); + return (set_errno(ESRCH)); + } + + /* + * Get number of threads in process + * + * NOTE: Only care about user processes, + * so p_lwpcnt should be number of threads. + */ + mutex_enter(&p->p_lock); + nthreads = p->p_lwpcnt; + mutex_exit(&p->p_lock); + + mutex_exit(&pidlock); + + if (nthreads < 1) + return (set_errno(ESRCH)); + + /* + * Preallocate memory for lgroup affinities for + * each thread in process now to avoid holding + * any locks. Allocate an array to hold a buffer + * for each thread. + */ + aff_buf_array = kmem_zalloc(nthreads * + sizeof (lgrp_affinity_t *), KM_SLEEP); + + size = nlgrpsmax * sizeof (lgrp_affinity_t); + for (i = 0; i < nthreads; i++) + aff_buf_array[i] = kmem_zalloc(size, KM_SLEEP); + + mutex_enter(&pidlock); + + /* + * Get process again since dropped locks to allocate + * memory (except current process) + */ + if (id != P_MYID) + p = prfind(id); + + /* + * Process went away after we dropped locks and before + * reacquiring them, so drop locks, free memory, and + * return. + */ + if (p == NULL) { + mutex_exit(&pidlock); + for (i = 0; i < nthreads; i++) + kmem_free(aff_buf_array[i], size); + kmem_free(aff_buf_array, + nthreads * sizeof (lgrp_affinity_t *)); + return (set_errno(ESRCH)); + } + + mutex_enter(&p->p_lock); + + /* + * See whether number of threads is same + * If not, drop locks, free memory, and try again + */ + if (nthreads != p->p_lwpcnt) { + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + for (i = 0; i < nthreads; i++) + kmem_free(aff_buf_array[i], size); + kmem_free(aff_buf_array, + nthreads * sizeof (lgrp_affinity_t *)); + continue; + } + + /* + * Set lgroup affinity for threads in process + */ + retval = lgrp_affinity_set_proc(p, lgrp, aff, + aff_buf_array); + + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + + /* + * Free any leftover memory, since some threads may + * have already allocated memory and set lgroup + * affinities before + */ + for (i = 0; i < nthreads; i++) + if (aff_buf_array[i] != NULL) + kmem_free(aff_buf_array[i], size); + kmem_free(aff_buf_array, + nthreads * sizeof (lgrp_affinity_t *)); + + break; + + } while (nthreads != p->p_lwpcnt); + + break; + + default: + retval = set_errno(EINVAL); + break; + } + + return (retval); +} + + +/* + * Return the latest generation number for the lgroup hierarchy + * with the given view + */ +lgrp_gen_t +lgrp_generation(lgrp_view_t view) +{ + cpupart_t *cpupart; + uint_t gen; + + kpreempt_disable(); + + /* + * Determine generation number for given view + */ + if (view == LGRP_VIEW_OS) + /* + * Return generation number of lgroup hierarchy for OS view + */ + gen = lgrp_gen; + else { + /* + * For caller's view, use generation numbers for lgroup + * hierarchy and caller's pset + * NOTE: Caller needs to check for change in pset ID + */ + cpupart = curthread->t_cpupart; + ASSERT(cpupart); + gen = lgrp_gen + cpupart->cp_gen; + } + + kpreempt_enable(); + + return (gen); +} + + +lgrp_id_t +lgrp_home_thread(kthread_t *t) +{ + lgrp_id_t home; + + ASSERT(t != NULL); + ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); + + thread_lock(t); + + /* + * Check to see whether caller has permission to set affinity for + * thread + */ + if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) { + thread_unlock(t); + return (set_errno(EPERM)); + } + + home = lgrp_home_id(t); + + thread_unlock(t); + return (home); +} + + +/* + * Get home lgroup of given process or thread + */ +lgrp_id_t +lgrp_home_get(idtype_t idtype, id_t id) +{ + proc_t *p; + lgrp_id_t retval; + kthread_t *t; + + /* + * Get home lgroup of given LWP or process + */ + switch (idtype) { + + case P_LWPID: + p = curproc; + + /* + * Set affinity for thread + */ + mutex_enter(&p->p_lock); + if (id == P_MYID) { /* current thread */ + retval = lgrp_home_thread(curthread); + } else if (p->p_tlist == NULL) { + retval = set_errno(ESRCH); + } else { /* other thread */ + int found = 0; + + t = p->p_tlist; + do { + if (t->t_tid == id) { + retval = lgrp_home_thread(t); + found = 1; + break; + } + } while ((t = t->t_forw) != p->p_tlist); + if (!found) + retval = set_errno(ESRCH); + } + mutex_exit(&p->p_lock); + break; + + case P_PID: + /* + * Get process + */ + mutex_enter(&pidlock); + + if (id == P_MYID) + p = curproc; + else + p = prfind(id); + + if (p == NULL) { + mutex_exit(&pidlock); + return (set_errno(ESRCH)); + } + + mutex_enter(&p->p_lock); + t = p->p_tlist; + if (t == NULL) + retval = set_errno(ESRCH); + else + retval = lgrp_home_thread(t); + mutex_exit(&p->p_lock); + + mutex_exit(&pidlock); + + break; + + default: + retval = set_errno(EINVAL); + break; + } + + return (retval); +} + + +/* + * Return latency between "from" and "to" lgroups + * + * This latency number can only be used for relative comparison + * between lgroups on the running system, cannot be used across platforms, + * and may not reflect the actual latency. It is platform and implementation + * specific, so platform gets to decide its value. It would be nice if the + * number was at least proportional to make comparisons more meaningful though. + */ +int +lgrp_latency(lgrp_id_t from, lgrp_id_t to) +{ + lgrp_t *from_lgrp; + int i; + int latency; + int latency_max; + lgrp_t *to_lgrp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (from < 0 || to < 0) + return (set_errno(EINVAL)); + + if (from > lgrp_alloc_max || to > lgrp_alloc_max) + return (set_errno(ESRCH)); + + from_lgrp = lgrp_table[from]; + to_lgrp = lgrp_table[to]; + + if (!LGRP_EXISTS(from_lgrp) || !LGRP_EXISTS(to_lgrp)) { + return (set_errno(ESRCH)); + } + + /* + * Get latency for same lgroup + */ + if (from == to) { + latency = from_lgrp->lgrp_latency; + return (latency); + } + + /* + * Get latency between leaf lgroups + */ + if (from_lgrp->lgrp_childcnt == 0 && to_lgrp->lgrp_childcnt == 0) + return (lgrp_plat_latency(from_lgrp->lgrp_plathand, + to_lgrp->lgrp_plathand)); + + /* + * Determine max latency between resources in two lgroups + */ + latency_max = 0; + for (i = 0; i <= lgrp_alloc_max; i++) { + lgrp_t *from_rsrc; + int j; + lgrp_t *to_rsrc; + + from_rsrc = lgrp_table[i]; + if (!LGRP_EXISTS(from_rsrc) || + !klgrpset_ismember(from_lgrp->lgrp_set[LGRP_RSRC_CPU], i)) + continue; + + for (j = 0; j <= lgrp_alloc_max; j++) { + to_rsrc = lgrp_table[j]; + if (!LGRP_EXISTS(to_rsrc) || + klgrpset_ismember(to_lgrp->lgrp_set[LGRP_RSRC_MEM], + j) == 0) + continue; + latency = lgrp_plat_latency(from_rsrc->lgrp_plathand, + to_rsrc->lgrp_plathand); + if (latency > latency_max) + latency_max = latency; + } + } + return (latency_max); +} + + +/* + * Return lgroup interface version number + * 0 - none + * 1 - original + * 2 - lgrp_latency_cookie() and lgrp_resources() added + */ +int +lgrp_version(int version) +{ + /* + * Return LGRP_VER_NONE when requested version isn't supported + */ + if (version < LGRP_VER_NONE || version > LGRP_VER_CURRENT) + return (LGRP_VER_NONE); + + /* + * Return current version when LGRP_VER_NONE passed in + */ + if (version == LGRP_VER_NONE) + return (LGRP_VER_CURRENT); + + /* + * Otherwise, return supported version. + */ + return (version); +} + + +/* + * Snapshot of lgroup hieararchy + * + * One snapshot is kept and is based on the kernel's native data model, so + * a 32-bit snapshot is kept for the 32-bit kernel and a 64-bit one for the + * 64-bit kernel. If a 32-bit user wants a snapshot from the 64-bit kernel, + * the kernel generates a 32-bit snapshot from the data in its 64-bit snapshot. + * + * The format is defined by lgroup snapshot header and the layout of + * the snapshot in memory is as follows: + * 1) lgroup snapshot header + * - specifies format of snapshot + * - defined by lgrp_snapshot_header_t + * 2) lgroup info array + * - contains information about each lgroup + * - one element for each lgroup + * - each element is defined by lgrp_info_t + * 3) lgroup CPU ID array + * - contains list (array) of CPU IDs for each lgroup + * - lgrp_info_t points into array and specifies how many CPUs belong to + * given lgroup + * 4) lgroup parents array + * - contains lgroup bitmask of parents for each lgroup + * - bitmask is an array of unsigned longs and its size depends on nlgrpsmax + * 5) lgroup children array + * - contains lgroup bitmask of children for each lgroup + * - bitmask is an array of unsigned longs and its size depends on nlgrpsmax + * 6) lgroup resources array + * - contains lgroup bitmask of resources for each lgroup + * - bitmask is an array of unsigned longs and its size depends on nlgrpsmax + * 7) lgroup latency table + * - contains latency from each lgroup to each of other lgroups + * + * NOTE: Must use nlgrpsmax for per lgroup data structures because lgroups + * may be sparsely allocated. + */ +lgrp_snapshot_header_t *lgrp_snap = NULL; /* lgroup snapshot */ +static kmutex_t lgrp_snap_lock; /* snapshot lock */ + + +/* + * Take a snapshot of lgroup hierarchy and return size of buffer + * needed to hold snapshot + */ +static int +lgrp_snapshot(void) +{ + size_t bitmask_size; + size_t bitmasks_size; + size_t bufsize; + int cpu_index; + size_t cpuids_size; + int i; + int j; + size_t info_size; + size_t lats_size; + ulong_t *lgrp_children; + processorid_t *lgrp_cpuids; + lgrp_info_t *lgrp_info; + int **lgrp_lats; + ulong_t *lgrp_parents; + ulong_t *lgrp_rsets; + ulong_t *lgrpset; + int snap_ncpus; + int snap_nlgrps; + int snap_nlgrpsmax; + size_t snap_hdr_size; +#ifdef _SYSCALL32_IMPL + model_t model = DATAMODEL_NATIVE; + + /* + * Have up-to-date snapshot, so check to see whether caller is 32-bit + * program and need to return size of 32-bit snapshot now. + */ + model = get_udatamodel(); + if (model == DATAMODEL_ILP32 && lgrp_snap && + lgrp_snap->ss_gen == lgrp_gen) { + + snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max; + + /* + * Calculate size of buffer needed for 32-bit snapshot, + * rounding up size of each object to allow for alignment + * of next object in buffer. + */ + snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t), + sizeof (caddr32_t)); + info_size = + P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t), + sizeof (processorid_t)); + cpuids_size = + P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t), + sizeof (ulong_t)); + + /* + * lgroup bitmasks needed for parents, children, and resources + * for each lgroup and pset lgroup set + */ + bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax); + bitmasks_size = (((2 + LGRP_RSRC_COUNT) * + snap_nlgrpsmax) + 1) * bitmask_size; + + /* + * Size of latency table and buffer + */ + lats_size = snap_nlgrpsmax * sizeof (caddr32_t) + + snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int); + + bufsize = snap_hdr_size + info_size + cpuids_size + + bitmasks_size + lats_size; + return (bufsize); + } +#endif /* _SYSCALL32_IMPL */ + + /* + * Check whether snapshot is up-to-date + * Free it and take another one if not + */ + if (lgrp_snap) { + if (lgrp_snap->ss_gen == lgrp_gen) + return (lgrp_snap->ss_size); + + kmem_free(lgrp_snap, lgrp_snap->ss_size); + lgrp_snap = NULL; + } + + /* + * Allocate memory for snapshot + * w/o holding cpu_lock while waiting for memory + */ + while (lgrp_snap == NULL) { + int old_generation; + + /* + * Take snapshot of lgroup generation number + * and configuration size dependent information + * NOTE: Only count number of online CPUs, + * since only online CPUs appear in lgroups. + */ + mutex_enter(&cpu_lock); + old_generation = lgrp_gen; + snap_ncpus = ncpus_online; + snap_nlgrps = nlgrps; + snap_nlgrpsmax = nlgrpsmax; + mutex_exit(&cpu_lock); + + /* + * Calculate size of buffer needed for snapshot, + * rounding up size of each object to allow for alignment + * of next object in buffer. + */ + snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t), + sizeof (void *)); + info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t), + sizeof (processorid_t)); + cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t), + sizeof (ulong_t)); + /* + * lgroup bitmasks needed for pset lgroup set and parents, + * children, and resource sets for each lgroup + */ + bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax); + bitmasks_size = (((2 + LGRP_RSRC_COUNT) * + snap_nlgrpsmax) + 1) * bitmask_size; + + /* + * Size of latency table and buffer + */ + lats_size = snap_nlgrpsmax * sizeof (int *) + + snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int); + + bufsize = snap_hdr_size + info_size + cpuids_size + + bitmasks_size + lats_size; + + /* + * Allocate memory for buffer + */ + lgrp_snap = kmem_zalloc(bufsize, KM_NOSLEEP); + if (lgrp_snap == NULL) + return (set_errno(ENOMEM)); + + /* + * Check whether generation number has changed + */ + mutex_enter(&cpu_lock); + if (lgrp_gen == old_generation) + break; /* hasn't change, so done. */ + + /* + * Generation number changed, so free memory and try again. + */ + mutex_exit(&cpu_lock); + kmem_free(lgrp_snap, bufsize); + lgrp_snap = NULL; + } + + /* + * Fill in lgroup snapshot header + * (including pointers to tables of lgroup info, CPU IDs, and parents + * and children) + */ + lgrp_snap->ss_version = LGRP_VER_CURRENT; + + /* + * XXX For now, liblgrp only needs to know whether the hierarchy + * XXX only has one level or not + */ + if (snap_nlgrps == 1) + lgrp_snap->ss_levels = 1; + else + lgrp_snap->ss_levels = 2; + + lgrp_snap->ss_root = LGRP_ROOTID; + + lgrp_snap->ss_nlgrps = lgrp_snap->ss_nlgrps_os = snap_nlgrps; + lgrp_snap->ss_nlgrps_max = snap_nlgrpsmax; + lgrp_snap->ss_ncpus = snap_ncpus; + lgrp_snap->ss_gen = lgrp_gen; + lgrp_snap->ss_view = LGRP_VIEW_OS; + lgrp_snap->ss_pset = 0; /* NOTE: caller should set if needed */ + lgrp_snap->ss_size = bufsize; + lgrp_snap->ss_magic = (uintptr_t)lgrp_snap; + + lgrp_snap->ss_info = lgrp_info = + (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size); + + lgrp_snap->ss_cpuids = lgrp_cpuids = + (processorid_t *)((uintptr_t)lgrp_info + info_size); + + lgrp_snap->ss_lgrpset = lgrpset = + (ulong_t *)((uintptr_t)lgrp_cpuids + cpuids_size); + + lgrp_snap->ss_parents = lgrp_parents = + (ulong_t *)((uintptr_t)lgrpset + bitmask_size); + + lgrp_snap->ss_children = lgrp_children = + (ulong_t *)((uintptr_t)lgrp_parents + (snap_nlgrpsmax * + bitmask_size)); + + lgrp_snap->ss_rsets = lgrp_rsets = + (ulong_t *)((uintptr_t)lgrp_children + (snap_nlgrpsmax * + bitmask_size)); + + lgrp_snap->ss_latencies = lgrp_lats = + (int **)((uintptr_t)lgrp_rsets + (LGRP_RSRC_COUNT * + snap_nlgrpsmax * bitmask_size)); + + /* + * Fill in lgroup information + */ + cpu_index = 0; + for (i = 0; i < snap_nlgrpsmax; i++) { + struct cpu *cp; + int cpu_count; + struct cpu *head; + int k; + lgrp_t *lgrp; + + lgrp = lgrp_table[i]; + if (!LGRP_EXISTS(lgrp)) { + bzero(&lgrp_info[i], sizeof (lgrp_info[i])); + lgrp_info[i].info_lgrpid = LGRP_NONE; + continue; + } + + lgrp_info[i].info_lgrpid = i; + lgrp_info[i].info_latency = lgrp->lgrp_latency; + + /* + * Fill in parents, children, and lgroup resources + */ + lgrp_info[i].info_parents = + (ulong_t *)((uintptr_t)lgrp_parents + (i * bitmask_size)); + + if (lgrp->lgrp_parent) + BT_SET(lgrp_info[i].info_parents, + lgrp->lgrp_parent->lgrp_id); + + lgrp_info[i].info_children = + (ulong_t *)((uintptr_t)lgrp_children + (i * bitmask_size)); + + for (j = 0; j < snap_nlgrpsmax; j++) + if (klgrpset_ismember(lgrp->lgrp_children, j)) + BT_SET(lgrp_info[i].info_children, j); + + lgrp_info[i].info_rset = + (ulong_t *)((uintptr_t)lgrp_rsets + + (i * LGRP_RSRC_COUNT * bitmask_size)); + + for (j = 0; j < LGRP_RSRC_COUNT; j++) { + ulong_t *rset; + + rset = (ulong_t *)((uintptr_t)lgrp_info[i].info_rset + + (j * bitmask_size)); + for (k = 0; k < snap_nlgrpsmax; k++) + if (klgrpset_ismember(lgrp->lgrp_set[j], k)) + BT_SET(rset, k); + } + + /* + * Fill in CPU IDs + */ + cpu_count = 0; + lgrp_info[i].info_cpuids = NULL; + cp = head = lgrp->lgrp_cpu; + if (head != NULL) { + lgrp_info[i].info_cpuids = &lgrp_cpuids[cpu_index]; + do { + lgrp_cpuids[cpu_index] = cp->cpu_id; + cpu_index++; + cpu_count++; + cp = cp->cpu_next_lgrp; + } while (cp != head); + } + ASSERT(cpu_count == lgrp->lgrp_cpucnt); + lgrp_info[i].info_ncpus = cpu_count; + + /* + * Fill in memory sizes for lgroups that directly contain + * memory + */ + if (klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], i)) { + lgrp_info[i].info_mem_free = + lgrp_mem_size(i, LGRP_MEM_SIZE_FREE); + lgrp_info[i].info_mem_install = + lgrp_mem_size(i, LGRP_MEM_SIZE_INSTALL); + } + + /* + * Fill in latency table and buffer + */ + lgrp_lats[i] = (int *)((uintptr_t)lgrp_lats + snap_nlgrpsmax * + sizeof (int *) + i * snap_nlgrpsmax * sizeof (int)); + for (j = 0; j < snap_nlgrpsmax; j++) { + lgrp_t *to; + + to = lgrp_table[j]; + if (!LGRP_EXISTS(to)) + continue; + lgrp_lats[i][j] = lgrp_latency(lgrp->lgrp_id, + to->lgrp_id); + } + } + ASSERT(cpu_index == snap_ncpus); + + + mutex_exit(&cpu_lock); + +#ifdef _SYSCALL32_IMPL + /* + * Check to see whether caller is 32-bit program and need to return + * size of 32-bit snapshot now that snapshot has been taken/updated. + * May not have been able to do this earlier if snapshot was out of + * date or didn't exist yet. + */ + if (model == DATAMODEL_ILP32) { + + snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max; + + /* + * Calculate size of buffer needed for 32-bit snapshot, + * rounding up size of each object to allow for alignment + * of next object in buffer. + */ + snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t), + sizeof (caddr32_t)); + info_size = + P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t), + sizeof (processorid_t)); + cpuids_size = + P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t), + sizeof (ulong_t)); + + bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax); + bitmasks_size = (((2 + LGRP_RSRC_COUNT) * snap_nlgrpsmax) + + 1) * bitmask_size; + + + /* + * Size of latency table and buffer + */ + lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) + + (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int)); + + bufsize = snap_hdr_size + info_size + cpuids_size + + bitmasks_size + lats_size; + return (bufsize); + } +#endif /* _SYSCALL32_IMPL */ + + return (lgrp_snap->ss_size); +} + + +/* + * Copy snapshot into given user buffer, fix up any pointers in buffer to point + * into user instead of kernel address space, and return size of buffer + * needed to hold snapshot + */ +static int +lgrp_snapshot_copy(char *buf, size_t bufsize) +{ + size_t bitmask_size; + int cpu_index; + size_t cpuids_size; + int i; + size_t info_size; + lgrp_info_t *lgrp_info; + int retval; + size_t snap_hdr_size; + int snap_ncpus; + int snap_nlgrpsmax; + lgrp_snapshot_header_t *user_snap; + lgrp_info_t *user_info; + lgrp_info_t *user_info_buffer; + processorid_t *user_cpuids; + ulong_t *user_lgrpset; + ulong_t *user_parents; + ulong_t *user_children; + int **user_lats; + int **user_lats_buffer; + ulong_t *user_rsets; + + if (lgrp_snap == NULL) + return (0); + + if (buf == NULL || bufsize <= 0) + return (lgrp_snap->ss_size); + + /* + * User needs to try getting size of buffer again + * because given buffer size is too small. + * The lgroup hierarchy may have changed after they asked for the size + * but before the snapshot was taken. + */ + if (bufsize < lgrp_snap->ss_size) + return (set_errno(EAGAIN)); + + snap_ncpus = lgrp_snap->ss_ncpus; + snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max; + + /* + * Fill in lgrpset now because caller may have change psets + */ + kpreempt_disable(); + for (i = 0; i < snap_nlgrpsmax; i++) { + if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset, + i)) { + BT_SET(lgrp_snap->ss_lgrpset, i); + } + } + kpreempt_enable(); + + /* + * Copy lgroup snapshot (snapshot header, lgroup info, and CPU IDs) + * into user buffer all at once + */ + if (copyout(lgrp_snap, buf, lgrp_snap->ss_size) != 0) + return (set_errno(EFAULT)); + + /* + * Round up sizes of lgroup snapshot header and info for alignment + */ + snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t), + sizeof (void *)); + info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t), + sizeof (processorid_t)); + cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t), + sizeof (ulong_t)); + + bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax); + + /* + * Calculate pointers into user buffer for lgroup snapshot header, + * info, and CPU IDs + */ + user_snap = (lgrp_snapshot_header_t *)buf; + user_info = (lgrp_info_t *)((uintptr_t)user_snap + snap_hdr_size); + user_cpuids = (processorid_t *)((uintptr_t)user_info + info_size); + user_lgrpset = (ulong_t *)((uintptr_t)user_cpuids + cpuids_size); + user_parents = (ulong_t *)((uintptr_t)user_lgrpset + bitmask_size); + user_children = (ulong_t *)((uintptr_t)user_parents + + (snap_nlgrpsmax * bitmask_size)); + user_rsets = (ulong_t *)((uintptr_t)user_children + + (snap_nlgrpsmax * bitmask_size)); + user_lats = (int **)((uintptr_t)user_rsets + + (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size)); + + /* + * Copyout magic number (ie. pointer to beginning of buffer) + */ + if (copyout(&buf, &user_snap->ss_magic, sizeof (buf)) != 0) + return (set_errno(EFAULT)); + + /* + * Fix up pointers in user buffer to point into user buffer + * not kernel snapshot + */ + if (copyout(&user_info, &user_snap->ss_info, sizeof (user_info)) != 0) + return (set_errno(EFAULT)); + + if (copyout(&user_cpuids, &user_snap->ss_cpuids, + sizeof (user_cpuids)) != 0) + return (set_errno(EFAULT)); + + if (copyout(&user_lgrpset, &user_snap->ss_lgrpset, + sizeof (user_lgrpset)) != 0) + return (set_errno(EFAULT)); + + if (copyout(&user_parents, &user_snap->ss_parents, + sizeof (user_parents)) != 0) + return (set_errno(EFAULT)); + + if (copyout(&user_children, &user_snap->ss_children, + sizeof (user_children)) != 0) + return (set_errno(EFAULT)); + + if (copyout(&user_rsets, &user_snap->ss_rsets, + sizeof (user_rsets)) != 0) + return (set_errno(EFAULT)); + + if (copyout(&user_lats, &user_snap->ss_latencies, + sizeof (user_lats)) != 0) + return (set_errno(EFAULT)); + + /* + * Make copies of lgroup info and latency table, fix up pointers, + * and then copy them into user buffer + */ + user_info_buffer = kmem_zalloc(info_size, KM_NOSLEEP); + if (user_info_buffer == NULL) + return (set_errno(ENOMEM)); + + user_lats_buffer = kmem_zalloc(snap_nlgrpsmax * sizeof (int *), + KM_NOSLEEP); + if (user_lats_buffer == NULL) { + kmem_free(user_info_buffer, info_size); + return (set_errno(ENOMEM)); + } + + lgrp_info = (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size); + bcopy(lgrp_info, user_info_buffer, info_size); + + cpu_index = 0; + for (i = 0; i < snap_nlgrpsmax; i++) { + ulong_t *snap_rset; + + /* + * Skip non-existent lgroups + */ + if (user_info_buffer[i].info_lgrpid == LGRP_NONE) + continue; + + /* + * Update free memory size since it changes frequently + * Only do so for lgroups directly containing memory + * + * NOTE: This must be done before changing the pointers to + * point into user space since we need to dereference + * lgroup resource set + */ + snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM * + BT_BITOUL(snap_nlgrpsmax)]; + if (BT_TEST(snap_rset, i)) + user_info_buffer[i].info_mem_free = + lgrp_mem_size(i, LGRP_MEM_SIZE_FREE); + + /* + * Fix up pointers to parents, children, resources, and + * latencies + */ + user_info_buffer[i].info_parents = + (ulong_t *)((uintptr_t)user_parents + (i * bitmask_size)); + user_info_buffer[i].info_children = + (ulong_t *)((uintptr_t)user_children + (i * bitmask_size)); + user_info_buffer[i].info_rset = + (ulong_t *)((uintptr_t)user_rsets + + (i * LGRP_RSRC_COUNT * bitmask_size)); + user_lats_buffer[i] = (int *)((uintptr_t)user_lats + + (snap_nlgrpsmax * sizeof (int *)) + (i * snap_nlgrpsmax * + sizeof (int))); + + /* + * Fix up pointer to CPU IDs + */ + if (user_info_buffer[i].info_ncpus == 0) { + user_info_buffer[i].info_cpuids = NULL; + continue; + } + user_info_buffer[i].info_cpuids = &user_cpuids[cpu_index]; + cpu_index += user_info_buffer[i].info_ncpus; + } + ASSERT(cpu_index == snap_ncpus); + + /* + * Copy lgroup info and latency table with pointers fixed up to point + * into user buffer out to user buffer now + */ + retval = lgrp_snap->ss_size; + if (copyout(user_info_buffer, user_info, info_size) != 0) + retval = set_errno(EFAULT); + kmem_free(user_info_buffer, info_size); + + if (copyout(user_lats_buffer, user_lats, snap_nlgrpsmax * + sizeof (int *)) != 0) + retval = set_errno(EFAULT); + kmem_free(user_lats_buffer, snap_nlgrpsmax * sizeof (int *)); + + return (retval); +} + + +#ifdef _SYSCALL32_IMPL +/* + * Make 32-bit copy of snapshot, fix up any pointers in buffer to point + * into user instead of kernel address space, copy 32-bit snapshot into + * given user buffer, and return size of buffer needed to hold snapshot + */ +static int +lgrp_snapshot_copy32(caddr32_t buf, size32_t bufsize) +{ + size32_t bitmask_size; + size32_t bitmasks_size; + size32_t children_size; + int cpu_index; + size32_t cpuids_size; + int i; + int j; + size32_t info_size; + size32_t lats_size; + lgrp_info_t *lgrp_info; + lgrp_snapshot_header32_t *lgrp_snap32; + lgrp_info32_t *lgrp_info32; + processorid_t *lgrp_cpuids32; + caddr32_t *lgrp_lats32; + int **lgrp_lats32_kernel; + uint_t *lgrp_set32; + uint_t *lgrp_parents32; + uint_t *lgrp_children32; + uint_t *lgrp_rsets32; + size32_t parents_size; + size32_t rsets_size; + size32_t set_size; + size32_t snap_hdr_size; + int snap_ncpus; + int snap_nlgrpsmax; + size32_t snap_size; + + if (lgrp_snap == NULL) + return (0); + + snap_ncpus = lgrp_snap->ss_ncpus; + snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max; + + /* + * Calculate size of buffer needed for 32-bit snapshot, + * rounding up size of each object to allow for alignment + * of next object in buffer. + */ + snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t), + sizeof (caddr32_t)); + info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t), + sizeof (processorid_t)); + cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t), + sizeof (ulong_t)); + + bitmask_size = BT_SIZEOFMAP32(snap_nlgrpsmax); + + set_size = bitmask_size; + parents_size = snap_nlgrpsmax * bitmask_size; + children_size = snap_nlgrpsmax * bitmask_size; + rsets_size = P2ROUNDUP(LGRP_RSRC_COUNT * snap_nlgrpsmax * + (int)bitmask_size, sizeof (caddr32_t)); + + bitmasks_size = set_size + parents_size + children_size + rsets_size; + + /* + * Size of latency table and buffer + */ + lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) + + (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int)); + + snap_size = snap_hdr_size + info_size + cpuids_size + bitmasks_size + + lats_size; + + if (buf == NULL || bufsize <= 0) { + return (snap_size); + } + + /* + * User needs to try getting size of buffer again + * because given buffer size is too small. + * The lgroup hierarchy may have changed after they asked for the size + * but before the snapshot was taken. + */ + if (bufsize < snap_size) + return (set_errno(EAGAIN)); + + /* + * Make 32-bit copy of snapshot, fix up pointers to point into user + * buffer not kernel, and then copy whole thing into user buffer + */ + lgrp_snap32 = kmem_zalloc(snap_size, KM_NOSLEEP); + if (lgrp_snap32 == NULL) + return (set_errno(ENOMEM)); + + /* + * Calculate pointers into 32-bit copy of snapshot + * for lgroup info, CPU IDs, pset lgroup bitmask, parents, children, + * resources, and latency table and buffer + */ + lgrp_info32 = (lgrp_info32_t *)((uintptr_t)lgrp_snap32 + + snap_hdr_size); + lgrp_cpuids32 = (processorid_t *)((uintptr_t)lgrp_info32 + info_size); + lgrp_set32 = (uint_t *)((uintptr_t)lgrp_cpuids32 + cpuids_size); + lgrp_parents32 = (uint_t *)((uintptr_t)lgrp_set32 + set_size); + lgrp_children32 = (uint_t *)((uintptr_t)lgrp_parents32 + parents_size); + lgrp_rsets32 = (uint_t *)((uintptr_t)lgrp_children32 + children_size); + lgrp_lats32 = (caddr32_t *)((uintptr_t)lgrp_rsets32 + rsets_size); + + /* + * Make temporary lgroup latency table of pointers for kernel to use + * to fill in rows of table with latencies from each lgroup + */ + lgrp_lats32_kernel = kmem_zalloc(snap_nlgrpsmax * sizeof (int *), + KM_NOSLEEP); + if (lgrp_lats32_kernel == NULL) { + kmem_free(lgrp_snap32, snap_size); + return (set_errno(ENOMEM)); + } + + /* + * Fill in 32-bit lgroup snapshot header + * (with pointers into user's buffer for lgroup info, CPU IDs, + * bit masks, and latencies) + */ + lgrp_snap32->ss_version = lgrp_snap->ss_version; + lgrp_snap32->ss_levels = lgrp_snap->ss_levels; + lgrp_snap32->ss_nlgrps = lgrp_snap32->ss_nlgrps_os = + lgrp_snap->ss_nlgrps; + lgrp_snap32->ss_nlgrps_max = snap_nlgrpsmax; + lgrp_snap32->ss_root = lgrp_snap->ss_root; + lgrp_snap32->ss_ncpus = lgrp_snap->ss_ncpus; + lgrp_snap32->ss_gen = lgrp_snap->ss_gen; + lgrp_snap32->ss_view = LGRP_VIEW_OS; + lgrp_snap32->ss_size = snap_size; + lgrp_snap32->ss_magic = buf; + lgrp_snap32->ss_info = buf + snap_hdr_size; + lgrp_snap32->ss_cpuids = lgrp_snap32->ss_info + info_size; + lgrp_snap32->ss_lgrpset = lgrp_snap32->ss_cpuids + cpuids_size; + lgrp_snap32->ss_parents = lgrp_snap32->ss_lgrpset + bitmask_size; + lgrp_snap32->ss_children = lgrp_snap32->ss_parents + + (snap_nlgrpsmax * bitmask_size); + lgrp_snap32->ss_rsets = lgrp_snap32->ss_children + + (snap_nlgrpsmax * bitmask_size); + lgrp_snap32->ss_latencies = lgrp_snap32->ss_rsets + + (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size); + + /* + * Fill in lgrpset now because caller may have change psets + */ + kpreempt_disable(); + for (i = 0; i < snap_nlgrpsmax; i++) { + if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset, + i)) { + BT_SET32(lgrp_set32, i); + } + } + kpreempt_enable(); + + /* + * Fill in 32-bit copy of lgroup info and fix up pointers + * to point into user's buffer instead of kernel's + */ + cpu_index = 0; + lgrp_info = lgrp_snap->ss_info; + for (i = 0; i < snap_nlgrpsmax; i++) { + uint_t *children; + uint_t *lgrp_rset; + uint_t *parents; + ulong_t *snap_rset; + + /* + * Skip non-existent lgroups + */ + if (lgrp_info[i].info_lgrpid == LGRP_NONE) { + bzero(&lgrp_info32[i], sizeof (lgrp_info32[i])); + lgrp_info32[i].info_lgrpid = LGRP_NONE; + continue; + } + + /* + * Fill in parents, children, lgroup resource set, and + * latencies from snapshot + */ + parents = (uint_t *)((uintptr_t)lgrp_parents32 + + i * bitmask_size); + children = (uint_t *)((uintptr_t)lgrp_children32 + + i * bitmask_size); + snap_rset = (ulong_t *)((uintptr_t)lgrp_snap->ss_rsets + + (i * LGRP_RSRC_COUNT * BT_SIZEOFMAP(snap_nlgrpsmax))); + lgrp_rset = (uint_t *)((uintptr_t)lgrp_rsets32 + + (i * LGRP_RSRC_COUNT * bitmask_size)); + lgrp_lats32_kernel[i] = (int *)((uintptr_t)lgrp_lats32 + + snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax * + sizeof (int)); + for (j = 0; j < snap_nlgrpsmax; j++) { + int k; + uint_t *rset; + + if (BT_TEST(&lgrp_snap->ss_parents[i], j)) + BT_SET32(parents, j); + + if (BT_TEST(&lgrp_snap->ss_children[i], j)) + BT_SET32(children, j); + + for (k = 0; k < LGRP_RSRC_COUNT; k++) { + rset = (uint_t *)((uintptr_t)lgrp_rset + + k * bitmask_size); + if (BT_TEST(&snap_rset[k], j)) + BT_SET32(rset, j); + } + + lgrp_lats32_kernel[i][j] = + lgrp_snap->ss_latencies[i][j]; + } + + /* + * Fix up pointer to latency buffer + */ + lgrp_lats32[i] = lgrp_snap32->ss_latencies + + snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax * + sizeof (int); + + /* + * Fix up pointers for parents, children, and resources + */ + lgrp_info32[i].info_parents = lgrp_snap32->ss_parents + + (i * bitmask_size); + lgrp_info32[i].info_children = lgrp_snap32->ss_children + + (i * bitmask_size); + lgrp_info32[i].info_rset = lgrp_snap32->ss_rsets + + (i * LGRP_RSRC_COUNT * bitmask_size); + + /* + * Fill in memory and CPU info + * Only fill in memory for lgroups directly containing memory + */ + snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM * + BT_BITOUL(snap_nlgrpsmax)]; + if (BT_TEST(snap_rset, i)) { + lgrp_info32[i].info_mem_free = lgrp_mem_size(i, + LGRP_MEM_SIZE_FREE); + lgrp_info32[i].info_mem_install = + lgrp_info[i].info_mem_install; + } + + lgrp_info32[i].info_ncpus = lgrp_info[i].info_ncpus; + + lgrp_info32[i].info_lgrpid = lgrp_info[i].info_lgrpid; + lgrp_info32[i].info_latency = lgrp_info[i].info_latency; + + if (lgrp_info32[i].info_ncpus == 0) { + lgrp_info32[i].info_cpuids = 0; + continue; + } + + /* + * Fix up pointer for CPU IDs + */ + lgrp_info32[i].info_cpuids = lgrp_snap32->ss_cpuids + + (cpu_index * sizeof (processorid_t)); + cpu_index += lgrp_info32[i].info_ncpus; + } + ASSERT(cpu_index == snap_ncpus); + + /* + * Copy lgroup CPU IDs into 32-bit snapshot + * before copying it out into user's buffer + */ + bcopy(lgrp_snap->ss_cpuids, lgrp_cpuids32, cpuids_size); + + /* + * Copy 32-bit lgroup snapshot into user's buffer all at once + */ + if (copyout(lgrp_snap32, (void *)(uintptr_t)buf, snap_size) != 0) { + kmem_free(lgrp_snap32, snap_size); + kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *)); + return (set_errno(EFAULT)); + } + + kmem_free(lgrp_snap32, snap_size); + kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *)); + + return (snap_size); +} +#endif /* _SYSCALL32_IMPL */ + + +int +lgrpsys(int subcode, long ia, void *ap) +{ + size_t bufsize; + int latency; + + switch (subcode) { + + case LGRP_SYS_AFFINITY_GET: + return (lgrp_affinity_get((lgrp_affinity_args_t *)ap)); + + case LGRP_SYS_AFFINITY_SET: + return (lgrp_affinity_set((lgrp_affinity_args_t *)ap)); + + case LGRP_SYS_GENERATION: + return (lgrp_generation(ia)); + + case LGRP_SYS_HOME: + return (lgrp_home_get((idtype_t)ia, (id_t)(uintptr_t)ap)); + + case LGRP_SYS_LATENCY: + mutex_enter(&cpu_lock); + latency = lgrp_latency(ia, (lgrp_id_t)(uintptr_t)ap); + mutex_exit(&cpu_lock); + return (latency); + + case LGRP_SYS_MEMINFO: + return (meminfo(ia, (struct meminfo *)ap)); + + case LGRP_SYS_VERSION: + return (lgrp_version(ia)); + + case LGRP_SYS_SNAPSHOT: + mutex_enter(&lgrp_snap_lock); + bufsize = lgrp_snapshot(); + if (ap && ia > 0) { + if (get_udatamodel() == DATAMODEL_NATIVE) + bufsize = lgrp_snapshot_copy(ap, ia); +#ifdef _SYSCALL32_IMPL + else + bufsize = lgrp_snapshot_copy32( + (caddr32_t)(uintptr_t)ap, ia); +#endif /* _SYSCALL32_IMPL */ + } + mutex_exit(&lgrp_snap_lock); + return (bufsize); + + default: + break; + + } + + return (set_errno(EINVAL)); +} diff --git a/usr/src/uts/common/syscall/link.c b/usr/src/uts/common/syscall/link.c new file mode 100644 index 0000000000..a63b04f133 --- /dev/null +++ b/usr/src/uts/common/syscall/link.c @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1989 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/debug.h> + +/* + * Make a hard link. + */ +int +link(char *from, char *to) +{ + int error; + + if (error = vn_link(from, to, UIO_USERSPACE)) + return (set_errno(error)); + return (0); +} diff --git a/usr/src/uts/common/syscall/lseek.c b/usr/src/uts/common/syscall/lseek.c new file mode 100644 index 0000000000..d03687eb68 --- /dev/null +++ b/usr/src/uts/common/syscall/lseek.c @@ -0,0 +1,380 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/debug.h> +#include <sys/cmn_err.h> +#include <sys/filio.h> + +/* + * These are defined in unistd.h - but we can't include that + */ +#define SEEK_SET 0 /* Set file pointer to "offset" */ +#define SEEK_CUR 1 /* Set file pointer to current plus "offset" */ +#define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ +#define SEEK_DATA 3 /* Set file pointer to next data past offset */ +#define SEEK_HOLE 4 /* Set file pointer to next hole past offset */ + +/* + * Seek on a file + */ + +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) +/* + * Workhorse for the 32-bit seek variants: lseek32 and llseek32 + * + * 'max' represents the maximum possible representation of offset + * in the data type corresponding to lseek and llseek. It is + * MAXOFF32_T for off32_t and MAXOFFSET_T for off64_t. + * We return EOVERFLOW if we cannot represent the resulting offset + * in the data type. + * We provide support for character devices to be seeked beyond MAXOFF32_T + * by lseek. To maintain compatibility in such cases lseek passes + * the arguments carefully to lseek_common when file is not regular. + * (/dev/kmem is a good example of a > 2Gbyte seek!) + */ +static int +lseek32_common(file_t *fp, int stype, offset_t off, offset_t max, + offset_t *retoff) +{ + vnode_t *vp; + struct vattr vattr; + int error; + u_offset_t noff; + offset_t curoff, newoff; + int reg; + + vp = fp->f_vnode; + reg = (vp->v_type == VREG); + + curoff = fp->f_offset; + + switch (stype) { + case SEEK_SET: + noff = (u_offset_t)off; + if (reg && noff > max) { + error = EINVAL; + goto out; + } + break; + + case SEEK_CUR: + if (reg && off > (max - curoff)) { + error = EOVERFLOW; + goto out; + } + noff = (u_offset_t)(off + curoff); + if (reg && noff > max) { + error = EINVAL; + goto out; + } + break; + + case SEEK_END: + vattr.va_mask = AT_SIZE; + if (error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred)) { + goto out; + } + if (reg && (off > (max - (offset_t)vattr.va_size))) { + error = EOVERFLOW; + goto out; + } + noff = (u_offset_t)(off + (offset_t)vattr.va_size); + if (reg && noff > max) { + error = EINVAL; + goto out; + } + break; + + case SEEK_DATA: + /* + * Get and set the file pointer to the offset of the next + * data past "off" + */ + noff = (u_offset_t)off; + error = VOP_IOCTL(vp, _FIO_SEEK_DATA, (intptr_t)(&noff), + FKIOCTL, kcred, NULL); + if (error) { + if (error != ENOTTY) + return (error); + /* + * The ioctl is not supported, check the supplied + * "off" is not past the end of file + */ + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred); + if (error) + return (error); + if (noff >= (u_offset_t)vattr.va_size) + return (ENXIO); + } + if (reg && (noff > max)) + return (EOVERFLOW); + + fp->f_offset = (offset_t)noff; + (*retoff) = (offset_t)noff; + return (0); + + case SEEK_HOLE: + /* + * Get and set the file pointer to the offset of the next + * hole past "off" + */ + noff = (u_offset_t)off; + error = VOP_IOCTL(vp, _FIO_SEEK_HOLE, (intptr_t)(&noff), + FKIOCTL, kcred, NULL); + if (error) { + if (error != ENOTTY) + return (error); + /* + * ioctl is not supported, if the off is valid return + * the "virtual hole" at the end of the file. + */ + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred); + if (error) + return (error); + if (off < (offset_t)vattr.va_size) + noff = (u_offset_t)vattr.va_size; + else + return (ENXIO); + } + if (reg && (noff > max)) + return (EOVERFLOW); + + fp->f_offset = (offset_t)noff; + (*retoff) = (offset_t)noff; + return (0); + + default: + error = EINVAL; + goto out; + } + + ASSERT((reg && noff <= max) || !reg); + newoff = (offset_t)noff; + if ((error = VOP_SEEK(vp, curoff, &newoff)) == 0) { + fp->f_offset = newoff; + (*retoff) = newoff; + return (0); + } +out: + return (error); +} + +off32_t +lseek32(int32_t fdes, off32_t off, int32_t stype) +{ + file_t *fp; + int error; + offset_t retoff; + + if ((fp = getf(fdes)) == NULL) + return ((off32_t)set_errno(EBADF)); + + /* + * lseek32 returns EOVERFLOW if we cannot represent the resulting + * offset from seek in a 32-bit off_t. + * The following routines are sensitive to sign extensions and + * calculations and if ever you change this make sure it works for + * special files. + * + * When VREG is not set we do the check for stype != SEEK_SET + * to send the unsigned value to lseek_common and not the sign + * extended value. (The maximum representable value is not + * checked by lseek_common for special files.) + */ + if (fp->f_vnode->v_type == VREG || stype != SEEK_SET) + error = lseek32_common(fp, stype, (offset_t)off, + (offset_t)MAXOFF32_T, &retoff); + else if (stype == SEEK_SET) + error = lseek32_common(fp, stype, (offset_t)(uint_t)off, + (offset_t)(uint_t)UINT_MAX, &retoff); + + releasef(fdes); + if (!error) + return ((off32_t)retoff); + return ((off32_t)set_errno(error)); +} + +/* + * 64-bit seeks from 32-bit applications + */ +offset_t +llseek32(int32_t fdes, uint32_t off1, uint32_t off2, int stype) +{ + file_t *fp; + int error; + offset_t retoff; +#if defined(_LITTLE_ENDIAN) + offset_t off = ((u_offset_t)off2 << 32) | (u_offset_t)off1; +#else + offset_t off = ((u_offset_t)off1 << 32) | (u_offset_t)off2; +#endif + + if ((fp = getf(fdes)) == NULL) + error = EBADF; + else { + error = lseek32_common(fp, stype, off, MAXOFFSET_T, &retoff); + releasef(fdes); + } + + return (error ? (offset_t)set_errno(error) : retoff); +} +#endif /* _SYSCALL32_IMPL || _ILP32 */ + +#ifdef _LP64 +/* + * Seek on a file. + * + * Life is almost simple again (at least until we do 128-bit files ;-) + * This is both 'lseek' and 'llseek' to a 64-bit application. + */ +off_t +lseek64(int fdes, off_t off, int stype) +{ + file_t *fp; + vnode_t *vp; + struct vattr vattr; + int error; + off_t old_off; + offset_t new_off; + + if ((fp = getf(fdes)) == NULL) + return ((off_t)set_errno(EBADF)); + + vp = fp->f_vnode; + new_off = off; + + switch (stype) { + case SEEK_CUR: + new_off += fp->f_offset; + break; + + case SEEK_END: + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred)) != 0) + goto lseek64error; + new_off += vattr.va_size; + break; + + case SEEK_SET: + break; + + case SEEK_DATA: + /* + * Get and set the file pointer to the offset of the next + * data past "off" + */ + new_off = (offset_t)off; + error = VOP_IOCTL(vp, _FIO_SEEK_DATA, (intptr_t)(&new_off), + FKIOCTL, kcred, NULL); + if (error) { + if (error != ENOTTY) { + goto lseek64error; + } + /* + * The ioctl is not supported, check the supplied off + * is not past end of file + */ + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred); + if (error) + goto lseek64error; + if (new_off >= (offset_t)vattr.va_size) { + error = ENXIO; + goto lseek64error; + } + } + fp->f_offset = new_off; + releasef(fdes); + return (new_off); + + case SEEK_HOLE: + /* + * Get and set the file pointer to the offset of the next + * hole past "off" + */ + new_off = off; + error = VOP_IOCTL(vp, _FIO_SEEK_HOLE, (intptr_t)(&new_off), + FKIOCTL, kcred, NULL); + if (error) { + if (error != ENOTTY) + goto lseek64error; + /* + * ioctl is not supported, if the off is valid return + * the "virtual hole" at the end of the file. + */ + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred); + if (error) + goto lseek64error; + if (off < (offset_t)vattr.va_size) { + new_off = (offset_t)vattr.va_size; + } else { + error = ENXIO; + goto lseek64error; + } + } + fp->f_offset = new_off; + releasef(fdes); + return (new_off); + + default: + error = EINVAL; + goto lseek64error; + } + + old_off = fp->f_offset; + if ((error = VOP_SEEK(vp, old_off, &new_off)) == 0) { + fp->f_offset = new_off; + releasef(fdes); + return (new_off); + } + +lseek64error: + releasef(fdes); + return ((off_t)set_errno(error)); +} +#endif /* _LP64 */ diff --git a/usr/src/uts/common/syscall/lwp_create.c b/usr/src/uts/common/syscall/lwp_create.c new file mode 100644 index 0000000000..e0bf63c886 --- /dev/null +++ b/usr/src/uts/common/syscall/lwp_create.c @@ -0,0 +1,212 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/processor.h> +#include <sys/fault.h> +#include <sys/ucontext.h> +#include <sys/signal.h> +#include <sys/unistd.h> +#include <sys/procfs.h> +#include <sys/prsystm.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/klwp.h> +#include <sys/pool.h> + +/* + * System call to create an lwp. + * + * Notes on the LWP_DETACHED and LWP_DAEMON flags: + * + * A detached lwp (LWP_DETACHED) cannot be the specific target of + * lwp_wait() (it is not joinable), but lwp_wait(0, ...) is required + * to sleep until all non-daemon detached lwps have terminated before + * returning EDEADLK because a detached lwp might create a non-detached lwp + * that could then be returned by lwp_wait(0, ...). See also lwp_detach(). + * + * A daemon lwp (LWP_DAEMON) is a detached lwp that has the additional + * property that it does not affect the termination condition of the + * process: The last non-daemon lwp to call lwp_exit() causes the process + * to exit and lwp_wait(0, ...) does not sleep waiting for daemon lwps + * to terminate. See the block comment before lwp_wait(). + */ +int +syslwp_create(ucontext_t *ucp, int flags, id_t *new_lwp) +{ + klwp_t *lwp; + proc_t *p = ttoproc(curthread); + kthread_t *t; + ucontext_t uc; +#ifdef _SYSCALL32_IMPL + ucontext32_t uc32; +#endif /* _SYSCALL32_IMPL */ + k_sigset_t sigmask; + int tid; + model_t model = get_udatamodel(); + uintptr_t thrptr = 0; + + if (flags & ~(LWP_DAEMON|LWP_DETACHED|LWP_SUSPENDED)) + return (set_errno(EINVAL)); + + /* + * lwp_create() is disallowed for the /proc agent lwp. + */ + if (curthread == p->p_agenttp) + return (set_errno(ENOTSUP)); + + if (model == DATAMODEL_NATIVE) { + if (copyin(ucp, &uc, sizeof (ucontext_t))) + return (set_errno(EFAULT)); + sigutok(&uc.uc_sigmask, &sigmask); +#if defined(__i386) + /* + * libc stashed thrptr into unused kernel %sp. + * See setup_context() in libc. + */ + thrptr = (uint32_t)uc.uc_mcontext.gregs[ESP]; +#endif + } +#ifdef _SYSCALL32_IMPL + else { + if (copyin(ucp, &uc32, sizeof (ucontext32_t))) + return (set_errno(EFAULT)); + sigutok(&uc32.uc_sigmask, &sigmask); +#if defined(__sparc) + ucontext_32ton(&uc32, &uc, NULL, NULL); +#else /* __amd64 */ + ucontext_32ton(&uc32, &uc); + /* + * libc stashed thrptr into unused kernel %sp. + * See setup_context() in libc. + */ + thrptr = (uint32_t)uc32.uc_mcontext.gregs[ESP]; +#endif + } +#endif /* _SYSCALL32_IMPL */ + + (void) save_syscall_args(); /* save args for tracing first */ + + mutex_enter(&curproc->p_lock); + pool_barrier_enter(); + mutex_exit(&curproc->p_lock); + lwp = lwp_create(lwp_rtt, NULL, NULL, curproc, TS_STOPPED, + curthread->t_pri, &sigmask, curthread->t_cid, 0); + mutex_enter(&curproc->p_lock); + pool_barrier_exit(); + mutex_exit(&curproc->p_lock); + if (lwp == NULL) + return (set_errno(EAGAIN)); + + lwp_load(lwp, uc.uc_mcontext.gregs, thrptr); + + t = lwptot(lwp); + /* + * Copy the new lwp's lwpid into the caller's specified buffer. + */ + if (new_lwp && copyout(&t->t_tid, new_lwp, sizeof (id_t))) { + /* + * caller's buffer is not writable, return + * EFAULT, and terminate new lwp. + */ + mutex_enter(&p->p_lock); + t->t_proc_flag |= TP_EXITLWP; + t->t_sig_check = 1; + t->t_sysnum = 0; + t->t_proc_flag &= ~TP_HOLDLWP; + lwp_create_done(t); + mutex_exit(&p->p_lock); + return (set_errno(EFAULT)); + } + + /* + * clone callers context, if any. must be invoked + * while -not- holding p_lock. + */ + if (curthread->t_ctx) + lwp_createctx(curthread, t); + + /* + * copy current contract templates + */ + lwp_ctmpl_copy(lwp, ttolwp(curthread)); + + mutex_enter(&p->p_lock); + /* + * Copy the syscall arguments to the new lwp's arg area + * for the benefit of debuggers. + */ + t->t_sysnum = SYS_lwp_create; + lwp->lwp_ap = lwp->lwp_arg; + lwp->lwp_arg[0] = (long)ucp; + lwp->lwp_arg[1] = (long)flags; + lwp->lwp_arg[2] = (long)new_lwp; + lwp->lwp_argsaved = 1; + + if (!(flags & (LWP_DETACHED|LWP_DAEMON))) + t->t_proc_flag |= TP_TWAIT; + if (flags & LWP_DAEMON) { + t->t_proc_flag |= TP_DAEMON; + p->p_lwpdaemon++; + } + + tid = (int)t->t_tid; /* for /proc debuggers */ + + /* + * We now set the newly-created lwp running. + * If it is being created as LWP_SUSPENDED, we leave its + * TP_HOLDLWP flag set so it will stop in system call exit. + */ + if (!(flags & LWP_SUSPENDED)) + t->t_proc_flag &= ~TP_HOLDLWP; + lwp_create_done(t); + mutex_exit(&p->p_lock); + + return (tid); +} + +/* + * Exit the calling lwp + */ +void +syslwp_exit() +{ + proc_t *p = ttoproc(curthread); + + mutex_enter(&p->p_lock); + lwp_exit(); + /* NOTREACHED */ +} diff --git a/usr/src/uts/common/syscall/lwp_info.c b/usr/src/uts/common/syscall/lwp_info.c new file mode 100644 index 0000000000..21ac0ca4c3 --- /dev/null +++ b/usr/src/uts/common/syscall/lwp_info.c @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/debug.h> +#include <sys/model.h> +#include <sys/msacct.h> + +/* + * Get the time accounting information for the calling LWP. + */ +int +lwp_info(timestruc_t *tvp) +{ + timestruc_t tv[2]; + hrtime_t hrutime, hrstime; + klwp_t *lwp = ttolwp(curthread); + + hrutime = lwp->lwp_mstate.ms_acct[LMS_USER]; + hrstime = lwp->lwp_mstate.ms_acct[LMS_SYSTEM] + + lwp->lwp_mstate.ms_acct[LMS_TRAP]; + scalehrtime(&hrutime); + scalehrtime(&hrstime); + + hrt2ts(hrutime, &tv[0]); + hrt2ts(hrstime, &tv[1]); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyout(tv, tvp, sizeof (tv))) + return (set_errno(EFAULT)); + } else { + timestruc32_t tv32[2]; + + if (TIMESPEC_OVERFLOW(&tv[0]) || + TIMESPEC_OVERFLOW(&tv[1])) + return (set_errno(EOVERFLOW)); /* unlikely */ + + TIMESPEC_TO_TIMESPEC32(&tv32[0], &tv[0]); + TIMESPEC_TO_TIMESPEC32(&tv32[1], &tv[1]); + + if (copyout(tv32, tvp, sizeof (tv32))) + return (set_errno(EFAULT)); + } + return (0); +} diff --git a/usr/src/uts/common/syscall/lwp_self.c b/usr/src/uts/common/syscall/lwp_self.c new file mode 100644 index 0000000000..bbd5b40632 --- /dev/null +++ b/usr/src/uts/common/syscall/lwp_self.c @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* Copyright (c) 1994 Sun Microsystems, Inc. */ +/* All Rights Reserved */ + + +#ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/thread.h> + +int +lwp_self() +{ + return (curthread->t_tid); +} diff --git a/usr/src/uts/common/syscall/lwp_sobj.c b/usr/src/uts/common/syscall/lwp_sobj.c new file mode 100644 index 0000000000..5b255912a0 --- /dev/null +++ b/usr/src/uts/common/syscall/lwp_sobj.c @@ -0,0 +1,3119 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/prsystm.h> +#include <sys/kmem.h> +#include <sys/sobject.h> +#include <sys/fault.h> +#include <sys/procfs.h> +#include <sys/watchpoint.h> +#include <sys/time.h> +#include <sys/cmn_err.h> +#include <sys/machlock.h> +#include <sys/debug.h> +#include <sys/synch.h> +#include <sys/synch32.h> +#include <sys/mman.h> +#include <sys/class.h> +#include <sys/schedctl.h> +#include <sys/sleepq.h> +#include <sys/policy.h> +#include <sys/tnf_probe.h> +#include <sys/lwpchan_impl.h> +#include <sys/turnstile.h> +#include <sys/atomic.h> +#include <sys/lwp_timer_impl.h> +#include <sys/lwp_upimutex_impl.h> +#include <vm/as.h> +#include <sys/sdt.h> + +static kthread_t *lwpsobj_owner(caddr_t); +static void lwp_unsleep(kthread_t *t); +static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip); +static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg); + +extern int lwp_cond_signal(lwp_cond_t *cv); + +/* + * Maximum number of user prio inheritance locks that can be held by a thread. + * Used to limit kmem for each thread. This is a per-thread limit that + * can be administered on a system wide basis (using /etc/system). + * + * Also, when a limit, say maxlwps is added for numbers of lwps within a + * process, the per-thread limit automatically becomes a process-wide limit + * of maximum number of held upi locks within a process: + * maxheldupimx = maxnestupimx * maxlwps; + */ +static uint32_t maxnestupimx = 2000; + +/* + * The sobj_ops vector exports a set of functions needed when a thread + * is asleep on a synchronization object of this type. + */ +static sobj_ops_t lwp_sobj_ops = { + SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri +}; + +static kthread_t *lwpsobj_pi_owner(upimutex_t *up); + +static sobj_ops_t lwp_sobj_pi_ops = { + SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep, + turnstile_change_pri +}; + +static sleepq_head_t lwpsleepq[NSLEEPQ]; +upib_t upimutextab[UPIMUTEX_TABSIZE]; + +#define LWPCHAN_LOCK_SHIFT 10 /* 1024 locks for each pool */ +#define LWPCHAN_LOCK_SIZE (1 << LWPCHAN_LOCK_SHIFT) + +/* + * We know that both lc_wchan and lc_wchan0 are addresses that most + * likely are 8-byte aligned, so we shift off the low-order 3 bits. + * 'pool' is either 0 or 1. + */ +#define LWPCHAN_LOCK_HASH(X, pool) \ + (((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \ + (LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0)) + +static kmutex_t lwpchanlock[2 * LWPCHAN_LOCK_SIZE]; + +/* + * Is this a POSIX threads user-level lock requiring priority inheritance? + */ +#define UPIMUTEX(type) ((type) & LOCK_PRIO_INHERIT) + +static sleepq_head_t * +lwpsqhash(lwpchan_t *lwpchan) +{ + uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0; + return (&lwpsleepq[SQHASHINDEX(x)]); +} + +/* + * Lock an lwpchan. + * Keep this in sync with lwpchan_unlock(), below. + */ +static void +lwpchan_lock(lwpchan_t *lwpchan, int pool) +{ + uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0; + mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]); +} + +/* + * Unlock an lwpchan. + * Keep this in sync with lwpchan_lock(), above. + */ +static void +lwpchan_unlock(lwpchan_t *lwpchan, int pool) +{ + uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0; + mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]); +} + +/* + * Delete mappings from the lwpchan cache for pages that are being + * unmapped by as_unmap(). Given a range of addresses, "start" to "end", + * all mappings within the range are deleted from the lwpchan cache. + */ +void +lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end) +{ + lwpchan_data_t *lcp; + lwpchan_hashbucket_t *hashbucket; + lwpchan_hashbucket_t *endbucket; + lwpchan_entry_t *ent; + lwpchan_entry_t **prev; + caddr_t addr; + + mutex_enter(&p->p_lcp_lock); + lcp = p->p_lcp; + hashbucket = lcp->lwpchan_cache; + endbucket = hashbucket + lcp->lwpchan_size; + for (; hashbucket < endbucket; hashbucket++) { + if (hashbucket->lwpchan_chain == NULL) + continue; + mutex_enter(&hashbucket->lwpchan_lock); + prev = &hashbucket->lwpchan_chain; + /* check entire chain */ + while ((ent = *prev) != NULL) { + addr = ent->lwpchan_addr; + if (start <= addr && addr < end) { + *prev = ent->lwpchan_next; + if (ent->lwpchan_pool == LWPCHAN_MPPOOL && + (ent->lwpchan_type & USYNC_PROCESS_ROBUST)) + lwp_mutex_cleanup(ent, LOCK_UNMAPPED); + kmem_free(ent, sizeof (*ent)); + atomic_add_32(&lcp->lwpchan_entries, -1); + } else { + prev = &ent->lwpchan_next; + } + } + mutex_exit(&hashbucket->lwpchan_lock); + } + mutex_exit(&p->p_lcp_lock); +} + +/* + * Given an lwpchan cache pointer and a process virtual address, + * return a pointer to the corresponding lwpchan hash bucket. + */ +static lwpchan_hashbucket_t * +lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr) +{ + uint_t i; + + /* + * All user-level sync object addresses are 8-byte aligned. + * Ignore the lowest 3 bits of the address and use the + * higher-order 2*lwpchan_bits bits for the hash index. + */ + addr >>= 3; + i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask; + return (lcp->lwpchan_cache + i); +} + +/* + * (Re)allocate the per-process lwpchan cache. + */ +static void +lwpchan_alloc_cache(proc_t *p, uint_t bits) +{ + lwpchan_data_t *lcp; + lwpchan_data_t *old_lcp; + lwpchan_hashbucket_t *hashbucket; + lwpchan_hashbucket_t *endbucket; + lwpchan_hashbucket_t *newbucket; + lwpchan_entry_t *ent; + lwpchan_entry_t *next; + uint_t count; + + ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS); + + lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP); + lcp->lwpchan_bits = bits; + lcp->lwpchan_size = 1 << lcp->lwpchan_bits; + lcp->lwpchan_mask = lcp->lwpchan_size - 1; + lcp->lwpchan_entries = 0; + lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size * + sizeof (lwpchan_hashbucket_t), KM_SLEEP); + lcp->lwpchan_next_data = NULL; + + mutex_enter(&p->p_lcp_lock); + if ((old_lcp = p->p_lcp) != NULL) { + if (old_lcp->lwpchan_bits >= bits) { + /* someone beat us to it */ + mutex_exit(&p->p_lcp_lock); + kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size * + sizeof (lwpchan_hashbucket_t)); + kmem_free(lcp, sizeof (lwpchan_data_t)); + return; + } + /* + * Acquire all of the old hash table locks. + */ + hashbucket = old_lcp->lwpchan_cache; + endbucket = hashbucket + old_lcp->lwpchan_size; + for (; hashbucket < endbucket; hashbucket++) + mutex_enter(&hashbucket->lwpchan_lock); + /* + * Move all of the old hash table entries to the + * new hash table. The new hash table has not yet + * been installed so we don't need any of its locks. + */ + count = 0; + hashbucket = old_lcp->lwpchan_cache; + for (; hashbucket < endbucket; hashbucket++) { + ent = hashbucket->lwpchan_chain; + while (ent != NULL) { + next = ent->lwpchan_next; + newbucket = lwpchan_bucket(lcp, + (uintptr_t)ent->lwpchan_addr); + ent->lwpchan_next = newbucket->lwpchan_chain; + newbucket->lwpchan_chain = ent; + ent = next; + count++; + } + hashbucket->lwpchan_chain = NULL; + } + lcp->lwpchan_entries = count; + } + + /* + * Retire the old hash table. We can't actually kmem_free() it + * now because someone may still have a pointer to it. Instead, + * we link it onto the new hash table's list of retired hash tables. + * The new hash table is double the size of the previous one, so + * the total size of all retired hash tables is less than the size + * of the new one. exit() and exec() free the retired hash tables + * (see lwpchan_destroy_cache(), below). + */ + lcp->lwpchan_next_data = old_lcp; + + /* + * As soon as we store the new lcp, future locking operations will + * use it. Therefore, we must ensure that all the state we've just + * established reaches global visibility before the new lcp does. + */ + membar_producer(); + p->p_lcp = lcp; + + if (old_lcp != NULL) { + /* + * Release all of the old hash table locks. + */ + hashbucket = old_lcp->lwpchan_cache; + for (; hashbucket < endbucket; hashbucket++) + mutex_exit(&hashbucket->lwpchan_lock); + } + mutex_exit(&p->p_lcp_lock); +} + +/* + * Deallocate the lwpchan cache, and any dynamically allocated mappings. + * Called when the process exits or execs. All lwps except one have + * exited so we need no locks here. + */ +void +lwpchan_destroy_cache(int exec) +{ + proc_t *p = curproc; + lwpchan_hashbucket_t *hashbucket; + lwpchan_hashbucket_t *endbucket; + lwpchan_data_t *lcp; + lwpchan_entry_t *ent; + lwpchan_entry_t *next; + uint16_t lockflg; + + lcp = p->p_lcp; + p->p_lcp = NULL; + + lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD; + hashbucket = lcp->lwpchan_cache; + endbucket = hashbucket + lcp->lwpchan_size; + for (; hashbucket < endbucket; hashbucket++) { + ent = hashbucket->lwpchan_chain; + hashbucket->lwpchan_chain = NULL; + while (ent != NULL) { + next = ent->lwpchan_next; + if (ent->lwpchan_pool == LWPCHAN_MPPOOL && + (ent->lwpchan_type & USYNC_PROCESS_ROBUST)) + lwp_mutex_cleanup(ent, lockflg); + kmem_free(ent, sizeof (*ent)); + ent = next; + } + } + + while (lcp != NULL) { + lwpchan_data_t *next_lcp = lcp->lwpchan_next_data; + kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size * + sizeof (lwpchan_hashbucket_t)); + kmem_free(lcp, sizeof (lwpchan_data_t)); + lcp = next_lcp; + } +} + +/* + * Return zero when there is an entry in the lwpchan cache for the + * given process virtual address and non-zero when there is not. + * The returned non-zero value is the current length of the + * hash chain plus one. The caller holds the hash bucket lock. + */ +static uint_t +lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan, + lwpchan_hashbucket_t *hashbucket) +{ + lwpchan_entry_t *ent; + uint_t count = 1; + + for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) { + if (ent->lwpchan_addr == addr) { + if (ent->lwpchan_type != type || + ent->lwpchan_pool != pool) { + /* + * This shouldn't happen, but might if the + * process reuses its memory for different + * types of sync objects. We test first + * to avoid grabbing the memory cache line. + */ + ent->lwpchan_type = (uint16_t)type; + ent->lwpchan_pool = (uint16_t)pool; + } + *lwpchan = ent->lwpchan_lwpchan; + return (0); + } + count++; + } + return (count); +} + +/* + * Return the cached lwpchan mapping if cached, otherwise insert + * a virtual address to lwpchan mapping into the cache. + */ +static int +lwpchan_get_mapping(struct as *as, caddr_t addr, + int type, lwpchan_t *lwpchan, int pool) +{ + proc_t *p = curproc; + lwpchan_data_t *lcp; + lwpchan_hashbucket_t *hashbucket; + lwpchan_entry_t *ent; + memid_t memid; + uint_t count; + uint_t bits; + +top: + /* initialize the lwpchan cache, if necesary */ + if ((lcp = p->p_lcp) == NULL) { + lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS); + goto top; + } + hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr); + mutex_enter(&hashbucket->lwpchan_lock); + if (lcp != p->p_lcp) { + /* someone resized the lwpchan cache; start over */ + mutex_exit(&hashbucket->lwpchan_lock); + goto top; + } + if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) { + /* it's in the cache */ + mutex_exit(&hashbucket->lwpchan_lock); + return (1); + } + mutex_exit(&hashbucket->lwpchan_lock); + if (as_getmemid(as, addr, &memid) != 0) + return (0); + lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0]; + lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1]; + ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP); + mutex_enter(&hashbucket->lwpchan_lock); + if (lcp != p->p_lcp) { + /* someone resized the lwpchan cache; start over */ + mutex_exit(&hashbucket->lwpchan_lock); + kmem_free(ent, sizeof (*ent)); + goto top; + } + count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket); + if (count == 0) { + /* someone else added this entry to the cache */ + mutex_exit(&hashbucket->lwpchan_lock); + kmem_free(ent, sizeof (*ent)); + return (1); + } + if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */ + (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) { + /* hash chain too long; reallocate the hash table */ + mutex_exit(&hashbucket->lwpchan_lock); + kmem_free(ent, sizeof (*ent)); + lwpchan_alloc_cache(p, bits + 1); + goto top; + } + ent->lwpchan_addr = addr; + ent->lwpchan_type = (uint16_t)type; + ent->lwpchan_pool = (uint16_t)pool; + ent->lwpchan_lwpchan = *lwpchan; + ent->lwpchan_next = hashbucket->lwpchan_chain; + hashbucket->lwpchan_chain = ent; + atomic_add_32(&lcp->lwpchan_entries, 1); + mutex_exit(&hashbucket->lwpchan_lock); + return (1); +} + +/* + * Return a unique pair of identifiers that corresponds to a + * synchronization object's virtual address. Process-shared + * sync objects usually get vnode/offset from as_getmemid(). + */ +static int +get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool) +{ + /* + * If the lwp synch object is defined to be process-private, + * we just make the first field of the lwpchan be 'as' and + * the second field be the synch object's virtual address. + * (segvn_getmemid() does the same for MAP_PRIVATE mappings.) + * The lwpchan cache is used only for process-shared objects. + */ + if ((type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) == 0) { + lwpchan->lc_wchan0 = (caddr_t)as; + lwpchan->lc_wchan = addr; + return (1); + } + /* check the lwpchan cache for mapping */ + return (lwpchan_get_mapping(as, addr, type, lwpchan, pool)); +} + +static void +lwp_block(lwpchan_t *lwpchan) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + sleepq_head_t *sqh; + + thread_lock(t); + t->t_flag |= T_WAKEABLE; + t->t_lwpchan = *lwpchan; + t->t_sobj_ops = &lwp_sobj_ops; + t->t_release = 0; + sqh = lwpsqhash(lwpchan); + disp_lock_enter_high(&sqh->sq_lock); + CL_SLEEP(t); + DTRACE_SCHED(sleep); + THREAD_SLEEP(t, &sqh->sq_lock); + sleepq_insert(&sqh->sq_queue, t); + thread_unlock(t); + lwp->lwp_asleep = 1; + lwp->lwp_sysabort = 0; + lwp->lwp_ru.nvcsw++; + (void) new_mstate(curthread, LMS_SLEEP); +} + +static kthread_t * +lwpsobj_pi_owner(upimutex_t *up) +{ + return (up->upi_owner); +} + +static struct upimutex * +upi_get(upib_t *upibp, lwpchan_t *lcp) +{ + struct upimutex *upip; + + for (upip = upibp->upib_first; upip != NULL; + upip = upip->upi_nextchain) { + if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 && + upip->upi_lwpchan.lc_wchan == lcp->lc_wchan) + break; + } + return (upip); +} + +static void +upi_chain_add(upib_t *upibp, struct upimutex *upimutex) +{ + ASSERT(MUTEX_HELD(&upibp->upib_lock)); + + /* + * Insert upimutex at front of list. Maybe a bit unfair + * but assume that not many lwpchans hash to the same + * upimutextab bucket, i.e. the list of upimutexes from + * upib_first is not too long. + */ + upimutex->upi_nextchain = upibp->upib_first; + upibp->upib_first = upimutex; +} + +static void +upi_chain_del(upib_t *upibp, struct upimutex *upimutex) +{ + struct upimutex **prev; + + ASSERT(MUTEX_HELD(&upibp->upib_lock)); + + prev = &upibp->upib_first; + while (*prev != upimutex) { + prev = &(*prev)->upi_nextchain; + } + *prev = upimutex->upi_nextchain; + upimutex->upi_nextchain = NULL; +} + +/* + * Add upimutex to chain of upimutexes held by curthread. + * Returns number of upimutexes held by curthread. + */ +static uint32_t +upi_mylist_add(struct upimutex *upimutex) +{ + kthread_t *t = curthread; + + /* + * Insert upimutex at front of list of upimutexes owned by t. This + * would match typical LIFO order in which nested locks are acquired + * and released. + */ + upimutex->upi_nextowned = t->t_upimutex; + t->t_upimutex = upimutex; + t->t_nupinest++; + ASSERT(t->t_nupinest > 0); + return (t->t_nupinest); +} + +/* + * Delete upimutex from list of upimutexes owned by curthread. + */ +static void +upi_mylist_del(struct upimutex *upimutex) +{ + kthread_t *t = curthread; + struct upimutex **prev; + + /* + * Since the order in which nested locks are acquired and released, + * is typically LIFO, and typical nesting levels are not too deep, the + * following should not be expensive in the general case. + */ + prev = &t->t_upimutex; + while (*prev != upimutex) { + prev = &(*prev)->upi_nextowned; + } + *prev = upimutex->upi_nextowned; + upimutex->upi_nextowned = NULL; + ASSERT(t->t_nupinest > 0); + t->t_nupinest--; +} + +/* + * Returns true if upimutex is owned. Should be called only when upim points + * to kmem which cannot disappear from underneath. + */ +static int +upi_owned(upimutex_t *upim) +{ + return (upim->upi_owner == curthread); +} + +/* + * Returns pointer to kernel object (upimutex_t *) if lp is owned. + */ +static struct upimutex * +lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type) +{ + lwpchan_t lwpchan; + upib_t *upibp; + struct upimutex *upimutex; + + if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, + &lwpchan, LWPCHAN_MPPOOL)) + return (NULL); + + upibp = &UPI_CHAIN(lwpchan); + mutex_enter(&upibp->upib_lock); + upimutex = upi_get(upibp, &lwpchan); + if (upimutex == NULL || upimutex->upi_owner != curthread) { + mutex_exit(&upibp->upib_lock); + return (NULL); + } + mutex_exit(&upibp->upib_lock); + return (upimutex); +} + +/* + * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if + * no lock hand-off occurrs. + */ +static void +upimutex_unlock(struct upimutex *upimutex, uint16_t flag) +{ + turnstile_t *ts; + upib_t *upibp; + kthread_t *newowner; + + upi_mylist_del(upimutex); + upibp = upimutex->upi_upibp; + mutex_enter(&upibp->upib_lock); + if (upimutex->upi_waiter != 0) { /* if waiters */ + ts = turnstile_lookup(upimutex); + if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) { + /* hand-off lock to highest prio waiter */ + newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first; + upimutex->upi_owner = newowner; + if (ts->ts_waiters == 1) + upimutex->upi_waiter = 0; + turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner); + mutex_exit(&upibp->upib_lock); + return; + } else if (ts != NULL) { + /* LOCK_NOTRECOVERABLE: wakeup all */ + turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL); + } else { + /* + * Misleading w bit. Waiters might have been + * interrupted. No need to clear the w bit (upimutex + * will soon be freed). Re-calculate PI from existing + * waiters. + */ + turnstile_exit(upimutex); + turnstile_pi_recalc(); + } + } + /* + * no waiters, or LOCK_NOTRECOVERABLE. + * remove from the bucket chain of upi mutexes. + * de-allocate kernel memory (upimutex). + */ + upi_chain_del(upimutex->upi_upibp, upimutex); + mutex_exit(&upibp->upib_lock); + kmem_free(upimutex, sizeof (upimutex_t)); +} + +static int +lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp) +{ + label_t ljb; + int error = 0; + lwpchan_t lwpchan; + uint16_t flag; + upib_t *upibp; + volatile struct upimutex *upimutex = NULL; + turnstile_t *ts; + uint32_t nupinest; + volatile int upilocked = 0; + + if (on_fault(&ljb)) { + if (upilocked) + upimutex_unlock((upimutex_t *)upimutex, 0); + error = EFAULT; + goto out; + } + /* + * The apparent assumption made in implementing other _lwp_* synch + * primitives, is that get_lwpchan() does not return a unique cookie + * for the case where 2 processes (one forked from the other) point + * at the same underlying object, which is typed USYNC_PROCESS, but + * mapped MAP_PRIVATE, since the object has not yet been written to, + * in the child process. + * + * Since get_lwpchan() has been fixed, it is not necessary to do the + * dummy writes to force a COW fault as in other places (which should + * be fixed). + */ + if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, + &lwpchan, LWPCHAN_MPPOOL)) { + error = EFAULT; + goto out; + } + upibp = &UPI_CHAIN(lwpchan); +retry: + mutex_enter(&upibp->upib_lock); + upimutex = upi_get(upibp, &lwpchan); + if (upimutex == NULL) { + /* lock available since lwpchan has no upimutex */ + upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP); + upi_chain_add(upibp, (upimutex_t *)upimutex); + upimutex->upi_owner = curthread; /* grab lock */ + upimutex->upi_upibp = upibp; + upimutex->upi_vaddr = lp; + upimutex->upi_lwpchan = lwpchan; + mutex_exit(&upibp->upib_lock); + nupinest = upi_mylist_add((upimutex_t *)upimutex); + upilocked = 1; + fuword16_noerr(&lp->mutex_flag, &flag); + if (nupinest > maxnestupimx && + secpolicy_resource(CRED()) != 0) { + upimutex_unlock((upimutex_t *)upimutex, flag); + error = ENOMEM; + goto out; + } + if (flag & LOCK_OWNERDEAD) { + /* + * Return with upimutex held. + */ + error = EOWNERDEAD; + } else if (flag & LOCK_NOTRECOVERABLE) { + /* + * Since the setting of LOCK_NOTRECOVERABLE + * was done under the high-level upi mutex, + * in lwp_upimutex_unlock(), this flag needs to + * be checked while holding the upi mutex. + * If set, this thread should return without + * the lock held, and with the right error + * code. + */ + upimutex_unlock((upimutex_t *)upimutex, flag); + upilocked = 0; + error = ENOTRECOVERABLE; + } + goto out; + } + /* + * If a upimutex object exists, it must have an owner. + * This is due to lock hand-off, and release of upimutex when no + * waiters are present at unlock time, + */ + ASSERT(upimutex->upi_owner != NULL); + if (upimutex->upi_owner == curthread) { + /* + * The user wrapper can check if the mutex type is + * ERRORCHECK: if not, it should stall at user-level. + * If so, it should return the error code. + */ + mutex_exit(&upibp->upib_lock); + error = EDEADLK; + goto out; + } + if (try == UPIMUTEX_TRY) { + mutex_exit(&upibp->upib_lock); + error = EBUSY; + goto out; + } + /* + * Block for the lock. + * Put the lwp in an orderly state for debugging. + * Calling prstop() has to be done here, and not in + * turnstile_block(), since the preceding call to + * turnstile_lookup() raises the PIL to a level + * at which calls to prstop() should not be made. + */ + if ((error = lwptp->lwpt_time_error) != 0) { + /* + * The SUSV3 Posix spec is very clear that we + * should get no error from validating the + * timer until we would actually sleep. + */ + mutex_exit(&upibp->upib_lock); + goto out; + } + prstop(PR_REQUESTED, 0); + if (lwptp->lwpt_tsp != NULL) { + /* + * If we successfully queue the timeout + * (lwp_timer_enqueue() returns zero), + * then don't drop t_delay_lock until we are + * on the sleep queue (in turnstile_block()). + * Otherwise we will get an immediate timeout + * when we attempt to sleep in turnstile_block(). + */ + mutex_enter(&curthread->t_delay_lock); + if (lwp_timer_enqueue(lwptp) != 0) + mutex_exit(&curthread->t_delay_lock); + } + /* + * Now, set the waiter bit and block for the lock in turnstile_block(). + * No need to preserve the previous wbit since a lock try is not + * attempted after setting the wait bit. Wait bit is set under + * the upib_lock, which is not released until the turnstile lock + * is acquired. Say, the upimutex is L: + * + * 1. upib_lock is held so the waiter does not have to retry L after + * setting the wait bit: since the owner has to grab the upib_lock + * to unlock L, it will certainly see the wait bit set. + * 2. upib_lock is not released until the turnstile lock is acquired. + * This is the key to preventing a missed wake-up. Otherwise, the + * owner could acquire the upib_lock, and the tc_lock, to call + * turnstile_wakeup(). All this, before the waiter gets tc_lock + * to sleep in turnstile_block(). turnstile_wakeup() will then not + * find this waiter, resulting in the missed wakeup. + * 3. The upib_lock, being a kernel mutex, cannot be released while + * holding the tc_lock (since mutex_exit() could need to acquire + * the same tc_lock)...and so is held when calling turnstile_block(). + * The address of upib_lock is passed to turnstile_block() which + * releases it after releasing all turnstile locks, and before going + * to sleep in swtch(). + * 4. The waiter value cannot be a count of waiters, because a waiter + * can be interrupted. The interrupt occurs under the tc_lock, at + * which point, the upib_lock cannot be locked, to decrement waiter + * count. So, just treat the waiter state as a bit, not a count. + */ + ts = turnstile_lookup((upimutex_t *)upimutex); + upimutex->upi_waiter = 1; + error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex, + &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp); + /* + * Hand-off implies that we wakeup holding the lock, except when: + * - deadlock is detected + * - lock is not recoverable + * - we got an interrupt or timeout + * If we wake up due to an interrupt or timeout, we may + * or may not be holding the lock due to mutex hand-off. + * Use lwp_upimutex_owned() to check if we do hold the lock. + */ + if (error != 0) { + if ((error == EINTR || error == ETIME) && + (upimutex = lwp_upimutex_owned(lp, type))) { + /* + * Unlock and return - the re-startable syscall will + * try the lock again if we got EINTR. + */ + (void) upi_mylist_add((upimutex_t *)upimutex); + upimutex_unlock((upimutex_t *)upimutex, 0); + } + /* + * The only other possible error is EDEADLK. If so, upimutex + * is valid, since its owner is deadlocked with curthread. + */ + ASSERT(error == EINTR || error == ETIME || + (error == EDEADLK && !upi_owned((upimutex_t *)upimutex))); + ASSERT(!lwp_upimutex_owned(lp, type)); + goto out; + } + if (lwp_upimutex_owned(lp, type)) { + ASSERT(lwp_upimutex_owned(lp, type) == upimutex); + nupinest = upi_mylist_add((upimutex_t *)upimutex); + upilocked = 1; + } + /* + * Now, need to read the user-level lp->mutex_flag to do the following: + * + * - if lock is held, check if EOWNERDEAD should be returned + * - if lock isn't held, check if ENOTRECOVERABLE should be returned + * + * Now, either lp->mutex_flag is readable or it's not. If not + * readable, the on_fault path will cause a return with EFAULT as + * it should. If it is readable, the state of the flag encodes the + * robustness state of the lock: + * + * If the upimutex is locked here, the flag's LOCK_OWNERDEAD setting + * will influence the return code appropriately. If the upimutex is + * not locked here, this could be due to a spurious wake-up or a + * NOTRECOVERABLE event. The flag's setting can be used to distinguish + * between these two events. + */ + fuword16_noerr(&lp->mutex_flag, &flag); + if (upilocked) { + /* + * If the thread wakes up from turnstile_block with the lock + * held, the flag could not be set to LOCK_NOTRECOVERABLE, + * since it would not have been handed-off the lock. + * So, no need to check for this case. + */ + if (nupinest > maxnestupimx && + secpolicy_resource(CRED()) != 0) { + upimutex_unlock((upimutex_t *)upimutex, flag); + upilocked = 0; + error = ENOMEM; + } else if (flag & LOCK_OWNERDEAD) { + error = EOWNERDEAD; + } + } else { + /* + * Wake-up without the upimutex held. Either this is a + * spurious wake-up (due to signals, forkall(), whatever), or + * it is a LOCK_NOTRECOVERABLE robustness event. The setting + * of the mutex flag can be used to distinguish between the + * two events. + */ + if (flag & LOCK_NOTRECOVERABLE) { + error = ENOTRECOVERABLE; + } else { + /* + * Here, the flag could be set to LOCK_OWNERDEAD or + * not. In both cases, this is a spurious wakeup, + * since the upi lock is not held, but the thread + * has returned from turnstile_block(). + * + * The user flag could be LOCK_OWNERDEAD if, at the + * same time as curthread having been woken up + * spuriously, the owner (say Tdead) has died, marked + * the mutex flag accordingly, and handed off the lock + * to some other waiter (say Tnew). curthread just + * happened to read the flag while Tnew has yet to deal + * with the owner-dead event. + * + * In this event, curthread should retry the lock. + * If Tnew is able to cleanup the lock, curthread + * will eventually get the lock with a zero error code, + * If Tnew is unable to cleanup, its eventual call to + * unlock the lock will result in the mutex flag being + * set to LOCK_NOTRECOVERABLE, and the wake-up of + * all waiters, including curthread, which will then + * eventually return ENOTRECOVERABLE due to the above + * check. + * + * Of course, if the user-flag is not set with + * LOCK_OWNERDEAD, retrying is the thing to do, since + * this is definitely a spurious wakeup. + */ + goto retry; + } + } + +out: + no_fault(); + return (error); +} + + +static int +lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type) +{ + label_t ljb; + int error = 0; + lwpchan_t lwpchan; + uint16_t flag; + upib_t *upibp; + volatile struct upimutex *upimutex = NULL; + volatile int upilocked = 0; + + if (on_fault(&ljb)) { + if (upilocked) + upimutex_unlock((upimutex_t *)upimutex, 0); + error = EFAULT; + goto out; + } + if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, + &lwpchan, LWPCHAN_MPPOOL)) { + error = EFAULT; + goto out; + } + upibp = &UPI_CHAIN(lwpchan); + mutex_enter(&upibp->upib_lock); + upimutex = upi_get(upibp, &lwpchan); + /* + * If the lock is not held, or the owner is not curthread, return + * error. The user-level wrapper can return this error or stall, + * depending on whether mutex is of ERRORCHECK type or not. + */ + if (upimutex == NULL || upimutex->upi_owner != curthread) { + mutex_exit(&upibp->upib_lock); + error = EPERM; + goto out; + } + mutex_exit(&upibp->upib_lock); /* release for user memory access */ + upilocked = 1; + fuword16_noerr(&lp->mutex_flag, &flag); + if (flag & LOCK_OWNERDEAD) { + /* + * transition mutex to the LOCK_NOTRECOVERABLE state. + */ + flag &= ~LOCK_OWNERDEAD; + flag |= LOCK_NOTRECOVERABLE; + suword16_noerr(&lp->mutex_flag, flag); + } + upimutex_unlock((upimutex_t *)upimutex, flag); + upilocked = 0; +out: + no_fault(); + return (error); +} + +/* + * Mark user mutex state, corresponding to kernel upimutex, as LOCK_OWNERDEAD. + */ +static int +upi_dead(upimutex_t *upip) +{ + label_t ljb; + int error = 0; + lwp_mutex_t *lp; + uint16_t flag; + + if (on_fault(&ljb)) { + error = EFAULT; + goto out; + } + + lp = upip->upi_vaddr; + fuword16_noerr(&lp->mutex_flag, &flag); + flag |= LOCK_OWNERDEAD; + suword16_noerr(&lp->mutex_flag, flag); +out: + no_fault(); + return (error); +} + +/* + * Unlock all upimutexes held by curthread, since curthread is dying. + * For each upimutex, attempt to mark its corresponding user mutex object as + * dead. + */ +void +upimutex_cleanup() +{ + kthread_t *t = curthread; + struct upimutex *upip; + + while ((upip = t->t_upimutex) != NULL) { + if (upi_dead(upip) != 0) { + /* + * If the user object associated with this upimutex is + * unmapped, unlock upimutex with the + * LOCK_NOTRECOVERABLE flag, so that all waiters are + * woken up. Since user object is unmapped, it could + * not be marked as dead or notrecoverable. + * The waiters will now all wake up and return + * ENOTRECOVERABLE, since they would find that the lock + * has not been handed-off to them. + * See lwp_upimutex_lock(). + */ + upimutex_unlock(upip, LOCK_NOTRECOVERABLE); + } else { + /* + * The user object has been updated as dead. + * Unlock the upimutex: if no waiters, upip kmem will + * be freed. If there is a waiter, the lock will be + * handed off. If exit() is in progress, each existing + * waiter will successively get the lock, as owners + * die, and each new owner will call this routine as + * it dies. The last owner will free kmem, since + * it will find the upimutex has no waiters. So, + * eventually, the kmem is guaranteed to be freed. + */ + upimutex_unlock(upip, 0); + } + /* + * Note that the call to upimutex_unlock() above will delete + * upimutex from the t_upimutexes chain. And so the + * while loop will eventually terminate. + */ + } +} + +int +lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + lwp_timer_t lwpt; + caddr_t timedwait; + int error = 0; + int time_error; + clock_t tim = -1; + uchar_t waiters; + volatile int locked = 0; + volatile int watched = 0; + label_t ljb; + volatile uint8_t type = 0; + lwpchan_t lwpchan; + sleepq_head_t *sqh; + static int iswanted(); + uint16_t flag; + int imm_timeout = 0; + + if ((caddr_t)lp >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + timedwait = (caddr_t)tsp; + if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 && + lwpt.lwpt_imm_timeout) { + imm_timeout = 1; + timedwait = NULL; + } + + /* + * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock", + * this micro state is really a run state. If the thread indeed blocks, + * this state becomes valid. If not, the state is converted back to + * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just + * when blocking. + */ + (void) new_mstate(t, LMS_USER_LOCK); + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); + error = EFAULT; + goto out; + } + fuword8_noerr(&lp->mutex_type, (uint8_t *)&type); + if (UPIMUTEX(type)) { + no_fault(); + error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt); + if ((error == 0 || error == EOWNERDEAD) && + (type & USYNC_PROCESS)) + (void) suword32(&lp->mutex_ownerpid, p->p_pid); + if (tsp && !time_error) /* copyout the residual time left */ + error = lwp_timer_copyout(&lwpt, error); + if (error) + return (set_errno(error)); + return (0); + } + /* + * Force Copy-on-write fault if lwp_mutex_t object is + * defined to be MAP_PRIVATE and it was initialized to + * USYNC_PROCESS. + */ + suword8_noerr(&lp->mutex_type, type); + if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, + &lwpchan, LWPCHAN_MPPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); + locked = 1; + fuword8_noerr(&lp->mutex_waiters, &waiters); + suword8_noerr(&lp->mutex_waiters, 1); + if (type & USYNC_PROCESS_ROBUST) { + fuword16_noerr(&lp->mutex_flag, &flag); + if (flag & LOCK_NOTRECOVERABLE) { + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); + error = ENOTRECOVERABLE; + goto out; + } + } + + /* + * If watchpoints are set, they need to be restored, since + * atomic accesses of memory such as the call to ulock_try() + * below cannot be watched. + */ + + watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + + while (!ulock_try(&lp->mutex_lockw)) { + if (time_error) { + /* + * The SUSV3 Posix spec is very clear that we + * should get no error from validating the + * timer until we would actually sleep. + */ + error = time_error; + break; + } + + if (watched) { + watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + watched = 0; + } + + /* + * Put the lwp in an orderly state for debugging. + */ + prstop(PR_REQUESTED, 0); + if (timedwait) { + /* + * If we successfully queue the timeout, + * then don't drop t_delay_lock until + * we are on the sleep queue (below). + */ + mutex_enter(&t->t_delay_lock); + if (lwp_timer_enqueue(&lwpt) != 0) { + mutex_exit(&t->t_delay_lock); + imm_timeout = 1; + timedwait = NULL; + } + } + lwp_block(&lwpchan); + /* + * Nothing should happen to cause the lwp to go to + * sleep again until after it returns from swtch(). + */ + if (timedwait) + mutex_exit(&t->t_delay_lock); + locked = 0; + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); + if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout) + setrun(t); + swtch(); + t->t_flag &= ~T_WAKEABLE; + if (timedwait) + tim = lwp_timer_dequeue(&lwpt); + setallwatch(); + if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t)) + error = EINTR; + else if (imm_timeout || (timedwait && tim == -1)) + error = ETIME; + if (error) { + lwp->lwp_asleep = 0; + lwp->lwp_sysabort = 0; + watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), + S_WRITE); + + /* + * Need to re-compute waiters bit. The waiters field in + * the lock is not reliable. Either of two things could + * have occurred: no lwp may have called lwp_release() + * for me but I have woken up due to a signal or + * timeout. In this case, the waiter bit is incorrect + * since it is still set to 1, set above. + * OR an lwp_release() did occur for some other lwp on + * the same lwpchan. In this case, the waiter bit is + * correct. But which event occurred, one can't tell. + * So, recompute. + */ + lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); + locked = 1; + sqh = lwpsqhash(&lwpchan); + disp_lock_enter(&sqh->sq_lock); + waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan); + disp_lock_exit(&sqh->sq_lock); + break; + } + lwp->lwp_asleep = 0; + watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), + S_WRITE); + lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); + locked = 1; + fuword8_noerr(&lp->mutex_waiters, &waiters); + suword8_noerr(&lp->mutex_waiters, 1); + if (type & USYNC_PROCESS_ROBUST) { + fuword16_noerr(&lp->mutex_flag, &flag); + if (flag & LOCK_NOTRECOVERABLE) { + error = ENOTRECOVERABLE; + break; + } + } + } + + if (t->t_mstate == LMS_USER_LOCK) + (void) new_mstate(t, LMS_SYSTEM); + + if (!error && (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST))) { + suword32_noerr(&lp->mutex_ownerpid, p->p_pid); + if (type & USYNC_PROCESS_ROBUST) { + fuword16_noerr(&lp->mutex_flag, &flag); + if (flag & LOCK_OWNERDEAD) + error = EOWNERDEAD; + else if (flag & LOCK_UNMAPPED) + error = ELOCKUNMAPPED; + } + } + suword8_noerr(&lp->mutex_waiters, waiters); + locked = 0; + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + if (tsp && !time_error) /* copyout the residual time left */ + error = lwp_timer_copyout(&lwpt, error); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * Obsolete lwp_mutex_lock() interface, no longer called from libc. + * libc now calls lwp_mutex_timedlock(lp, NULL). + * This system call trap continues to exist solely for the benefit + * of old statically-linked binaries from Solaris 9 and before. + * It should be removed from the system when we no longer care + * about such applications. + */ +int +lwp_mutex_lock(lwp_mutex_t *lp) +{ + return (lwp_mutex_timedlock(lp, NULL)); +} + +static int +iswanted(kthread_t *t, lwpchan_t *lwpchan) +{ + /* + * The caller holds the dispatcher lock on the sleep queue. + */ + while (t != NULL) { + if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && + t->t_lwpchan.lc_wchan == lwpchan->lc_wchan) + return (1); + t = t->t_link; + } + return (0); +} + +/* + * Return the highest priority thread sleeping on this lwpchan. + */ +static kthread_t * +lwp_queue_waiter(lwpchan_t *lwpchan) +{ + sleepq_head_t *sqh; + kthread_t *tp; + + sqh = lwpsqhash(lwpchan); + disp_lock_enter(&sqh->sq_lock); /* lock the sleep queue */ + for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) { + if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && + tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) + break; + } + disp_lock_exit(&sqh->sq_lock); + return (tp); +} + +static int +lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type) +{ + sleepq_head_t *sqh; + kthread_t *tp; + kthread_t **tpp; + + sqh = lwpsqhash(lwpchan); + disp_lock_enter(&sqh->sq_lock); /* lock the sleep queue */ + tpp = &sqh->sq_queue.sq_first; + while ((tp = *tpp) != NULL) { + if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && + tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) { + /* + * The following is typically false. It could be true + * only if lwp_release() is called from + * lwp_mutex_wakeup() after reading the waiters field + * from memory in which the lwp lock used to be, but has + * since been re-used to hold a lwp cv or lwp semaphore. + * The thread "tp" found to match the lwp lock's wchan + * is actually sleeping for the cv or semaphore which + * now has the same wchan. In this case, lwp_release() + * should return failure. + */ + if (sync_type != (tp->t_flag & T_WAITCVSEM)) { + ASSERT(sync_type == 0); + /* + * assert that this can happen only for mutexes + * i.e. sync_type == 0, for correctly written + * user programs. + */ + disp_lock_exit(&sqh->sq_lock); + return (0); + } + *waiters = iswanted(tp->t_link, lwpchan); + sleepq_unlink(tpp, tp); + DTRACE_SCHED1(wakeup, kthread_t *, tp); + tp->t_wchan0 = NULL; + tp->t_wchan = NULL; + tp->t_sobj_ops = NULL; + tp->t_release = 1; + THREAD_TRANSITION(tp); /* drops sleepq lock */ + CL_WAKEUP(tp); + thread_unlock(tp); /* drop run queue lock */ + return (1); + } + tpp = &tp->t_link; + } + *waiters = 0; + disp_lock_exit(&sqh->sq_lock); + return (0); +} + +static void +lwp_release_all(lwpchan_t *lwpchan) +{ + sleepq_head_t *sqh; + kthread_t *tp; + kthread_t **tpp; + + sqh = lwpsqhash(lwpchan); + disp_lock_enter(&sqh->sq_lock); /* lock sleep q queue */ + tpp = &sqh->sq_queue.sq_first; + while ((tp = *tpp) != NULL) { + if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && + tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) { + sleepq_unlink(tpp, tp); + DTRACE_SCHED1(wakeup, kthread_t *, tp); + tp->t_wchan0 = NULL; + tp->t_wchan = NULL; + tp->t_sobj_ops = NULL; + CL_WAKEUP(tp); + thread_unlock_high(tp); /* release run queue lock */ + } else { + tpp = &tp->t_link; + } + } + disp_lock_exit(&sqh->sq_lock); /* drop sleep q lock */ +} + +/* + * unblock a lwp that is trying to acquire this mutex. the blocked + * lwp resumes and retries to acquire the lock. + */ +int +lwp_mutex_wakeup(lwp_mutex_t *lp) +{ + proc_t *p = ttoproc(curthread); + lwpchan_t lwpchan; + uchar_t waiters; + volatile int locked = 0; + volatile int watched = 0; + volatile uint8_t type = 0; + label_t ljb; + int error = 0; + + if ((caddr_t)lp >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); + error = EFAULT; + goto out; + } + /* + * Force Copy-on-write fault if lwp_mutex_t object is + * defined to be MAP_PRIVATE, and type is USYNC_PROCESS + */ + fuword8_noerr(&lp->mutex_type, (uint8_t *)&type); + suword8_noerr(&lp->mutex_type, type); + if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, + &lwpchan, LWPCHAN_MPPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); + locked = 1; + /* + * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will + * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release() + * may fail. If it fails, do not write into the waiter bit. + * The call to lwp_release() might fail due to one of three reasons: + * + * 1. due to the thread which set the waiter bit not actually + * sleeping since it got the lock on the re-try. The waiter + * bit will then be correctly updated by that thread. This + * window may be closed by reading the wait bit again here + * and not calling lwp_release() at all if it is zero. + * 2. the thread which set the waiter bit and went to sleep + * was woken up by a signal. This time, the waiter recomputes + * the wait bit in the return with EINTR code. + * 3. the waiter bit read by lwp_mutex_wakeup() was in + * memory that has been re-used after the lock was dropped. + * In this case, writing into the waiter bit would cause data + * corruption. + */ + if (lwp_release(&lwpchan, &waiters, 0) == 1) { + suword8_noerr(&lp->mutex_waiters, waiters); + } + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * lwp_cond_wait() has four arguments, a pointer to a condition variable, + * a pointer to a mutex, a pointer to a timespec for a timed wait and + * a flag telling the kernel whether or not to honor the kernel/user + * schedctl parking protocol (see schedctl_is_park() in schedctl.c). + * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an + * lwpchan, returned by get_lwpchan(). If the timespec pointer is non-NULL, + * it is used an an in/out parameter. On entry, it contains the relative + * time until timeout. On exit, we copyout the residual time left to it. + */ +int +lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + lwp_timer_t lwpt; + lwpchan_t cv_lwpchan; + lwpchan_t m_lwpchan; + caddr_t timedwait; + volatile uint16_t type = 0; + volatile uint8_t mtype = 0; + uchar_t waiters; + volatile int error; + clock_t tim = -1; + volatile int locked = 0; + volatile int m_locked = 0; + volatile int cvwatched = 0; + volatile int mpwatched = 0; + label_t ljb; + volatile int no_lwpchan = 1; + int imm_timeout = 0; + int imm_unpark = 0; + + if ((caddr_t)cv >= p->p_as->a_userlimit || + (caddr_t)mp >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + timedwait = (caddr_t)tsp; + if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0) + return (set_errno(error)); + if (lwpt.lwpt_imm_timeout) { + imm_timeout = 1; + timedwait = NULL; + } + + (void) new_mstate(t, LMS_USER_LOCK); + + if (on_fault(&ljb)) { + if (no_lwpchan) { + error = EFAULT; + goto out; + } + if (m_locked) { + m_locked = 0; + lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL); + } + if (locked) { + locked = 0; + lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL); + } + /* + * set up another on_fault() for a possible fault + * on the user lock accessed at "efault" + */ + if (on_fault(&ljb)) { + if (m_locked) { + m_locked = 0; + lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL); + } + goto out; + } + error = EFAULT; + goto efault; + } + + /* + * Force Copy-on-write fault if lwp_cond_t and lwp_mutex_t + * objects are defined to be MAP_PRIVATE, and are USYNC_PROCESS + */ + fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype); + if (UPIMUTEX(mtype) == 0) { + suword8_noerr(&mp->mutex_type, mtype); + /* convert user level mutex, "mp", to a unique lwpchan */ + /* check if mtype is ok to use below, instead of type from cv */ + if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype, + &m_lwpchan, LWPCHAN_MPPOOL)) { + error = EFAULT; + goto out; + } + } + fuword16_noerr(&cv->cond_type, (uint16_t *)&type); + suword16_noerr(&cv->cond_type, type); + /* convert user level condition variable, "cv", to a unique lwpchan */ + if (!get_lwpchan(p->p_as, (caddr_t)cv, type, + &cv_lwpchan, LWPCHAN_CVPOOL)) { + error = EFAULT; + goto out; + } + no_lwpchan = 0; + cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); + if (UPIMUTEX(mtype) == 0) + mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), + S_WRITE); + + /* + * lwpchan_lock ensures that the calling lwp is put to sleep atomically + * with respect to a possible wakeup which is a result of either + * an lwp_cond_signal() or an lwp_cond_broadcast(). + * + * What's misleading, is that the lwp is put to sleep after the + * condition variable's mutex is released. This is OK as long as + * the release operation is also done while holding lwpchan_lock. + * The lwp is then put to sleep when the possibility of pagefaulting + * or sleeping is completely eliminated. + */ + lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL); + locked = 1; + if (UPIMUTEX(mtype) == 0) { + lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL); + m_locked = 1; + suword8_noerr(&cv->cond_waiters_kernel, 1); + /* + * unlock the condition variable's mutex. (pagefaults are + * possible here.) + */ + ulock_clear(&mp->mutex_lockw); + fuword8_noerr(&mp->mutex_waiters, &waiters); + if (waiters != 0) { + /* + * Given the locking of lwpchan_lock around the release + * of the mutex and checking for waiters, the following + * call to lwp_release() can fail ONLY if the lock + * acquirer is interrupted after setting the waiter bit, + * calling lwp_block() and releasing lwpchan_lock. + * In this case, it could get pulled off the lwp sleep + * q (via setrun()) before the following call to + * lwp_release() occurs. In this case, the lock + * requestor will update the waiter bit correctly by + * re-evaluating it. + */ + if (lwp_release(&m_lwpchan, &waiters, 0) > 0) + suword8_noerr(&mp->mutex_waiters, waiters); + } + m_locked = 0; + lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL); + } else { + suword8_noerr(&cv->cond_waiters_kernel, 1); + error = lwp_upimutex_unlock(mp, mtype); + if (error) { /* if the upimutex unlock failed */ + locked = 0; + lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL); + goto out; + } + } + no_fault(); + + if (mpwatched) { + watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); + mpwatched = 0; + } + if (cvwatched) { + watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); + cvwatched = 0; + } + + /* + * Put the lwp in an orderly state for debugging. + */ + prstop(PR_REQUESTED, 0); + if (check_park && (!schedctl_is_park() || t->t_unpark)) { + /* + * We received a signal at user-level before calling here + * or another thread wants us to return immediately + * with EINTR. See lwp_unpark(). + */ + imm_unpark = 1; + t->t_unpark = 0; + timedwait = NULL; + } else if (timedwait) { + /* + * If we successfully queue the timeout, + * then don't drop t_delay_lock until + * we are on the sleep queue (below). + */ + mutex_enter(&t->t_delay_lock); + if (lwp_timer_enqueue(&lwpt) != 0) { + mutex_exit(&t->t_delay_lock); + imm_timeout = 1; + timedwait = NULL; + } + } + t->t_flag |= T_WAITCVSEM; + lwp_block(&cv_lwpchan); + /* + * Nothing should happen to cause the lwp to go to sleep + * until after it returns from swtch(). + */ + if (timedwait) + mutex_exit(&t->t_delay_lock); + locked = 0; + lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL); + if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || + (imm_timeout | imm_unpark)) + setrun(t); + swtch(); + t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE); + if (timedwait) + tim = lwp_timer_dequeue(&lwpt); + if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || + MUSTRETURN(p, t) || imm_unpark) + error = EINTR; + else if (imm_timeout || (timedwait && tim == -1)) + error = ETIME; + lwp->lwp_asleep = 0; + lwp->lwp_sysabort = 0; + setallwatch(); + + if (t->t_mstate == LMS_USER_LOCK) + (void) new_mstate(t, LMS_SYSTEM); + + if (tsp && check_park) /* copyout the residual time left */ + error = lwp_timer_copyout(&lwpt, error); + + /* the mutex is reacquired by the caller on return to user level */ + if (error) { + /* + * If we were concurrently lwp_cond_signal()d and we + * received a UNIX signal or got a timeout, then perform + * another lwp_cond_signal() to avoid consuming the wakeup. + */ + if (t->t_release) + (void) lwp_cond_signal(cv); + return (set_errno(error)); + } + return (0); + +efault: + /* + * make sure that the user level lock is dropped before + * returning to caller, since the caller always re-acquires it. + */ + if (UPIMUTEX(mtype) == 0) { + lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL); + m_locked = 1; + ulock_clear(&mp->mutex_lockw); + fuword8_noerr(&mp->mutex_waiters, &waiters); + if (waiters != 0) { + /* + * See comment above on lock clearing and lwp_release() + * success/failure. + */ + if (lwp_release(&m_lwpchan, &waiters, 0) > 0) + suword8_noerr(&mp->mutex_waiters, waiters); + } + m_locked = 0; + lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL); + } else { + (void) lwp_upimutex_unlock(mp, mtype); + } +out: + no_fault(); + if (mpwatched) + watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); + if (cvwatched) + watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); + if (t->t_mstate == LMS_USER_LOCK) + (void) new_mstate(t, LMS_SYSTEM); + return (set_errno(error)); +} + +/* + * wakeup one lwp that's blocked on this condition variable. + */ +int +lwp_cond_signal(lwp_cond_t *cv) +{ + proc_t *p = ttoproc(curthread); + lwpchan_t lwpchan; + uchar_t waiters; + volatile uint16_t type = 0; + volatile int locked = 0; + volatile int watched = 0; + label_t ljb; + int error = 0; + + if ((caddr_t)cv >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); + + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + error = EFAULT; + goto out; + } + /* + * Force Copy-on-write fault if lwp_cond_t object is + * defined to be MAP_PRIVATE, and is USYNC_PROCESS. + */ + fuword16_noerr(&cv->cond_type, (uint16_t *)&type); + suword16_noerr(&cv->cond_type, type); + if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type, + &lwpchan, LWPCHAN_CVPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); + locked = 1; + fuword8_noerr(&cv->cond_waiters_kernel, &waiters); + if (waiters != 0) { + /* + * The following call to lwp_release() might fail but it is + * OK to write into the waiters bit below, since the memory + * could not have been re-used or unmapped (for correctly + * written user programs) as in the case of lwp_mutex_wakeup(). + * For an incorrect program, we should not care about data + * corruption since this is just one instance of other places + * where corruption can occur for such a program. Of course + * if the memory is unmapped, normal fault recovery occurs. + */ + (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM); + suword8_noerr(&cv->cond_waiters_kernel, waiters); + } + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * wakeup every lwp that's blocked on this condition variable. + */ +int +lwp_cond_broadcast(lwp_cond_t *cv) +{ + proc_t *p = ttoproc(curthread); + lwpchan_t lwpchan; + volatile uint16_t type = 0; + volatile int locked = 0; + volatile int watched = 0; + label_t ljb; + uchar_t waiters; + int error = 0; + + if ((caddr_t)cv >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); + + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + error = EFAULT; + goto out; + } + /* + * Force Copy-on-write fault if lwp_cond_t object is + * defined to be MAP_PRIVATE, and is USYNC_PROCESS. + */ + fuword16_noerr(&cv->cond_type, (uint16_t *)&type); + suword16_noerr(&cv->cond_type, type); + if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type, + &lwpchan, LWPCHAN_CVPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); + locked = 1; + fuword8_noerr(&cv->cond_waiters_kernel, &waiters); + if (waiters != 0) { + lwp_release_all(&lwpchan); + suword8_noerr(&cv->cond_waiters_kernel, 0); + } + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); + if (error) + return (set_errno(error)); + return (0); +} + +int +lwp_sema_trywait(lwp_sema_t *sp) +{ + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + label_t ljb; + volatile int locked = 0; + volatile int watched = 0; + volatile uint16_t type = 0; + int count; + lwpchan_t lwpchan; + uchar_t waiters; + int error = 0; + + if ((caddr_t)sp >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); + + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + error = EFAULT; + goto out; + } + /* + * Force Copy-on-write fault if lwp_sema_t object is + * defined to be MAP_PRIVATE, and is USYNC_PROCESS. + */ + fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type); + suword16_noerr((void *)&sp->sema_type, type); + if (!get_lwpchan(p->p_as, (caddr_t)sp, type, + &lwpchan, LWPCHAN_CVPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); + locked = 1; + fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count); + if (count == 0) + error = EBUSY; + else + suword32_noerr((void *)&sp->sema_count, --count); + if (count != 0) { + fuword8_noerr(&sp->sema_waiters, &waiters); + if (waiters != 0) { + (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM); + suword8_noerr(&sp->sema_waiters, waiters); + } + } + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument. + */ +int +lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + lwp_timer_t lwpt; + caddr_t timedwait; + clock_t tim = -1; + label_t ljb; + volatile int locked = 0; + volatile int watched = 0; + volatile uint16_t type = 0; + int count; + lwpchan_t lwpchan; + uchar_t waiters; + int error = 0; + int time_error; + int imm_timeout = 0; + int imm_unpark = 0; + + if ((caddr_t)sp >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + timedwait = (caddr_t)tsp; + if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 && + lwpt.lwpt_imm_timeout) { + imm_timeout = 1; + timedwait = NULL; + } + + watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); + + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + error = EFAULT; + goto out; + } + /* + * Force Copy-on-write fault if lwp_sema_t object is + * defined to be MAP_PRIVATE, and is USYNC_PROCESS. + */ + fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type); + suword16_noerr((void *)&sp->sema_type, type); + if (!get_lwpchan(p->p_as, (caddr_t)sp, type, + &lwpchan, LWPCHAN_CVPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); + locked = 1; + fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count); + while (error == 0 && count == 0) { + if (time_error) { + /* + * The SUSV3 Posix spec is very clear that we + * should get no error from validating the + * timer until we would actually sleep. + */ + error = time_error; + break; + } + suword8_noerr(&sp->sema_waiters, 1); + if (watched) + watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); + /* + * Put the lwp in an orderly state for debugging. + */ + prstop(PR_REQUESTED, 0); + if (check_park && (!schedctl_is_park() || t->t_unpark)) { + /* + * We received a signal at user-level before calling + * here or another thread wants us to return + * immediately with EINTR. See lwp_unpark(). + */ + imm_unpark = 1; + t->t_unpark = 0; + timedwait = NULL; + } else if (timedwait) { + /* + * If we successfully queue the timeout, + * then don't drop t_delay_lock until + * we are on the sleep queue (below). + */ + mutex_enter(&t->t_delay_lock); + if (lwp_timer_enqueue(&lwpt) != 0) { + mutex_exit(&t->t_delay_lock); + imm_timeout = 1; + timedwait = NULL; + } + } + t->t_flag |= T_WAITCVSEM; + lwp_block(&lwpchan); + /* + * Nothing should happen to cause the lwp to sleep + * again until after it returns from swtch(). + */ + if (timedwait) + mutex_exit(&t->t_delay_lock); + locked = 0; + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || + (imm_timeout | imm_unpark)) + setrun(t); + swtch(); + t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE); + if (timedwait) + tim = lwp_timer_dequeue(&lwpt); + setallwatch(); + if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || + MUSTRETURN(p, t) || imm_unpark) + error = EINTR; + else if (imm_timeout || (timedwait && tim == -1)) + error = ETIME; + lwp->lwp_asleep = 0; + lwp->lwp_sysabort = 0; + watched = watch_disable_addr((caddr_t)sp, + sizeof (*sp), S_WRITE); + lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); + locked = 1; + fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count); + } + if (error == 0) + suword32_noerr((void *)&sp->sema_count, --count); + if (count != 0) { + (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM); + suword8_noerr(&sp->sema_waiters, waiters); + } + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); + if (tsp && check_park && !time_error) + error = lwp_timer_copyout(&lwpt, error); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * Obsolete lwp_sema_wait() interface, no longer called from libc. + * libc now calls lwp_sema_timedwait(). + * This system call trap exists solely for the benefit of old + * statically linked applications from Solaris 9 and before. + * It should be removed when we no longer care about such applications. + */ +int +lwp_sema_wait(lwp_sema_t *sp) +{ + return (lwp_sema_timedwait(sp, NULL, 0)); +} + +int +lwp_sema_post(lwp_sema_t *sp) +{ + proc_t *p = ttoproc(curthread); + label_t ljb; + volatile int locked = 0; + volatile int watched = 0; + volatile uint16_t type = 0; + int count; + lwpchan_t lwpchan; + uchar_t waiters; + int error = 0; + + if ((caddr_t)sp >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); + + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + error = EFAULT; + goto out; + } + /* + * Force Copy-on-write fault if lwp_sema_t object is + * defined to be MAP_PRIVATE, and is USYNC_PROCESS. + */ + fuword16_noerr(&sp->sema_type, (uint16_t *)&type); + suword16_noerr(&sp->sema_type, type); + if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type, + &lwpchan, LWPCHAN_CVPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); + locked = 1; + fuword32_noerr(&sp->sema_count, (uint32_t *)&count); + if (count == _SEM_VALUE_MAX) + error = EOVERFLOW; + else + suword32_noerr(&sp->sema_count, ++count); + if (count == 1) { + fuword8_noerr(&sp->sema_waiters, &waiters); + if (waiters) { + (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM); + suword8_noerr(&sp->sema_waiters, waiters); + } + } + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); + if (error) + return (set_errno(error)); + return (0); +} + +#define TRW_WANT_WRITE 0x1 +#define TRW_LOCK_GRANTED 0x2 + +#define READ_LOCK 0 +#define WRITE_LOCK 1 +#define TRY_FLAG 0x10 +#define READ_LOCK_TRY (READ_LOCK | TRY_FLAG) +#define WRITE_LOCK_TRY (WRITE_LOCK | TRY_FLAG) + +/* + * Release one writer or one or more readers. Compute the rwstate word to + * reflect the new state of the queue. For a safe hand-off we copy the new + * rwstate value back to userland before we wake any of the new lock holders. + * + * Note that sleepq_insert() implements a prioritized FIFO (with writers + * being given precedence over readers of the same priority). + * + * If the first thread is a reader we scan the queue releasing all readers + * until we hit a writer or the end of the queue. If the first thread is a + * writer we still need to check for another writer (i.e. URW_WRITE_WANTED). + */ +void +lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw) +{ + sleepq_head_t *sqh; + kthread_t *tp; + kthread_t **tpp; + kthread_t *tpnext; + kthread_t *wakelist = NULL; + uint32_t rwstate = 0; + int wcount = 0; + int rcount = 0; + + sqh = lwpsqhash(lwpchan); + disp_lock_enter(&sqh->sq_lock); + tpp = &sqh->sq_queue.sq_first; + while ((tp = *tpp) != NULL) { + if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && + tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) { + if (tp->t_writer & TRW_WANT_WRITE) { + if ((wcount++ == 0) && (rcount == 0)) { + rwstate |= URW_WRITE_LOCKED; + + /* Just one writer to wake. */ + sleepq_unlink(tpp, tp); + wakelist = tp; + + /* tpp already set for next thread. */ + continue; + } else { + rwstate |= + (URW_WRITE_WANTED|URW_HAS_WAITERS); + + /* We need look no further. */ + break; + } + } else { + rcount++; + if (wcount == 0) { + rwstate++; + + /* Add reader to wake list. */ + sleepq_unlink(tpp, tp); + tp->t_link = wakelist; + wakelist = tp; + + /* tpp already set for next thread. */ + continue; + } else + rwstate |= URW_HAS_WAITERS; + } + } + tpp = &tp->t_link; + } + + /* Copy the new rwstate back to userland. */ + suword32_noerr(&rw->rwlock_readers, rwstate); + + /* Wake the new lock holder(s) up. */ + tp = wakelist; + while (tp != NULL) { + DTRACE_SCHED1(wakeup, kthread_t *, tp); + tp->t_wchan0 = NULL; + tp->t_wchan = NULL; + tp->t_sobj_ops = NULL; + tp->t_writer |= TRW_LOCK_GRANTED; + tpnext = tp->t_link; + tp->t_link = NULL; + CL_WAKEUP(tp); + thread_unlock_high(tp); + tp = tpnext; + } + + disp_lock_exit(&sqh->sq_lock); +} + +/* + * We enter here holding the user-level mutex, which we must release before + * returning or blocking. Based on lwp_cond_wait(). + */ +static int +lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr) +{ + lwp_mutex_t *mp = NULL; + kthread_t *t = curthread; + kthread_t *tp; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + lwp_timer_t lwpt; + lwpchan_t lwpchan; + lwpchan_t mlwpchan; + caddr_t timedwait; + volatile uint16_t type = 0; + volatile uint8_t mtype = 0; + uchar_t mwaiters; + volatile int error = 0; + int time_error; + clock_t tim = -1; + volatile int locked = 0; + volatile int mlocked = 0; + volatile int watched = 0; + volatile int mwatched = 0; + label_t ljb; + volatile int no_lwpchan = 1; + int imm_timeout = 0; + int try_flag; + uint32_t rwstate; + int acquired = 0; + + /* We only check rw because the mutex is included in it. */ + if ((caddr_t)rw >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + /* We must only report this error if we are about to sleep (later). */ + timedwait = (caddr_t)tsp; + if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 && + lwpt.lwpt_imm_timeout) { + imm_timeout = 1; + timedwait = NULL; + } + + (void) new_mstate(t, LMS_USER_LOCK); + + if (on_fault(&ljb)) { + if (no_lwpchan) { + error = EFAULT; + goto out_nodrop; + } + if (mlocked) { + mlocked = 0; + lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL); + } + if (locked) { + locked = 0; + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + } + /* + * Set up another on_fault() for a possible fault + * on the user lock accessed at "out_drop". + */ + if (on_fault(&ljb)) { + if (mlocked) { + mlocked = 0; + lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL); + } + error = EFAULT; + goto out_nodrop; + } + error = EFAULT; + goto out_nodrop; + } + + /* Process rd_wr (including sanity check). */ + try_flag = (rd_wr & TRY_FLAG); + rd_wr &= ~TRY_FLAG; + if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) { + error = EINVAL; + goto out_nodrop; + } + + /* We can only continue for simple USYNC_PROCESS locks. */ + mp = &rw->mutex; + fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype); + fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type); + if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) { + error = EINVAL; + goto out_nodrop; + } + + /* Force Copy-on-write fault incase objects are MAP_PRIVATE. */ + suword8_noerr(&mp->mutex_type, mtype); + suword16_noerr(&rw->rwlock_type, type); + + /* Convert user level mutex, "mp", to a unique lwpchan. */ + if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype, + &mlwpchan, LWPCHAN_MPPOOL)) { + error = EFAULT; + goto out_nodrop; + } + + /* Convert user level rwlock, "rw", to a unique lwpchan. */ + if (!get_lwpchan(p->p_as, (caddr_t)rw, type, + &lwpchan, LWPCHAN_CVPOOL)) { + error = EFAULT; + goto out_nodrop; + } + + no_lwpchan = 0; + watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); + mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); + + /* + * lwpchan_lock() ensures that the calling LWP is put to sleep + * atomically with respect to a possible wakeup which is a result + * of lwp_rwlock_unlock(). + * + * What's misleading is that the LWP is put to sleep after the + * rwlock's mutex is released. This is OK as long as the release + * operation is also done while holding mlwpchan. The LWP is then + * put to sleep when the possibility of pagefaulting or sleeping + * has been completely eliminated. + */ + lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); + locked = 1; + lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL); + mlocked = 1; + + /* + * Fetch the current rwlock state. + * + * The possibility of spurious wake-ups or killed waiters means that + * rwstate's URW_HAS_WAITERS and URW_WRITE_WANTED bits may indicate + * false positives. We only fix these if they are important to us. + * + * Although various error states can be observed here (e.g. the lock + * is not held, but there are waiters) we assume these are applicaton + * errors and so we take no corrective action. + */ + fuword32_noerr(&rw->rwlock_readers, &rwstate); + + /* + * If the lock is uncontended we can acquire it here. These tests + * should have already been done at user-level, we just need to be + * sure. + */ + if (rd_wr == READ_LOCK) { + if ((rwstate & ~URW_READERS_MASK) == 0) { + rwstate++; + acquired = 1; + } + } else if (rwstate == 0) { + rwstate = URW_WRITE_LOCKED; + acquired = 1; + } + + /* + * We can only try harder if the lock isn't held by a writer. + */ + if (!acquired && !(rwstate & URW_WRITE_LOCKED)) { + tp = lwp_queue_waiter(&lwpchan); + if (tp == NULL) { + /* + * Hmmm, rwstate indicates waiters but there are + * none queued. This could just be the result of a + * spurious wakeup, so let's fix it. + */ + rwstate &= URW_READERS_MASK; + + /* + * We now have another chance to acquire the lock + * uncontended, but this is the last chance for a + * writer to acquire the lock without blocking. + */ + if (rd_wr == READ_LOCK) { + rwstate++; + acquired = 1; + } else if (rwstate == 0) { + rwstate = URW_WRITE_LOCKED; + acquired = 1; + } + } else if (rd_wr == READ_LOCK) { + /* + * This is the last chance for a reader to acquire + * the lock now, but it can only do so if there is + * no writer of equal or greater priority at the + * head of the queue . + * + * It is also just possible that there is a reader + * at the head of the queue. This may be the result + * of a spurious wakeup or an application failure. + * In this case we only acquire the lock if we have + * equal or greater priority. It is not our job to + * release spurious waiters. + */ + pri_t our_pri = DISP_PRIO(t); + pri_t his_pri = DISP_PRIO(tp); + + if ((our_pri > his_pri) || ((our_pri == his_pri) && + !(tp->t_writer & TRW_WANT_WRITE))) { + rwstate++; + acquired = 1; + } + } + } + + if (acquired || try_flag || time_error) { + /* + * We're not going to block this time! + */ + suword32_noerr(&rw->rwlock_readers, rwstate); + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + locked = 0; + + if (acquired) { + /* + * Got the lock! + */ + error = 0; + + } else if (try_flag) { + /* + * We didn't get the lock and we're about to block. + * If we're doing a trylock, return EBUSY instead. + */ + error = EBUSY; + + } else if (time_error) { + /* + * The SUSV3 POSIX spec is very clear that we should + * get no error from validating the timer (above) + * until we would actually sleep. + */ + error = time_error; + } + + goto out_drop; + } + + /* + * We're about to block, so indicate what kind of waiter we are. + */ + t->t_writer = 0; + rwstate |= URW_HAS_WAITERS; + if (rd_wr == WRITE_LOCK) { + t->t_writer = TRW_WANT_WRITE; + rwstate |= URW_WRITE_WANTED; + } + suword32_noerr(&rw->rwlock_readers, rwstate); + + /* + * Unlock the rwlock's mutex (pagefaults are possible here). + */ + ulock_clear(&mp->mutex_lockw); + fuword8_noerr(&mp->mutex_waiters, &mwaiters); + if (mwaiters != 0) { + /* + * Given the locking of mlwpchan around the release of + * the mutex and checking for waiters, the following + * call to lwp_release() can fail ONLY if the lock + * acquirer is interrupted after setting the waiter bit, + * calling lwp_block() and releasing mlwpchan. + * In this case, it could get pulled off the LWP sleep + * queue (via setrun()) before the following call to + * lwp_release() occurs, and the lock requestor will + * update the waiter bit correctly by re-evaluating it. + */ + if (lwp_release(&mlwpchan, &mwaiters, 0) > 0) + suword8_noerr(&mp->mutex_waiters, mwaiters); + } + lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL); + mlocked = 0; + no_fault(); + + if (mwatched) { + watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); + mwatched = 0; + } + if (watched) { + watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); + watched = 0; + } + + /* + * Put the LWP in an orderly state for debugging. + */ + prstop(PR_REQUESTED, 0); + if (timedwait) { + /* + * If we successfully queue the timeout, + * then don't drop t_delay_lock until + * we are on the sleep queue (below). + */ + mutex_enter(&t->t_delay_lock); + if (lwp_timer_enqueue(&lwpt) != 0) { + mutex_exit(&t->t_delay_lock); + imm_timeout = 1; + timedwait = NULL; + } + } + t->t_flag |= T_WAITCVSEM; + lwp_block(&lwpchan); + + /* + * Nothing should happen to cause the LWp to go to sleep until after + * it returns from swtch(). + */ + if (timedwait) + mutex_exit(&t->t_delay_lock); + locked = 0; + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t)) + setrun(t); + swtch(); + + /* + * We're back, but we need to work out why. Were we interrupted? Did + * we timeout? Were we granted the lock? + */ + error = EAGAIN; + acquired = (t->t_writer & TRW_LOCK_GRANTED); + t->t_writer = 0; + t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE); + if (timedwait) + tim = lwp_timer_dequeue(&lwpt); + if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t)) + error = EINTR; + else if (imm_timeout || (timedwait && tim == -1)) + error = ETIME; + lwp->lwp_asleep = 0; + lwp->lwp_sysabort = 0; + setallwatch(); + + /* + * If we were granted the lock we don't care about EINTR or ETIME. + */ + if (acquired) + error = 0; + + if (t->t_mstate == LMS_USER_LOCK) + (void) new_mstate(t, LMS_SYSTEM); + + if (error) + return (set_errno(error)); + return (0); + +out_drop: + /* + * Make sure that the user level lock is dropped before returning + * to the caller. + */ + if (!mlocked) { + lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL); + mlocked = 1; + } + suword32_noerr(&mp->mutex_ownerpid, 0); + ulock_clear(&mp->mutex_lockw); + fuword8_noerr(&mp->mutex_waiters, &mwaiters); + if (mwaiters != 0) { + /* + * See comment above on lock clearing and lwp_release() + * success/failure. + */ + if (lwp_release(&mlwpchan, &mwaiters, 0) > 0) + suword8_noerr(&mp->mutex_waiters, mwaiters); + } + lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL); + mlocked = 0; + +out_nodrop: + no_fault(); + if (mwatched) + watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); + if (watched) + watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); + if (t->t_mstate == LMS_USER_LOCK) + (void) new_mstate(t, LMS_SYSTEM); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(), + * we never drop the lock. + */ +static int +lwp_rwlock_unlock(lwp_rwlock_t *rw) +{ + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + lwpchan_t lwpchan; + volatile uint16_t type = 0; + volatile int error = 0; + volatile int locked = 0; + volatile int watched = 0; + label_t ljb; + volatile int no_lwpchan = 1; + uint32_t rwstate; + + /* We only check rw because the mutex is included in it. */ + if ((caddr_t)rw >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + if (on_fault(&ljb)) { + if (no_lwpchan) { + error = EFAULT; + goto out_nodrop; + } + if (locked) { + locked = 0; + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + } + error = EFAULT; + goto out_nodrop; + } + + /* We can only continue for simple USYNC_PROCESS locks. */ + fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type); + if (type != USYNC_PROCESS) { + error = EINVAL; + goto out_nodrop; + } + + /* Force Copy-on-write fault incase objects are MAP_PRIVATE. */ + suword16_noerr(&rw->rwlock_type, type); + + /* Convert user level rwlock, "rw", to a unique lwpchan. */ + if (!get_lwpchan(p->p_as, (caddr_t)rw, type, + &lwpchan, LWPCHAN_CVPOOL)) { + error = EFAULT; + goto out_nodrop; + } + + no_lwpchan = 0; + watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); + + lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); + locked = 1; + + /* + * We can resolve multiple readers (except the last reader) here. + * For the last reader or a writer we need lwp_rwlock_release(), + * to which we also delegate the task of copying the new rwstate + * back to userland (see the comment there). + */ + fuword32_noerr(&rw->rwlock_readers, &rwstate); + if (rwstate & URW_WRITE_LOCKED) + lwp_rwlock_release(&lwpchan, rw); + else if ((rwstate & URW_READERS_MASK) > 0) { + rwstate--; + if ((rwstate & URW_READERS_MASK) == 0) + lwp_rwlock_release(&lwpchan, rw); + else + suword32_noerr(&rw->rwlock_readers, rwstate); + } + + lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); + locked = 0; + error = 0; + +out_nodrop: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); + if (error) + return (set_errno(error)); + return (0); +} + +int +lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp) +{ + switch (subcode) { + case 0: + return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK)); + case 1: + return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK)); + case 2: + return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY)); + case 3: + return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY)); + case 4: + return (lwp_rwlock_unlock(rwlp)); + } + return (set_errno(EINVAL)); +} + +/* + * Return the owner of the user-level s-object. + * Since we can't really do this, return NULL. + */ +/* ARGSUSED */ +static kthread_t * +lwpsobj_owner(caddr_t sobj) +{ + return ((kthread_t *)NULL); +} + +/* + * Wake up a thread asleep on a user-level synchronization + * object. + */ +static void +lwp_unsleep(kthread_t *t) +{ + ASSERT(THREAD_LOCK_HELD(t)); + if (t->t_wchan0 != NULL) { + sleepq_head_t *sqh; + sleepq_t *sqp = t->t_sleepq; + + if (sqp != NULL) { + sqh = lwpsqhash(&t->t_lwpchan); + ASSERT(&sqh->sq_queue == sqp); + sleepq_unsleep(t); + disp_lock_exit_high(&sqh->sq_lock); + CL_SETRUN(t); + return; + } + } + panic("lwp_unsleep: thread %p not on sleepq", (void *)t); +} + +/* + * Change the priority of a thread asleep on a user-level + * synchronization object. To maintain proper priority order, + * we: + * o dequeue the thread. + * o change its priority. + * o re-enqueue the thread. + * Assumption: the thread is locked on entry. + */ +static void +lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip) +{ + ASSERT(THREAD_LOCK_HELD(t)); + if (t->t_wchan0 != NULL) { + sleepq_t *sqp = t->t_sleepq; + + sleepq_dequeue(t); + *t_prip = pri; + sleepq_insert(sqp, t); + } else + panic("lwp_change_pri: %p not on a sleep queue", (void *)t); +} + +/* + * Clean up a locked a robust mutex + */ +static void +lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg) +{ + uint16_t flag; + uchar_t waiters; + label_t ljb; + pid_t owner_pid; + lwp_mutex_t *lp; + volatile int locked = 0; + volatile int watched = 0; + + ASSERT(ent->lwpchan_type & USYNC_PROCESS_ROBUST); + + lp = (lwp_mutex_t *)ent->lwpchan_addr; + watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL); + goto out; + } + fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid); + if (owner_pid != curproc->p_pid) { + goto out; + } + lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL); + locked = 1; + fuword16_noerr(&lp->mutex_flag, &flag); + if ((flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) == 0) { + flag |= lockflg; + suword16_noerr(&lp->mutex_flag, flag); + } + suword32_noerr(&lp->mutex_ownerpid, 0); + ulock_clear(&lp->mutex_lockw); + fuword8_noerr(&lp->mutex_waiters, &waiters); + if (waiters && lwp_release(&ent->lwpchan_lwpchan, &waiters, 0)) + suword8_noerr(&lp->mutex_waiters, waiters); + lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); +} + +/* + * Register the mutex and initialize the mutex if it is not already + */ +int +lwp_mutex_init(lwp_mutex_t *lp, int type) +{ + proc_t *p = curproc; + int error = 0; + volatile int locked = 0; + volatile int watched = 0; + label_t ljb; + uint16_t flag; + lwpchan_t lwpchan; + pid_t owner_pid; + + if ((caddr_t)lp >= (caddr_t)USERLIMIT) + return (set_errno(EFAULT)); + + if (type != USYNC_PROCESS_ROBUST) + return (set_errno(EINVAL)); + + watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); + error = EFAULT; + goto out; + } + /* + * Force Copy-on-write fault if lwp_mutex_t object is + * defined to be MAP_PRIVATE and it was initialized to + * USYNC_PROCESS. + */ + suword8_noerr(&lp->mutex_type, type); + if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, + &lwpchan, LWPCHAN_MPPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); + locked = 1; + fuword16_noerr(&lp->mutex_flag, &flag); + if (flag & LOCK_INITED) { + if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { + fuword32_noerr(&lp->mutex_ownerpid, + (uint32_t *)&owner_pid); + if (owner_pid == p->p_pid) { + flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED); + suword16_noerr(&lp->mutex_flag, flag); + locked = 0; + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); + goto out; + } + } + error = EBUSY; + } else { + suword8_noerr(&lp->mutex_waiters, 0); + suword8_noerr(&lp->mutex_lockw, 0); + suword16_noerr(&lp->mutex_flag, LOCK_INITED); + suword32_noerr(&lp->mutex_ownerpid, 0); + } + locked = 0; + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + if (error) + return (set_errno(error)); + return (0); +} + +int +lwp_mutex_trylock(lwp_mutex_t *lp) +{ + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + int error = 0; + volatile int locked = 0; + volatile int watched = 0; + label_t ljb; + volatile uint8_t type = 0; + uint16_t flag; + lwpchan_t lwpchan; + + if ((caddr_t)lp >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + (void) new_mstate(t, LMS_USER_LOCK); + + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); + error = EFAULT; + goto out; + } + fuword8_noerr(&lp->mutex_type, (uint8_t *)&type); + if (UPIMUTEX(type)) { + no_fault(); + error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL); + if ((error == 0 || error == EOWNERDEAD) && + (type & USYNC_PROCESS)) + (void) suword32(&lp->mutex_ownerpid, p->p_pid); + if (error) + return (set_errno(error)); + return (0); + } + /* + * Force Copy-on-write fault if lwp_mutex_t object is + * defined to be MAP_PRIVATE and it was initialized to + * USYNC_PROCESS. + */ + suword8_noerr(&lp->mutex_type, type); + if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, + &lwpchan, LWPCHAN_MPPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); + locked = 1; + if (type & USYNC_PROCESS_ROBUST) { + fuword16_noerr((uint16_t *)(&lp->mutex_flag), &flag); + if (flag & LOCK_NOTRECOVERABLE) { + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); + error = ENOTRECOVERABLE; + goto out; + } + } + + watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + + if (!ulock_try(&lp->mutex_lockw)) + error = EBUSY; + else if (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) { + suword32_noerr(&lp->mutex_ownerpid, p->p_pid); + if (type & USYNC_PROCESS_ROBUST) { + if (flag & LOCK_OWNERDEAD) + error = EOWNERDEAD; + else if (flag & LOCK_UNMAPPED) + error = ELOCKUNMAPPED; + } + } + locked = 0; + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); +out: + + if (t->t_mstate == LMS_USER_LOCK) + (void) new_mstate(t, LMS_SYSTEM); + + no_fault(); + if (watched) + watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * unlock the mutex and unblock lwps that is trying to acquire this mutex. + * the blocked lwp resumes and retries to acquire the lock. + */ +int +lwp_mutex_unlock(lwp_mutex_t *lp) +{ + proc_t *p = ttoproc(curthread); + lwpchan_t lwpchan; + uchar_t waiters; + volatile int locked = 0; + volatile int watched = 0; + volatile uint8_t type = 0; + label_t ljb; + uint16_t flag; + int error = 0; + + if ((caddr_t)lp >= p->p_as->a_userlimit) + return (set_errno(EFAULT)); + + if (on_fault(&ljb)) { + if (locked) + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); + error = EFAULT; + goto out; + } + fuword8_noerr(&lp->mutex_type, (uint8_t *)&type); + if (UPIMUTEX(type)) { + no_fault(); + error = lwp_upimutex_unlock(lp, type); + if (error) + return (set_errno(error)); + return (0); + } + + watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + + /* + * Force Copy-on-write fault if lwp_mutex_t object is + * defined to be MAP_PRIVATE, and type is USYNC_PROCESS + */ + suword8_noerr(&lp->mutex_type, type); + if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, + &lwpchan, LWPCHAN_MPPOOL)) { + error = EFAULT; + goto out; + } + lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); + locked = 1; + if (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) { + if (type & USYNC_PROCESS_ROBUST) { + fuword16_noerr(&lp->mutex_flag, &flag); + if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { + flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED); + flag |= LOCK_NOTRECOVERABLE; + suword16_noerr(&lp->mutex_flag, flag); + } + } + suword32_noerr(&lp->mutex_ownerpid, 0); + } + ulock_clear(&lp->mutex_lockw); + /* + * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will + * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release() + * may fail. If it fails, do not write into the waiter bit. + * The call to lwp_release() might fail due to one of three reasons: + * + * 1. due to the thread which set the waiter bit not actually + * sleeping since it got the lock on the re-try. The waiter + * bit will then be correctly updated by that thread. This + * window may be closed by reading the wait bit again here + * and not calling lwp_release() at all if it is zero. + * 2. the thread which set the waiter bit and went to sleep + * was woken up by a signal. This time, the waiter recomputes + * the wait bit in the return with EINTR code. + * 3. the waiter bit read by lwp_mutex_wakeup() was in + * memory that has been re-used after the lock was dropped. + * In this case, writing into the waiter bit would cause data + * corruption. + */ + fuword8_noerr(&lp->mutex_waiters, &waiters); + if (waiters) { + if ((type & USYNC_PROCESS_ROBUST) && + (flag & LOCK_NOTRECOVERABLE)) { + lwp_release_all(&lwpchan); + suword8_noerr(&lp->mutex_waiters, 0); + } else if (lwp_release(&lwpchan, &waiters, 0) == 1) { + suword8_noerr(&lp->mutex_waiters, waiters); + } + } + + lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); +out: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); + if (error) + return (set_errno(error)); + return (0); +} diff --git a/usr/src/uts/common/syscall/lwp_timer.c b/usr/src/uts/common/syscall/lwp_timer.c new file mode 100644 index 0000000000..7c1d862bea --- /dev/null +++ b/usr/src/uts/common/syscall/lwp_timer.c @@ -0,0 +1,216 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/debug.h> +#include <sys/mutex.h> +#include <sys/timer.h> +#include <sys/lwp_timer_impl.h> + +/* + * lwp_timer_timeout() is called from a timeout set up in lwp_cond_wait(), + * lwp_mutex_timedlock(), lwp_sema_timedwait() or lwp_rwlock_lock(). + * + * It recomputes the time remaining until the absolute time when the + * wait is supposed to timeout and either calls realtime_timeout() + * to reschedule itself or calls setrun() on the sleeping thread. + * + * This is done to ensure that the waiting thread does not wake up + * due to timer expiration until the absolute future time of the + * timeout has been reached. Until that time, the thread must + * remain on its sleep queue. + * + * An lwp_timer_t structure is used to pass information + * about the sleeping thread to the timeout function. + */ + +static void +lwp_timer_timeout(void *arg) +{ + lwp_timer_t *lwptp = arg; + kthread_t *t = lwptp->lwpt_thread; + timespec_t now; + + mutex_enter(&t->t_delay_lock); + gethrestime(&now); + /* + * Requeue the timeout if no one has reset the system time + * and if the absolute future time has not been reached. + */ + if (lwptp->lwpt_timecheck == timechanged && + (lwptp->lwpt_rqtime.tv_sec > now.tv_sec || + (lwptp->lwpt_rqtime.tv_sec == now.tv_sec && + lwptp->lwpt_rqtime.tv_nsec > now.tv_nsec))) { + lwptp->lwpt_id = realtime_timeout(lwp_timer_timeout, lwptp, + timespectohz_adj(&lwptp->lwpt_rqtime, now)); + } else { + /* + * Set the thread running only if it is asleep on + * its lwpchan sleep queue (not if it is asleep on + * the t_delay_lock mutex). + */ + thread_lock(t); + if (t->t_state == TS_SLEEP && + (t->t_flag & T_WAKEABLE) && + t->t_wchan0 != NULL) + setrun_locked(t); + thread_unlock(t); + } + mutex_exit(&t->t_delay_lock); +} + +int +lwp_timer_copyin(lwp_timer_t *lwptp, timespec_t *tsp) +{ + timespec_t now; + int error = 0; + + if (tsp == NULL) /* not really an error, just need to bzero() */ + goto err; + lwptp->lwpt_timecheck = timechanged; /* do this before gethrestime() */ + gethrestime(&now); /* do this before copyin() */ + if (curproc->p_model == DATAMODEL_NATIVE) { + if (copyin(tsp, &lwptp->lwpt_rqtime, sizeof (timespec_t))) { + error = EFAULT; + goto err; + } + } else { + timespec32_t ts32; + if (copyin(tsp, &ts32, sizeof (timespec32_t))) { + error = EFAULT; + goto err; + } + TIMESPEC32_TO_TIMESPEC(&lwptp->lwpt_rqtime, &ts32); + } + if (itimerspecfix(&lwptp->lwpt_rqtime)) { + error = EINVAL; + goto err; + } + /* + * Unless the requested timeout is zero, + * get the precise future (absolute) time at + * which we are to time out and return ETIME. + * We must not return ETIME before that time. + */ + if (lwptp->lwpt_rqtime.tv_sec == 0 && lwptp->lwpt_rqtime.tv_nsec == 0) { + bzero(lwptp, sizeof (lwp_timer_t)); + lwptp->lwpt_imm_timeout = 1; + } else { + lwptp->lwpt_thread = curthread; + lwptp->lwpt_tsp = tsp; + lwptp->lwpt_time_error = 0; + lwptp->lwpt_id = 0; + lwptp->lwpt_imm_timeout = 0; + timespecadd(&lwptp->lwpt_rqtime, &now); + } + return (0); +err: + bzero(lwptp, sizeof (lwp_timer_t)); + lwptp->lwpt_time_error = error; + return (error); +} + +int +lwp_timer_enqueue(lwp_timer_t *lwptp) +{ + timespec_t now; + + ASSERT(lwptp->lwpt_thread == curthread); + ASSERT(MUTEX_HELD(&curthread->t_delay_lock)); + gethrestime(&now); + if (lwptp->lwpt_timecheck == timechanged && + (lwptp->lwpt_rqtime.tv_sec > now.tv_sec || + (lwptp->lwpt_rqtime.tv_sec == now.tv_sec && + lwptp->lwpt_rqtime.tv_nsec > now.tv_nsec))) { + /* + * Queue the timeout. + */ + lwptp->lwpt_id = realtime_timeout(lwp_timer_timeout, lwptp, + timespectohz_adj(&lwptp->lwpt_rqtime, now)); + return (0); + } + + /* + * Time has already run out or someone reset the system time; + * just cause an immediate timeout. + */ + lwptp->lwpt_imm_timeout = 1; + return (1); +} + +clock_t +lwp_timer_dequeue(lwp_timer_t *lwptp) +{ + kthread_t *t = curthread; + clock_t tim = -1; + timeout_id_t tmp_id; + + mutex_enter(&t->t_delay_lock); + while ((tmp_id = lwptp->lwpt_id) != 0) { + lwptp->lwpt_id = 0; + mutex_exit(&t->t_delay_lock); + tim = untimeout(tmp_id); + mutex_enter(&t->t_delay_lock); + } + mutex_exit(&t->t_delay_lock); + return (tim); +} + +int +lwp_timer_copyout(lwp_timer_t *lwptp, int error) +{ + timespec_t rmtime; + timespec_t now; + + if (lwptp->lwpt_tsp == NULL) /* nothing to do */ + return (error); + + rmtime.tv_sec = rmtime.tv_nsec = 0; + if (error != ETIME) { + gethrestime(&now); + if ((now.tv_sec < lwptp->lwpt_rqtime.tv_sec) || + ((now.tv_sec == lwptp->lwpt_rqtime.tv_sec) && + (now.tv_nsec < lwptp->lwpt_rqtime.tv_nsec))) { + rmtime = lwptp->lwpt_rqtime; + timespecsub(&rmtime, &now); + } + } + if (curproc->p_model == DATAMODEL_NATIVE) { + if (copyout(&rmtime, lwptp->lwpt_tsp, sizeof (timespec_t))) + error = EFAULT; + } else { + timespec32_t rmtime32; + + TIMESPEC_TO_TIMESPEC32(&rmtime32, &rmtime); + if (copyout(&rmtime32, lwptp->lwpt_tsp, sizeof (timespec32_t))) + error = EFAULT; + } + + return (error); +} diff --git a/usr/src/uts/common/syscall/lwpsys.c b/usr/src/uts/common/syscall/lwpsys.c new file mode 100644 index 0000000000..8868468a44 --- /dev/null +++ b/usr/src/uts/common/syscall/lwpsys.c @@ -0,0 +1,563 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/prsystm.h> +#include <sys/cred.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/signal.h> +#include <sys/kmem.h> +#include <sys/unistd.h> +#include <sys/cmn_err.h> +#include <sys/schedctl.h> +#include <sys/debug.h> +#include <sys/contract/process_impl.h> + +kthread_t * +idtot(proc_t *p, id_t lwpid) +{ + lwpdir_t *ldp; + + if ((ldp = lwp_hash_lookup(p, lwpid)) != NULL) + return (ldp->ld_entry->le_thread); + return (NULL); +} + +/* + * Stop an lwp of the current process + */ +int +syslwp_suspend(id_t lwpid) +{ + kthread_t *t; + int error; + proc_t *p = ttoproc(curthread); + + mutex_enter(&p->p_lock); + if ((t = idtot(p, lwpid)) == NULL) + error = ESRCH; + else + error = lwp_suspend(t); + mutex_exit(&p->p_lock); + if (error) + return (set_errno(error)); + return (0); +} + +int +syslwp_continue(id_t lwpid) +{ + kthread_t *t; + proc_t *p = ttoproc(curthread); + + mutex_enter(&p->p_lock); + if ((t = idtot(p, lwpid)) == NULL) { + mutex_exit(&p->p_lock); + return (set_errno(ESRCH)); + } + lwp_continue(t); + mutex_exit(&p->p_lock); + return (0); +} + +int +lwp_kill(id_t lwpid, int sig) +{ + sigqueue_t *sqp; + kthread_t *t; + proc_t *p = ttoproc(curthread); + + if (sig < 0 || sig >= NSIG) + return (set_errno(EINVAL)); + if (sig != 0) + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + mutex_enter(&p->p_lock); + if ((t = idtot(p, lwpid)) == NULL) { + mutex_exit(&p->p_lock); + if (sig != 0) + kmem_free(sqp, sizeof (sigqueue_t)); + return (set_errno(ESRCH)); + } + if (sig == 0) { + mutex_exit(&p->p_lock); + return (0); + } + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = SI_LWP; + sqp->sq_info.si_pid = p->p_pid; + sqp->sq_info.si_ctid = PRCTID(p); + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(p, t, sqp); + mutex_exit(&p->p_lock); + return (0); +} + +/* + * This is the specification of lwp_wait() from the _lwp_wait(2) manual page: + * + * The lwp_wait() function blocks the current lwp until the lwp specified + * by 'lwpid' terminates. If the specified lwp terminated prior to the call + * to lwp_wait(), then lwp_wait() returns immediately. If 'lwpid' is zero, + * then lwp_wait() waits for any undetached lwp in the current process. + * If 'lwpid' is not zero, then it must specify an undetached lwp in the + * current process. If 'departed' is not NULL, then it points to a location + * where the id of the exited lwp is stored. + * + * When an lwp exits and there are one or more lwps in the process waiting + * for this specific lwp to exit, then one of the waiting lwps is unblocked + * and it returns from lwp_wait() successfully. Any other lwps waiting for + * this same lwp to exit are also unblocked, however, they return from + * lwp_wait() with the error ESRCH. If there are no lwps in the process + * waiting for this specific lwp to exit but there are one or more lwps + * waiting for any lwp to exit, then one of the waiting lwps is unblocked + * and it returns from lwp_wait() successfully. + * + * If an lwp is waiting for any lwp to exit, it blocks until an undetached + * lwp for which no other lwp is waiting terminates, at which time it returns + * successfully, or until all other lwps in the process are either daemon + * lwps or lwps waiting in lwp_wait(), in which case it returns EDEADLK. + */ +int +lwp_wait(id_t lwpid, id_t *departed) +{ + proc_t *p = ttoproc(curthread); + int error = 0; + int daemon = (curthread->t_proc_flag & TP_DAEMON)? 1 : 0; + lwpent_t *target_lep; + lwpdir_t *ldp; + lwpent_t *lep; + + /* + * lwp_wait() is not supported for the /proc agent lwp. + */ + if (curthread == p->p_agenttp) + return (set_errno(ENOTSUP)); + + mutex_enter(&p->p_lock); + prbarrier(p); + + curthread->t_waitfor = lwpid; + p->p_lwpwait++; + p->p_lwpdwait += daemon; + + if (lwpid != 0) { + if ((ldp = lwp_hash_lookup(p, lwpid)) == NULL) + target_lep = NULL; + else { + target_lep = ldp->ld_entry; + target_lep->le_waiters++; + target_lep->le_dwaiters += daemon; + } + } + + while (error == 0) { + kthread_t *t; + id_t tid; + int i; + + if (lwpid != 0) { + /* + * Look for a specific zombie lwp. + */ + if (target_lep == NULL) + error = ESRCH; + else if ((t = target_lep->le_thread) != NULL) { + if (!(t->t_proc_flag & TP_TWAIT)) + error = EINVAL; + } else { + /* + * We found the zombie we are waiting for. + */ + ASSERT(p->p_zombcnt > 0); + p->p_zombcnt--; + p->p_lwpwait--; + p->p_lwpdwait -= daemon; + curthread->t_waitfor = -1; + lwp_hash_out(p, lwpid); + mutex_exit(&p->p_lock); + if (departed != NULL && + copyout(&lwpid, departed, sizeof (id_t))) + return (set_errno(EFAULT)); + return (0); + } + } else { + /* + * Look for any zombie lwp. + */ + int some_non_daemon_will_return = 0; + + /* for each entry in the lwp directory... */ + ldp = p->p_lwpdir; + for (i = 0; i < p->p_lwpdir_sz; i++, ldp++) { + + if ((lep = ldp->ld_entry) == NULL || + lep->le_thread != NULL) + continue; + + /* + * We found a zombie lwp. If there is some + * other thread waiting specifically for the + * zombie we just found, then defer to the other + * waiting thread and continue searching for + * another zombie. Also check to see if there + * is some non-daemon thread sleeping here in + * lwp_wait() that will succeed and return when + * we drop p->p_lock. This is tested below. + */ + tid = lep->le_lwpid; + if (lep->le_waiters != 0) { + if (lep->le_waiters - lep->le_dwaiters) + some_non_daemon_will_return = 1; + continue; + } + + /* + * We found a zombie that no one else + * is specifically waiting for. + */ + ASSERT(p->p_zombcnt > 0); + p->p_zombcnt--; + p->p_lwpwait--; + p->p_lwpdwait -= daemon; + curthread->t_waitfor = -1; + lwp_hash_out(p, tid); + mutex_exit(&p->p_lock); + if (departed != NULL && + copyout(&tid, departed, sizeof (id_t))) + return (set_errno(EFAULT)); + return (0); + } + + /* + * We are waiting for anyone. If all non-daemon lwps + * are waiting here, and if we determined above that + * no non-daemon lwp will return, we have deadlock. + */ + if (!some_non_daemon_will_return && + p->p_lwpcnt == p->p_lwpdaemon + + (p->p_lwpwait - p->p_lwpdwait)) + error = EDEADLK; + } + + if (error == 0 && lwpid != 0) { + /* + * We are waiting for a specific non-zombie lwp. + * Fail if there is a deadlock loop. + */ + for (;;) { + if (t == curthread) { + error = EDEADLK; + break; + } + /* who is he waiting for? */ + if ((tid = t->t_waitfor) == -1) + break; + if (tid == 0) { + /* + * The lwp we are waiting for is + * waiting for anyone (transitively). + * If there are no zombies right now + * and if we would have deadlock due + * to all non-daemon lwps waiting here, + * wake up the lwp that is waiting for + * anyone so it can return EDEADLK. + */ + if (p->p_zombcnt == 0 && + p->p_lwpcnt == p->p_lwpdaemon + + p->p_lwpwait - p->p_lwpdwait) + cv_broadcast(&p->p_lwpexit); + break; + } + if ((ldp = lwp_hash_lookup(p, tid)) == NULL || + (t = ldp->ld_entry->le_thread) == NULL) + break; + } + } + + if (error) + break; + + /* + * Wait for some lwp to terminate. + */ + if (!cv_wait_sig(&p->p_lwpexit, &p->p_lock)) + error = EINTR; + prbarrier(p); + + if (lwpid != 0) { + if ((ldp = lwp_hash_lookup(p, lwpid)) == NULL) + target_lep = NULL; + else + target_lep = ldp->ld_entry; + } + } + + if (lwpid != 0 && target_lep != NULL) { + target_lep->le_waiters--; + target_lep->le_dwaiters -= daemon; + } + p->p_lwpwait--; + p->p_lwpdwait -= daemon; + curthread->t_waitfor = -1; + mutex_exit(&p->p_lock); + return (set_errno(error)); +} + +int +lwp_detach(id_t lwpid) +{ + kthread_t *t; + proc_t *p = ttoproc(curthread); + lwpdir_t *ldp; + int error = 0; + + mutex_enter(&p->p_lock); + prbarrier(p); + if ((ldp = lwp_hash_lookup(p, lwpid)) == NULL) + error = ESRCH; + else if ((t = ldp->ld_entry->le_thread) != NULL) { + if (!(t->t_proc_flag & TP_TWAIT)) + error = EINVAL; + else { + t->t_proc_flag &= ~TP_TWAIT; + cv_broadcast(&p->p_lwpexit); + } + } else { + ASSERT(p->p_zombcnt > 0); + p->p_zombcnt--; + lwp_hash_out(p, lwpid); + } + mutex_exit(&p->p_lock); + + if (error) + return (set_errno(error)); + return (0); +} + +/* + * Unpark the specified lwp. + */ +static int +lwp_unpark(id_t lwpid) +{ + proc_t *p = ttoproc(curthread); + kthread_t *t; + int error = 0; + + mutex_enter(&p->p_lock); + if ((t = idtot(p, lwpid)) == NULL) + error = ESRCH; + else { + mutex_enter(&t->t_delay_lock); + t->t_unpark = 1; + cv_signal(&t->t_delay_cv); + mutex_exit(&t->t_delay_lock); + } + mutex_exit(&p->p_lock); + return (error); +} + +/* + * Sleep until we are set running by lwp_unpark() or until we are + * interrupted by a signal or until we exhaust our timeout. + * timeoutp is an in/out parameter. On entry, it contains the relative + * time until timeout. On exit, we copyout the residual time left to it. + */ +static int +lwp_park(timespec_t *timeoutp, id_t lwpid) +{ + timespec_t rqtime; + timespec_t rmtime; + timespec_t now; + timespec_t *rqtp = NULL; + kthread_t *t = curthread; + int timecheck = 0; + int error = 0; + model_t datamodel = ttoproc(t)->p_model; + + if (lwpid != 0) /* unpark the other lwp, if any */ + (void) lwp_unpark(lwpid); + + if (timeoutp) { + timecheck = timechanged; + gethrestime(&now); + if (datamodel == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &rqtime, sizeof (timespec_t))) { + error = EFAULT; + goto out; + } + } else { + timespec32_t timeout32; + + if (copyin(timeoutp, &timeout32, sizeof (timeout32))) { + error = EFAULT; + goto out; + } + TIMESPEC32_TO_TIMESPEC(&rqtime, &timeout32) + } + + if (itimerspecfix(&rqtime)) { + error = EINVAL; + goto out; + } + /* + * Convert the timespec value into absolute time. + */ + timespecadd(&rqtime, &now); + rqtp = &rqtime; + } + + (void) new_mstate(t, LMS_USER_LOCK); + + mutex_enter(&t->t_delay_lock); + if (!schedctl_is_park()) + error = EINTR; + while (error == 0 && t->t_unpark == 0) { + switch (cv_waituntil_sig(&t->t_delay_cv, + &t->t_delay_lock, rqtp, timecheck)) { + case 0: + error = EINTR; + break; + case -1: + error = ETIME; + break; + } + } + t->t_unpark = 0; + mutex_exit(&t->t_delay_lock); + + if (timeoutp != NULL) { + rmtime.tv_sec = rmtime.tv_nsec = 0; + if (error != ETIME) { + gethrestime(&now); + if ((now.tv_sec < rqtime.tv_sec) || + ((now.tv_sec == rqtime.tv_sec) && + (now.tv_nsec < rqtime.tv_nsec))) { + rmtime = rqtime; + timespecsub(&rmtime, &now); + } + } + if (datamodel == DATAMODEL_NATIVE) { + if (copyout(&rmtime, timeoutp, sizeof (rmtime))) + error = EFAULT; + } else { + timespec32_t rmtime32; + + TIMESPEC_TO_TIMESPEC32(&rmtime32, &rmtime); + if (copyout(&rmtime32, timeoutp, sizeof (rmtime32))) + error = EFAULT; + } + } +out: + schedctl_unpark(); + if (t->t_mstate == LMS_USER_LOCK) + (void) new_mstate(t, LMS_SYSTEM); + return (error); +} + +#define MAXLWPIDS 1024 + +/* + * Unpark all of the specified lwps. + * Do it in chunks of MAXLWPIDS to avoid allocating too much memory. + */ +static int +lwp_unpark_all(id_t *lwpidp, int nids) +{ + proc_t *p = ttoproc(curthread); + kthread_t *t; + int error = 0; + id_t *lwpid; + size_t lwpidsz; + int n; + int i; + + if (nids <= 0) + return (EINVAL); + + lwpidsz = MIN(nids, MAXLWPIDS) * sizeof (id_t); + lwpid = kmem_alloc(lwpidsz, KM_SLEEP); + while (nids > 0) { + n = MIN(nids, MAXLWPIDS); + if (copyin(lwpidp, lwpid, n * sizeof (id_t))) { + error = EFAULT; + break; + } + mutex_enter(&p->p_lock); + for (i = 0; i < n; i++) { + if ((t = idtot(p, lwpid[i])) == NULL) + error = ESRCH; + else { + mutex_enter(&t->t_delay_lock); + t->t_unpark = 1; + cv_signal(&t->t_delay_cv); + mutex_exit(&t->t_delay_lock); + } + } + mutex_exit(&p->p_lock); + lwpidp += n; + nids -= n; + } + kmem_free(lwpid, lwpidsz); + return (error); +} + +/* + * SYS_lwp_park() system call. + */ +int +syslwp_park(int which, uintptr_t arg1, uintptr_t arg2) +{ + int error; + + switch (which) { + case 0: + error = lwp_park((timespec_t *)arg1, (id_t)arg2); + break; + case 1: + error = lwp_unpark((id_t)arg1); + break; + case 2: + error = lwp_unpark_all((id_t *)arg1, (int)arg2); + break; + default: + error = EINVAL; + break; + } + + if (error) + return (set_errno(error)); + return (0); +} diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c new file mode 100644 index 0000000000..5e162c2002 --- /dev/null +++ b/usr/src/uts/common/syscall/memcntl.c @@ -0,0 +1,394 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/bitmap.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/unistd.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/mman.h> +#include <sys/tuneable.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/vmsystm.h> +#include <sys/debug.h> +#include <sys/policy.h> + +#include <vm/as.h> +#include <vm/seg.h> + +static uint_t mem_getpgszc(size_t); + +/* + * Memory control operations + */ +int +memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask) +{ + struct as *as = ttoproc(curthread)->p_as; + struct proc *p = ttoproc(curthread); + size_t pgsz; + uint_t szc, oszc, pgcmd; + int error = 0; + faultcode_t fc; + uintptr_t iarg; + STRUCT_DECL(memcntl_mha, mha); + + if (mask) + return (set_errno(EINVAL)); + if ((cmd == MC_LOCKAS) || (cmd == MC_UNLOCKAS)) { + if ((addr != 0) || (len != 0)) { + return (set_errno(EINVAL)); + } + } else if (cmd != MC_HAT_ADVISE) { + if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0) { + return (set_errno(EINVAL)); + } + /* + * We're only concerned with the address range + * here, not the protections. The protections + * are only used as a "filter" in this code, + * they aren't set or modified here. + */ + if (valid_usr_range(addr, len, 0, as, + as->a_userlimit) != RANGE_OKAY) { + return (set_errno(ENOMEM)); + } + } + + if (cmd == MC_HAT_ADVISE) { + if (attr != 0 || mask != 0) { + return (set_errno(EINVAL)); + } + + } else { + if ((VALID_ATTR & attr) != attr) { + return (set_errno(EINVAL)); + } + if ((attr & SHARED) && (attr & PRIVATE)) { + return (set_errno(EINVAL)); + } + if (((cmd == MC_LOCKAS) || (cmd == MC_LOCK) || + (cmd == MC_UNLOCKAS) || (cmd == MC_UNLOCK)) && + (error = secpolicy_lock_memory(CRED())) != 0) + return (set_errno(error)); + } + if (attr) { + attr |= PROT_USER; + } + + switch (cmd) { + case MC_SYNC: + /* + * MS_SYNC used to be defined to be zero but is now non-zero. + * For binary compatibility we still accept zero + * (the absence of MS_ASYNC) to mean the same thing. + */ + iarg = (uintptr_t)arg; + if ((iarg & ~MS_INVALIDATE) == 0) + iarg |= MS_SYNC; + + if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) || + ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) { + error = set_errno(EINVAL); + } else { + error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0); + if (error) { + (void) set_errno(error); + } + } + return (error); + case MC_LOCKAS: + if ((uintptr_t)arg & ~(MCL_FUTURE|MCL_CURRENT) || + (uintptr_t)arg == 0) { + return (set_errno(EINVAL)); + } + break; + case MC_LOCK: + case MC_UNLOCKAS: + case MC_UNLOCK: + break; + case MC_HAT_ADVISE: + /* + * Set prefered page size. + */ + STRUCT_INIT(mha, get_udatamodel()); + if (copyin(arg, STRUCT_BUF(mha), STRUCT_SIZE(mha))) { + return (set_errno(EFAULT)); + } + + pgcmd = STRUCT_FGET(mha, mha_cmd); + + /* + * Currently only MHA_MAPSIZE_VA, MHA_MAPSIZE_STACK + * and MHA_MAPSIZE_BSSBRK are supported. Only one + * command may be specified at a time. + */ + if ((~(MHA_MAPSIZE_VA|MHA_MAPSIZE_STACK|MHA_MAPSIZE_BSSBRK) & + pgcmd) || pgcmd == 0 || !ISP2(pgcmd) || + STRUCT_FGET(mha, mha_flags)) + return (set_errno(EINVAL)); + + pgsz = STRUCT_FGET(mha, mha_pagesize); + + /* + * call platform specific map_pgsz() routine to get the + * optimal pgsz if pgsz is 0. + * + * For stack and heap operations addr and len must be zero. + */ + if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) { + if (addr != NULL || len != 0) { + return (set_errno(EINVAL)); + } + + /* + * Disable autompss for this process unless pgsz == 0, + * which means the system should pick. In the + * pgsz == 0 case, leave the SAUTOLPG setting alone, as + * we don't want to enable it when someone has + * disabled automatic large page selection for the + * whole system. + */ + mutex_enter(&p->p_lock); + if (pgsz != 0) { + p->p_flag &= ~SAUTOLPG; + } + mutex_exit(&p->p_lock); + + as_rangelock(as); + + if (pgsz == 0) { + int type; + + if (pgcmd == MHA_MAPSIZE_BSSBRK) + type = MAPPGSZ_HEAP; + else + type = MAPPGSZ_STK; + + pgsz = map_pgsz(type, p, 0, 0, NULL); + } + } else { + /* + * Note that we don't disable automatic large page + * selection for anon segments based on use of + * memcntl(). + */ + if (pgsz == 0) { + pgsz = map_pgsz(MAPPGSZ_VA, p, addr, len, + NULL); + } + + /* + * addr and len must be prefered page size aligned + * and valid for range specified. + */ + if (!IS_P2ALIGNED(addr, pgsz) || + !IS_P2ALIGNED(len, pgsz)) { + return (set_errno(EINVAL)); + } + if (valid_usr_range(addr, len, 0, as, + as->a_userlimit) != RANGE_OKAY) { + return (set_errno(ENOMEM)); + } + } + + szc = mem_getpgszc(pgsz); + if (szc == (uint_t)-1) { + if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) + != 0) { + as_rangeunlock(as); + } + return (set_errno(EINVAL)); + } + + /* + * For stack and heap operations we first need to pad + * out existing range (create new mappings) to the new + * prefered page size boundary. Also the start of the + * .bss for the heap or user's stack base may not be on + * the new prefered page size boundary. For these cases + * we align the base of the request on the new prefered + * page size. + */ + if (pgcmd & MHA_MAPSIZE_BSSBRK) { + if (szc == p->p_brkpageszc) { + as_rangeunlock(as); + return (0); + } + if (szc > p->p_brkpageszc) { + error = brk_internal(p->p_brkbase + + p->p_brksize, szc); + if (error) { + as_rangeunlock(as); + return (set_errno(error)); + } + } + oszc = p->p_brkpageszc; + p->p_brkpageszc = szc; + + ASSERT(IS_P2ALIGNED(p->p_brkbase + p->p_brksize, pgsz)); + addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, + pgsz); + len = (p->p_brkbase + p->p_brksize) - addr; + ASSERT(IS_P2ALIGNED(len, pgsz)); + /* + * Perhaps no existing pages to promote. + */ + if (len == 0) { + as_rangeunlock(as); + return (0); + } + } + /* + * The code below, as does grow.c, assumes stacks always grow + * downward. + */ + if (pgcmd & MHA_MAPSIZE_STACK) { + /* + * Some boxes (x86) have a top of stack that + * is not large page aligned. Since stacks are + * usually small we'll just return and do nothing + * for theses cases. Prefeered page size is advisory + * so no need to return an error. + */ + if (szc == p->p_stkpageszc || + !IS_P2ALIGNED(p->p_usrstack, pgsz)) { + as_rangeunlock(as); + return (0); + } + + if (szc > p->p_stkpageszc) { + error = grow_internal(p->p_usrstack + - p->p_stksize, szc); + if (error) { + as_rangeunlock(as); + return (set_errno(error)); + } + } + oszc = p->p_stkpageszc; + p->p_stkpageszc = szc; + + ASSERT(IS_P2ALIGNED(p->p_usrstack, pgsz)); + addr = p->p_usrstack - p->p_stksize; + len = p->p_stksize; + + /* + * Perhaps nothing to promote, we wrapped around + * or grow did not not grow the stack to a large + * page boundary. + */ + if (!IS_P2ALIGNED(len, pgsz) || len == 0 || + addr >= p->p_usrstack || (addr + len) < addr) { + as_rangeunlock(as); + return (0); + } + } + ASSERT(IS_P2ALIGNED(addr, pgsz)); + ASSERT(IS_P2ALIGNED(len, pgsz)); + error = as_setpagesize(as, addr, len, szc, B_TRUE); + + /* + * On stack or heap failures restore original + * pg size code. + */ + if (error) { + if ((pgcmd & MHA_MAPSIZE_BSSBRK) != 0) { + p->p_brkpageszc = oszc; + } + if ((pgcmd & MHA_MAPSIZE_STACK) != 0) { + p->p_stkpageszc = oszc; + } + (void) set_errno(error); + } + if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) { + as_rangeunlock(as); + } + return (error); + case MC_ADVISE: + switch ((uintptr_t)arg) { + case MADV_WILLNEED: + fc = as_faulta(as, addr, len); + if (fc) { + if (FC_CODE(fc) == FC_OBJERR) + error = set_errno(FC_ERRNO(fc)); + else if (FC_CODE(fc) == FC_NOMAP) + error = set_errno(ENOMEM); + else + error = set_errno(EINVAL); + return (error); + } + break; + + case MADV_DONTNEED: + /* + * For now, don't need is turned into an as_ctl(MC_SYNC) + * operation flagged for async invalidate. + */ + error = as_ctl(as, addr, len, MC_SYNC, attr, + MS_ASYNC | MS_INVALIDATE, NULL, 0); + if (error) + (void) set_errno(error); + return (error); + + default: + error = as_ctl(as, addr, len, cmd, attr, + (uintptr_t)arg, NULL, 0); + if (error) + (void) set_errno(error); + return (error); + } + break; + default: + return (set_errno(EINVAL)); + } + + error = as_ctl(as, addr, len, cmd, attr, (uintptr_t)arg, NULL, 0); + + if (error) + (void) set_errno(error); + return (error); +} + +/* + * Return page size code for page size passed in. If + * matching page size not found return -1. + */ +static uint_t +mem_getpgszc(size_t pgsz) { + return ((uint_t)page_user_szc(pgsz)); +} diff --git a/usr/src/uts/common/syscall/mkdir.c b/usr/src/uts/common/syscall/mkdir.c new file mode 100644 index 0000000000..fc9262b0a3 --- /dev/null +++ b/usr/src/uts/common/syscall/mkdir.c @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1996 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/debug.h> + +/* + * Make a directory. + */ +int +mkdir(char *dname, int dmode) +{ + vnode_t *vp; + struct vattr vattr; + int error; + + vattr.va_type = VDIR; + vattr.va_mode = dmode & PERMMASK; + vattr.va_mask = AT_TYPE|AT_MODE; + error = vn_create(dname, UIO_USERSPACE, &vattr, EXCL, 0, &vp, CRMKDIR, + 0, u.u_cmask); + if (error) + return (set_errno(error)); + VN_RELE(vp); + return (0); +} diff --git a/usr/src/uts/common/syscall/mknod.c b/usr/src/uts/common/syscall/mknod.c new file mode 100644 index 0000000000..26250387e4 --- /dev/null +++ b/usr/src/uts/common/syscall/mknod.c @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/uio.h> +#include <sys/mkdev.h> +#include <sys/policy.h> +#include <sys/debug.h> + +/* + * Create a special file, a regular file, or a FIFO. + * fname - pathname passed by user + * fmode - mode of pathname + * dev = device number - b/c specials only + */ +int +mknod(char *fname, mode_t fmode, dev_t dev) +{ + vnode_t *vp; + struct vattr vattr; + int error; + enum create why; + + /* + * Zero type is equivalent to a regular file. + */ + if ((fmode & S_IFMT) == 0) + fmode |= S_IFREG; + + /* + * Must be privileged unless making a FIFO node. + */ + if (((fmode & S_IFMT) != S_IFIFO) && secpolicy_sys_devices(CRED()) != 0) + return (set_errno(EPERM)); + /* + * Set up desired attributes and vn_create the file. + */ + vattr.va_type = IFTOVT(fmode); + vattr.va_mode = fmode & MODEMASK; + vattr.va_mask = AT_TYPE|AT_MODE; + if (vattr.va_type == VCHR || vattr.va_type == VBLK) { + if (get_udatamodel() != DATAMODEL_NATIVE) + dev = expldev(dev); + if (dev == NODEV || (getemajor(dev)) == (major_t)NODEV) + return (set_errno(EINVAL)); + vattr.va_rdev = dev; + vattr.va_mask |= AT_RDEV; + } + why = ((fmode & S_IFMT) == S_IFDIR) ? CRMKDIR : CRMKNOD; + if (error = vn_create(fname, UIO_USERSPACE, &vattr, EXCL, 0, &vp, + why, 0, u.u_cmask)) + return (set_errno(error)); + VN_RELE(vp); + return (0); +} + +#if defined(__i386) || defined(__i386_COMPAT) + +/*ARGSUSED*/ +int +xmknod(int version, char *fname, mode_t fmode, dev_t dev) +{ + return (mknod(fname, fmode, dev)); +} + +#endif diff --git a/usr/src/uts/common/syscall/mount.c b/usr/src/uts/common/syscall/mount.c new file mode 100644 index 0000000000..ef681ea586 --- /dev/null +++ b/usr/src/uts/common/syscall/mount.c @@ -0,0 +1,137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/user.h> +#include <sys/fstyp.h> +#include <sys/kmem.h> +#include <sys/systm.h> +#include <sys/mount.h> +#include <sys/vfs.h> +#include <sys/cred.h> +#include <sys/vnode.h> +#include <sys/dnlc.h> +#include <sys/file.h> +#include <sys/time.h> +#include <sys/cmn_err.h> +#include <sys/swap.h> +#include <sys/debug.h> +#include <sys/pathname.h> +#include <sys/cladm.h> + +/* + * System calls. + */ + +/* + * "struct mounta" defined in sys/vfs.h. + */ + +/* ARGSUSED */ +int +mount(long *lp, rval_t *rp) +{ + vnode_t *vp = NULL; + struct vfs *vfsp; /* dummy argument */ + int error; + struct mounta *uap; +#if defined(_LP64) + struct mounta native; + + /* + * Make a struct mounta if we are DATAMODEL_LP64 + */ + uap = &native; + uap->spec = (char *)*lp++; + uap->dir = (char *)*lp++; + uap->flags = (int)*lp++; + uap->fstype = (char *)*lp++; + uap->dataptr = (char *)*lp++; + uap->datalen = (int)*lp++; + uap->optptr = (char *)*lp++; + uap->optlen = (int)*lp++; +#else /* !defined(_LP64) */ + /* + * 32 bit kernels can take a shortcut and just cast + * the args array to the structure. + */ + uap = (struct mounta *)lp; +#endif /* _LP64 */ + /* + * Resolve second path name (mount point). + */ + if (error = lookupname(uap->dir, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) + return (set_errno(error)); + + /* + * Some mount flags are disallowed through the system call interface. + */ + uap->flags &= MS_MASK; + + if ((vp->v_flag & VPXFS) && ((uap->flags & MS_GLOBAL) != MS_GLOBAL)) { + /* + * Clustering: if we're doing a mount onto the global + * namespace, and the mount is not a global mount, return + * an error. + */ + error = ENOTSUP; + } else if (uap->flags & MS_GLOBAL) { + /* + * Clustering: global mount specified. + */ + if ((cluster_bootflags & CLUSTER_BOOTED) == 0) { + /* + * If we're not booted as a cluster, + * global mounts are not allowed. + */ + error = ENOTSUP; + } else { + error = domount("pxfs", uap, vp, CRED(), &vfsp); + if (!error) + VFS_RELE(vfsp); + } + } else { + error = domount(NULL, uap, vp, CRED(), &vfsp); + if (!error) + VFS_RELE(vfsp); + } + VN_RELE(vp); + rp->r_val2 = error; + return (error ? set_errno(error) : 0); +} diff --git a/usr/src/uts/common/syscall/nice.c b/usr/src/uts/common/syscall/nice.c new file mode 100644 index 0000000000..55db136f7b --- /dev/null +++ b/usr/src/uts/common/syscall/nice.c @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* Copyright (c) 1994 Sun Microsystems, Inc. */ + + +#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.15 */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/proc.h> +#include <sys/debug.h> +#include <sys/class.h> +#include <sys/mutex.h> + +/* + * We support the nice system call for compatibility although + * the priocntl system call supports a superset of nice's functionality. + * We support nice only for time sharing threads. It will fail + * if called by a thread from another class. + */ + +int +nice(int niceness) +{ + int error = 0; + int err, retval; + kthread_id_t t; + proc_t *p = curproc; + + mutex_enter(&p->p_lock); + t = p->p_tlist; + do { + err = CL_DONICE(t, CRED(), niceness, &retval); + if (error == 0 && err) + error = set_errno(err); + } while ((t = t->t_forw) != p->p_tlist); + mutex_exit(&p->p_lock); + if (error) + return (error); + return (retval); +} diff --git a/usr/src/uts/common/syscall/ntptime.c b/usr/src/uts/common/syscall/ntptime.c new file mode 100644 index 0000000000..7f38b65db4 --- /dev/null +++ b/usr/src/uts/common/syscall/ntptime.c @@ -0,0 +1,218 @@ +/* + * Copyright 1994,1996-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) David L. Mills 1993, 1994 + * + * Permission to use, copy, modify, and distribute this software and its + * documentation for any purpose and without fee is hereby granted, provided + * that the above copyright notice appears in all copies and that both the + * copyright notice and this permission notice appear in supporting + * documentation, and that the name University of Delaware not be used in + * advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. The University of Delaware + * makes no representations about the suitability this software for any + * purpose. It is provided "as is" without express or implied warranty. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Modification history kern_ntptime.c + * + * 24 Sep 94 David L. Mills + * Tightened code at exits. + * + * 24 Mar 94 David L. Mills + * Revised syscall interface to include new variables for PPS + * time discipline. + * + * 14 Feb 94 David L. Mills + * Added code for external clock + * + * 28 Nov 93 David L. Mills + * Revised frequency scaling to conform with adjusted parameters + * + * 17 Sep 93 David L. Mills + * Created file + */ +/* + * ntp_gettime(), ntp_adjtime() - precision time interface + * + * These routines consitute the Network Time Protocol (NTP) interfaces + * for user and daemon application programs. The ntp_gettime() routine + * provides the time, maximum error (synch distance) and estimated error + * (dispersion) to client user application programs. The ntp_adjtime() + * routine is used by the NTP daemon to adjust the system clock to an + * externally derived time. The time offset and related variables set by + * this routine are used by clock() to adjust the phase and + * frequency of the phase-lock loop which controls the system clock. + */ +#include <sys/param.h> +#include <sys/user.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/cpuvar.h> +#include <sys/timer.h> +#include <sys/debug.h> +#include <sys/timex.h> +#include <sys/model.h> +#include <sys/policy.h> + +/* + * ntp_gettime() - NTP user application interface + */ +int +ntp_gettime(struct ntptimeval *tp) +{ + timestruc_t tod; + struct ntptimeval ntv; + model_t datamodel = get_udatamodel(); + + gethrestime(&tod); + if (tod.tv_sec > TIME32_MAX) + return (set_errno(EOVERFLOW)); + ntv.time.tv_sec = tod.tv_sec; + ntv.time.tv_usec = tod.tv_nsec / (NANOSEC / MICROSEC); + ntv.maxerror = time_maxerror; + ntv.esterror = time_esterror; + + if (datamodel == DATAMODEL_NATIVE) { + if (copyout(&ntv, tp, sizeof (ntv))) + return (set_errno(EFAULT)); + } else { + struct ntptimeval32 ntv32; + + if (TIMEVAL_OVERFLOW(&ntv.time)) + return (set_errno(EOVERFLOW)); + + TIMEVAL_TO_TIMEVAL32(&ntv32.time, &ntv.time); + + ntv32.maxerror = ntv.maxerror; + ntv32.esterror = ntv.esterror; + + if (copyout(&ntv32, tp, sizeof (ntv32))) + return (set_errno(EFAULT)); + } + + /* + * Status word error decode. If any of these conditions + * occur, an error is returned, instead of the status + * word. Most applications will care only about the fact + * the system clock may not be trusted, not about the + * details. + * + * Hardware or software error + */ + if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) || + /* + * PPS signal lost when either time or frequency + * synchronization requested + */ + (time_status & (STA_PPSFREQ | STA_PPSTIME) && + !(time_status & STA_PPSSIGNAL)) || + + /* + * PPS jitter exceeded when time synchronization + * requested + */ + (time_status & STA_PPSTIME && time_status & STA_PPSJITTER) || + + /* + * PPS wander exceeded or calibration error when + * frequency synchronization requested + */ + (time_status & STA_PPSFREQ && time_status & + (STA_PPSWANDER | STA_PPSERROR))) + return (TIME_ERROR); + + return (time_state); +} + +/* + * ntp_adjtime() - NTP daemon application interface + */ +int +ntp_adjtime(struct timex *tp) +{ + struct timex ntv; + int modes; + + if (copyin(tp, &ntv, sizeof (ntv))) + return (set_errno(EFAULT)); + + /* + * Update selected clock variables - only privileged users can + * change anything. Note that there is no error checking here on + * the assumption privileged users know what they're doing. + */ + modes = ntv.modes; + + if (modes != 0 && secpolicy_settime(CRED()) != 0) + return (set_errno(EPERM)); + + if (ntv.constant < 0 || ntv.constant > 30) + return (set_errno(EINVAL)); + + mutex_enter(&tod_lock); + if (modes & MOD_FREQUENCY) + time_freq = ntv.freq - pps_freq; + if (modes & MOD_MAXERROR) + time_maxerror = ntv.maxerror; + if (modes & MOD_ESTERROR) + time_esterror = ntv.esterror; + if (modes & MOD_STATUS) { + time_status &= STA_RONLY; + time_status |= ntv.status & ~STA_RONLY; + } + if (modes & MOD_TIMECONST) + time_constant = ntv.constant; + + if (modes & MOD_OFFSET) + clock_update(ntv.offset); + + /* + * Retrieve all clock variables + */ + ntv.offset = time_offset / SCALE_UPDATE; + ntv.freq = time_freq + pps_freq; + ntv.maxerror = time_maxerror; + ntv.esterror = time_esterror; + ntv.status = time_status; + ntv.constant = time_constant; + ntv.precision = time_precision; + ntv.tolerance = time_tolerance; + ntv.shift = pps_shift; + ntv.ppsfreq = pps_freq; + ntv.jitter = pps_jitter >> PPS_AVG; + ntv.stabil = pps_stabil; + ntv.calcnt = pps_calcnt; + ntv.errcnt = pps_errcnt; + ntv.jitcnt = pps_jitcnt; + ntv.stbcnt = pps_stbcnt; + mutex_exit(&tod_lock); + + if (copyout(&ntv, tp, sizeof (ntv))) + return (set_errno(EFAULT)); + + /* + * Status word error decode. See comments in + * ntp_gettime() routine. + */ + if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) || + (time_status & (STA_PPSFREQ | STA_PPSTIME) && + !(time_status & STA_PPSSIGNAL)) || + (time_status & STA_PPSTIME && + time_status & STA_PPSJITTER) || + (time_status & STA_PPSFREQ && + time_status & (STA_PPSWANDER | STA_PPSERROR))) + return (TIME_ERROR); + + return (time_state); +} diff --git a/usr/src/uts/common/syscall/open.c b/usr/src/uts/common/syscall/open.c new file mode 100644 index 0000000000..40e7faa097 --- /dev/null +++ b/usr/src/uts/common/syscall/open.c @@ -0,0 +1,305 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/uio.h> +#include <sys/debug.h> +#include <c2/audit.h> + +/* + * Common code for open()/openat() and creat(). Check permissions, allocate + * an open file structure, and call the device open routine (if any). + */ + +static int +copen(int startfd, char *fname, int filemode, int createmode) +{ + struct pathname pn; + vnode_t *vp, *sdvp; + file_t *fp, *startfp; + enum vtype type; + int error; + int fd, dupfd; + vnode_t *startvp; + proc_t *p = curproc; + + if (startfd == AT_FDCWD) { + /* + * Regular open() + */ + startvp = NULL; + } else { + /* + * We're here via openat() + */ + char startchar; + + if (copyin(fname, &startchar, sizeof (char))) + return (set_errno(EFAULT)); + + /* + * if startchar is / then startfd is ignored + */ + if (startchar == '/') + startvp = NULL; + else { + if ((startfp = getf(startfd)) == NULL) + return (set_errno(EBADF)); + startvp = startfp->f_vnode; + VN_HOLD(startvp); + releasef(startfd); + } + } + + if (filemode & FXATTR) { + + /* + * Make sure we have a valid request. + * We must either have a real fd or AT_FDCWD + */ + + if (startfd != AT_FDCWD && startvp == NULL) { + error = EINVAL; + goto out; + } + + if (error = pn_get(fname, UIO_USERSPACE, &pn)) { + goto out; + } + + if (startfd == AT_FDCWD) { + mutex_enter(&p->p_lock); + startvp = PTOU(p)->u_cdir; + VN_HOLD(startvp); + mutex_exit(&p->p_lock); + } + + /* + * Verify permission to put attributes on file + */ + + if ((VOP_ACCESS(startvp, VREAD, 0, CRED()) != 0) && + (VOP_ACCESS(startvp, VWRITE, 0, CRED()) != 0) && + (VOP_ACCESS(startvp, VEXEC, 0, CRED()) != 0)) { + error = EACCES; + pn_free(&pn); + goto out; + } + + if ((startvp->v_vfsp->vfs_flag & VFS_XATTR) != 0) { + error = VOP_LOOKUP(startvp, "", &sdvp, &pn, + LOOKUP_XATTR|CREATE_XATTR_DIR, rootvp, CRED()); + } else { + error = EINVAL; + } + pn_free(&pn); + if (error != 0) + goto out; + + VN_RELE(startvp); + startvp = sdvp; + } + + if ((filemode & (FREAD|FWRITE)) != 0) { + if ((filemode & (FNONBLOCK|FNDELAY)) == (FNONBLOCK|FNDELAY)) + filemode &= ~FNDELAY; + error = falloc((vnode_t *)NULL, filemode, &fp, &fd); + if (error == 0) { +#ifdef C2_AUDIT + if (audit_active) + audit_setfsat_path(1); +#endif /* C2_AUDIT */ + /* + * Last arg is a don't-care term if + * !(filemode & FCREAT). + */ + error = vn_openat(fname, UIO_USERSPACE, filemode, + (int)(createmode & MODEMASK), &vp, CRCREAT, + u.u_cmask, startvp); + + if (startvp != NULL) + VN_RELE(startvp); + if (error == 0) { +#ifdef C2_AUDIT + if (audit_active) + audit_copen(fd, fp, vp); +#endif /* C2_AUDIT */ + if ((vp->v_flag & VDUP) == 0) { + fp->f_vnode = vp; + mutex_exit(&fp->f_tlock); + /* + * We must now fill in the slot + * falloc reserved. + */ + setf(fd, fp); + return (fd); + } else { + /* + * Special handling for /dev/fd. + * Give up the file pointer + * and dup the indicated file descriptor + * (in v_rdev). This is ugly, but I've + * seen worse. + */ + unfalloc(fp); + dupfd = getminor(vp->v_rdev); + type = vp->v_type; + mutex_enter(&vp->v_lock); + vp->v_flag &= ~VDUP; + mutex_exit(&vp->v_lock); + VN_RELE(vp); + if (type != VCHR) + return (set_errno(EINVAL)); + if ((fp = getf(dupfd)) == NULL) { + setf(fd, NULL); + return (set_errno(EBADF)); + } + mutex_enter(&fp->f_tlock); + fp->f_count++; + mutex_exit(&fp->f_tlock); + setf(fd, fp); + releasef(dupfd); + } + return (fd); + } else { + setf(fd, NULL); + unfalloc(fp); + return (set_errno(error)); + } + } + } else { + error = EINVAL; + } +out: + if (startvp != NULL) + VN_RELE(startvp); + return (set_errno(error)); +} + +#define OPENMODE32(fmode) ((int)((fmode)-FOPEN)) +#define CREATMODE32 (FWRITE|FCREAT|FTRUNC) +#define OPENMODE64(fmode) (OPENMODE32(fmode) | FOFFMAX) +#define CREATMODE64 (CREATMODE32 | FOFFMAX) +#ifdef _LP64 +#define OPENMODE(fmode) OPENMODE64(fmode) +#define CREATMODE CREATMODE64 +#else +#define OPENMODE OPENMODE32 +#define CREATMODE CREATMODE32 +#endif + +/* + * Open a file. + */ +int +open(char *fname, int fmode, int cmode) +{ + return (copen(AT_FDCWD, fname, OPENMODE(fmode), cmode)); +} + +/* + * Create a file. + */ +int +creat(char *fname, int cmode) +{ + return (copen(AT_FDCWD, fname, CREATMODE, cmode)); +} + +int +openat(int fd, char *path, int fmode, int cmode) +{ + return (copen(fd, path, OPENMODE(fmode), cmode)); +} + +#if defined(_ILP32) || defined(_SYSCALL32_IMPL) +/* + * Open and Creat for large files in 32-bit environment. Sets the FOFFMAX flag. + */ +int +open64(char *fname, int fmode, int cmode) +{ + return (copen(AT_FDCWD, fname, OPENMODE64(fmode), cmode)); +} + +int +creat64(char *fname, int cmode) +{ + return (copen(AT_FDCWD, fname, CREATMODE64, cmode)); +} + +int +openat64(int fd, char *path, int fmode, int cmode) +{ + return (copen(fd, path, OPENMODE64(fmode), cmode)); +} + +#endif /* _ILP32 || _SYSCALL32_IMPL */ + +#ifdef _SYSCALL32_IMPL +/* + * Open and Creat for 32-bit compatibility on 64-bit kernel + */ +int +open32(char *fname, int fmode, int cmode) +{ + return (copen(AT_FDCWD, fname, OPENMODE32(fmode), cmode)); +} + +int +creat32(char *fname, int cmode) +{ + return (copen(AT_FDCWD, fname, CREATMODE32, cmode)); +} + +int +openat32(int fd, char *path, int fmode, int cmode) +{ + return (copen(fd, path, OPENMODE32(fmode), cmode)); +} +#endif /* _SYSCALL32_IMPL */ diff --git a/usr/src/uts/common/syscall/p_online.c b/usr/src/uts/common/syscall/p_online.c new file mode 100644 index 0000000000..004627569c --- /dev/null +++ b/usr/src/uts/common/syscall/p_online.c @@ -0,0 +1,244 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/var.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/kstat.h> +#include <sys/uadmin.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/procset.h> +#include <sys/processor.h> +#include <sys/debug.h> +#include <sys/policy.h> + +/* + * CPU state diagram + * + * P_SPARE + * P_POWEROFF <---> P_OFFLINE <---> P_ONLINE <---> P_NOINTR + * P_FAULTED + */ +int +p_online_internal(processorid_t cpun, int new_status, int *old_status) +{ + cpu_t *cp; + int status; + int error = 0; + int flags = 0; + + /* + * Try to get a pointer to the requested CPU structure. + */ + mutex_enter(&cpu_lock); /* protects CPU states */ + if ((cp = cpu_get(cpun)) == NULL) { + error = EINVAL; + goto out; + } + + if (new_status & P_FORCED) + flags = CPU_FORCED; + *old_status = status = cpu_get_state(cp); /* get processor status */ + new_status &= ~P_FORCED; + + /* + * Perform credentials check. + */ + switch (new_status) { + case P_STATUS: + goto out; + case P_ONLINE: + case P_OFFLINE: + case P_NOINTR: + case P_FAULTED: + case P_SPARE: + if (secpolicy_ponline(CRED()) != 0) + error = EPERM; + break; + default: + error = EINVAL; + } + + if (error) + goto out; + + /* + * return 0 if the CPU is already in the desired new state. + */ + if (status == new_status) + goto out; + + switch (new_status) { + case P_ONLINE: + switch (status) { + case P_POWEROFF: + /* + * If CPU is powered off, power it on. + */ + if (error = cpu_poweron(cp)) + break; + ASSERT(cpu_get_state(cp) == P_OFFLINE); + /* FALLTHROUGH */ + case P_OFFLINE: + case P_FAULTED: + case P_SPARE: + /* + * If CPU is in one of the offline states, + * bring it online. + */ + error = cpu_online(cp); + break; + case P_NOINTR: + cpu_intr_enable(cp); + break; + } + break; + + case P_OFFLINE: + switch (status) { + case P_NOINTR: + /* + * Before we take the CPU offline, we first enable I/O + * interrupts. + */ + cpu_intr_enable(cp); + /* FALLTHROUGH */ + case P_ONLINE: + case P_FAULTED: + case P_SPARE: + /* + * CPU is online, or in a special offline state. + * Take it offline. + */ + error = cpu_offline(cp, flags); + break; + case P_POWEROFF: + /* + * If CPU is powered off, power it on. + */ + error = cpu_poweron(cp); + } + break; + + case P_NOINTR: + switch (status) { + case P_POWEROFF: + /* + * if CPU is powered off, power it on. + */ + if (error = cpu_poweron(cp)) + break; + ASSERT(cpu_get_state(cp) == P_OFFLINE); + /* FALLTHROUGH */ + case P_OFFLINE: + case P_FAULTED: + case P_SPARE: + /* + * First, bring the CPU online. + */ + if (error = cpu_online(cp)) + break; + /* FALLTHROUGH */ + case P_ONLINE: + /* + * CPU is now online. Try to disable interrupts. + */ + error = cpu_intr_disable(cp); + } + break; + + case P_FAULTED: + switch (status) { + case P_POWEROFF: + /* + * If CPU is powered off, power it on. + */ + if (error = cpu_poweron(cp)) + break; + ASSERT(cpu_get_state(cp) == P_OFFLINE); + /*FALLTHROUGH*/ + case P_OFFLINE: + case P_SPARE: + case P_ONLINE: + case P_NOINTR: + /* + * Mark this CPU as faulted. + */ + error = cpu_faulted(cp, flags); + } + break; + + case P_SPARE: + switch (status) { + case P_POWEROFF: + /* + * If CPU is powered off, power it on. + */ + if (error = cpu_poweron(cp)) + break; + ASSERT(cpu_get_state(cp) == P_OFFLINE); + /*FALLTHROUGH*/ + case P_OFFLINE: + case P_FAULTED: + case P_ONLINE: + case P_NOINTR: + /* + * Mark this CPU as a spare. + */ + error = cpu_spare(cp, flags); + } + break; + } +out: + mutex_exit(&cpu_lock); + return (error); +} + +/* + * p_online(2) - get/change processor operational status. + * + * As noted in os/cpu.c, the P_ONLINE and other state constants are for use + * only in this system call path and other paths conveying CPU state to + * userland. In general, other kernel consumers should be using the accessor + * functions in uts/common/os/cpu.c. + */ +int +p_online(processorid_t cpun, int new_status) +{ + int ret; + int old_status; + + ret = p_online_internal(cpun, new_status, &old_status); + if (ret != 0) + return (set_errno(ret)); + return (old_status); +} diff --git a/usr/src/uts/common/syscall/pathconf.c b/usr/src/uts/common/syscall/pathconf.c new file mode 100644 index 0000000000..788076d25e --- /dev/null +++ b/usr/src/uts/common/syscall/pathconf.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/file.h> +#include <sys/uio.h> +#include <sys/debug.h> + +/* + * Common code for pathconf(), fpathconf() system calls + */ +static long +cpathconf(register vnode_t *vp, int cmd, struct cred *cr) +{ + int error; + ulong_t val; + + switch (cmd) { + case _PC_2_SYMLINKS: + if (error = VOP_PATHCONF(vp, _PC_SYMLINK_MAX, &val, cr)) + return ((long)set_errno(error)); + return ((long)(val > 0)); + + case _PC_ALLOC_SIZE_MIN: + case _PC_REC_INCR_XFER_SIZE: + case _PC_REC_MAX_XFER_SIZE: + case _PC_REC_MIN_XFER_SIZE: + case _PC_REC_XFER_ALIGN: + return ((long)set_errno(EINVAL)); + + case _PC_ASYNC_IO: + return (1l); + + case _PC_PRIO_IO: + return ((long)set_errno(EINVAL)); + + case _PC_SYNC_IO: + if (!(error = VOP_FSYNC(vp, FSYNC, cr))) + return (1l); + return ((long)set_errno(error)); + + case _PC_XATTR_ENABLED: + return ((vp->v_vfsp->vfs_flag & VFS_XATTR) ? 1 : 0); + + default: + if (error = VOP_PATHCONF(vp, cmd, &val, cr)) + return ((long)set_errno(error)); + return (val); + } + /* NOTREACHED */ +} + +/* fpathconf/pathconf interfaces */ + +long +fpathconf(int fdes, int name) +{ + file_t *fp; + long retval; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + retval = cpathconf(fp->f_vnode, name, fp->f_cred); + releasef(fdes); + return (retval); +} + +long +pathconf(char *fname, int name) +{ + vnode_t *vp; + long retval; + int error; + +lookup: + if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + return ((long)set_errno(error)); + } + + retval = cpathconf(vp, name, CRED()); + VN_RELE(vp); + return (retval); +} diff --git a/usr/src/uts/common/syscall/pause.c b/usr/src/uts/common/syscall/pause.c new file mode 100644 index 0000000000..3c621859e7 --- /dev/null +++ b/usr/src/uts/common/syscall/pause.c @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright (c) 1994-2000 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/condvar.h> +#include <sys/debug.h> + +/* + * Indefinite wait. + */ +int +pause() +{ + mutex_enter(&curthread->t_delay_lock); + while (cv_wait_sig_swap(&curthread->t_delay_cv, + &curthread->t_delay_lock)) + ; + mutex_exit(&curthread->t_delay_lock); + return (set_errno(EINTR)); +} diff --git a/usr/src/uts/common/syscall/pgrpsys.c b/usr/src/uts/common/syscall/pgrpsys.c new file mode 100644 index 0000000000..e8be876537 --- /dev/null +++ b/usr/src/uts/common/syscall/pgrpsys.c @@ -0,0 +1,163 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/session.h> +#include <sys/debug.h> + +/* ARGSUSED */ +int +setpgrp(int flag, int pid, int pgid) +{ + register proc_t *p = ttoproc(curthread); + register int retval = 0; + + switch (flag) { + + case 1: /* setpgrp() */ + mutex_enter(&pidlock); + if (p->p_sessp->s_sidp != p->p_pidp && !pgmembers(p->p_pid)) { + mutex_exit(&pidlock); + sess_create(); + } else + mutex_exit(&pidlock); + return (p->p_sessp->s_sid); + + case 3: /* setsid() */ + mutex_enter(&pidlock); + if (p->p_pgidp == p->p_pidp || pgmembers(p->p_pid)) { + mutex_exit(&pidlock); + return (set_errno(EPERM)); + } + mutex_exit(&pidlock); + sess_create(); + return (p->p_sessp->s_sid); + + case 5: /* setpgid() */ + { + mutex_enter(&pidlock); + if (pid == 0) + pid = p->p_pid; + else if (pid < 0 || pid >= maxpid) { + mutex_exit(&pidlock); + return (set_errno(EINVAL)); + } else if (pid != p->p_pid) { + for (p = p->p_child; /* empty */; p = p->p_sibling) { + if (p == NULL) { + mutex_exit(&pidlock); + return (set_errno(ESRCH)); + } + if (p->p_pid == pid) + break; + } + if (p->p_flag & SEXECED) { + mutex_exit(&pidlock); + return (set_errno(EACCES)); + } + if (p->p_sessp != ttoproc(curthread)->p_sessp) { + mutex_exit(&pidlock); + return (set_errno(EPERM)); + } + } + + if (p->p_sessp->s_sid == pid) { + mutex_exit(&pidlock); + return (set_errno(EPERM)); + } + + if (pgid == 0) + pgid = p->p_pid; + else if (pgid < 0 || pgid >= maxpid) { + mutex_exit(&pidlock); + return (set_errno(EINVAL)); + } + + if (p->p_pgrp == pgid) { + mutex_exit(&pidlock); + break; + } else if (p->p_pid == pgid) { + /* + * We need to protect p_pgidp with p_lock because + * /proc looks at it while holding only p_lock. + */ + mutex_enter(&p->p_lock); + pgexit(p); + pgjoin(p, p->p_pidp); + mutex_exit(&p->p_lock); + } else { + register proc_t *q; + + if ((q = pgfind(pgid)) == NULL || + q->p_sessp != p->p_sessp) { + mutex_exit(&pidlock); + return (set_errno(EPERM)); + } + /* + * See comment above about p_lock and /proc + */ + mutex_enter(&p->p_lock); + pgexit(p); + pgjoin(p, q->p_pgidp); + mutex_exit(&p->p_lock); + } + mutex_exit(&pidlock); + break; + } + + case 0: /* getpgrp() */ + mutex_enter(&pidlock); + retval = p->p_pgrp; + mutex_exit(&pidlock); + break; + + case 2: /* getsid() */ + case 4: /* getpgid() */ + if (pid < 0 || pid >= maxpid) { + return (set_errno(EINVAL)); + } + mutex_enter(&pidlock); + if (pid != 0 && p->p_pid != pid && + ((p = prfind(pid)) == NULL || p->p_stat == SIDL)) { + mutex_exit(&pidlock); + return (set_errno(ESRCH)); + } + if (flag == 2) + retval = p->p_sessp->s_sid; + else + retval = p->p_pgrp; + mutex_exit(&pidlock); + break; + + } + return (retval); +} diff --git a/usr/src/uts/common/syscall/pipe.c b/usr/src/uts/common/syscall/pipe.c new file mode 100644 index 0000000000..c980270a55 --- /dev/null +++ b/usr/src/uts/common/syscall/pipe.c @@ -0,0 +1,178 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.11 */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <sys/fs/fifonode.h> + +/* + * This is the loadable module wrapper. + */ +#include <sys/modctl.h> +#include <sys/syscall.h> + +char _depends_on[] = "fs/fifofs"; + +longlong_t pipe(); + +static struct sysent pipe_sysent = { + 0, + SE_32RVAL1 | SE_32RVAL2 | SE_NOUNLOAD | SE_ARGC, + (int (*)())pipe +}; + +/* + * Module linkage information for the kernel. + */ +static struct modlsys modlsys = { + &mod_syscallops, "pipe(2) syscall", &pipe_sysent +}; + +#ifdef _SYSCALL32_IMPL +static struct modlsys modlsys32 = { + &mod_syscallops32, "32-bit pipe(2) syscall", &pipe_sysent +}; +#endif + +static struct modlinkage modlinkage = { + MODREV_1, + &modlsys, +#ifdef _SYSCALL32_IMPL + &modlsys32, +#endif + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + return (EBUSY); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * pipe(2) system call. + * Create a pipe by connecting two streams together. Associate + * each end of the pipe with a vnode, a file descriptor and + * one of the streams. + */ +longlong_t +pipe() +{ + vnode_t *vp1, *vp2; + struct file *fp1, *fp2; + int error = 0; + int fd1, fd2; + rval_t r; + + /* + * Allocate and initialize two vnodes. + */ + makepipe(&vp1, &vp2); + + /* + * Allocate and initialize two file table entries and two + * file pointers. Each file pointer is open for read and + * write. + */ + if (error = falloc(vp1, FWRITE|FREAD, &fp1, &fd1)) { + VN_RELE(vp1); + VN_RELE(vp2); + return ((longlong_t)set_errno(error)); + } + + if (error = falloc(vp2, FWRITE|FREAD, &fp2, &fd2)) + goto out2; + + /* + * Create two stream heads and attach to each vnode. + */ + if (error = fifo_stropen(&vp1, FWRITE|FREAD, fp1->f_cred, 0, 0)) + goto out; + + if (error = fifo_stropen(&vp2, FWRITE|FREAD, fp2->f_cred, 0, 0)) { + (void) VOP_CLOSE(vp1, FWRITE|FREAD, 1, (offset_t)0, + fp1->f_cred); + goto out; + } + + strmate(vp1, vp2); + + VTOF(vp1)->fn_ino = VTOF(vp2)->fn_ino = fifogetid(); + + /* + * Now fill in the entries that falloc reserved + */ + mutex_exit(&fp1->f_tlock); + mutex_exit(&fp2->f_tlock); + setf(fd1, fp1); + setf(fd2, fp2); + + /* + * Return the file descriptors to the user. They now + * point to two different vnodes which have different + * stream heads. + */ + r.r_val1 = fd1; + r.r_val2 = fd2; + return (r.r_vals); +out: + unfalloc(fp2); + setf(fd2, NULL); +out2: + unfalloc(fp1); + setf(fd1, NULL); + VN_RELE(vp1); + VN_RELE(vp2); + return ((longlong_t)set_errno(error)); +} diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c new file mode 100644 index 0000000000..4d3c2f1060 --- /dev/null +++ b/usr/src/uts/common/syscall/poll.c @@ -0,0 +1,2776 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/time.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/poll_impl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/bitmap.h> +#include <sys/kstat.h> +#include <sys/rctl.h> +#include <sys/port_kernel.h> +#include <sys/schedctl.h> + +#define NPHLOCKS 64 /* Number of locks; must be power of 2 */ +#define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)] +#define PHLOCK(php) PHLOCKADDR(php).pp_lock +#define PH_ENTER(php) mutex_enter(PHLOCK(php)) +#define PH_EXIT(php) mutex_exit(PHLOCK(php)) +#define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \ + | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL) + +/* + * global counters to collect some stats + */ +static struct { + kstat_named_t polllistmiss; /* failed to find a cached poll list */ + kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ + kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ + kstat_named_t pollcachemiss; /* every list entry is dif from cache */ +} pollstats = { + { "polllistmiss", KSTAT_DATA_UINT64 }, + { "pollcachehit", KSTAT_DATA_UINT64 }, + { "pollcachephit", KSTAT_DATA_UINT64 }, + { "pollcachemiss", KSTAT_DATA_UINT64 } +}; + +kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; +uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t); + +struct pplock { + kmutex_t pp_lock; + short pp_flag; + kcondvar_t pp_wait_cv; + int32_t pp_pad; /* to a nice round 16 bytes */ +}; + +static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ + +#ifdef DEBUG +static int pollchecksanity(pollstate_t *, nfds_t); +static int pollcheckxref(pollstate_t *, int); +static void pollcheckphlist(void); +static int pollcheckrevents(pollstate_t *, int, int, int); +static void checkpolldat(pollstate_t *); +#endif /* DEBUG */ +static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, + int *); + +/* + * Data structure overview: + * The per-thread poll state consists of + * one pollstate_t + * one pollcache_t + * one bitmap with one event bit per fd + * a (two-dimensional) hashed array of polldat_t structures - one entry + * per fd + * + * This conglomerate of data structures interact with + * the pollhead which is used by VOP_POLL and pollwakeup + * (protected by the PHLOCK, cached array of plocks), and + * the fpollinfo list hanging off the fi_list which is used to notify + * poll when a cached fd is closed. This is protected by uf_lock. + * + * Invariants: + * pd_php (pollhead pointer) is set iff (if and only if) the polldat + * is on that pollhead. This is modified atomically under pc_lock. + * + * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo + * list for that open file. + * This is modified atomically under pc_lock. + * + * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt. + * Iff pd_ref[i].xf_refcnt >= 1 then + * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd + * Iff pd_ref[i].xf_refcnt > 1 then + * In ps_pcacheset[i].pcs_pollfd between index + * pd_ref[i].xf_position] and the end of the list + * there are xf_refcnt entries with .fd == pd_fd + * + * Locking design: + * Whenever possible the design relies on the fact that the poll cache state + * is per thread thus for both poll and exit it is self-synchronizing. + * Thus the key interactions where other threads access the state are: + * pollwakeup (and polltime), and + * close cleaning up the cached references to an open file + * + * The two key locks in poll proper is ps_lock and pc_lock. + * + * The ps_lock is used for synchronization between poll, (lwp_)exit and close + * to ensure that modifications to pollcacheset structure are serialized. + * This lock is held through most of poll() except where poll sleeps + * since there is little need to handle closes concurrently with the execution + * of poll. + * The pc_lock protects most of the fields in pollcache structure and polldat + * structures (which are accessed by poll, pollwakeup, and polltime) + * with the exception of fields that are only modified when only one thread + * can access this per-thread state. + * Those exceptions occur in poll when first allocating the per-thread state, + * when poll grows the number of polldat (never shrinks), and when + * exit/pollcleanup has ensured that there are no references from either + * pollheads or fpollinfo to the threads poll state. + * + * Poll(2) system call is the only path which ps_lock and pc_lock are both + * held, in that order. It needs ps_lock to synchronize with close and + * lwp_exit; and pc_lock with pollwakeup. + * + * The locking interaction between pc_lock and PHLOCK take into account + * that poll acquires these locks in the order of pc_lock and then PHLOCK + * while pollwakeup does it in the reverse order. Thus pollwakeup implements + * deadlock avoidance by dropping the locks and reacquiring them in the + * reverse order. For this to work pollwakeup needs to prevent the thread + * from exiting and freeing all of the poll related state. Thus is done + * using + * the pc_no_exit lock + * the pc_busy counter + * the pc_busy_cv condition variable + * + * The locking interaction between pc_lock and uf_lock has similar + * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef + * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock + * to prevent poll or exit from doing a delfpollinfo after which the thread + * might exit. But the cleanup needs to acquire pc_lock when modifying + * the poll cache state. The solution is to use pc_busy and do the close + * cleanup in two phases: + * First close calls pollblockexit which increments pc_busy. + * This prevents the per-thread poll related state from being freed. + * Then close drops uf_lock and calls pollcacheclean. + * This routine can then acquire pc_lock and remove any references + * to the closing fd (as well as recording that it has been closed + * so that a POLLNVAL can be generated even if the fd is reused before + * poll has been woken up and checked getf() again). + * + * When removing a polled fd from poll cache, the fd is always removed + * from pollhead list first and then from fpollinfo list, i.e., + * pollhead_delete() is called before delfpollinfo(). + * + * + * Locking hierarchy: + * pc_no_exit is a leaf level lock. + * ps_lock is held when acquiring pc_lock (except when pollwakeup + * acquires pc_lock). + * pc_lock might be held when acquiring PHLOCK (pollhead_insert/ + * pollhead_delete) + * pc_lock is always held (but this is not required) + * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called + * from pcache_clean_entry). + * pc_lock is held across addfpollinfo/delfpollinfo which acquire + * uf_lock. + * pc_lock is held across getf/releasef which acquire uf_lock. + * ps_lock might be held across getf/releasef which acquire uf_lock. + * pollwakeup tries to acquire pc_lock while holding PHLOCK + * but drops the locks and reacquire them in reverse order to avoid + * deadlock. + * + * Note also that there is deadlock avoidance support for VOP_POLL routines + * and pollwakeup involving a file system or driver lock. + * See below. + */ + +/* + * Deadlock avoidance support for VOP_POLL() routines. This is + * sometimes necessary to prevent deadlock between polling threads + * (which hold poll locks on entry to xx_poll(), then acquire foo) + * and pollwakeup() threads (which hold foo, then acquire poll locks). + * + * pollunlock(void) releases whatever poll locks the current thread holds, + * returning a cookie for use by pollrelock(); + * + * pollrelock(cookie) reacquires previously dropped poll locks; + * + * polllock(php, mutex) does the common case: pollunlock(), + * acquire the problematic mutex, pollrelock(). + */ +int +pollunlock(void) +{ + pollcache_t *pcp; + int lockstate = 0; + + /* + * t_pollcache is set by /dev/poll and event ports (port_fd.c). + * If the pollrelock/pollunlock is called as a result of poll(2), + * the t_pollcache should be NULL. + */ + if (curthread->t_pollcache == NULL) + pcp = curthread->t_pollstate->ps_pcache; + else + pcp = curthread->t_pollcache; + + if (mutex_owned(&pcp->pc_lock)) { + lockstate = 1; + mutex_exit(&pcp->pc_lock); + } + return (lockstate); +} + +void +pollrelock(int lockstate) +{ + pollcache_t *pcp; + + /* + * t_pollcache is set by /dev/poll and event ports (port_fd.c). + * If the pollrelock/pollunlock is called as a result of poll(2), + * the t_pollcache should be NULL. + */ + if (curthread->t_pollcache == NULL) + pcp = curthread->t_pollstate->ps_pcache; + else + pcp = curthread->t_pollcache; + + if (lockstate > 0) + mutex_enter(&pcp->pc_lock); +} + +/* ARGSUSED */ +void +polllock(pollhead_t *php, kmutex_t *lp) +{ + if (!mutex_tryenter(lp)) { + int lockstate = pollunlock(); + mutex_enter(lp); + pollrelock(lockstate); + } +} + +static int +poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + int fdcnt = 0; + int rval; + int i; + timespec_t *rqtp = NULL; + int timecheck = 0; + int imm_timeout = 0; + pollfd_t *pollfdp; + pollstate_t *ps; + pollcache_t *pcp; + int error = 0; + nfds_t old_nfds; + int cacheindex = 0; /* which cache set is used */ + + /* + * Determine the precise future time of the requested timeout, if any. + */ + if (tsp != NULL) { + if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) + imm_timeout = 1; + else { + timespec_t now; + timecheck = timechanged; + gethrestime(&now); + rqtp = tsp; + timespecadd(rqtp, &now); + } + } + + /* + * Reset our signal mask, if requested. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + /* + * Call cv_timedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_timedwait_sig(&t->t_delay_cv, &p->p_lock, lbolt)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto pollout; + } + mutex_exit(&p->p_lock); + } + + /* + * Check to see if this guy just wants to use poll() as a timeout. + * If yes then bypass all the other stuff and make him sleep. + */ + if (nfds == 0) { + /* + * Sleep until we have passed the requested future + * time or until interrupted by a signal. + * Do not check for signals if we have a zero timeout. + */ + if (!imm_timeout) { + mutex_enter(&t->t_delay_lock); + while ((rval = cv_waituntil_sig(&t->t_delay_cv, + &t->t_delay_lock, rqtp, timecheck)) > 0) + continue; + mutex_exit(&t->t_delay_lock); + if (rval == 0) + error = EINTR; + } + goto pollout; + } + + if (nfds > p->p_fno_ctl) { + mutex_enter(&p->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + p->p_rctls, p, RCA_SAFE); + mutex_exit(&p->p_lock); + error = EINVAL; + goto pollout; + } + + /* + * Need to allocate memory for pollstate before anything because + * the mutex and cv are created in this space + */ + if ((ps = t->t_pollstate) == NULL) { + t->t_pollstate = pollstate_create(); + ps = t->t_pollstate; + } + + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + pcp = ps->ps_pcache; + + /* + * NOTE: for performance, buffers are saved across poll() calls. + * The theory is that if a process polls heavily, it tends to poll + * on the same set of descriptors. Therefore, we only reallocate + * buffers when nfds changes. There is no hysteresis control, + * because there is no data to suggest that this is necessary; + * the penalty of reallocating is not *that* great in any event. + */ + old_nfds = ps->ps_nfds; + if (nfds != old_nfds) { + + kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); + pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + ps->ps_pollfd = pollfdp; + ps->ps_nfds = nfds; + } + + pollfdp = ps->ps_pollfd; + if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { + error = EFAULT; + goto pollout; + } + + if (fds == NULL) { + /* + * If the process has page 0 mapped, then the copyin() above + * will succeed even if fds is NULL. However, our cached + * poll lists are keyed by the address of the passed-in fds + * structure, and we use the value NULL to indicate an unused + * poll cache list entry. As such, we elect not to support + * NULL as a valid (user) memory address and fail the poll() + * call. + */ + error = EINVAL; + goto pollout; + } + + /* + * If this thread polls for the first time, allocate ALL poll + * cache data structures and cache the poll fd list. This + * allocation is delayed till now because lwp's polling 0 fd + * (i.e. using poll as timeout()) don't need this memory. + */ + mutex_enter(&ps->ps_lock); + pcp = ps->ps_pcache; + ASSERT(pcp != NULL); + if (pcp->pc_bitmap == NULL) { + pcache_create(pcp, nfds); + /* + * poll and cache this poll fd list in ps_pcacheset[0]. + */ + error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); + if (fdcnt || error) { + mutex_exit(&ps->ps_lock); + goto pollout; + } + } else { + pollcacheset_t *pcset = ps->ps_pcacheset; + + /* + * Not first time polling. Select a cached poll list by + * matching user pollfd list buffer address. + */ + for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) { + if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) { + if ((++pcset[cacheindex].pcs_count) == 0) { + /* + * counter is wrapping around. + */ + pcacheset_reset_count(ps, cacheindex); + } + /* + * examine and resolve possible + * difference of the current poll + * list and previously cached one. + * If there is an error during resolve(), + * the callee will guarantee the consistency + * of cached poll list and cache content. + */ + error = pcacheset_resolve(ps, nfds, &fdcnt, + cacheindex); + if (error) { + mutex_exit(&ps->ps_lock); + goto pollout; + } + break; + } + + /* + * Note that pcs_usradr field of an used entry won't be + * NULL because it stores the address of passed-in fds, + * and NULL fds will not be cached (Then it is either + * the special timeout case when nfds is 0 or it returns + * failure directly). + */ + if (pcset[cacheindex].pcs_usradr == NULL) { + /* + * found an unused entry. Use it to cache + * this poll list. + */ + error = pcacheset_cache_list(ps, fds, &fdcnt, + cacheindex); + if (fdcnt || error) { + mutex_exit(&ps->ps_lock); + goto pollout; + } + break; + } + } + if (cacheindex == ps->ps_nsets) { + /* + * We failed to find a matching cached poll fd list. + * replace an old list. + */ + pollstats.polllistmiss.value.ui64++; + cacheindex = pcacheset_replace(ps); + ASSERT(cacheindex < ps->ps_nsets); + pcset[cacheindex].pcs_usradr = (uintptr_t)fds; + error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); + if (error) { + mutex_exit(&ps->ps_lock); + goto pollout; + } + } + } + + /* + * Always scan the bitmap with the lock on the pollcache held. + * This is to make sure that a wakeup does not come undetected. + * If the lock is not held, a pollwakeup could have come for an + * fd we already checked but before this thread sleeps, in which + * case the wakeup is missed. Now we hold the pcache lock and + * check the bitmap again. This will prevent wakeup from happening + * while we hold pcache lock since pollwakeup() will also lock + * the pcache before updating poll bitmap. + */ + mutex_enter(&pcp->pc_lock); + for (;;) { + pcp->pc_flag = 0; + error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); + if (fdcnt || error) { + mutex_exit(&pcp->pc_lock); + mutex_exit(&ps->ps_lock); + break; + } + + /* + * If T_POLLWAKE is set, a pollwakeup() was performed on + * one of the file descriptors. This can happen only if + * one of the VOP_POLL() functions dropped pcp->pc_lock. + * The only current cases of this is in procfs (prpoll()) + * and STREAMS (strpoll()). + */ + if (pcp->pc_flag & T_POLLWAKE) + continue; + + /* + * If you get here, the poll of fds was unsuccessful. + * Wait until some fd becomes readable, writable, or gets + * an exception, or until a signal or a timeout occurs. + * Do not check for signals if we have a zero timeout. + */ + mutex_exit(&ps->ps_lock); + if (imm_timeout) + rval = -1; + else + rval = cv_waituntil_sig(&pcp->pc_cv, &pcp->pc_lock, + rqtp, timecheck); + mutex_exit(&pcp->pc_lock); + /* + * If we have received a signal or timed out + * then break out and return. + */ + if (rval <= 0) { + if (rval == 0) + error = EINTR; + break; + } + /* + * We have not received a signal or timed out. + * Continue around and poll fds again. + */ + mutex_enter(&ps->ps_lock); + mutex_enter(&pcp->pc_lock); + } + +pollout: + /* + * If we changed the signal mask but we received + * no signal then restore the signal mask. + * Otherwise psig() will deal with the signal mask. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + mutex_exit(&p->p_lock); + } + + if (error) + return (set_errno(error)); + + /* + * Copy out the events and return the fdcnt to the user. + */ + if (nfds != 0 && + copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) + return (set_errno(EFAULT)); + +#ifdef DEBUG + /* + * Another sanity check: + */ + if (fdcnt) { + int reventcnt = 0; + + for (i = 0; i < nfds; i++) { + if (pollfdp[i].fd < 0) { + ASSERT(pollfdp[i].revents == 0); + continue; + } + if (pollfdp[i].revents) { + reventcnt++; + } + } + ASSERT(fdcnt == reventcnt); + } else { + for (i = 0; i < nfds; i++) { + ASSERT(pollfdp[i].revents == 0); + } + } +#endif /* DEBUG */ + + return (fdcnt); +} + +/* + * This system call trap exists solely for binary compatibility with + * old statically-linked applications. It is not called from libc. + * It should be removed in the next release. + */ +int +poll(pollfd_t *fds, nfds_t nfds, int time_out) +{ + timespec_t ts; + timespec_t *tsp; + + if (time_out < 0) + tsp = NULL; + else { + ts.tv_sec = time_out / MILLISEC; + ts.tv_nsec = (time_out % MILLISEC) * MICROSEC; + tsp = &ts; + } + + return (poll_common(fds, nfds, tsp, NULL)); +} + +/* + * This is the system call trap that poll(), + * select() and pselect() are built upon. + * It is a private interface between libc and the kernel. + */ +int +pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) +{ + timespec_t ts; + timespec_t *tsp; + sigset_t set; + k_sigset_t kset; + k_sigset_t *ksetp; + model_t datamodel = get_udatamodel(); + + if (timeoutp == NULL) + tsp = NULL; + else { + if (datamodel == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + + if (setp == NULL) + ksetp = NULL; + else { + if (copyin(setp, &set, sizeof (set))) + return (set_errno(EFAULT)); + sigutok(&set, &kset); + ksetp = &kset; + } + + return (poll_common(fds, nfds, tsp, ksetp)); +} + +/* + * Clean up any state left around by poll(2). Called when a thread exits. + */ +void +pollcleanup() +{ + pollstate_t *ps = curthread->t_pollstate; + pollcache_t *pcp; + + if (ps == NULL) + return; + pcp = ps->ps_pcache; + /* + * free up all cached poll fds + */ + if (pcp == NULL) { + /* this pollstate is used by /dev/poll */ + goto pollcleanout; + } + + if (pcp->pc_bitmap != NULL) { + ASSERT(MUTEX_NOT_HELD(&ps->ps_lock)); + /* + * a close lwp can race with us when cleaning up a polldat + * entry. We hold the ps_lock when cleaning hash table. + * Since this pollcache is going away anyway, there is no + * need to hold the pc_lock. + */ + mutex_enter(&ps->ps_lock); + pcache_clean(pcp); + mutex_exit(&ps->ps_lock); +#ifdef DEBUG + /* + * At this point, all fds cached by this lwp should be + * cleaned up. There should be no fd in fi_list still + * reference this thread. + */ + checkfpollinfo(); /* sanity check */ + pollcheckphlist(); /* sanity check */ +#endif /* DEBUG */ + } + /* + * Be sure no one is referencing thread before exiting + */ + mutex_enter(&pcp->pc_no_exit); + ASSERT(pcp->pc_busy >= 0); + while (pcp->pc_busy > 0) + cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); + mutex_exit(&pcp->pc_no_exit); +pollcleanout: + pollstate_destroy(ps); + curthread->t_pollstate = NULL; +} + +/* + * pollwakeup() - poke threads waiting in poll() for some event + * on a particular object. + * + * The threads hanging off of the specified pollhead structure are scanned. + * If their event mask matches the specified event(s), then pollnotify() is + * called to poke the thread. + * + * Multiple events may be specified. When POLLHUP or POLLERR are specified, + * all waiting threads are poked. + * + * It is important that pollnotify() not drop the lock protecting the list + * of threads. + */ +void +pollwakeup(pollhead_t *php, short events_arg) +{ + polldat_t *pdp; + int events = (ushort_t)events_arg; + +retry: + PH_ENTER(php); + + /* + * About half of all pollwakeups don't do anything, because the + * pollhead list is empty (i.e, nobody is interested in the event). + * For this common case, we can optimize out locking overhead. + */ + if (php->ph_list == NULL) { + PH_EXIT(php); + return; + } + + for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) { + if ((pdp->pd_events & events) || + (events & (POLLHUP | POLLERR))) { + + pollcache_t *pcp; + + if (pdp->pd_portev != NULL) { + port_kevent_t *pkevp = pdp->pd_portev; + /* + * Object (fd) is associated with an event port, + * => send event notification to the port. + */ + pkevp->portkev_events |= events & + (pdp->pd_events | POLLHUP | POLLERR); + if (pkevp->portkev_flags & PORT_KEV_VALID) { + pkevp->portkev_flags &= ~PORT_KEV_VALID; + (void) port_send_event(pdp->pd_portev); + } + continue; + } + + pcp = pdp->pd_pcache; + + /* + * Try to grab the lock for this thread. If + * we don't get it then we may deadlock so + * back out and restart all over again. Note + * that the failure rate is very very low. + */ + if (mutex_tryenter(&pcp->pc_lock)) { + pollnotify(pcp, pdp->pd_fd); + mutex_exit(&pcp->pc_lock); + } else { + /* + * We are here because: + * 1) This thread has been woke up + * and is trying to get out of poll(). + * 2) Some other thread is also here + * but with a different pollhead lock. + * + * So, we need to drop the lock on pollhead + * because of (1) but we want to prevent + * that thread from doing lwp_exit() or + * devpoll close. We want to ensure that + * the pollcache pointer is still invalid. + * + * Solution: Grab the pcp->pc_no_exit lock, + * increment the pc_busy counter, drop every + * lock in sight. Get out of the way and wait + * for type (2) threads to finish. + */ + + mutex_enter(&pcp->pc_no_exit); + pcp->pc_busy++; /* prevents exit()'s */ + mutex_exit(&pcp->pc_no_exit); + + PH_EXIT(php); + mutex_enter(&pcp->pc_lock); + mutex_exit(&pcp->pc_lock); + mutex_enter(&pcp->pc_no_exit); + pcp->pc_busy--; + if (pcp->pc_busy == 0) { + /* + * Wakeup the thread waiting in + * thread_exit(). + */ + cv_signal(&pcp->pc_busy_cv); + } + mutex_exit(&pcp->pc_no_exit); + goto retry; + } + } + } + PH_EXIT(php); +} + +/* + * This function is called to inform a thread that + * an event being polled for has occurred. + * The pollstate lock on the thread should be held on entry. + */ +void +pollnotify(pollcache_t *pcp, int fd) +{ + ASSERT(fd < pcp->pc_mapsize); + ASSERT(MUTEX_HELD(&pcp->pc_lock)); + BT_SET(pcp->pc_bitmap, fd); + pcp->pc_flag |= T_POLLWAKE; + cv_signal(&pcp->pc_cv); +} + +/* + * add a polldat entry to pollhead ph_list. The polldat struct is used + * by pollwakeup to wake sleeping pollers when polled events has happened. + */ +void +pollhead_insert(pollhead_t *php, polldat_t *pdp) +{ + PH_ENTER(php); + ASSERT(pdp->pd_next == NULL); +#ifdef DEBUG + { + /* + * the polldat should not be already on the list + */ + polldat_t *wp; + for (wp = php->ph_list; wp; wp = wp->pd_next) { + ASSERT(wp != pdp); + } + } +#endif /* DEBUG */ + pdp->pd_next = php->ph_list; + php->ph_list = pdp; + PH_EXIT(php); +} + +/* + * Delete the polldat entry from ph_list. + */ +void +pollhead_delete(pollhead_t *php, polldat_t *pdp) +{ + polldat_t *wp; + polldat_t **wpp; + + PH_ENTER(php); + for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) { + if (wp == pdp) { + *wpp = pdp->pd_next; + pdp->pd_next = NULL; + break; + } + } +#ifdef DEBUG + /* assert that pdp is no longer in the list */ + for (wp = *wpp; wp; wp = wp->pd_next) { + ASSERT(wp != pdp); + } +#endif /* DEBUG */ + PH_EXIT(php); +} + +/* + * walk through the poll fd lists to see if they are identical. This is an + * expensive operation and should not be done more than once for each poll() + * call. + * + * As an optimization (i.e., not having to go through the lists more than + * once), this routine also clear the revents field of pollfd in 'current'. + * Zeroing out the revents field of each entry in current poll list is + * required by poll man page. + * + * Since the events field of cached list has illegal poll events filtered + * out, the current list applies the same filtering before comparison. + * + * The routine stops when it detects a meaningful difference, or when it + * exhausts the lists. + */ +int +pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n) +{ + int ix; + + for (ix = 0; ix < n; ix++) { + if (current[ix].fd == cached[ix].fd) { + /* + * Filter out invalid poll events while we are in + * inside the loop. + */ + if (current[ix].events & ~VALID_POLL_EVENTS) { + current[ix].events &= VALID_POLL_EVENTS; + if (newlist != NULL) + newlist[ix].events = current[ix].events; + } + if (current[ix].events == cached[ix].events) { + current[ix].revents = 0; + continue; + } + } + if ((current[ix].fd < 0) && (cached[ix].fd < 0)) { + current[ix].revents = 0; + continue; + } + return (ix); + } + return (ix); +} + +/* + * This routine returns a pointer to a cached poll fd entry, or NULL if it + * does not find it in the hash table. + */ +polldat_t * +pcache_lookup_fd(pollcache_t *pcp, int fd) +{ + int hashindex; + polldat_t *pdp; + + hashindex = POLLHASH(pcp->pc_hashsize, fd); + pdp = pcp->pc_hash[hashindex]; + while (pdp != NULL) { + if (pdp->pd_fd == fd) + break; + pdp = pdp->pd_hashnext; + } + return (pdp); +} + +polldat_t * +pcache_alloc_fd(int nsets) +{ + polldat_t *pdp; + + pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP); + if (nsets > 0) { + pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP); + pdp->pd_nsets = nsets; + } + return (pdp); +} + +/* + * This routine inserts a polldat into the pollcache's hash table. It + * may be necessary to grow the size of the hash table. + */ +void +pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds) +{ + int hashindex; + int fd; + + if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) || + (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) { + pcache_grow_hashtbl(pcp, nfds); + } + fd = pdp->pd_fd; + hashindex = POLLHASH(pcp->pc_hashsize, fd); + pdp->pd_hashnext = pcp->pc_hash[hashindex]; + pcp->pc_hash[hashindex] = pdp; + pcp->pc_fdcount++; + +#ifdef DEBUG + { + /* + * same fd should not appear on a hash list twice + */ + polldat_t *pdp1; + for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) { + ASSERT(pdp->pd_fd != pdp1->pd_fd); + } + } +#endif /* DEBUG */ +} + +/* + * Grow the hash table -- either double the table size or round it to the + * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the + * elements on the hash table. + */ +void +pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) +{ + int oldsize; + polldat_t **oldtbl; + polldat_t *pdp, *pdp1; + int i; +#ifdef DEBUG + int count = 0; +#endif + + ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0); + oldsize = pcp->pc_hashsize; + oldtbl = pcp->pc_hash; + if (nfds > pcp->pc_hashsize * POLLHASHINC) { + pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & + ~(POLLHASHCHUNKSZ - 1); + } else { + pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC; + } + pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), + KM_SLEEP); + /* + * rehash existing elements + */ + pcp->pc_fdcount = 0; + for (i = 0; i < oldsize; i++) { + pdp = oldtbl[i]; + while (pdp != NULL) { + pdp1 = pdp->pd_hashnext; + pcache_insert_fd(pcp, pdp, nfds); + pdp = pdp1; +#ifdef DEBUG + count++; +#endif + } + } + kmem_free(oldtbl, oldsize * sizeof (polldat_t *)); + ASSERT(pcp->pc_fdcount == count); +} + +void +pcache_grow_map(pollcache_t *pcp, int fd) +{ + int newsize; + ulong_t *newmap; + + /* + * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is + * power of 2. + */ + newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1); + newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t), + KM_SLEEP); + /* + * don't want pollwakeup to set a bit while growing the bitmap. + */ + ASSERT(mutex_owned(&pcp->pc_lock) == 0); + mutex_enter(&pcp->pc_lock); + bcopy(pcp->pc_bitmap, newmap, + (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t)); + kmem_free(pcp->pc_bitmap, + (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t)); + pcp->pc_bitmap = newmap; + pcp->pc_mapsize = newsize; + mutex_exit(&pcp->pc_lock); +} + +/* + * remove all the reference from pollhead list and fpollinfo lists. + */ +void +pcache_clean(pollcache_t *pcp) +{ + int i; + polldat_t **hashtbl; + polldat_t *pdp; + + ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock)); + hashtbl = pcp->pc_hash; + for (i = 0; i < pcp->pc_hashsize; i++) { + for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { + if (pdp->pd_php != NULL) { + pollhead_delete(pdp->pd_php, pdp); + pdp->pd_php = NULL; + } + if (pdp->pd_fp != NULL) { + delfpollinfo(pdp->pd_fd); + pdp->pd_fp = NULL; + } + } + } +} + +void +pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) +{ + int i; + int fd = pdp->pd_fd; + + /* + * we come here because an earlier close() on this cached poll fd. + */ + ASSERT(pdp->pd_fp == NULL); + ASSERT(MUTEX_HELD(&ps->ps_lock)); + pdp->pd_events = 0; + for (i = 0; i < ps->ps_nsets; i++) { + xref_t *refp; + pollcacheset_t *pcsp; + + ASSERT(pdp->pd_ref != NULL); + refp = &pdp->pd_ref[i]; + if (refp->xf_refcnt) { + ASSERT(refp->xf_position >= 0); + pcsp = &ps->ps_pcacheset[i]; + if (refp->xf_refcnt == 1) { + pcsp->pcs_pollfd[refp->xf_position].fd = -1; + refp->xf_refcnt = 0; + pdp->pd_count--; + } else if (refp->xf_refcnt > 1) { + int j; + + /* + * turn off every appearance in pcs_pollfd list + */ + for (j = refp->xf_position; + j < pcsp->pcs_nfds; j++) { + if (pcsp->pcs_pollfd[j].fd == fd) { + pcsp->pcs_pollfd[j].fd = -1; + refp->xf_refcnt--; + pdp->pd_count--; + } + } + } + ASSERT(refp->xf_refcnt == 0); + refp->xf_position = POLLPOSINVAL; + } + } + ASSERT(pdp->pd_count == 0); +} + +/* + * Insert poll fd into the pollcache, and add poll registration. + * This routine is called after getf() and before releasef(). So the vnode + * can not disappear even if we block here. + * If there is an error, the polled fd is not cached. + */ +int +pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, + ssize_t pos, int which) +{ + pollcache_t *pcp = ps->ps_pcache; + polldat_t *pdp; + int error; + int fd; + pollhead_t *memphp = NULL; + xref_t *refp; + int newpollfd = 0; + + ASSERT(MUTEX_HELD(&ps->ps_lock)); + /* + * The poll caching uses the existing VOP_POLL interface. If there + * is no polled events, we want the polled device to set its "some + * one is sleeping in poll" flag. When the polled events happen + * later, the driver will call pollwakeup(). We achieve this by + * always passing 0 in the third parameter ("anyyet") when calling + * VOP_POLL. This parameter is not looked at by drivers when the + * polled events exist. If a driver chooses to ignore this parameter + * and call pollwakeup whenever the polled events happen, that will + * be OK too. + */ + ASSERT(curthread->t_pollcache == NULL); + error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, + &memphp); + if (error) { + return (error); + } + if (pollfdp->revents) { + (*fdcntp)++; + } + /* + * polling the underlying device succeeded. Now we can cache it. + * A close can't come in here because we have not done a releasef() + * yet. + */ + fd = pollfdp->fd; + pdp = pcache_lookup_fd(pcp, fd); + if (pdp == NULL) { + ASSERT(ps->ps_nsets > 0); + pdp = pcache_alloc_fd(ps->ps_nsets); + newpollfd = 1; + } + /* + * If this entry was used to cache a poll fd which was closed, and + * this entry has not been cleaned, do it now. + */ + if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) { + pcacheset_invalidate(ps, pdp); + ASSERT(pdp->pd_next == NULL); + } + if (pdp->pd_count == 0) { + pdp->pd_fd = fd; + pdp->pd_fp = fp; + addfpollinfo(fd); + pdp->pd_thread = curthread; + pdp->pd_pcache = pcp; + /* + * the entry is never used or cleared by removing a cached + * pollfd (pcache_delete_fd). So all the fields should be clear. + */ + ASSERT(pdp->pd_next == NULL); + } + + /* + * A polled fd is considered cached. So there should be a fpollinfo + * entry on uf_fpollinfo list. + */ + ASSERT(infpollinfo(fd)); + /* + * If there is an inconsistency, we want to know it here. + */ + ASSERT(pdp->pd_fp == fp); + + /* + * XXX pd_events is a union of all polled events on this fd, possibly + * by different threads. Unless this is a new first poll(), pd_events + * never shrinks. If an event is no longer polled by a process, there + * is no way to cancel that event. In that case, poll degrade to its + * old form -- polling on this fd every time poll() is called. The + * assumption is an app always polls the same type of events. + */ + pdp->pd_events |= pollfdp->events; + + pdp->pd_count++; + /* + * There is not much special handling for multiple appearances of + * same fd other than xf_position always recording the first + * appearance in poll list. If this is called from pcacheset_cache_list, + * a VOP_POLL is called on every pollfd entry; therefore each + * revents and fdcnt should be set correctly. If this is called from + * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will + * pick up the right count and handle revents field of each pollfd + * entry. + */ + ASSERT(pdp->pd_ref != NULL); + refp = &pdp->pd_ref[which]; + if (refp->xf_refcnt == 0) { + refp->xf_position = pos; + } else { + /* + * xf_position records the fd's first appearance in poll list + */ + if (pos < refp->xf_position) { + refp->xf_position = pos; + } + } + ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd); + refp->xf_refcnt++; + if (fd >= pcp->pc_mapsize) { + pcache_grow_map(pcp, fd); + } + if (fd > pcp->pc_mapend) { + pcp->pc_mapend = fd; + } + if (newpollfd != 0) { + pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds); + } + if (memphp) { + if (pdp->pd_php == NULL) { + pollhead_insert(memphp, pdp); + pdp->pd_php = memphp; + } else { + if (memphp != pdp->pd_php) { + /* + * layered devices (e.g. console driver) + * may change the vnode and thus the pollhead + * pointer out from underneath us. + */ + pollhead_delete(pdp->pd_php, pdp); + pollhead_insert(memphp, pdp); + pdp->pd_php = memphp; + } + } + } + /* + * Since there is a considerable window between VOP_POLL and when + * we actually put the polldat struct on the pollhead list, we could + * miss a pollwakeup. In the case of polling additional events, we + * don't update the events until after VOP_POLL. So we could miss + * pollwakeup there too. So we always set the bit here just to be + * safe. The real performance gain is in subsequent pcache_poll. + */ + mutex_enter(&pcp->pc_lock); + BT_SET(pcp->pc_bitmap, fd); + mutex_exit(&pcp->pc_lock); + return (0); +} + +/* + * The entry is not really deleted. The fields are cleared so that the + * entry is no longer useful, but it will remain in the hash table for reuse + * later. It will be freed when the polling lwp exits. + */ +int +pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) +{ + pollcache_t *pcp = ps->ps_pcache; + polldat_t *pdp; + xref_t *refp; + + ASSERT(fd < pcp->pc_mapsize); + ASSERT(MUTEX_HELD(&ps->ps_lock)); + + pdp = pcache_lookup_fd(pcp, fd); + ASSERT(pdp != NULL); + ASSERT(pdp->pd_count > 0); + ASSERT(pdp->pd_ref != NULL); + refp = &pdp->pd_ref[which]; + if (pdp->pd_count == 1) { + pdp->pd_events = 0; + refp->xf_position = POLLPOSINVAL; + ASSERT(refp->xf_refcnt == 1); + refp->xf_refcnt = 0; + if (pdp->pd_php) { + /* + * It is possible for a wakeup thread to get ahead + * of the following pollhead_delete and set the bit in + * bitmap. It is OK because the bit will be cleared + * here anyway. + */ + pollhead_delete(pdp->pd_php, pdp); + pdp->pd_php = NULL; + } + pdp->pd_count = 0; + if (pdp->pd_fp != NULL) { + pdp->pd_fp = NULL; + delfpollinfo(fd); + } + mutex_enter(&pcp->pc_lock); + BT_CLEAR(pcp->pc_bitmap, fd); + mutex_exit(&pcp->pc_lock); + return (0); + } + if ((cevent & POLLCLOSED) == POLLCLOSED) { + /* + * fd cached here has been closed. This is the first + * pcache_delete_fd called after the close. Clean up the + * entire entry. + */ + pcacheset_invalidate(ps, pdp); + ASSERT(pdp->pd_php == NULL); + mutex_enter(&pcp->pc_lock); + BT_CLEAR(pcp->pc_bitmap, fd); + mutex_exit(&pcp->pc_lock); + return (0); + } +#ifdef DEBUG + if (getf(fd) != NULL) { + ASSERT(infpollinfo(fd)); + releasef(fd); + } +#endif /* DEBUG */ + pdp->pd_count--; + ASSERT(refp->xf_refcnt > 0); + if (--refp->xf_refcnt == 0) { + refp->xf_position = POLLPOSINVAL; + } else { + ASSERT(pos >= refp->xf_position); + if (pos == refp->xf_position) { + /* + * The xref position is no longer valid. + * Reset it to a special value and let + * caller know it needs to updatexref() + * with a new xf_position value. + */ + refp->xf_position = POLLPOSTRANS; + return (1); + } + } + return (0); +} + +void +pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) +{ + polldat_t *pdp; + + pdp = pcache_lookup_fd(pcp, fd); + ASSERT(pdp != NULL); + ASSERT(pdp->pd_ref != NULL); + pdp->pd_ref[which].xf_position = pos; +} + +#ifdef DEBUG +/* + * For each polled fd, it's either in the bitmap or cached in + * pcache hash table. If this routine returns 0, something is wrong. + */ +static int +pollchecksanity(pollstate_t *ps, nfds_t nfds) +{ + int i; + int fd; + pollcache_t *pcp = ps->ps_pcache; + polldat_t *pdp; + pollfd_t *pollfdp = ps->ps_pollfd; + file_t *fp; + + ASSERT(MUTEX_HELD(&ps->ps_lock)); + for (i = 0; i < nfds; i++) { + fd = pollfdp[i].fd; + if (fd < 0) { + ASSERT(pollfdp[i].revents == 0); + continue; + } + if (pollfdp[i].revents == POLLNVAL) + continue; + if ((fp = getf(fd)) == NULL) + continue; + pdp = pcache_lookup_fd(pcp, fd); + ASSERT(pdp != NULL); + ASSERT(infpollinfo(fd)); + ASSERT(pdp->pd_fp == fp); + releasef(fd); + if (BT_TEST(pcp->pc_bitmap, fd)) + continue; + if (pdp->pd_php == NULL) + return (0); + } + return (1); +} +#endif /* DEBUG */ + +/* + * resolve the difference between the current poll list and a cached one. + */ +int +pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) +{ + int i; + pollcache_t *pcp = ps->ps_pcache; + pollfd_t *newlist = NULL; + pollfd_t *current = ps->ps_pollfd; + pollfd_t *cached; + pollcacheset_t *pcsp; + int common; + int count = 0; + int offset; + int remain; + int fd; + file_t *fp; + int fdcnt = 0; + int cnt = 0; + nfds_t old_nfds; + int error = 0; + int mismatch = 0; + + ASSERT(MUTEX_HELD(&ps->ps_lock)); +#ifdef DEBUG + checkpolldat(ps); +#endif + pcsp = &ps->ps_pcacheset[which]; + old_nfds = pcsp->pcs_nfds; + common = (nfds > old_nfds) ? old_nfds : nfds; + if (nfds != old_nfds) { + /* + * the length of poll list has changed. allocate a new + * pollfd list. + */ + newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + bcopy(current, newlist, sizeof (pollfd_t) * nfds); + } + /* + * Compare the overlapping part of the current fd list with the + * cached one. Whenever a difference is found, resolve it. + * The comparison is done on the current poll list and the + * cached list. But we may be setting up the newlist to be the + * cached list for next poll. + */ + cached = pcsp->pcs_pollfd; + remain = common; + + while (count < common) { + int tmpfd; + pollfd_t *np; + + np = (newlist != NULL) ? &newlist[count] : NULL; + offset = pcacheset_cmp(¤t[count], &cached[count], np, + remain); + /* + * Collect stats. If lists are completed the first time, + * it's a hit. Otherwise, it's a partial hit or miss. + */ + if ((count == 0) && (offset == common)) { + pollstats.pollcachehit.value.ui64++; + } else { + mismatch++; + } + count += offset; + if (offset < remain) { + ASSERT(count < common); + ASSERT((current[count].fd != cached[count].fd) || + (current[count].events != cached[count].events)); + /* + * Filter out invalid events. + */ + if (current[count].events & ~VALID_POLL_EVENTS) { + if (newlist != NULL) { + newlist[count].events = + current[count].events &= + VALID_POLL_EVENTS; + } else { + current[count].events &= + VALID_POLL_EVENTS; + } + } + /* + * when resolving a difference, we always remove the + * fd from cache before inserting one into cache. + */ + if (cached[count].fd >= 0) { + tmpfd = cached[count].fd; + if (pcache_delete_fd(ps, tmpfd, count, which, + (uint_t)cached[count].events)) { + /* + * This should be rare but needed for + * correctness. + * + * The first appearance in cached list + * is being "turned off". The same fd + * appear more than once in the cached + * poll list. Find the next one on the + * list and update the cached + * xf_position field. + */ + for (i = count + 1; i < old_nfds; i++) { + if (cached[i].fd == tmpfd) { + pcache_update_xref(pcp, + tmpfd, (ssize_t)i, + which); + break; + } + } + ASSERT(i <= old_nfds); + } + /* + * In case a new cache list is allocated, + * need to keep both cache lists in sync + * b/c the new one can be freed if we have + * an error later. + */ + cached[count].fd = -1; + if (newlist != NULL) { + newlist[count].fd = -1; + } + } + if ((tmpfd = current[count].fd) >= 0) { + /* + * add to the cached fd tbl and bitmap. + */ + if ((fp = getf(tmpfd)) == NULL) { + current[count].revents = POLLNVAL; + if (newlist != NULL) { + newlist[count].fd = -1; + } + cached[count].fd = -1; + fdcnt++; + } else { + /* + * Here we don't care about the + * fdcnt. We will examine the bitmap + * later and pick up the correct + * fdcnt there. So we never bother + * to check value of 'cnt'. + */ + error = pcache_insert(ps, fp, + ¤t[count], &cnt, + (ssize_t)count, which); + /* + * if no error, we want to do releasef + * after we updated cache poll list + * entry so that close() won't race + * us. + */ + if (error) { + /* + * If we encountered an error, + * we have invalidated an + * entry in cached poll list + * (in pcache_delete_fd() above) + * but failed to add one here. + * This is OK b/c what's in the + * cached list is consistent + * with content of cache. + * It will not have any ill + * effect on next poll(). + */ + releasef(tmpfd); + if (newlist != NULL) { + kmem_free(newlist, + nfds * + sizeof (pollfd_t)); + } + return (error); + } + /* + * If we have allocated a new(temp) + * cache list, we need to keep both + * in sync b/c the new one can be freed + * if we have an error later. + */ + if (newlist != NULL) { + newlist[count].fd = + current[count].fd; + newlist[count].events = + current[count].events; + } + cached[count].fd = current[count].fd; + cached[count].events = + current[count].events; + releasef(tmpfd); + } + } else { + current[count].revents = 0; + } + count++; + remain = common - count; + } + } + if (mismatch != 0) { + if (mismatch == common) { + pollstats.pollcachemiss.value.ui64++; + } else { + pollstats.pollcachephit.value.ui64++; + } + } + /* + * take care of the non overlapping part of a list + */ + if (nfds > old_nfds) { + ASSERT(newlist != NULL); + for (i = old_nfds; i < nfds; i++) { + /* filter out invalid events */ + if (current[i].events & ~VALID_POLL_EVENTS) { + newlist[i].events = current[i].events = + current[i].events & VALID_POLL_EVENTS; + } + if ((fd = current[i].fd) < 0) { + current[i].revents = 0; + continue; + } + /* + * add to the cached fd tbl and bitmap. + */ + if ((fp = getf(fd)) == NULL) { + current[i].revents = POLLNVAL; + newlist[i].fd = -1; + fdcnt++; + continue; + } + /* + * Here we don't care about the + * fdcnt. We will examine the bitmap + * later and pick up the correct + * fdcnt there. So we never bother to + * check 'cnt'. + */ + error = pcache_insert(ps, fp, ¤t[i], &cnt, + (ssize_t)i, which); + releasef(fd); + if (error) { + /* + * Here we are half way through adding newly + * polled fd. Undo enough to keep the cache + * list consistent with the cache content. + */ + pcacheset_remove_list(ps, current, old_nfds, + i, which, 0); + kmem_free(newlist, nfds * sizeof (pollfd_t)); + return (error); + } + } + } + if (old_nfds > nfds) { + /* + * remove the fd's which are no longer polled. + */ + pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds, + which, 1); + } + /* + * set difference resolved. update nfds and cachedlist + * in pollstate struct. + */ + if (newlist != NULL) { + kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t)); + /* + * By now, the pollfd.revents field should + * all be zeroed. + */ + pcsp->pcs_pollfd = newlist; + pcsp->pcs_nfds = nfds; + } + ASSERT(*fdcntp == 0); + *fdcntp = fdcnt; + /* + * By now for every fd in pollfdp, one of the following should be + * true. Otherwise we will miss a polled event. + * + * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL + * will be called on this fd in next poll. + * 2. the fd is cached in the pcache (i.e. pd_php is set). So + * pollnotify will happen. + */ + ASSERT(pollchecksanity(ps, nfds)); + /* + * make sure cross reference between cached poll lists and cached + * poll fds are correct. + */ + ASSERT(pollcheckxref(ps, which)); + /* + * ensure each polldat in pollcache reference a polled fd in + * pollcacheset. + */ +#ifdef DEBUG + checkpolldat(ps); +#endif + return (0); +} + +#ifdef DEBUG +static int +pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds) +{ + int i; + int reventcnt = 0; + + for (i = 0; i < nfds; i++) { + if (pollfdp[i].fd < 0) { + ASSERT(pollfdp[i].revents == 0); + continue; + } + if (pollfdp[i].revents) { + reventcnt++; + } + if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) { + ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd)); + } + } + return (reventcnt); +} +#endif /* DEBUG */ + +/* + * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock + * is held upon entry. + */ +int +pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, + int which) +{ + int i; + pollcache_t *pcp; + int fd; + int begin, end, done; + pollhead_t *php; + int fdcnt; + int error = 0; + file_t *fp; + polldat_t *pdp; + xref_t *refp; + int entry; + + pcp = ps->ps_pcache; + ASSERT(MUTEX_HELD(&ps->ps_lock)); + ASSERT(MUTEX_HELD(&pcp->pc_lock)); +retry: + done = 0; + begin = 0; + fdcnt = 0; + end = pcp->pc_mapend; + while ((fdcnt < nfds) && !done) { + php = NULL; + /* + * only poll fds which may have events + */ + fd = bt_getlowbit(pcp->pc_bitmap, begin, end); + ASSERT(fd <= end); + if (fd >= 0) { + ASSERT(pollcheckrevents(ps, begin, fd, which)); + /* + * adjust map pointers for next round + */ + if (fd == end) { + done = 1; + } else { + begin = fd + 1; + } + /* + * A bitmap caches poll state information of + * multiple poll lists. Call VOP_POLL only if + * the bit corresponds to an fd in this poll + * list. + */ + pdp = pcache_lookup_fd(pcp, fd); + ASSERT(pdp != NULL); + ASSERT(pdp->pd_ref != NULL); + refp = &pdp->pd_ref[which]; + if (refp->xf_refcnt == 0) + continue; + entry = refp->xf_position; + ASSERT((entry >= 0) && (entry < nfds)); + ASSERT(pollfdp[entry].fd == fd); + /* + * we are in this routine implies that we have + * successfully polled this fd in the past. + * Check to see this fd is closed while we are + * blocked in poll. This ensures that we don't + * miss a close on the fd in the case this fd is + * reused. + */ + if (pdp->pd_fp == NULL) { + ASSERT(pdp->pd_count > 0); + pollfdp[entry].revents = POLLNVAL; + fdcnt++; + if (refp->xf_refcnt > 1) { + /* + * this fd appeared multiple time + * in the poll list. Find all of them. + */ + for (i = entry + 1; i < nfds; i++) { + if (pollfdp[i].fd == fd) { + pollfdp[i].revents = + POLLNVAL; + fdcnt++; + } + } + } + pcacheset_invalidate(ps, pdp); + continue; + } + /* + * We can be here polling a device that is being + * closed (i.e. the file pointer is set to NULL, + * but pollcacheclean has not happened yet). + */ + if ((fp = getf(fd)) == NULL) { + pollfdp[entry].revents = POLLNVAL; + fdcnt++; + if (refp->xf_refcnt > 1) { + /* + * this fd appeared multiple time + * in the poll list. Find all of them. + */ + for (i = entry + 1; i < nfds; i++) { + if (pollfdp[i].fd == fd) { + pollfdp[i].revents = + POLLNVAL; + fdcnt++; + } + } + } + continue; + } + ASSERT(pdp->pd_fp == fp); + ASSERT(infpollinfo(fd)); + /* + * Since we no longer hold poll head lock across + * VOP_POLL, pollunlock logic can be simplifed. + */ + ASSERT(pdp->pd_php == NULL || + MUTEX_NOT_HELD(PHLOCK(pdp->pd_php))); + /* + * underlying file systems may set a "pollpending" + * flag when it sees the poll may block. Pollwakeup() + * is called by wakeup thread if pollpending is set. + * Pass a 0 fdcnt so that the underlying file system + * will set the "pollpending" flag set when there is + * no polled events. + * + * Use pollfdp[].events for actual polling because + * the pd_events is union of all cached poll events + * on this fd. The events parameter also affects + * how the polled device sets the "poll pending" + * flag. + */ + ASSERT(curthread->t_pollcache == NULL); + error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, + &pollfdp[entry].revents, &php); + /* + * releasef after completely done with this cached + * poll entry. To prevent close() coming in to clear + * this entry. + */ + if (error) { + releasef(fd); + break; + } + /* + * layered devices (e.g. console driver) + * may change the vnode and thus the pollhead + * pointer out from underneath us. + */ + if (php != NULL && pdp->pd_php != NULL && + php != pdp->pd_php) { + releasef(fd); + pollhead_delete(pdp->pd_php, pdp); + pdp->pd_php = php; + pollhead_insert(php, pdp); + /* + * We could have missed a wakeup on the new + * target device. Make sure the new target + * gets polled once. + */ + BT_SET(pcp->pc_bitmap, fd); + goto retry; + } + + if (pollfdp[entry].revents) { + ASSERT(refp->xf_refcnt >= 1); + fdcnt++; + if (refp->xf_refcnt > 1) { + /* + * this fd appeared multiple time + * in the poll list. This is rare but + * we have to look at all of them for + * correctness. + */ + error = plist_chkdupfd(fp, pdp, ps, + pollfdp, entry, &fdcnt); + if (error > 0) { + releasef(fd); + break; + } + if (error < 0) { + goto retry; + } + } + releasef(fd); + } else { + /* + * VOP_POLL didn't return any revents. We can + * clear the bit in bitmap only if we have the + * pollhead ptr cached and no other cached + * entry is polling different events on this fd. + * VOP_POLL may have dropped the ps_lock. Make + * sure pollwakeup has not happened before clear + * the bit. + */ + if ((pdp->pd_php != NULL) && + (pollfdp[entry].events == pdp->pd_events) && + ((pcp->pc_flag & T_POLLWAKE) == 0)) { + BT_CLEAR(pcp->pc_bitmap, fd); + } + /* + * if the fd can be cached now but not before, + * do it now. + */ + if ((pdp->pd_php == NULL) && (php != NULL)) { + pdp->pd_php = php; + pollhead_insert(php, pdp); + /* + * We are inserting a polldat struct for + * the first time. We may have missed a + * wakeup on this device. Re-poll once. + * This should be a rare event. + */ + releasef(fd); + goto retry; + } + if (refp->xf_refcnt > 1) { + /* + * this fd appeared multiple time + * in the poll list. This is rare but + * we have to look at all of them for + * correctness. + */ + error = plist_chkdupfd(fp, pdp, ps, + pollfdp, entry, &fdcnt); + if (error > 0) { + releasef(fd); + break; + } + if (error < 0) { + goto retry; + } + } + releasef(fd); + } + } else { + done = 1; + ASSERT(pollcheckrevents(ps, begin, end + 1, which)); + } + } + if (!error) { + ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds)); + *fdcntp += fdcnt; + } + return (error); +} + +/* + * Going through the poll list without much locking. Poll all fds and + * cache all valid fds in the pollcache. + */ +int +pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which) +{ + pollfd_t *pollfdp = ps->ps_pollfd; + pollcacheset_t *pcacheset = ps->ps_pcacheset; + pollfd_t *newfdlist; + int i; + int fd; + file_t *fp; + int error = 0; + + ASSERT(MUTEX_HELD(&ps->ps_lock)); + ASSERT(which < ps->ps_nsets); + ASSERT(pcacheset != NULL); + ASSERT(pcacheset[which].pcs_pollfd == NULL); + newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP); + /* + * cache the new poll list in pollcachset. + */ + bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds); + + pcacheset[which].pcs_pollfd = newfdlist; + pcacheset[which].pcs_nfds = ps->ps_nfds; + pcacheset[which].pcs_usradr = (uintptr_t)fds; + + /* + * We have saved a copy of current poll fd list in one pollcacheset. + * The 'revents' field of the new list is not yet set to 0. Loop + * through the new list just to do that is expensive. We do that + * while polling the list. + */ + for (i = 0; i < ps->ps_nfds; i++) { + fd = pollfdp[i].fd; + /* + * We also filter out the illegal poll events in the event + * field for the cached poll list/set. + */ + if (pollfdp[i].events & ~VALID_POLL_EVENTS) { + newfdlist[i].events = pollfdp[i].events = + pollfdp[i].events & VALID_POLL_EVENTS; + } + if (fd < 0) { + pollfdp[i].revents = 0; + continue; + } + if ((fp = getf(fd)) == NULL) { + pollfdp[i].revents = POLLNVAL; + /* + * invalidate this cache entry in the cached poll list + */ + newfdlist[i].fd = -1; + (*fdcntp)++; + continue; + } + /* + * cache this fd. + */ + error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i, + which); + releasef(fd); + if (error) { + /* + * Here we are half way through caching a new + * poll list. Undo every thing. + */ + pcacheset_remove_list(ps, pollfdp, 0, i, which, 0); + kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t)); + pcacheset[which].pcs_pollfd = NULL; + pcacheset[which].pcs_usradr = NULL; + break; + } + } + return (error); +} + +/* + * called by pollcacheclean() to set the fp NULL. It also sets polled events + * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to + * wake any sleeping poller, then remove the polldat from the driver. + * The routine is called with ps_pcachelock held. + */ +void +pcache_clean_entry(pollstate_t *ps, int fd) +{ + pollcache_t *pcp; + polldat_t *pdp; + int i; + + ASSERT(ps != NULL); + ASSERT(MUTEX_HELD(&ps->ps_lock)); + pcp = ps->ps_pcache; + ASSERT(pcp); + pdp = pcache_lookup_fd(pcp, fd); + ASSERT(pdp != NULL); + /* + * the corresponding fpollinfo in fi_list has been removed by + * a close on this fd. Reset the cached fp ptr here. + */ + pdp->pd_fp = NULL; + /* + * XXX - This routine also touches data in pcacheset struct. + * + * set the event in cached poll lists to POLLCLOSED. This invalidate + * the cached poll fd entry in that poll list, which will force a + * removal of this cached entry in next poll(). The cleanup is done + * at the removal time. + */ + ASSERT(pdp->pd_ref != NULL); + for (i = 0; i < ps->ps_nsets; i++) { + xref_t *refp; + pollcacheset_t *pcsp; + + refp = &pdp->pd_ref[i]; + if (refp->xf_refcnt) { + ASSERT(refp->xf_position >= 0); + pcsp = &ps->ps_pcacheset[i]; + if (refp->xf_refcnt == 1) { + pcsp->pcs_pollfd[refp->xf_position].events = + (short)POLLCLOSED; + } + if (refp->xf_refcnt > 1) { + int j; + /* + * mark every matching entry in pcs_pollfd + */ + for (j = refp->xf_position; + j < pcsp->pcs_nfds; j++) { + if (pcsp->pcs_pollfd[j].fd == fd) { + pcsp->pcs_pollfd[j].events = + (short)POLLCLOSED; + } + } + } + } + } + if (pdp->pd_php) { + pollwakeup(pdp->pd_php, POLLHUP); + pollhead_delete(pdp->pd_php, pdp); + pdp->pd_php = NULL; + } +} + +/* + * This is the first time this thread has ever polled, + * so we have to create its pollstate structure. + * This will persist for the life of the thread, + * until it calls pollcleanup(). + */ +pollstate_t * +pollstate_create(void) +{ + pollstate_t *ps; + + ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); + ps->ps_nsets = POLLFDSETS; + ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); + return (ps); +} + +void +pollstate_destroy(pollstate_t *ps) +{ + if (ps->ps_pollfd != NULL) { + kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t)); + ps->ps_pollfd = NULL; + } + if (ps->ps_pcache != NULL) { + pcache_destroy(ps->ps_pcache); + ps->ps_pcache = NULL; + } + pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); + ps->ps_pcacheset = NULL; + if (ps->ps_dpbuf != NULL) { + kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t)); + ps->ps_dpbuf = NULL; + } + mutex_destroy(&ps->ps_lock); + kmem_free(ps, sizeof (pollstate_t)); +} + +/* + * We are holding the appropriate uf_lock entering this routine. + * Bump up the ps_busy count to prevent the thread from exiting. + */ +void +pollblockexit(fpollinfo_t *fpip) +{ + for (; fpip; fpip = fpip->fp_next) { + pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache; + + mutex_enter(&pcp->pc_no_exit); + pcp->pc_busy++; /* prevents exit()'s */ + mutex_exit(&pcp->pc_no_exit); + } +} + +/* + * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark + * the pcacheset events field POLLCLOSED to force the next poll() to remove + * this cache entry. We can't clean the polldat entry clean up here because + * lwp block in poll() needs the info to return. Wakeup anyone blocked in + * poll and let exiting lwp go. No lock is help upon entry. So it's OK for + * pcache_clean_entry to call pollwakeup(). + */ +void +pollcacheclean(fpollinfo_t *fip, int fd) +{ + struct fpollinfo *fpip, *fpip2; + + fpip = fip; + while (fpip) { + pollstate_t *ps = fpip->fp_thread->t_pollstate; + pollcache_t *pcp = ps->ps_pcache; + + mutex_enter(&ps->ps_lock); + pcache_clean_entry(ps, fd); + mutex_exit(&ps->ps_lock); + mutex_enter(&pcp->pc_no_exit); + pcp->pc_busy--; + if (pcp->pc_busy == 0) { + /* + * Wakeup the thread waiting in + * thread_exit(). + */ + cv_signal(&pcp->pc_busy_cv); + } + mutex_exit(&pcp->pc_no_exit); + + fpip2 = fpip; + fpip = fpip->fp_next; + kmem_free(fpip2, sizeof (fpollinfo_t)); + } +} + +/* + * one of the cache line's counter is wrapping around. Reset all cache line + * counters to zero except one. This is simplistic, but probably works + * effectively. + */ +void +pcacheset_reset_count(pollstate_t *ps, int index) +{ + int i; + + ASSERT(MUTEX_HELD(&ps->ps_lock)); + for (i = 0; i < ps->ps_nsets; i++) { + if (ps->ps_pcacheset[i].pcs_pollfd != NULL) { + ps->ps_pcacheset[i].pcs_count = 0; + } + } + ps->ps_pcacheset[index].pcs_count = 1; +} + +/* + * this routine implements poll cache list replacement policy. + * It is currently choose the "least used". + */ +int +pcacheset_replace(pollstate_t *ps) +{ + int i; + int index = 0; + + ASSERT(MUTEX_HELD(&ps->ps_lock)); + for (i = 1; i < ps->ps_nsets; i++) { + if (ps->ps_pcacheset[index].pcs_count > + ps->ps_pcacheset[i].pcs_count) { + index = i; + } + } + ps->ps_pcacheset[index].pcs_count = 0; + return (index); +} + +/* + * this routine is called by strclose to remove remaining polldat struct on + * the pollhead list of the device being closed. There are two reasons as why + * the polldat structures still remain on the pollhead list: + * + * (1) The layered device(e.g.the console driver). + * In this case, the existence of a polldat implies that the thread putting + * the polldat on this list has not exited yet. Before the thread exits, it + * will have to hold this pollhead lock to remove the polldat. So holding the + * pollhead lock here effectively prevents the thread which put the polldat + * on this list from exiting. + * + * (2) /dev/poll. + * When a polled fd is cached in /dev/poll, its polldat will remain on the + * pollhead list if the process has not done a POLLREMOVE before closing the + * polled fd. We just unlink it here. + */ +void +pollhead_clean(pollhead_t *php) +{ + polldat_t *pdp; + + /* + * In case(1), while we must prevent the thread in question from + * exiting, we must also obey the proper locking order, i.e. + * (ps_lock -> phlock). + */ + PH_ENTER(php); + while (php->ph_list != NULL) { + pollstate_t *ps; + pollcache_t *pcp; + + pdp = php->ph_list; + ASSERT(pdp->pd_php == php); + if (pdp->pd_thread == NULL) { + /* + * This is case(2). Since the ph_lock is sufficient + * to synchronize this lwp with any other /dev/poll + * lwp, just unlink the polldat. + */ + php->ph_list = pdp->pd_next; + pdp->pd_php = NULL; + pdp->pd_next = NULL; + continue; + } + ps = pdp->pd_thread->t_pollstate; + ASSERT(ps != NULL); + pcp = pdp->pd_pcache; + ASSERT(pcp != NULL); + mutex_enter(&pcp->pc_no_exit); + pcp->pc_busy++; /* prevents exit()'s */ + mutex_exit(&pcp->pc_no_exit); + /* + * Now get the locks in proper order to avoid deadlock. + */ + PH_EXIT(php); + mutex_enter(&ps->ps_lock); + /* + * while we dropped the pollhead lock, the element could be + * taken off the list already. + */ + PH_ENTER(php); + if (pdp->pd_php == php) { + ASSERT(pdp == php->ph_list); + php->ph_list = pdp->pd_next; + pdp->pd_php = NULL; + pdp->pd_next = NULL; + } + PH_EXIT(php); + mutex_exit(&ps->ps_lock); + mutex_enter(&pcp->pc_no_exit); + pcp->pc_busy--; + if (pcp->pc_busy == 0) { + /* + * Wakeup the thread waiting in + * thread_exit(). + */ + cv_signal(&pcp->pc_busy_cv); + } + mutex_exit(&pcp->pc_no_exit); + PH_ENTER(php); + } + PH_EXIT(php); +} + +/* + * The remove_list is called to cleanup a partially cached 'current' list or + * to remove a partial list which is no longer cached. The flag value of 1 + * indicates the second case. + */ +void +pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end, + int cacheindex, int flag) +{ + int i; + + ASSERT(MUTEX_HELD(&ps->ps_lock)); + for (i = start; i < end; i++) { + if ((pollfdp[i].fd >= 0) && + (flag || !(pollfdp[i].revents & POLLNVAL))) { + if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex, + (uint_t)pollfdp[i].events)) { + int j; + int fd = pollfdp[i].fd; + + for (j = i + 1; j < end; j++) { + if (pollfdp[j].fd == fd) { + pcache_update_xref( + ps->ps_pcache, fd, + (ssize_t)j, cacheindex); + break; + } + } + ASSERT(j <= end); + } + } + } +} + +#ifdef DEBUG + +#include<sys/strsubr.h> +/* + * make sure curthread is not on anyone's pollhead list any more. + */ +static void +pollcheckphlist() +{ + int i; + file_t *fp; + uf_entry_t *ufp; + uf_info_t *fip = P_FINFO(curproc); + struct stdata *stp; + polldat_t *pdp; + + mutex_enter(&fip->fi_lock); + for (i = 0; i < fip->fi_nfiles; i++) { + UF_ENTER(ufp, fip, i); + if ((fp = ufp->uf_file) != NULL) { + if ((stp = fp->f_vnode->v_stream) != NULL) { + PH_ENTER(&stp->sd_pollist); + pdp = stp->sd_pollist.ph_list; + while (pdp) { + ASSERT(pdp->pd_thread != curthread); + pdp = pdp->pd_next; + } + PH_EXIT(&stp->sd_pollist); + } + } + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); +} + +/* + * for resolved set poll list, the xref info in the pcache should be + * consistent with this poll list. + */ +static int +pollcheckxref(pollstate_t *ps, int cacheindex) +{ + pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd; + pollcache_t *pcp = ps->ps_pcache; + polldat_t *pdp; + int i; + xref_t *refp; + + for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) { + if (pollfdp[i].fd < 0) { + continue; + } + pdp = pcache_lookup_fd(pcp, pollfdp[i].fd); + ASSERT(pdp != NULL); + ASSERT(pdp->pd_ref != NULL); + refp = &pdp->pd_ref[cacheindex]; + if (refp->xf_position >= 0) { + ASSERT(refp->xf_refcnt >= 1); + ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd); + if (refp->xf_refcnt > 1) { + int j; + int count = 0; + + for (j = refp->xf_position; + j < ps->ps_pcacheset[cacheindex].pcs_nfds; + j++) { + if (pollfdp[j].fd == pdp->pd_fd) { + count++; + } + } + ASSERT(count == refp->xf_refcnt); + } + } + } + return (1); +} + +/* + * For every cached pollfd, its polldat struct should be consistent with + * what is in the pcacheset lists. + */ +static void +checkpolldat(pollstate_t *ps) +{ + pollcache_t *pcp = ps->ps_pcache; + polldat_t **hashtbl; + int i; + + hashtbl = pcp->pc_hash; + for (i = 0; i < pcp->pc_hashsize; i++) { + polldat_t *pdp; + + for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { + ASSERT(pdp->pd_ref != NULL); + if (pdp->pd_count > 0) { + xref_t *refp; + int j; + pollcacheset_t *pcsp; + pollfd_t *pollfd; + + for (j = 0; j < ps->ps_nsets; j++) { + refp = &pdp->pd_ref[j]; + if (refp->xf_refcnt > 0) { + pcsp = &ps->ps_pcacheset[j]; + ASSERT(refp->xf_position < pcsp->pcs_nfds); + pollfd = pcsp->pcs_pollfd; + ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd); + } + } + } + } + } +} + +/* + * every wfd element on ph_list must have a corresponding fpollinfo on the + * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks. + */ +void +checkwfdlist(vnode_t *vp, fpollinfo_t *fpip) +{ + stdata_t *stp; + polldat_t *pdp; + fpollinfo_t *fpip2; + + if ((stp = vp->v_stream) == NULL) { + return; + } + PH_ENTER(&stp->sd_pollist); + for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) { + if (pdp->pd_thread->t_procp == curthread->t_procp) { + for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) { + if (pdp->pd_thread == fpip2->fp_thread) { + break; + } + } + ASSERT(fpip2 != NULL); + } + } + PH_EXIT(&stp->sd_pollist); +} + +/* + * For each cached fd whose bit is not set in bitmap, its revents field in + * current poll list should be 0. + */ +static int +pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex) +{ + pollcache_t *pcp = ps->ps_pcache; + pollfd_t *pollfdp = ps->ps_pollfd; + int i; + + for (i = begin; i < end; i++) { + polldat_t *pdp; + + ASSERT(!BT_TEST(pcp->pc_bitmap, i)); + pdp = pcache_lookup_fd(pcp, i); + if (pdp && pdp->pd_fp != NULL) { + xref_t *refp; + int entry; + + ASSERT(pdp->pd_ref != NULL); + refp = &pdp->pd_ref[cacheindex]; + if (refp->xf_refcnt == 0) { + continue; + } + entry = refp->xf_position; + ASSERT(entry >= 0); + ASSERT(pollfdp[entry].revents == 0); + if (refp->xf_refcnt > 1) { + int j; + + for (j = entry + 1; j < ps->ps_nfds; j++) { + if (pollfdp[j].fd == i) { + ASSERT(pollfdp[j].revents == 0); + } + } + } + } + } + return (1); +} + +#endif /* DEBUG */ + +pollcache_t * +pcache_alloc() +{ + return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP)); +} + +void +pcache_create(pollcache_t *pcp, nfds_t nfds) +{ + size_t mapsize; + + /* + * allocate enough bits for the poll fd list + */ + if ((mapsize = POLLMAPCHUNK) <= nfds) { + mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1); + } + pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t), + KM_SLEEP); + pcp->pc_mapsize = mapsize; + /* + * The hash size is at least POLLHASHCHUNKSZ. If user polls a large + * number of fd to start with, allocate a bigger hash table (to the + * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a + * hash table is expensive. + */ + if (nfds < POLLHASHCHUNKSZ) { + pcp->pc_hashsize = POLLHASHCHUNKSZ; + } else { + pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & + ~(POLLHASHCHUNKSZ - 1); + } + pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), + KM_SLEEP); +} + +void +pcache_destroy(pollcache_t *pcp) +{ + polldat_t **hashtbl; + int i; + + hashtbl = pcp->pc_hash; + for (i = 0; i < pcp->pc_hashsize; i++) { + if (hashtbl[i] != NULL) { + polldat_t *pdp, *pdp2; + + pdp = hashtbl[i]; + while (pdp != NULL) { + pdp2 = pdp->pd_hashnext; + if (pdp->pd_ref != NULL) { + kmem_free(pdp->pd_ref, sizeof (xref_t) * + pdp->pd_nsets); + } + kmem_free(pdp, sizeof (polldat_t)); + pdp = pdp2; + pcp->pc_fdcount--; + } + } + } + ASSERT(pcp->pc_fdcount == 0); + kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize); + kmem_free(pcp->pc_bitmap, + sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL)); + mutex_destroy(&pcp->pc_no_exit); + mutex_destroy(&pcp->pc_lock); + cv_destroy(&pcp->pc_cv); + cv_destroy(&pcp->pc_busy_cv); + kmem_free(pcp, sizeof (pollcache_t)); +} + +pollcacheset_t * +pcacheset_create(int nsets) +{ + return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP)); +} + +void +pcacheset_destroy(pollcacheset_t *pcsp, int nsets) +{ + int i; + + for (i = 0; i < nsets; i++) { + if (pcsp[i].pcs_pollfd != NULL) { + kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds * + sizeof (pollfd_t)); + } + } + kmem_free(pcsp, sizeof (pollcacheset_t) * nsets); +} + +/* + * Check each duplicated poll fd in the poll list. It may be necessary to + * VOP_POLL the same fd again using different poll events. getf() has been + * done by caller. This routine returns 0 if it can sucessfully process the + * entire poll fd list. It returns -1 if underlying vnode has changed during + * a VOP_POLL, in which case the caller has to repoll. It returns a positive + * value if VOP_POLL failed. + */ +static int +plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, + int entry, int *fdcntp) +{ + int i; + int fd; + nfds_t nfds = psp->ps_nfds; + + fd = pollfdp[entry].fd; + for (i = entry + 1; i < nfds; i++) { + if (pollfdp[i].fd == fd) { + if (pollfdp[i].events == pollfdp[entry].events) { + if ((pollfdp[i].revents = + pollfdp[entry].revents) != 0) { + (*fdcntp)++; + } + } else { + + int error; + pollhead_t *php; + pollcache_t *pcp = psp->ps_pcache; + + /* + * the events are different. VOP_POLL on this + * fd so that we don't miss any revents. + */ + php = NULL; + ASSERT(curthread->t_pollcache == NULL); + error = VOP_POLL(fp->f_vnode, + pollfdp[i].events, 0, + &pollfdp[i].revents, &php); + if (error) { + return (error); + } + /* + * layered devices(e.g. console driver) + * may change the vnode and thus the pollhead + * pointer out from underneath us. + */ + if (php != NULL && pdp->pd_php != NULL && + php != pdp->pd_php) { + pollhead_delete(pdp->pd_php, pdp); + pdp->pd_php = php; + pollhead_insert(php, pdp); + /* + * We could have missed a wakeup on the + * new target device. Make sure the new + * target gets polled once. + */ + BT_SET(pcp->pc_bitmap, fd); + return (-1); + } + if (pollfdp[i].revents) { + (*fdcntp)++; + } + } + } + } + return (0); +} diff --git a/usr/src/uts/common/syscall/ppriv.c b/usr/src/uts/common/syscall/ppriv.c new file mode 100644 index 0000000000..817c4fc83b --- /dev/null +++ b/usr/src/uts/common/syscall/ppriv.c @@ -0,0 +1,333 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/cred_impl.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/debug.h> +#include <sys/priv_impl.h> +#include <sys/policy.h> +#include <sys/ddi.h> +#include <sys/thread.h> +#include <c2/audit.h> + +/* + * System call support for manipulating privileges. + * + * + * setppriv(2) - set process privilege set + * getppriv(2) - get process privilege set + * getprivimplinfo(2) - get process privilege implementation information + * setpflags(2) - set process (privilege) flags + * getpflags(2) - get process (privilege) flags + */ + +/* + * setppriv (priv_op_t, priv_ptype_t, priv_set_t) + */ +static int +setppriv(priv_op_t op, priv_ptype_t type, priv_set_t *in_pset) +{ + priv_set_t pset, *target; + cred_t *cr, *pcr; + proc_t *p; + boolean_t donocd; + + if (!PRIV_VALIDSET(type) || !PRIV_VALIDOP(op)) + return (set_errno(EINVAL)); + + if (copyin(in_pset, &pset, sizeof (priv_set_t))) + return (set_errno(EFAULT)); + + p = ttoproc(curthread); + cr = cralloc(); + mutex_enter(&p->p_crlock); + + pcr = p->p_cred; + +#ifdef C2_AUDIT + if (audit_active) + audit_setppriv(op, type, &pset, pcr); +#endif + + /* + * Filter out unallowed request (bad op and bad type) + */ + switch (op) { + case PRIV_ON: + case PRIV_SET: + /* + * Turning on privileges; the limit set cannot grow, + * other sets can but only as long as they remain subsets + * of P. Only immediately after exec holds that P <= L. + */ + if (((type == PRIV_LIMIT && + !priv_issubset(&pset, &CR_LPRIV(pcr))) || + !priv_issubset(&pset, &CR_OPPRIV(pcr))) && + !priv_issubset(&pset, priv_getset(pcr, type))) { + mutex_exit(&p->p_crlock); + crfree(cr); + return (set_errno(EPERM)); + } + break; + + case PRIV_OFF: + /* PRIV_OFF is always allowed */ + break; + } + + /* + * OK! everything is cool. + * Do cred COW. + */ + crcopy_to(pcr, cr); + + /* + * If we change the effective, permitted or limit set, we attain + * "privilege awareness". + */ + if (type != PRIV_INHERITABLE) + priv_set_PA(cr); + + target = &(CR_PRIVS(cr)->crprivs[type]); + + switch (op) { + case PRIV_ON: + priv_union(&pset, target); + break; + case PRIV_OFF: + priv_inverse(&pset); + priv_intersect(target, &pset); + + /* + * Fall-thru to set target and change other process + * privilege sets. + */ + /*FALLTHRU*/ + + case PRIV_SET: + *target = pset; + + /* + * Take privileges no longer permitted out + * of other effective sets as well. + * Limit set is enforced at exec() time. + */ + if (type == PRIV_PERMITTED) + priv_intersect(&pset, &CR_EPRIV(cr)); + break; + } + + /* + * When we give up privileges not in the inheritable set, + * set SNOCD if not already set; first we compute the + * privileges removed from P using Diff = (~P') & P + * and then we check whether the removed privileges are + * a subset of I. If we retain uid 0, all privileges + * are required anyway so don't set SNOCD. + */ + if (type == PRIV_PERMITTED && (p->p_flag & SNOCD) == 0 && + cr->cr_uid != 0 && cr->cr_ruid != 0 && cr->cr_suid != 0) { + priv_set_t diff = CR_OPPRIV(cr); + priv_inverse(&diff); + priv_intersect(&CR_OPPRIV(pcr), &diff); + donocd = !priv_issubset(&diff, &CR_IPRIV(cr)); + } else { + donocd = B_FALSE; + } + + p->p_cred = cr; + mutex_exit(&p->p_crlock); + + if (donocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + + crset(p, cr); /* broadcast to process threads */ + + return (0); +} + +/* + * getppriv (priv_ptype_t, priv_set_t *) + */ +static int +getppriv(priv_ptype_t type, priv_set_t *pset) +{ + if (!PRIV_VALIDSET(type)) + return (set_errno(EINVAL)); + + if (copyout(priv_getset(CRED(), type), pset, sizeof (priv_set_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +getprivimplinfo(void *buf, size_t bufsize) +{ + int err; + + err = copyout(priv_hold_implinfo(), buf, min(bufsize, privinfosize)); + + priv_release_implinfo(); + + if (err) + return (set_errno(EFAULT)); + + return (0); +} + +/* + * Set privilege flags + * + * For now we cheat: the flags are actually bit masks so we can simplify + * some; we do make sure that the arguments are valid, though. + */ + +static int +setpflags(uint_t flag, uint_t val) +{ + cred_t *cr, *pcr; + proc_t *p = curproc; + uint_t newflags; + + if (val > 1 || (flag != PRIV_DEBUG && flag != PRIV_AWARE && + flag != __PROC_PROTECT)) { + return (set_errno(EINVAL)); + } + + if (flag == __PROC_PROTECT) { + mutex_enter(&p->p_lock); + if (val == 0) + p->p_flag &= ~SNOCD; + else + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + return (0); + } + + cr = cralloc(); + + mutex_enter(&p->p_crlock); + + pcr = p->p_cred; + + newflags = CR_FLAGS(pcr); + + if (val != 0) + newflags |= flag; + else + newflags &= ~flag; + + /* No change */ + if (CR_FLAGS(pcr) == newflags) { + mutex_exit(&p->p_crlock); + crfree(cr); + return (0); + } + + /* Trying to unset PA; if we can't, return an error */ + if (flag == PRIV_AWARE && val == 0 && !priv_can_clear_PA(pcr)) { + mutex_exit(&p->p_crlock); + crfree(cr); + return (set_errno(EPERM)); + } + + /* Committed to changing the flag */ + crcopy_to(pcr, cr); + if (flag == PRIV_AWARE) { + if (val != 0) + priv_set_PA(cr); + else + priv_adjust_PA(cr); + } else { + CR_FLAGS(cr) = newflags; + } + + p->p_cred = cr; + + mutex_exit(&p->p_crlock); + + crset(p, cr); + + return (0); +} + +/* + * Getpflags. Currently only implements single bit flags. + */ +static uint_t +getpflags(uint_t flag) +{ + if (flag != PRIV_DEBUG && flag != PRIV_AWARE) + return (set_errno(EINVAL)); + + return ((CR_FLAGS(CRED()) & flag) != 0); +} + +/* + * Privilege system call entry point + */ +int +privsys(int code, priv_op_t op, priv_ptype_t type, void *buf, size_t bufsize) +{ + switch (code) { + case PRIVSYS_SETPPRIV: + if (bufsize < sizeof (priv_set_t)) + return (set_errno(ENOMEM)); + return (setppriv(op, type, buf)); + case PRIVSYS_GETPPRIV: + if (bufsize < sizeof (priv_set_t)) + return (set_errno(ENOMEM)); + return (getppriv(type, buf)); + case PRIVSYS_GETIMPLINFO: + return (getprivimplinfo(buf, bufsize)); + case PRIVSYS_SETPFLAGS: + return (setpflags((uint_t)op, (uint_t)type)); + case PRIVSYS_GETPFLAGS: + return ((int)getpflags((uint_t)op)); + + } + return (set_errno(EINVAL)); +} + +#ifdef _SYSCALL32_IMPL +int +privsys32(int code, priv_op_t op, priv_ptype_t type, caddr32_t *buf, + size32_t bufsize) +{ + return (privsys(code, op, type, (void *)buf, (size_t)bufsize)); +} +#endif diff --git a/usr/src/uts/common/syscall/processor_bind.c b/usr/src/uts/common/syscall/processor_bind.c new file mode 100644 index 0000000000..10ca1178d5 --- /dev/null +++ b/usr/src/uts/common/syscall/processor_bind.c @@ -0,0 +1,375 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/var.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/kstat.h> +#include <sys/uadmin.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/procset.h> +#include <sys/processor.h> +#include <sys/debug.h> +#include <sys/task.h> +#include <sys/project.h> +#include <sys/zone.h> +#include <sys/contract_impl.h> +#include <sys/contract/process_impl.h> + +/* + * Bind all the threads of a process to a CPU. + */ +static int +cpu_bind_process(proc_t *pp, processorid_t bind, processorid_t *obind, + int *error) +{ + kthread_t *tp; + kthread_t *fp; + int err = 0; + int i; + + ASSERT(MUTEX_HELD(&pidlock)); + + /* skip kernel processes */ + if (pp->p_flag & SSYS) { + *obind = PBIND_NONE; + return (0); + } + + mutex_enter(&pp->p_lock); + tp = pp->p_tlist; + if (tp != NULL) { + fp = tp; + do { + i = cpu_bind_thread(tp, bind, obind, error); + if (err == 0) + err = i; + } while ((tp = tp->t_forw) != fp); + } + + mutex_exit(&pp->p_lock); + return (err); +} + +/* + * Bind all the processes of a task to a CPU. + */ +static int +cpu_bind_task(task_t *tk, processorid_t bind, processorid_t *obind, + int *error) +{ + proc_t *p; + int err = 0; + int i; + + ASSERT(MUTEX_HELD(&pidlock)); + + if ((p = tk->tk_memb_list) == NULL) + return (ESRCH); + + do { + i = cpu_bind_process(p, bind, obind, error); + if (err == 0) + err = i; + } while ((p = p->p_tasknext) != tk->tk_memb_list); + + return (err); +} + +/* + * Bind all the processes in a project to a CPU. + */ +static int +cpu_bind_project(kproject_t *kpj, processorid_t bind, processorid_t *obind, + int *error) +{ + proc_t *p; + int err = 0; + int i; + + ASSERT(MUTEX_HELD(&pidlock)); + + for (p = practive; p != NULL; p = p->p_next) { + if (p->p_tlist == NULL) + continue; + if (p->p_task->tk_proj == kpj) { + i = cpu_bind_process(p, bind, obind, error); + if (err == 0) + err = i; + } + } + return (err); +} + +/* + * Bind all the processes in a zone to a CPU. + */ +int +cpu_bind_zone(zone_t *zptr, processorid_t bind, processorid_t *obind, + int *error) +{ + proc_t *p; + int err = 0; + int i; + + ASSERT(MUTEX_HELD(&pidlock)); + + for (p = practive; p != NULL; p = p->p_next) { + if (p->p_tlist == NULL) + continue; + if (p->p_zone == zptr) { + i = cpu_bind_process(p, bind, obind, error); + if (err == 0) + err = i; + } + } + return (err); +} + +/* + * Bind all the processes in a process contract to a CPU. + */ +int +cpu_bind_contract(cont_process_t *ctp, processorid_t bind, processorid_t *obind, + int *error) +{ + proc_t *p; + int err = 0; + int i; + + ASSERT(MUTEX_HELD(&pidlock)); + + for (p = practive; p != NULL; p = p->p_next) { + if (p->p_tlist == NULL) + continue; + if (p->p_ct_process == ctp) { + i = cpu_bind_process(p, bind, obind, error); + if (err == 0) + err = i; + } + } + return (err); +} + +/* + * processor_bind(2) - Processor binding interfaces. + */ +int +processor_bind(idtype_t idtype, id_t id, processorid_t bind, + processorid_t *obindp) +{ + processorid_t obind = PBIND_NONE; + int ret = 0; + int err = 0; + cpu_t *cp; + kthread_id_t tp; + proc_t *pp; + task_t *tk; + kproject_t *kpj; + zone_t *zptr; + contract_t *ct; + + /* + * Since we might be making a binding to a processor, hold the + * cpu_lock so that the processor cannot be taken offline while + * we do this. + */ + mutex_enter(&cpu_lock); + + /* + * Check to be sure binding processor ID is valid. + */ + switch (bind) { + default: + if ((cp = cpu_get(bind)) == NULL || + (cp->cpu_flags & (CPU_QUIESCED | CPU_OFFLINE))) + ret = EINVAL; + else if ((cp->cpu_flags & CPU_READY) == 0) + ret = EIO; + break; + + case PBIND_NONE: + case PBIND_QUERY: + break; + } + + if (ret) { + mutex_exit(&cpu_lock); + return (set_errno(ret)); + } + + switch (idtype) { + case P_LWPID: + pp = curproc; + mutex_enter(&pp->p_lock); + if (id == P_MYID) { + ret = cpu_bind_thread(curthread, bind, &obind, &err); + } else { + int found = 0; + + tp = pp->p_tlist; + do { + if (tp->t_tid == id) { + ret = cpu_bind_thread(tp, + bind, &obind, &err); + found = 1; + break; + } + } while ((tp = tp->t_forw) != pp->p_tlist); + if (!found) + ret = ESRCH; + } + mutex_exit(&pp->p_lock); + break; + + case P_PID: + /* + * Note. Cannot use dotoprocs here because it doesn't find + * system class processes, which are legal to query. + */ + mutex_enter(&pidlock); + if (id == P_MYID) { + ret = cpu_bind_process(curproc, bind, &obind, &err); + } else if ((pp = prfind(id)) != NULL) { + ret = cpu_bind_process(pp, bind, &obind, &err); + } else { + ret = ESRCH; + } + mutex_exit(&pidlock); + break; + + case P_TASKID: + mutex_enter(&pidlock); + if (id == P_MYID) { + proc_t *p = curproc; + id = p->p_task->tk_tkid; + } + + if ((tk = task_hold_by_id(id)) != NULL) { + ret = cpu_bind_task(tk, bind, &obind, &err); + mutex_exit(&pidlock); + task_rele(tk); + } else { + mutex_exit(&pidlock); + ret = ESRCH; + } + break; + + case P_PROJID: + if (id == P_MYID) + id = curprojid(); + if ((kpj = project_hold_by_id(id, getzoneid(), + PROJECT_HOLD_FIND)) == NULL) { + ret = ESRCH; + } else { + mutex_enter(&pidlock); + ret = cpu_bind_project(kpj, bind, &obind, &err); + mutex_exit(&pidlock); + project_rele(kpj); + } + break; + + case P_ZONEID: + if (id == P_MYID) + id = getzoneid(); + + if ((zptr = zone_find_by_id(id)) == NULL) { + ret = ESRCH; + } else { + mutex_enter(&pidlock); + ret = cpu_bind_zone(zptr, bind, &obind, &err); + mutex_exit(&pidlock); + zone_rele(zptr); + } + break; + + case P_CTID: + if (id == P_MYID) + id = PRCTID(curproc); + + if ((ct = contract_type_ptr(process_type, id, + curproc->p_zone->zone_uniqid)) == NULL) { + ret = ESRCH; + } else { + mutex_enter(&pidlock); + ret = cpu_bind_contract(ct->ct_data, + bind, &obind, &err); + mutex_exit(&pidlock); + contract_rele(ct); + } + break; + + case P_CPUID: + if (id == P_MYID || bind != PBIND_NONE || cpu_get(id) == NULL) + ret = EINVAL; + else + ret = cpu_unbind(id); + break; + + case P_ALL: + if (id == P_MYID || bind != PBIND_NONE) { + ret = EINVAL; + } else { + int i; + cpu_t *cp = cpu_list; + do { + if ((cp->cpu_flags & CPU_EXISTS) == 0) + continue; + i = cpu_unbind(cp->cpu_id); + if (ret == 0) + ret = i; + } while ((cp = cp->cpu_next) != cpu_list); + } + break; + + default: + /* + * Spec says this is invalid, even though we could + * handle other idtypes. + */ + ret = EINVAL; + break; + } + mutex_exit(&cpu_lock); + + /* + * If no search error occurred, see if any permissions errors did. + */ + if (ret == 0) + ret = err; + + if (ret == 0 && obindp != NULL) + if (copyout((caddr_t)&obind, (caddr_t)obindp, + sizeof (obind)) == -1) + ret = EFAULT; + return (ret ? set_errno(ret) : 0); /* return success or failure */ +} diff --git a/usr/src/uts/common/syscall/processor_info.c b/usr/src/uts/common/syscall/processor_info.c new file mode 100644 index 0000000000..d080f08e02 --- /dev/null +++ b/usr/src/uts/common/syscall/processor_info.c @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1992, 1994, 1998 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/var.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/kstat.h> +#include <sys/uadmin.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/procset.h> +#include <sys/processor.h> +#include <sys/debug.h> + +/* + * processor_info(2) - return information on a processor. + */ +int +processor_info(processorid_t cpun, processor_info_t *infop) +{ + cpu_t *cp; + processor_info_t temp; + + mutex_enter(&cpu_lock); + if ((cp = cpu_get(cpun)) == NULL) { + mutex_exit(&cpu_lock); + return (set_errno(EINVAL)); + } + bcopy(&cp->cpu_type_info, &temp, sizeof (temp)); + mutex_exit(&cpu_lock); + + /* + * The spec indicates that the rest of the information is meaningless + * if the CPU is offline, but if presented by the machine-dependent + * layer, it is probably still accurate. It seems OK to copy it all in + * either case. + */ + if (copyout((caddr_t)&temp, (caddr_t)infop, + sizeof (processor_info_t))) + return (set_errno(EFAULT)); + + return (0); +} diff --git a/usr/src/uts/common/syscall/profil.c b/usr/src/uts/common/syscall/profil.c new file mode 100644 index 0000000000..e74ea39824 --- /dev/null +++ b/usr/src/uts/common/syscall/profil.c @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1998, Sun Microsystems, Inc. + * All rights reserved. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/debug.h> + +/* + * Profiling. + */ +int +profil(unsigned short *bufbase, size_t bufsize, u_long pcoffset, u_int pcscale) +{ + struct proc *p = ttoproc(curthread); + + if (pcscale == 1) + pcscale = 0; + + mutex_enter(&p->p_pflock); + p->p_prof.pr_base = bufbase; + p->p_prof.pr_size = bufsize; + p->p_prof.pr_off = pcoffset; + p->p_prof.pr_scale = pcscale; + + /* pcsample and profil are mutually exclusive */ + p->p_prof.pr_samples = 0; + + mutex_exit(&p->p_pflock); + mutex_enter(&p->p_lock); + set_proc_post_sys(p); /* activate post_syscall profiling code */ + mutex_exit(&p->p_lock); + return (0); +} + + +/* + * PC Sampling + */ +long +pcsample(void *buf, long nsamples) +{ + struct proc *p = ttoproc(curthread); + long count = 0; + + if (nsamples < 0 || + ((get_udatamodel() != DATAMODEL_NATIVE) && (nsamples > INT32_MAX))) + return (set_errno(EINVAL)); + + mutex_enter(&p->p_pflock); + p->p_prof.pr_base = buf; + p->p_prof.pr_size = nsamples; + p->p_prof.pr_scale = 1; + count = p->p_prof.pr_samples; + p->p_prof.pr_samples = 0; + mutex_exit(&p->p_pflock); + + mutex_enter(&p->p_lock); + set_proc_post_sys(p); /* activate post_syscall profiling code */ + mutex_exit(&p->p_lock); + + return (count); +} diff --git a/usr/src/uts/common/syscall/pset.c b/usr/src/uts/common/syscall/pset.c new file mode 100644 index 0000000000..73b45c88be --- /dev/null +++ b/usr/src/uts/common/syscall/pset.c @@ -0,0 +1,797 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/cmn_err.h> +#include <sys/cpuvar.h> +#include <sys/thread.h> +#include <sys/disp.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/cpupart.h> +#include <sys/pset.h> +#include <sys/modctl.h> +#include <sys/syscall.h> +#include <sys/task.h> +#include <sys/loadavg.h> +#include <sys/fss.h> +#include <sys/pool.h> +#include <sys/pool_pset.h> +#include <sys/policy.h> +#include <sys/zone.h> +#include <sys/contract/process_impl.h> + +static int pset(int, long, long, long, long); + +static struct sysent pset_sysent = { + 5, + SE_ARGC | SE_NOUNLOAD, + (int (*)())pset, +}; + +static struct modlsys modlsys = { + &mod_syscallops, "processor sets", &pset_sysent +}; + +#ifdef _SYSCALL32_IMPL +static struct modlsys modlsys32 = { + &mod_syscallops32, "32-bit pset(2) syscall", &pset_sysent +}; +#endif + +static struct modlinkage modlinkage = { + MODREV_1, + &modlsys, +#ifdef _SYSCALL32_IMPL + &modlsys32, +#endif + NULL +}; + +#define PSET_BADATTR(attr) ((~PSET_NOESCAPE) & (attr)) + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static int +pset_create(psetid_t *psetp) +{ + psetid_t newpset; + int error; + + if (secpolicy_pset(CRED()) != 0) + return (set_errno(EPERM)); + + pool_lock(); + if (pool_state == POOL_ENABLED) { + pool_unlock(); + return (set_errno(ENOTSUP)); + } + error = cpupart_create(&newpset); + if (error) { + pool_unlock(); + return (set_errno(error)); + } + if (copyout(&newpset, psetp, sizeof (psetid_t)) != 0) { + (void) cpupart_destroy(newpset); + pool_unlock(); + return (set_errno(EFAULT)); + } + pool_unlock(); + return (error); +} + +static int +pset_destroy(psetid_t pset) +{ + int error; + + if (secpolicy_pset(CRED()) != 0) + return (set_errno(EPERM)); + + pool_lock(); + if (pool_state == POOL_ENABLED) { + pool_unlock(); + return (set_errno(ENOTSUP)); + } + error = cpupart_destroy(pset); + pool_unlock(); + if (error) + return (set_errno(error)); + else + return (0); +} + +static int +pset_assign(psetid_t pset, processorid_t cpuid, psetid_t *opset, int forced) +{ + psetid_t oldpset; + int error = 0; + cpu_t *cp; + + if (pset != PS_QUERY && secpolicy_pset(CRED()) != 0) + return (set_errno(EPERM)); + + pool_lock(); + if (pset != PS_QUERY && pool_state == POOL_ENABLED) { + pool_unlock(); + return (set_errno(ENOTSUP)); + } + + mutex_enter(&cpu_lock); + if ((cp = cpu_get(cpuid)) == NULL) { + mutex_exit(&cpu_lock); + pool_unlock(); + return (set_errno(EINVAL)); + } + + oldpset = cpupart_query_cpu(cp); + + if (pset != PS_QUERY) + error = cpupart_attach_cpu(pset, cp, forced); + mutex_exit(&cpu_lock); + pool_unlock(); + + if (error) + return (set_errno(error)); + + if (opset != NULL) + if (copyout(&oldpset, opset, sizeof (psetid_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +pset_info(psetid_t pset, int *typep, uint_t *numcpusp, + processorid_t *cpulistp) +{ + int pset_type; + uint_t user_ncpus = 0, real_ncpus, copy_ncpus; + processorid_t *pset_cpus = NULL; + int error = 0; + + if (numcpusp != NULL) { + if (copyin(numcpusp, &user_ncpus, sizeof (uint_t)) != 0) + return (set_errno(EFAULT)); + } + + if (user_ncpus > max_ncpus) /* sanity check */ + user_ncpus = max_ncpus; + if (user_ncpus != 0 && cpulistp != NULL) + pset_cpus = kmem_alloc(sizeof (processorid_t) * user_ncpus, + KM_SLEEP); + + real_ncpus = user_ncpus; + if ((error = cpupart_get_cpus(&pset, pset_cpus, &real_ncpus)) != 0) + goto out; + + /* + * Now copyout the information about this processor set. + */ + + /* + * Get number of cpus to copy back. If the user didn't pass in + * a big enough buffer, only copy back as many cpus as fits in + * the buffer but copy back the real number of cpus. + */ + + if (user_ncpus != 0 && cpulistp != NULL) { + copy_ncpus = MIN(real_ncpus, user_ncpus); + if (copyout(pset_cpus, cpulistp, + sizeof (processorid_t) * copy_ncpus) != 0) { + error = EFAULT; + goto out; + } + } + if (pset_cpus != NULL) + kmem_free(pset_cpus, sizeof (processorid_t) * user_ncpus); + if (typep != NULL) { + if (pset == PS_NONE) + pset_type = PS_NONE; + else + pset_type = PS_PRIVATE; + if (copyout(&pset_type, typep, sizeof (int)) != 0) + return (set_errno(EFAULT)); + } + if (numcpusp != NULL) + if (copyout(&real_ncpus, numcpusp, sizeof (uint_t)) != 0) + return (set_errno(EFAULT)); + return (0); + +out: + if (pset_cpus != NULL) + kmem_free(pset_cpus, sizeof (processorid_t) * user_ncpus); + return (set_errno(error)); +} + +static int +pset_bind_thread(kthread_t *tp, psetid_t pset, psetid_t *oldpset, void *projbuf, + void *zonebuf) +{ + int error = 0; + + ASSERT(pool_lock_held()); + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); + + *oldpset = tp->t_bind_pset; + if (pset != PS_QUERY) { + /* + * Must have the same UID as the target process or + * have PRIV_PROC_OWNER privilege. + */ + if (!hasprocperm(tp->t_cred, CRED())) + return (EPERM); + /* + * Unbinding of an unbound thread should always succeed. + */ + if (*oldpset == PS_NONE && pset == PS_NONE) + return (0); + /* + * Only privileged processes can move threads from psets with + * PSET_NOESCAPE attribute. + */ + if ((tp->t_cpupart->cp_attr & PSET_NOESCAPE) && + secpolicy_pset(CRED()) != 0) + return (EPERM); + if ((error = cpupart_bind_thread(tp, pset, 0, + projbuf, zonebuf)) == 0) + tp->t_bind_pset = pset; + } + return (error); +} + +static int +pset_bind_process(proc_t *pp, psetid_t pset, psetid_t *oldpset, void *projbuf, + void *zonebuf) +{ + int error = 0; + kthread_t *tp; + + /* skip kernel processes */ + if (pset != PS_QUERY && pp->p_flag & SSYS) { + *oldpset = PS_NONE; + return (0); + } + + mutex_enter(&pp->p_lock); + tp = pp->p_tlist; + if (tp != NULL) { + do { + int rval; + + rval = pset_bind_thread(tp, pset, oldpset, projbuf, + zonebuf); + if (error == 0) + error = rval; + } while ((tp = tp->t_forw) != pp->p_tlist); + } else + error = ESRCH; + mutex_exit(&pp->p_lock); + + return (error); +} + +static int +pset_bind_task(task_t *tk, psetid_t pset, psetid_t *oldpset, void *projbuf, + void *zonebuf) +{ + int error = 0; + proc_t *pp; + + ASSERT(MUTEX_HELD(&pidlock)); + + if ((pp = tk->tk_memb_list) == NULL) { + return (ESRCH); + } + + do { + int rval; + + rval = pset_bind_process(pp, pset, oldpset, projbuf, zonebuf); + if (error == 0) + error = rval; + } while ((pp = pp->p_tasknext) != tk->tk_memb_list); + + return (error); +} + +static int +pset_bind_project(kproject_t *kpj, psetid_t pset, psetid_t *oldpset, + void *projbuf, void *zonebuf) +{ + int error = 0; + proc_t *pp; + + ASSERT(MUTEX_HELD(&pidlock)); + + for (pp = practive; pp != NULL; pp = pp->p_next) { + if (pp->p_tlist == NULL) + continue; + if (pp->p_task->tk_proj == kpj) { + int rval; + + rval = pset_bind_process(pp, pset, oldpset, projbuf, + zonebuf); + if (error == 0) + error = rval; + } + } + + return (error); +} + +static int +pset_bind_zone(zone_t *zptr, psetid_t pset, psetid_t *oldpset, void *projbuf, + void *zonebuf) +{ + int error = 0; + proc_t *pp; + + ASSERT(MUTEX_HELD(&pidlock)); + + for (pp = practive; pp != NULL; pp = pp->p_next) { + if (pp->p_zone == zptr) { + int rval; + + rval = pset_bind_process(pp, pset, oldpset, projbuf, + zonebuf); + if (error == 0) + error = rval; + } + } + + return (error); +} + +/* + * Unbind all threads from the specified processor set, or from all + * processor sets. + */ +static int +pset_unbind(psetid_t pset, void *projbuf, void *zonebuf, idtype_t idtype) +{ + psetid_t olbind; + kthread_t *tp; + int error = 0; + int rval; + proc_t *pp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (idtype == P_PSETID && cpupart_find(pset) == NULL) + return (EINVAL); + + mutex_enter(&pidlock); + for (pp = practive; pp != NULL; pp = pp->p_next) { + mutex_enter(&pp->p_lock); + tp = pp->p_tlist; + /* + * Skip zombies and kernel processes, and processes in + * other zones, if called from a non-global zone. + */ + if (tp == NULL || (pp->p_flag & SSYS) || + !HASZONEACCESS(curproc, pp->p_zone->zone_id)) { + mutex_exit(&pp->p_lock); + continue; + } + do { + if ((idtype == P_PSETID && tp->t_bind_pset != pset) || + (idtype == P_ALL && tp->t_bind_pset == PS_NONE)) + continue; + rval = pset_bind_thread(tp, PS_NONE, &olbind, + projbuf, zonebuf); + if (error == 0) + error = rval; + } while ((tp = tp->t_forw) != pp->p_tlist); + mutex_exit(&pp->p_lock); + } + mutex_exit(&pidlock); + return (error); +} + +static int +pset_bind_contract(cont_process_t *ctp, psetid_t pset, psetid_t *oldpset, + void *projbuf, void *zonebuf) +{ + int error = 0; + proc_t *pp; + + ASSERT(MUTEX_HELD(&pidlock)); + + for (pp = practive; pp != NULL; pp = pp->p_next) { + if (pp->p_ct_process == ctp) { + int rval; + + rval = pset_bind_process(pp, pset, oldpset, projbuf, + zonebuf); + if (error == 0) + error = rval; + } + } + + return (error); +} + +static int +pset_bind(psetid_t pset, idtype_t idtype, id_t id, psetid_t *opset) +{ + kthread_t *tp; + proc_t *pp; + task_t *tk; + kproject_t *kpj; + contract_t *ct; + zone_t *zptr; + psetid_t oldpset; + int error = 0; + void *projbuf, *zonebuf; + + pool_lock(); + if (pset != PS_QUERY) { + /* + * Check if the set actually exists before checking + * permissions. This is the historical error + * precedence. Note that if pset was PS_MYID, the + * cpupart_get_cpus call will change it to the + * processor set id of the caller (or PS_NONE if the + * caller is not bound to a processor set). + */ + if (pool_state == POOL_ENABLED) { + pool_unlock(); + return (set_errno(ENOTSUP)); + } + if (cpupart_get_cpus(&pset, NULL, NULL) != 0) { + pool_unlock(); + return (set_errno(EINVAL)); + } else if (pset != PS_NONE && secpolicy_pset(CRED()) != 0) { + pool_unlock(); + return (set_errno(EPERM)); + } + } + + /* + * Pre-allocate enough buffers for FSS for all active projects + * and for all active zones on the system. Unused buffers will + * be freed later by fss_freebuf(). + */ + mutex_enter(&cpu_lock); + projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); + zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); + + switch (idtype) { + case P_LWPID: + pp = curproc; + mutex_enter(&pidlock); + mutex_enter(&pp->p_lock); + if (id == P_MYID) { + tp = curthread; + } else { + if ((tp = idtot(pp, id)) == NULL) { + mutex_exit(&pp->p_lock); + mutex_exit(&pidlock); + error = ESRCH; + break; + } + } + error = pset_bind_thread(tp, pset, &oldpset, projbuf, zonebuf); + mutex_exit(&pp->p_lock); + mutex_exit(&pidlock); + break; + + case P_PID: + mutex_enter(&pidlock); + if (id == P_MYID) { + pp = curproc; + } else if ((pp = prfind(id)) == NULL) { + mutex_exit(&pidlock); + error = ESRCH; + break; + } + error = pset_bind_process(pp, pset, &oldpset, projbuf, zonebuf); + mutex_exit(&pidlock); + break; + + case P_TASKID: + mutex_enter(&pidlock); + if (id == P_MYID) + id = curproc->p_task->tk_tkid; + if ((tk = task_hold_by_id(id)) == NULL) { + mutex_exit(&pidlock); + error = ESRCH; + break; + } + error = pset_bind_task(tk, pset, &oldpset, projbuf, zonebuf); + mutex_exit(&pidlock); + task_rele(tk); + break; + + case P_PROJID: + if (id == P_MYID) + id = curprojid(); + if ((kpj = project_hold_by_id(id, getzoneid(), + PROJECT_HOLD_FIND)) == NULL) { + error = ESRCH; + break; + } + mutex_enter(&pidlock); + error = pset_bind_project(kpj, pset, &oldpset, projbuf, + zonebuf); + mutex_exit(&pidlock); + project_rele(kpj); + break; + + case P_ZONEID: + if (id == P_MYID) + id = getzoneid(); + if ((zptr = zone_find_by_id(id)) == NULL) { + error = ESRCH; + break; + } + mutex_enter(&pidlock); + error = pset_bind_zone(zptr, pset, &oldpset, projbuf, zonebuf); + mutex_exit(&pidlock); + zone_rele(zptr); + break; + + case P_CTID: + if (id == P_MYID) + id = PRCTID(curproc); + if ((ct = contract_type_ptr(process_type, id, + curproc->p_zone->zone_uniqid)) == NULL) { + error = ESRCH; + break; + } + mutex_enter(&pidlock); + error = pset_bind_contract(ct->ct_data, pset, &oldpset, projbuf, + zonebuf); + mutex_exit(&pidlock); + contract_rele(ct); + break; + + case P_PSETID: + if (id == P_MYID || pset != PS_NONE || !INGLOBALZONE(curproc)) { + error = EINVAL; + break; + } + error = pset_unbind(id, projbuf, zonebuf, idtype); + break; + + case P_ALL: + if (id == P_MYID || pset != PS_NONE || !INGLOBALZONE(curproc)) { + error = EINVAL; + break; + } + error = pset_unbind(PS_NONE, projbuf, zonebuf, idtype); + break; + + default: + error = EINVAL; + break; + } + + fss_freebuf(projbuf, FSS_ALLOC_PROJ); + fss_freebuf(zonebuf, FSS_ALLOC_ZONE); + mutex_exit(&cpu_lock); + pool_unlock(); + + if (error != 0) + return (set_errno(error)); + if (opset != NULL) { + if (copyout(&oldpset, opset, sizeof (psetid_t)) != 0) + return (set_errno(EFAULT)); + } + return (0); +} + +/* + * Report load average statistics for the specified processor set. + */ +static int +pset_getloadavg(psetid_t pset, int *buf, int nelem) +{ + int *loadbuf; + int error = 0; + + if (nelem < 0) + return (set_errno(EINVAL)); + + /* + * We keep the same number of load average statistics for processor + * sets as we do for the system as a whole. + */ + if (nelem > LOADAVG_NSTATS) + nelem = LOADAVG_NSTATS; + + loadbuf = kmem_alloc(nelem * sizeof (int), KM_SLEEP); + + mutex_enter(&cpu_lock); + error = cpupart_get_loadavg(pset, loadbuf, nelem); + mutex_exit(&cpu_lock); + if (!error && nelem && copyout(loadbuf, buf, nelem * sizeof (int)) != 0) + error = EFAULT; + + kmem_free(loadbuf, nelem * sizeof (int)); + + if (error) + return (set_errno(error)); + else + return (0); +} + + +/* + * Return list of active processor sets, up to a maximum indicated by + * numpsets. The total number of processor sets is stored in the + * location pointed to by numpsets. + */ +static int +pset_list(psetid_t *psetlist, uint_t *numpsets) +{ + uint_t user_npsets = 0; + uint_t real_npsets; + psetid_t *psets = NULL; + int error = 0; + + if (numpsets != NULL) { + if (copyin(numpsets, &user_npsets, sizeof (uint_t)) != 0) + return (set_errno(EFAULT)); + } + + /* + * Get the list of all processor sets. First we need to find + * out how many there are, so we can allocate a large enough + * buffer. + */ + mutex_enter(&cpu_lock); + if (!INGLOBALZONE(curproc) && pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + if (psetid == PS_NONE) { + real_npsets = 0; + } else { + real_npsets = 1; + psets = kmem_alloc(real_npsets * sizeof (psetid_t), + KM_SLEEP); + psets[0] = psetid; + } + } else { + real_npsets = cpupart_list(0, NULL, CP_ALL); + if (real_npsets) { + psets = kmem_alloc(real_npsets * sizeof (psetid_t), + KM_SLEEP); + (void) cpupart_list(psets, real_npsets, CP_ALL); + } + } + mutex_exit(&cpu_lock); + + if (user_npsets > real_npsets) + user_npsets = real_npsets; + + if (numpsets != NULL) { + if (copyout(&real_npsets, numpsets, sizeof (uint_t)) != 0) + error = EFAULT; + else if (psetlist != NULL && user_npsets != 0) { + if (copyout(psets, psetlist, + user_npsets * sizeof (psetid_t)) != 0) + error = EFAULT; + } + } + + if (real_npsets) + kmem_free(psets, real_npsets * sizeof (psetid_t)); + + if (error) + return (set_errno(error)); + else + return (0); +} + +static int +pset_setattr(psetid_t pset, uint_t attr) +{ + int error; + + if (secpolicy_pset(CRED()) != 0) + return (set_errno(EPERM)); + pool_lock(); + if (pool_state == POOL_ENABLED) { + pool_unlock(); + return (set_errno(ENOTSUP)); + } + if (pset == PS_QUERY || PSET_BADATTR(attr)) { + pool_unlock(); + return (set_errno(EINVAL)); + } + if ((error = cpupart_setattr(pset, attr)) != 0) { + pool_unlock(); + return (set_errno(error)); + } + pool_unlock(); + return (0); +} + +static int +pset_getattr(psetid_t pset, uint_t *attrp) +{ + int error = 0; + uint_t attr; + + if (pset == PS_QUERY) + return (set_errno(EINVAL)); + if ((error = cpupart_getattr(pset, &attr)) != 0) + return (set_errno(error)); + if (copyout(&attr, attrp, sizeof (uint_t)) != 0) + return (set_errno(EFAULT)); + return (0); +} + +static int +pset(int subcode, long arg1, long arg2, long arg3, long arg4) +{ + switch (subcode) { + case PSET_CREATE: + return (pset_create((psetid_t *)arg1)); + case PSET_DESTROY: + return (pset_destroy((psetid_t)arg1)); + case PSET_ASSIGN: + return (pset_assign((psetid_t)arg1, + (processorid_t)arg2, (psetid_t *)arg3, 0)); + case PSET_INFO: + return (pset_info((psetid_t)arg1, (int *)arg2, + (uint_t *)arg3, (processorid_t *)arg4)); + case PSET_BIND: + return (pset_bind((psetid_t)arg1, (idtype_t)arg2, + (id_t)arg3, (psetid_t *)arg4)); + case PSET_GETLOADAVG: + return (pset_getloadavg((psetid_t)arg1, (int *)arg2, + (int)arg3)); + case PSET_LIST: + return (pset_list((psetid_t *)arg1, (uint_t *)arg2)); + case PSET_SETATTR: + return (pset_setattr((psetid_t)arg1, (uint_t)arg2)); + case PSET_GETATTR: + return (pset_getattr((psetid_t)arg1, (uint_t *)arg2)); + case PSET_ASSIGN_FORCED: + return (pset_assign((psetid_t)arg1, + (processorid_t)arg2, (psetid_t *)arg3, 1)); + default: + return (set_errno(EINVAL)); + } +} diff --git a/usr/src/uts/common/syscall/rctlsys.c b/usr/src/uts/common/syscall/rctlsys.c new file mode 100644 index 0000000000..03617b5d44 --- /dev/null +++ b/usr/src/uts/common/syscall/rctlsys.c @@ -0,0 +1,871 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/errno.h> +#include <sys/rctl.h> +#include <sys/rctl_impl.h> +#include <sys/strlog.h> +#include <sys/syslog.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/policy.h> +#include <sys/proc.h> +#include <sys/task.h> + +/* + * setrctl(2), getrctl(2), and private rctlsys(2*) system calls + * + * Resource control block (rctlblk_ptr_t, rctl_opaque_t) + * The resource control system call interfaces present the resource control + * values and flags via the resource control block abstraction, made manifest + * via an opaque data type with strict type definitions. Keeping the formal + * definitions in the rcontrol block allows us to be clever in the kernel, + * combining attributes where appropriate in the current implementation while + * preserving binary compatibility in the face of implementation changes. + */ + +#define RBX_TO_BLK 0x1 +#define RBX_FROM_BLK 0x2 +#define RBX_VAL 0x4 +#define RBX_CTL 0x8 + +static void +rctlsys_rblk_xfrm(rctl_opaque_t *blk, rctl_dict_entry_t *rde, + rctl_val_t *val, int flags) +{ + if (flags & RBX_FROM_BLK) { + if (flags & RBX_VAL) { + /* + * Firing time cannot be set. + */ + val->rcv_privilege = blk->rcq_privilege; + val->rcv_value = blk->rcq_value; + val->rcv_flagaction = blk->rcq_local_flagaction; + val->rcv_action_signal = blk->rcq_local_signal; + val->rcv_action_recip_pid = + blk->rcq_local_recipient_pid; + } + if (flags & RBX_CTL) { + rde->rcd_flagaction = blk->rcq_global_flagaction; + rde->rcd_syslog_level = blk->rcq_global_syslog_level; + + /* + * Because the strlog() interface supports fewer options + * than are made available via the syslog() interface to + * userland, we map the syslog level down to a smaller + * set of distinct logging behaviours. + */ + rde->rcd_strlog_flags = 0; + switch (blk->rcq_global_syslog_level) { + case LOG_EMERG: + case LOG_ALERT: + case LOG_CRIT: + rde->rcd_strlog_flags |= SL_CONSOLE; + /*FALLTHROUGH*/ + case LOG_ERR: + rde->rcd_strlog_flags |= SL_ERROR; + /*FALLTHROUGH*/ + case LOG_WARNING: + rde->rcd_strlog_flags |= SL_WARN; + break; + case LOG_NOTICE: + rde->rcd_strlog_flags |= SL_CONSOLE; + /*FALLTHROUGH*/ + case LOG_INFO: /* informational */ + case LOG_DEBUG: /* debug-level messages */ + default: + rde->rcd_strlog_flags |= SL_NOTE; + break; + } + } + } else { + bzero(blk, sizeof (rctl_opaque_t)); + if (flags & RBX_VAL) { + blk->rcq_privilege = val->rcv_privilege; + blk->rcq_value = val->rcv_value; + blk->rcq_enforced_value = rctl_model_value(rde, + curproc, val->rcv_value); + blk->rcq_local_flagaction = val->rcv_flagaction; + blk->rcq_local_signal = val->rcv_action_signal; + blk->rcq_firing_time = val->rcv_firing_time; + blk->rcq_local_recipient_pid = + val->rcv_action_recip_pid; + } + if (flags & RBX_CTL) { + blk->rcq_global_flagaction = rde->rcd_flagaction; + blk->rcq_global_syslog_level = rde->rcd_syslog_level; + } + } +} + +/* + * int rctl_invalid_value(rctl_dict_entry_t *, rctl_val_t *) + * + * Overview + * Perform basic validation of proposed new resource control value against the + * global properties set on the control. Any system call operation presented + * with an invalid resource control value should return -1 and set errno to + * EINVAL. + * + * Return values + * 0 if valid, 1 if invalid. + * + * Caller's context + * No restriction on context. + */ +int +rctl_invalid_value(rctl_dict_entry_t *rde, rctl_val_t *rval) +{ + rctl_val_t *sys_rval; + + if (rval->rcv_privilege != RCPRIV_BASIC && + rval->rcv_privilege != RCPRIV_PRIVILEGED && + rval->rcv_privilege != RCPRIV_SYSTEM) + return (1); + + if (rval->rcv_flagaction & ~RCTL_LOCAL_MASK) + return (1); + + if (rval->rcv_privilege == RCPRIV_BASIC && + (rde->rcd_flagaction & RCTL_GLOBAL_NOBASIC) != 0) + return (1); + + if ((rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0 && + (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS) != 0) + return (1); + + if ((rval->rcv_flagaction & RCTL_LOCAL_DENY) && + (rde->rcd_flagaction & RCTL_GLOBAL_DENY_NEVER)) + return (1); + + if ((rval->rcv_flagaction & RCTL_LOCAL_SIGNAL) && + (rde->rcd_flagaction & RCTL_GLOBAL_SIGNAL_NEVER)) + return (1); + + if ((rval->rcv_flagaction & RCTL_LOCAL_SIGNAL) && + rval->rcv_action_signal == 0) + return (1); + + if (rval->rcv_action_signal == SIGXCPU && + (rde->rcd_flagaction & RCTL_GLOBAL_CPU_TIME) == 0) + return (1); + else if (rval->rcv_action_signal == SIGXFSZ && + (rde->rcd_flagaction & RCTL_GLOBAL_FILE_SIZE) == 0) + return (1); + else if (rval->rcv_action_signal != SIGHUP && + rval->rcv_action_signal != SIGABRT && + rval->rcv_action_signal != SIGKILL && + rval->rcv_action_signal != SIGTERM && + rval->rcv_action_signal != SIGSTOP && + rval->rcv_action_signal != SIGXCPU && + rval->rcv_action_signal != SIGXFSZ && + rval->rcv_action_signal != SIGXRES && + rval->rcv_action_signal != 0) /* That is, no signal is ok. */ + return (1); + + sys_rval = rde->rcd_default_value; + while (sys_rval->rcv_privilege != RCPRIV_SYSTEM) + sys_rval = sys_rval->rcv_next; + + if (rval->rcv_value > sys_rval->rcv_value) + return (1); + + return (0); +} + +/* + * static long rctlsys_get(char *name, rctl_opaque_t *old_rblk, + * rctl_opaque_t *new_rblk, int flags) + * + * Overview + * rctlsys_get() is the implementation of the core logic of getrctl(2), the + * public system call for fetching resource control values. Two mutually + * exclusive flag values are supported: RCTL_FIRST and RCTL_NEXT. When + * RCTL_FIRST is presented, the value of old_rblk is ignored, and the first + * value in the resource control value sequence for the named control is + * transformed and placed in the user memory location at new_rblk. In the + * RCTL_NEXT case, the value of old_rblk is examined, and the next value in + * the sequence is transformed and placed at new_rblk. + */ +static long +rctlsys_get(char *name, rctl_opaque_t *old_rblk, rctl_opaque_t *new_rblk, + int flags) +{ + rctl_val_t *nval; + rctl_opaque_t *nblk; + rctl_hndl_t hndl; + char *kname; + size_t klen; + rctl_dict_entry_t *krde; + int ret; + int action = flags & (~RCTLSYS_ACTION_MASK); + + if (flags & (~RCTLSYS_MASK)) + return (set_errno(EINVAL)); + + if (action != RCTL_FIRST && action != RCTL_NEXT && + action != RCTL_USAGE) + return (set_errno(EINVAL)); + + if (new_rblk == NULL || name == NULL) + return (set_errno(EFAULT)); + + kname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + krde = kmem_alloc(sizeof (rctl_dict_entry_t), KM_SLEEP); + + if (copyinstr(name, kname, MAXPATHLEN, &klen) != 0) { + kmem_free(kname, MAXPATHLEN); + kmem_free(krde, sizeof (rctl_dict_entry_t)); + return (set_errno(EFAULT)); + } + + if ((hndl = rctl_hndl_lookup(kname)) == -1) { + kmem_free(kname, MAXPATHLEN); + kmem_free(krde, sizeof (rctl_dict_entry_t)); + return (set_errno(EINVAL)); + } + + if (rctl_global_get(kname, krde) == -1) { + kmem_free(kname, MAXPATHLEN); + kmem_free(krde, sizeof (rctl_dict_entry_t)); + return (set_errno(ESRCH)); + } + + kmem_free(kname, MAXPATHLEN); + + nval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + + if (action == RCTL_USAGE) { + kmem_cache_free(rctl_val_cache, nval); + kmem_free(krde, sizeof (rctl_dict_entry_t)); + return (set_errno(ENOTSUP)); + } else if (action == RCTL_FIRST) { + + mutex_enter(&curproc->p_lock); + if (ret = rctl_local_get(hndl, NULL, nval, curproc)) { + mutex_exit(&curproc->p_lock); + kmem_cache_free(rctl_val_cache, nval); + kmem_free(krde, sizeof (rctl_dict_entry_t)); + return (set_errno(ret)); + } + mutex_exit(&curproc->p_lock); + } else { + /* + * RCTL_NEXT + */ + rctl_val_t *oval; + rctl_opaque_t *oblk; + + oblk = kmem_alloc(sizeof (rctl_opaque_t), KM_SLEEP); + + if (copyin(old_rblk, oblk, sizeof (rctl_opaque_t)) == -1) { + kmem_cache_free(rctl_val_cache, nval); + kmem_free(oblk, sizeof (rctl_opaque_t)); + kmem_free(krde, sizeof (rctl_dict_entry_t)); + return (set_errno(EFAULT)); + } + + oval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + + rctlsys_rblk_xfrm(oblk, NULL, oval, RBX_FROM_BLK | RBX_VAL); + mutex_enter(&curproc->p_lock); + ret = rctl_local_get(hndl, oval, nval, curproc); + mutex_exit(&curproc->p_lock); + + kmem_cache_free(rctl_val_cache, oval); + kmem_free(oblk, sizeof (rctl_opaque_t)); + + if (ret != 0) { + kmem_cache_free(rctl_val_cache, nval); + kmem_free(krde, sizeof (rctl_dict_entry_t)); + return (set_errno(ret)); + } + } + + nblk = kmem_alloc(sizeof (rctl_opaque_t), KM_SLEEP); + + rctlsys_rblk_xfrm(nblk, krde, nval, RBX_TO_BLK | RBX_VAL | RBX_CTL); + + kmem_free(krde, sizeof (rctl_dict_entry_t)); + kmem_cache_free(rctl_val_cache, nval); + + if (copyout(nblk, new_rblk, sizeof (rctl_opaque_t)) == -1) { + kmem_free(nblk, sizeof (rctl_opaque_t)); + return (set_errno(EFAULT)); + } + + kmem_free(nblk, sizeof (rctl_opaque_t)); + + return (0); +} + +/* + * static long rctlsys_set(char *name, rctl_opaque_t *old_rblk, + * rctl_opaque_t *new_rblk, int flags) + * + * Overview + * rctlsys_set() is the implementation of the core login of setrctl(2), which + * allows the establishment of resource control values. Flags may take on any + * of three exclusive values: RCTL_INSERT, RCTL_DELETE, and RCTL_REPLACE. + * RCTL_INSERT ignores old_rblk and inserts the value in the appropriate + * position in the ordered sequence of resource control values. RCTL_DELETE + * ignores old_rblk and deletes the first resource control value matching + * (value, priority) in the given resource block. If no matching value is + * found, -1 is returned and errno is set to ENOENT. Finally, in the case of + * RCTL_REPLACE, old_rblk is used to match (value, priority); the matching + * resource control value in the sequence is replaced with the contents of + * new_rblk. Again, if no match is found, -1 is returned and errno is set to + * ENOENT. + * + * rctlsys_set() causes a cursor test, which can reactivate resource controls + * that have previously fired. + */ +static long +rctlsys_set(char *name, rctl_opaque_t *old_rblk, rctl_opaque_t *new_rblk, + int flags) +{ + rctl_val_t *nval; + rctl_dict_entry_t *rde; + rctl_opaque_t *nblk; + rctl_hndl_t hndl; + char *kname; + size_t klen; + long ret = 0; + proc_t *pp = NULL; + pid_t pid; + int action = flags & (~RCTLSYS_ACTION_MASK); + rctl_val_t *oval; + rctl_val_t *rval1; + rctl_val_t *rval2; + rctl_val_t *tval; + rctl_opaque_t *oblk; + + if (flags & (~RCTLSYS_MASK)) + return (set_errno(EINVAL)); + + if (action != RCTL_INSERT && + action != RCTL_DELETE && + action != RCTL_REPLACE) + return (set_errno(EINVAL)); + + if (new_rblk == NULL || name == NULL) + return (set_errno(EFAULT)); + + kname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if (copyinstr(name, kname, MAXPATHLEN, &klen) != 0) { + kmem_free(kname, MAXPATHLEN); + return (set_errno(EFAULT)); + } + + if ((hndl = rctl_hndl_lookup(kname)) == -1) { + kmem_free(kname, MAXPATHLEN); + return (set_errno(EINVAL)); + } + + kmem_free(kname, MAXPATHLEN); + + rde = rctl_dict_lookup_hndl(hndl); + + nblk = kmem_alloc(sizeof (rctl_opaque_t), KM_SLEEP); + + if (copyin(new_rblk, nblk, sizeof (rctl_opaque_t)) == -1) { + kmem_free(nblk, sizeof (rctl_opaque_t)); + return (set_errno(EFAULT)); + } + + nval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + + rctlsys_rblk_xfrm(nblk, NULL, nval, RBX_FROM_BLK | RBX_VAL); + + if (rctl_invalid_value(rde, nval)) { + kmem_free(nblk, sizeof (rctl_opaque_t)); + kmem_cache_free(rctl_val_cache, nval); + return (set_errno(EINVAL)); + } + + /* allocate what we might need before potentially grabbing p_lock */ + oblk = kmem_alloc(sizeof (rctl_opaque_t), KM_SLEEP); + oval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + rval1 = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + rval2 = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + + if (nval->rcv_privilege == RCPRIV_BASIC) { + if (flags & RCTL_USE_RECIPIENT_PID) { + pid = nval->rcv_action_recip_pid; + + /* case for manipulating rctl values on other procs */ + if (pid != curproc->p_pid) { + /* cannot be other pid on process rctls */ + if (rde->rcd_entity == RCENTITY_PROCESS) { + ret = set_errno(EINVAL); + goto rctlsys_out; + } + /* + * must have privilege to manipulate controls + * on other processes + */ + if (secpolicy_rctlsys(CRED(), B_FALSE) != 0) { + ret = set_errno(EACCES); + goto rctlsys_out; + } + + pid = nval->rcv_action_recip_pid; + mutex_enter(&pidlock); + pp = prfind(pid); + if (!pp) { + mutex_exit(&pidlock); + ret = set_errno(ESRCH); + goto rctlsys_out; + } + + /* + * idle or zombie procs have either not yet + * set up their rctls or have already done + * their rctl_set_tearoff's. + */ + if (pp->p_stat == SZOMB || + pp->p_stat == SIDL) { + mutex_exit(&pidlock); + ret = set_errno(ESRCH); + goto rctlsys_out; + } + + /* + * hold this pp's p_lock to ensure that + * it does not do it's rctl_set_tearoff + * If we did not do this, we could + * potentially add rctls to the entity + * with a recipient that is a process + * that has exited. + */ + mutex_enter(&pp->p_lock); + mutex_exit(&pidlock); + + /* + * We know that curproc's task, project, + * and zone pointers will not change + * because functions that change them + * call holdlwps(SHOLDFORK1) first. + */ + + /* + * verify that the found pp is in the + * current task. If it is, then it + * is also within the current project + * and zone. + */ + if (rde->rcd_entity == RCENTITY_TASK && + pp->p_task != curproc->p_task) { + ret = set_errno(ESRCH); + goto rctlsys_out; + } + + ASSERT(pp->p_task->tk_proj == + curproc->p_task->tk_proj); + ASSERT(pp->p_zone == curproc->p_zone); + + + nval->rcv_action_recipient = pp; + nval->rcv_action_recip_pid = pid; + + } else { + /* for manipulating rctl values on this proc */ + mutex_enter(&curproc->p_lock); + pp = curproc; + nval->rcv_action_recipient = curproc; + nval->rcv_action_recip_pid = curproc->p_pid; + } + + } else { + /* RCTL_USE_RECIPIENT_PID not set, use this proc */ + mutex_enter(&curproc->p_lock); + pp = curproc; + nval->rcv_action_recipient = curproc; + nval->rcv_action_recip_pid = curproc->p_pid; + } + + } else { + /* privileged controls have no recipient pid */ + mutex_enter(&curproc->p_lock); + pp = curproc; + nval->rcv_action_recipient = NULL; + nval->rcv_action_recip_pid = -1; + } + + nval->rcv_firing_time = 0; + + if (action == RCTL_REPLACE) { + + if (copyin(old_rblk, oblk, sizeof (rctl_opaque_t)) == -1) { + ret = set_errno(EFAULT); + goto rctlsys_out; + } + + rctlsys_rblk_xfrm(oblk, NULL, oval, RBX_FROM_BLK | RBX_VAL); + + if (rctl_invalid_value(rde, oval)) { + ret = set_errno(EINVAL); + goto rctlsys_out; + } + + if (oval->rcv_privilege == RCPRIV_BASIC) { + if (!(flags & RCTL_USE_RECIPIENT_PID)) { + oval->rcv_action_recipient = curproc; + oval->rcv_action_recip_pid = curproc->p_pid; + } + } else { + oval->rcv_action_recipient = NULL; + oval->rcv_action_recip_pid = -1; + } + + /* + * Find the real value we're attempting to replace on the + * sequence, rather than trusting the one delivered from + * userland. + */ + if (ret = rctl_local_get(hndl, NULL, rval1, pp)) { + (void) set_errno(ret); + goto rctlsys_out; + } + + do { + if (rval1->rcv_privilege == RCPRIV_SYSTEM || + rctl_val_cmp(oval, rval1, 0) == 0) + break; + + tval = rval1; + rval1 = rval2; + rval2 = tval; + } while (rctl_local_get(hndl, rval2, rval1, pp) == 0); + + if (rval1->rcv_privilege == RCPRIV_SYSTEM) { + if (rctl_val_cmp(oval, rval1, 1) == 0) + ret = set_errno(EPERM); + else + ret = set_errno(ESRCH); + + goto rctlsys_out; + } + + bcopy(rval1, oval, sizeof (rctl_val_t)); + + /* + * System controls are immutable. + */ + if (nval->rcv_privilege == RCPRIV_SYSTEM) { + ret = set_errno(EPERM); + goto rctlsys_out; + } + + /* + * Only privileged processes in the global zone can modify + * privileged rctls of type RCENTITY_ZONE; replacing privileged + * controls with basic ones are not allowed either. Lowering a + * lowerable one might be OK for privileged processes in a + * non-global zone, but lowerable rctls probably don't make + * sense for zones (hence, not modifiable from within a zone). + */ + if (rde->rcd_entity == RCENTITY_ZONE && + (nval->rcv_privilege == RCPRIV_PRIVILEGED || + oval->rcv_privilege == RCPRIV_PRIVILEGED) && + secpolicy_rctlsys(CRED(), B_TRUE) != 0) { + ret = set_errno(EACCES); + goto rctlsys_out; + } + + /* + * Must be privileged to replace a privileged control with + * a basic one. + */ + if (oval->rcv_privilege == RCPRIV_PRIVILEGED && + nval->rcv_privilege != RCPRIV_PRIVILEGED && + secpolicy_rctlsys(CRED(), B_FALSE) != 0) { + ret = set_errno(EACCES); + goto rctlsys_out; + } + + /* + * Must have lowerable global property for non-privileged + * to lower the value of a privileged control; otherwise must + * have sufficient privileges to modify privileged controls + * at all. + */ + if (oval->rcv_privilege == RCPRIV_PRIVILEGED && + nval->rcv_privilege == RCPRIV_PRIVILEGED && + ((((rde->rcd_flagaction & RCTL_GLOBAL_LOWERABLE) == 0) || + oval->rcv_flagaction != nval->rcv_flagaction || + oval->rcv_action_signal != nval->rcv_action_signal || + oval->rcv_value < nval->rcv_value)) && + secpolicy_rctlsys(CRED(), B_FALSE) != 0) { + ret = set_errno(EACCES); + goto rctlsys_out; + } + + if (ret = rctl_local_replace(hndl, oval, nval, pp)) { + (void) set_errno(ret); + goto rctlsys_out; + } + + /* ensure that nval is not freed */ + nval = NULL; + + } else if (action == RCTL_INSERT) { + /* + * System controls are immutable. + */ + if (nval->rcv_privilege == RCPRIV_SYSTEM) { + ret = set_errno(EPERM); + goto rctlsys_out; + } + + /* + * Only privileged processes in the global zone may add + * privileged zone.* rctls. Only privileged processes + * may add other privileged rctls. + */ + if (nval->rcv_privilege == RCPRIV_PRIVILEGED) { + if ((rde->rcd_entity == RCENTITY_ZONE && + secpolicy_rctlsys(CRED(), B_TRUE) != 0) || + (rde->rcd_entity != RCENTITY_ZONE && + secpolicy_rctlsys(CRED(), B_FALSE) != 0)) { + ret = set_errno(EACCES); + goto rctlsys_out; + } + } + + /* + * Only one basic control is allowed per rctl. + * If a basic control is being inserted, delete + * any other basic control. + */ + if ((nval->rcv_privilege == RCPRIV_BASIC) && + (rctl_local_get(hndl, NULL, rval1, pp) == 0)) { + do { + if (rval1->rcv_privilege == RCPRIV_BASIC && + rval1->rcv_action_recipient == curproc) { + (void) rctl_local_delete(hndl, rval1, + pp); + if (rctl_local_get(hndl, NULL, rval1, + pp) != 0) + break; + } + + tval = rval1; + rval1 = rval2; + rval2 = tval; + } while (rctl_local_get(hndl, rval2, rval1, pp) + == 0); + } + + + if (ret = rctl_local_insert(hndl, nval, pp)) { + (void) set_errno(ret); + goto rctlsys_out; + } + + /* ensure that nval is not freed */ + nval = NULL; + + } else { + /* + * RCTL_DELETE + */ + if (nval->rcv_privilege == RCPRIV_SYSTEM) { + ret = set_errno(EPERM); + goto rctlsys_out; + } + + if (nval->rcv_privilege == RCPRIV_PRIVILEGED) { + if ((rde->rcd_entity == RCENTITY_ZONE && + secpolicy_rctlsys(CRED(), B_TRUE) != 0) || + (rde->rcd_entity != RCENTITY_ZONE && + secpolicy_rctlsys(CRED(), B_FALSE) != 0)) { + ret = set_errno(EACCES); + goto rctlsys_out; + } + } + + if (ret = rctl_local_delete(hndl, nval, pp)) { + (void) set_errno(ret); + goto rctlsys_out; + } + } + +rctlsys_out: + + if (pp) + mutex_exit(&pp->p_lock); + + kmem_free(nblk, sizeof (rctl_opaque_t)); + kmem_free(oblk, sizeof (rctl_opaque_t)); + + /* only free nval if we did not rctl_local_insert it */ + if (nval) + kmem_cache_free(rctl_val_cache, nval); + + kmem_cache_free(rctl_val_cache, oval); + kmem_cache_free(rctl_val_cache, rval1); + kmem_cache_free(rctl_val_cache, rval2); + + return (ret); +} + +static long +rctlsys_lst(char *ubuf, size_t ubufsz) +{ + char *kbuf; + size_t kbufsz; + + kbufsz = rctl_build_name_buf(&kbuf); + + if (kbufsz <= ubufsz && + copyout(kbuf, ubuf, kbufsz) != 0) { + kmem_free(kbuf, kbufsz); + return (set_errno(EFAULT)); + } + + kmem_free(kbuf, kbufsz); + + return (kbufsz); +} + +static long +rctlsys_ctl(char *name, rctl_opaque_t *rblk, int flags) +{ + rctl_dict_entry_t *krde; + rctl_opaque_t *krblk; + char *kname; + size_t klen; + + kname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if (name == NULL || copyinstr(name, kname, MAXPATHLEN, &klen) != 0) { + kmem_free(kname, MAXPATHLEN); + return (set_errno(EFAULT)); + } + + switch (flags) { + case RCTLCTL_GET: + krde = kmem_alloc(sizeof (rctl_dict_entry_t), KM_SLEEP); + krblk = kmem_zalloc(sizeof (rctl_opaque_t), KM_SLEEP); + + if (rctl_global_get(kname, krde) == -1) { + kmem_free(krde, sizeof (rctl_dict_entry_t)); + kmem_free(krblk, sizeof (rctl_opaque_t)); + kmem_free(kname, MAXPATHLEN); + return (set_errno(ESRCH)); + } + + rctlsys_rblk_xfrm(krblk, krde, NULL, RBX_TO_BLK | RBX_CTL); + + if (copyout(krblk, rblk, sizeof (rctl_opaque_t)) != 0) { + kmem_free(krde, sizeof (rctl_dict_entry_t)); + kmem_free(krblk, sizeof (rctl_opaque_t)); + kmem_free(kname, MAXPATHLEN); + return (set_errno(EFAULT)); + } + + kmem_free(krde, sizeof (rctl_dict_entry_t)); + kmem_free(krblk, sizeof (rctl_opaque_t)); + kmem_free(kname, MAXPATHLEN); + break; + case RCTLCTL_SET: + if (secpolicy_rctlsys(CRED(), B_TRUE) != 0) { + kmem_free(kname, MAXPATHLEN); + return (set_errno(EPERM)); + } + + krde = kmem_alloc(sizeof (rctl_dict_entry_t), KM_SLEEP); + krblk = kmem_zalloc(sizeof (rctl_opaque_t), KM_SLEEP); + + if (rctl_global_get(kname, krde) == -1) { + kmem_free(krde, sizeof (rctl_dict_entry_t)); + kmem_free(krblk, sizeof (rctl_opaque_t)); + kmem_free(kname, MAXPATHLEN); + return (set_errno(ESRCH)); + } + + if (copyin(rblk, krblk, sizeof (rctl_opaque_t)) != 0) { + kmem_free(krde, sizeof (rctl_dict_entry_t)); + kmem_free(krblk, sizeof (rctl_opaque_t)); + kmem_free(kname, MAXPATHLEN); + return (set_errno(EFAULT)); + } + + rctlsys_rblk_xfrm(krblk, krde, NULL, RBX_FROM_BLK | RBX_CTL); + + if (rctl_global_set(kname, krde) == -1) { + kmem_free(krde, sizeof (rctl_dict_entry_t)); + kmem_free(krblk, sizeof (rctl_opaque_t)); + kmem_free(kname, MAXPATHLEN); + return (set_errno(ESRCH)); + } + + kmem_free(krde, sizeof (rctl_dict_entry_t)); + kmem_free(krblk, sizeof (rctl_opaque_t)); + kmem_free(kname, MAXPATHLEN); + + break; + default: + kmem_free(kname, MAXPATHLEN); + return (set_errno(EINVAL)); + } + + return (0); +} + +long +rctlsys(int code, char *name, void *obuf, void *nbuf, size_t obufsz, int flags) +{ + switch (code) { + case 0: + return (rctlsys_get(name, obuf, nbuf, flags)); + + case 1: + return (rctlsys_set(name, obuf, nbuf, flags)); + + case 2: + /* + * Private call for rctl_walk(3C). + */ + return (rctlsys_lst(obuf, obufsz)); + + case 3: + /* + * Private code for rctladm(1M): "rctlctl". + */ + return (rctlsys_ctl(name, obuf, flags)); + + default: + return (set_errno(EINVAL)); + } +} diff --git a/usr/src/uts/common/syscall/readlink.c b/usr/src/uts/common/syscall/readlink.c new file mode 100644 index 0000000000..a1e8475787 --- /dev/null +++ b/usr/src/uts/common/syscall/readlink.c @@ -0,0 +1,119 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/uio.h> +#include <sys/ioreq.h> +#include <sys/debug.h> + +/* + * Read the contents of a symbolic link. + */ +ssize_t +readlink(char *name, char *buf, size_t count) +{ + vnode_t *vp; + struct iovec aiov; + struct uio auio; + int error; + struct vattr vattr; + ssize_t cnt; + + if ((cnt = (ssize_t)count) < 0) + return (set_errno(EINVAL)); + +lookup: + if (error = lookupname(name, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + + if (vp->v_type != VLNK) { + /* + * Ask the underlying filesystem if it wants this + * object to look like a symlink at user-level. + */ + vattr.va_mask = AT_TYPE; + error = VOP_GETATTR(vp, &vattr, 0, CRED()); + if (error || vattr.va_type != VLNK) { + VN_RELE(vp); + if (error == ESTALE) + goto lookup; + return (set_errno(EINVAL)); + } + } + aiov.iov_base = buf; + aiov.iov_len = cnt; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = 0; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_extflg = UIO_COPY_CACHED; + auio.uio_resid = cnt; + error = VOP_READLINK(vp, &auio, CRED()); + VN_RELE(vp); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + return ((ssize_t)(cnt - auio.uio_resid)); +} + +#ifdef _SYSCALL32_IMPL +/* + * readlink32() intentionally returns a ssize_t rather than ssize32_t; + * see the comments above read32 for details. + */ + +ssize_t +readlink32(caddr32_t name, caddr32_t buf, size32_t count) +{ + return ((ssize32_t)readlink((char *)(uintptr_t)name, + (char *)(uintptr_t)buf, (ssize32_t)count)); +} + +#endif /* _SYSCALL32_IMPL */ diff --git a/usr/src/uts/common/syscall/rename.c b/usr/src/uts/common/syscall/rename.c new file mode 100644 index 0000000000..4d8d5270ed --- /dev/null +++ b/usr/src/uts/common/syscall/rename.c @@ -0,0 +1,139 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2001 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/debug.h> +#include <sys/file.h> +#include <sys/fcntl.h> + +/* + * Rename or move an existing file. + */ +int +rename(char *from, char *to) +{ + int error; + + if (error = vn_rename(from, to, UIO_USERSPACE)) + return (set_errno(error)); + return (0); +} + +/* + * Rename a file relative to a given directory + */ +int +renameat(int fromfd, char *old, int tofd, char *new) +{ + file_t *fromfp; + file_t *tofp; + vnode_t *fromvp, *tovp; + int error; + proc_t *p = curproc; + char oldstart, newstart; + + tovp = fromvp = NULL; + + if ((fromfd == AT_FDCWD && old == NULL) || + (tofd == AT_FDCWD && new == NULL)) + return (set_errno(EFAULT)); + + if (fromfd == AT_FDCWD || tofd == AT_FDCWD) { + mutex_enter(&p->p_lock); + if (fromfd == AT_FDCWD) { + fromvp = PTOU(p)->u_cdir; + VN_HOLD(fromvp); + } + if (tofd == AT_FDCWD) { + tovp = PTOU(p)->u_cdir; + VN_HOLD(tovp); + } + mutex_exit(&p->p_lock); + } + + if (copyin(old, &oldstart, sizeof (char))) + return (set_errno(EFAULT)); + + if (copyin(new, &newstart, sizeof (char))) + return (set_errno(EFAULT)); + + if (fromvp == NULL) { + if (oldstart != '/') { + if ((fromfp = getf(fromfd)) == NULL) { + if (tovp != NULL) + VN_RELE(tovp); + return (set_errno(EBADF)); + } + fromvp = fromfp->f_vnode; + VN_HOLD(fromvp); + releasef(fromfd); + } else { + fromvp = NULL; + } + } + + if (tovp == NULL) { + if (newstart != '/') { + if ((tofp = getf(tofd)) == NULL) { + if (fromvp != NULL) + VN_RELE(fromvp); + return (set_errno(EBADF)); + } + tovp = tofp->f_vnode; + VN_HOLD(tovp); + releasef(tofd); + } else { + tovp = NULL; + } + } + + error = vn_renameat(fromvp, old, tovp, new, UIO_USERSPACE); + + if (fromvp != NULL) + VN_RELE(fromvp); + if (tovp != NULL) + VN_RELE(tovp); + if (error != 0) + return (set_errno(error)); + return (error); +} diff --git a/usr/src/uts/common/syscall/resolvepath.c b/usr/src/uts/common/syscall/resolvepath.c new file mode 100644 index 0000000000..e6cb678761 --- /dev/null +++ b/usr/src/uts/common/syscall/resolvepath.c @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ident "%Z%%M% %I% %E% SMI" + +/* + * Copyright 1997 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/pathname.h> + +int +resolvepath(char *path, char *buf, size_t count) +{ + struct pathname lookpn; + struct pathname resolvepn; + int error; + + if (count == 0) + return (0); + if (error = pn_get(path, UIO_USERSPACE, &lookpn)) + return (set_errno(error)); + pn_alloc(&resolvepn); + error = lookuppn(&lookpn, &resolvepn, FOLLOW, NULL, NULL); + if (error == 0) { + if (count > resolvepn.pn_pathlen) + count = resolvepn.pn_pathlen; + if (copyout(resolvepn.pn_path, buf, count)) + error = EFAULT; + } + pn_free(&resolvepn); + pn_free(&lookpn); + + if (error) + return (set_errno(error)); + return ((int)count); +} diff --git a/usr/src/uts/common/syscall/rlimit.c b/usr/src/uts/common/syscall/rlimit.c new file mode 100644 index 0000000000..eac3584764 --- /dev/null +++ b/usr/src/uts/common/syscall/rlimit.c @@ -0,0 +1,487 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/inttypes.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/tuneable.h> +#include <sys/user.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/resource.h> +#include <sys/ulimit.h> +#include <sys/debug.h> +#include <sys/rctl.h> + +#include <vm/as.h> + +/* + * Perhaps ulimit could be moved into a user library, as calls to + * getrlimit and setrlimit, were it not for binary compatibility + * restrictions. + */ +long +ulimit(int cmd, long arg) +{ + proc_t *p = curproc; + long retval; + + switch (cmd) { + + case UL_GFILLIM: /* Return current file size limit. */ + { + rlim64_t filesize; + + mutex_enter(&p->p_lock); + filesize = rctl_enforced_value(rctlproc_legacy[RLIMIT_FSIZE], + p->p_rctls, p); + mutex_exit(&p->p_lock); + + if (get_udatamodel() == DATAMODEL_ILP32) { + /* + * File size is returned in blocks for ulimit. + * This function is deprecated and therefore LFS API + * didn't define the behaviour of ulimit. + * Here we return maximum value of file size possible + * so that applications that do not check errors + * continue to work. + */ + if (filesize > MAXOFF32_T) + filesize = MAXOFF32_T; + retval = ((int)filesize >> SCTRSHFT); + } else + retval = filesize >> SCTRSHFT; + break; + } + + case UL_SFILLIM: /* Set new file size limit. */ + { + int error = 0; + rlim64_t lim = (rlim64_t)arg; + struct rlimit64 rl64; + rctl_alloc_gp_t *gp = rctl_rlimit_set_prealloc(1); + + if (lim >= (((rlim64_t)MAXOFFSET_T) >> SCTRSHFT)) + lim = (rlim64_t)RLIM64_INFINITY; + else + lim <<= SCTRSHFT; + + rl64.rlim_max = rl64.rlim_cur = lim; + mutex_enter(&p->p_lock); + if (error = rctl_rlimit_set(rctlproc_legacy[RLIMIT_FSIZE], p, + &rl64, gp, RCTL_LOCAL_DENY | RCTL_LOCAL_SIGNAL, SIGXFSZ, + CRED())) { + mutex_exit(&p->p_lock); + rctl_prealloc_destroy(gp); + return (set_errno(error)); + } + mutex_exit(&p->p_lock); + rctl_prealloc_destroy(gp); + retval = arg; + break; + } + + case UL_GMEMLIM: /* Return maximum possible break value. */ + { + struct seg *seg; + struct seg *nextseg; + struct as *as = p->p_as; + caddr_t brkend; + caddr_t brkbase; + size_t size; + rlim64_t size_ctl; + rlim64_t vmem_ctl; + + /* + * Find the segment with a virtual address + * greater than the end of the current break. + */ + nextseg = NULL; + mutex_enter(&p->p_lock); + brkbase = (caddr_t)p->p_brkbase; + brkend = (caddr_t)p->p_brkbase + p->p_brksize; + mutex_exit(&p->p_lock); + + /* + * Since we can't return less than the current break, + * initialize the return value to the current break + */ + retval = (long)brkend; + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = as_findseg(as, brkend, 0); seg != NULL; + seg = AS_SEGNEXT(as, seg)) { + if (seg->s_base >= brkend) { + nextseg = seg; + break; + } + } + + mutex_enter(&p->p_lock); + size_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA], + p->p_rctls, p); + vmem_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_VMEM], + p->p_rctls, p); + mutex_exit(&p->p_lock); + + /* + * First, calculate the maximum break value based on + * the user's RLIMIT_DATA, but also taking into account + * that this value cannot be greater than as->a_userlimit. + * We also take care to make sure that we don't overflow + * in the calculation. + */ + /* + * Since we are casting the RLIMIT_DATA value to a + * ulong (a 32-bit value in the 32-bit kernel) we have + * to pass this assertion. + */ + ASSERT32((size_t)size_ctl <= UINT32_MAX); + + size = (size_t)size_ctl; + if (as->a_userlimit - brkbase > size) + retval = MAX((size_t)retval, (size_t)(brkbase + size)); + /* don't return less than current */ + else + retval = (long)as->a_userlimit; + + /* + * The max break cannot extend into the next segment + */ + if (nextseg != NULL) + retval = MIN((uintptr_t)retval, + (uintptr_t)nextseg->s_base); + + /* + * Handle the case where there is an limit on RLIMIT_VMEM + */ + if (vmem_ctl < UINT64_MAX) { + /* calculate brkend based on the end of page */ + caddr_t brkendpg = (caddr_t)roundup((uintptr_t)brkend, + PAGESIZE); + /* + * Large Files: The following assertion has to pass + * through to ensure the correctness of the cast. + */ + ASSERT32(vmem_ctl <= UINT32_MAX); + + size = (size_t)(vmem_ctl & PAGEMASK); + + if (as->a_size < size) + size -= as->a_size; + else + size = 0; + /* + * Take care to not overflow the calculation + */ + if (as->a_userlimit - brkendpg > size) + retval = MIN((size_t)retval, + (size_t)(brkendpg + size)); + } + + AS_LOCK_EXIT(as, &as->a_lock); + + /* truncate to same boundary as sbrk */ + + switch (get_udatamodel()) { + default: + case DATAMODEL_ILP32: + retval = retval & ~(8-1); + break; + case DATAMODEL_LP64: + retval = retval & ~(16-1); + break; + } + break; + } + + case UL_GDESLIM: /* Return approximate number of open files */ + { + rlim64_t fdno_ctl; + + mutex_enter(&curproc->p_lock); + fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE], + curproc->p_rctls, curproc); + ASSERT(fdno_ctl <= INT_MAX); + retval = (rlim_t)fdno_ctl; + mutex_exit(&curproc->p_lock); + break; + } + + default: + return (set_errno(EINVAL)); + + } + return (retval); +} + +#ifdef _SYSCALL32_IMPL + +int +ulimit32(int cmd, int arg) +{ + return ((int)ulimit(cmd, (long)arg)); +} + +#endif /* _SYSCALL32_IMPL */ + +#if defined(_ILP32) || defined(_SYSCALL32_IMPL) + +/* + * Large Files: getrlimit returns RLIM_SAVED_CUR or RLIM_SAVED_MAX when + * rlim_cur or rlim_max is not representable in 32-bit rlim_t. These + * values are just tokens which will be used in setrlimit to set the + * correct limits. The current limits are saved in the saved_rlimit members + * in user structures when the token is returned. setrlimit restores + * the limit values to these saved values when the token is passed. + * Consider the following common scenario of the apps: + * + * limit = getrlimit(); + * savedlimit = limit; + * limit = limit1; + * setrlimit(limit) + * // execute all processes in the new rlimit state. + * setrlimit(savedlimit) // restore the old values. + * + * Most apps don't check error returns from getrlimit or setrlimit + * and this is why we return tokens when the correct value + * cannot be represented in rlim_t. For more discussion refer to + * the LFS API document. + * + * In the 64-bit kernel, all existing resource limits are treated in this + * manner. In the 32-bit kernel, CPU time is treated equivalently to the + * file size limit above; the VM-related limits are not. The macro, + * RLIM_SAVED(x), returns true if the resource limit should be handled in + * this way on the current kernel. + */ +int +getrlimit32(int resource, struct rlimit32 *rlp) +{ + struct rlimit32 rlim32; + struct rlimit64 rlim64; + struct proc *p = curproc; + struct user *up = PTOU(p); + int savecur = 0; + int savemax = 0; + + if (resource < 0 || resource >= RLIM_NLIMITS) + return (set_errno(EINVAL)); + + mutex_enter(&p->p_lock); + (void) rctl_rlimit_get(rctlproc_legacy[resource], p, &rlim64); + mutex_exit(&p->p_lock); + + if (rlim64.rlim_max > (rlim64_t)UINT32_MAX) { + + if (rlim64.rlim_max == RLIM64_INFINITY) + rlim32.rlim_max = RLIM32_INFINITY; + else { + savemax = 1; + rlim32.rlim_max = RLIM32_SAVED_MAX; + /*CONSTCOND*/ + ASSERT(RLIM_SAVED(resource)); + } + + if (rlim64.rlim_cur == RLIM64_INFINITY) + rlim32.rlim_cur = RLIM32_INFINITY; + else if (rlim64.rlim_cur == rlim64.rlim_max) { + savecur = 1; + rlim32.rlim_cur = RLIM32_SAVED_MAX; + /*CONSTCOND*/ + ASSERT(RLIM_SAVED(resource)); + } else if (rlim64.rlim_cur > (rlim64_t)UINT32_MAX) { + savecur = 1; + rlim32.rlim_cur = RLIM32_SAVED_CUR; + /*CONSTCOND*/ + ASSERT(RLIM_SAVED(resource)); + } else + rlim32.rlim_cur = rlim64.rlim_cur; + + /* + * save the current limits in user structure. + */ + /*CONSTCOND*/ + if (RLIM_SAVED(resource)) { + mutex_enter(&p->p_lock); + if (savemax) + up->u_saved_rlimit[resource].rlim_max = + rlim64.rlim_max; + if (savecur) + up->u_saved_rlimit[resource].rlim_cur = + rlim64.rlim_cur; + mutex_exit(&p->p_lock); + } + } else { + ASSERT(rlim64.rlim_cur <= (rlim64_t)UINT32_MAX); + rlim32.rlim_max = rlim64.rlim_max; + rlim32.rlim_cur = rlim64.rlim_cur; + } + + if (copyout(&rlim32, rlp, sizeof (rlim32))) + return (set_errno(EFAULT)); + + return (0); +} + +/* + * See comments above getrlimit32(). When the tokens are passed in the + * rlimit structure the values are considered equal to the values + * stored in saved_rlimit members of user structure. + * When the user passes RLIM_INFINITY to set the resource limit to + * unlimited internally understand this value as RLIM64_INFINITY and + * let rlimit() do the job. + */ +int +setrlimit32(int resource, struct rlimit32 *rlp) +{ + struct rlimit32 rlim32; + struct rlimit64 rlim64; + struct rlimit64 saved_rlim; + int error; + struct proc *p = ttoproc(curthread); + struct user *up = PTOU(p); + rctl_alloc_gp_t *gp; + + if (resource < 0 || resource >= RLIM_NLIMITS) + return (set_errno(EINVAL)); + if (copyin(rlp, &rlim32, sizeof (rlim32))) + return (set_errno(EFAULT)); + + gp = rctl_rlimit_set_prealloc(1); + + /* + * Disallow resource limit tunnelling + */ + /*CONSTCOND*/ + if (RLIM_SAVED(resource)) { + mutex_enter(&p->p_lock); + saved_rlim = up->u_saved_rlimit[resource]; + mutex_exit(&p->p_lock); + } else { + saved_rlim.rlim_max = (rlim64_t)rlim32.rlim_max; + saved_rlim.rlim_cur = (rlim64_t)rlim32.rlim_cur; + } + + switch (rlim32.rlim_cur) { + case RLIM32_INFINITY: + rlim64.rlim_cur = RLIM64_INFINITY; + break; + case RLIM32_SAVED_CUR: + rlim64.rlim_cur = saved_rlim.rlim_cur; + break; + case RLIM32_SAVED_MAX: + rlim64.rlim_cur = saved_rlim.rlim_max; + break; + default: + rlim64.rlim_cur = (rlim64_t)rlim32.rlim_cur; + break; + } + + switch (rlim32.rlim_max) { + case RLIM32_INFINITY: + rlim64.rlim_max = RLIM64_INFINITY; + break; + case RLIM32_SAVED_MAX: + rlim64.rlim_max = saved_rlim.rlim_max; + break; + case RLIM32_SAVED_CUR: + rlim64.rlim_max = saved_rlim.rlim_cur; + break; + default: + rlim64.rlim_max = (rlim64_t)rlim32.rlim_max; + break; + } + + mutex_enter(&p->p_lock); + if (error = rctl_rlimit_set(rctlproc_legacy[resource], p, &rlim64, gp, + rctlproc_flags[resource], rctlproc_signals[resource], CRED())) { + mutex_exit(&p->p_lock); + rctl_prealloc_destroy(gp); + return (set_errno(error)); + } + mutex_exit(&p->p_lock); + rctl_prealloc_destroy(gp); + + return (0); +} + +#endif /* _ILP32 && _SYSCALL32_IMPL */ + +int +getrlimit64(int resource, struct rlimit64 *rlp) +{ + struct rlimit64 rlim64; + struct proc *p = ttoproc(curthread); + + if (resource < 0 || resource >= RLIM_NLIMITS) + return (set_errno(EINVAL)); + + mutex_enter(&p->p_lock); + (void) rctl_rlimit_get(rctlproc_legacy[resource], p, &rlim64); + mutex_exit(&p->p_lock); + + if (copyout(&rlim64, rlp, sizeof (rlim64))) + return (set_errno(EFAULT)); + return (0); +} + +int +setrlimit64(int resource, struct rlimit64 *rlp) +{ + struct rlimit64 rlim64; + struct proc *p = ttoproc(curthread); + int error; + rctl_alloc_gp_t *gp; + + if (resource < 0 || resource >= RLIM_NLIMITS) + return (set_errno(EINVAL)); + if (copyin(rlp, &rlim64, sizeof (rlim64))) + return (set_errno(EFAULT)); + + gp = rctl_rlimit_set_prealloc(1); + + mutex_enter(&p->p_lock); + if (error = rctl_rlimit_set(rctlproc_legacy[resource], p, &rlim64, gp, + rctlproc_flags[resource], rctlproc_signals[resource], CRED())) { + mutex_exit(&p->p_lock); + rctl_prealloc_destroy(gp); + return (set_errno(error)); + } + mutex_exit(&p->p_lock); + rctl_prealloc_destroy(gp); + return (0); + +} diff --git a/usr/src/uts/common/syscall/rmdir.c b/usr/src/uts/common/syscall/rmdir.c new file mode 100644 index 0000000000..0a0ad7e2cd --- /dev/null +++ b/usr/src/uts/common/syscall/rmdir.c @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1989 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/filio.h> + +#include <sys/debug.h> + +/* + * Remove a directory. + */ +int +rmdir(char *dname) +{ + int error; + + if (error = vn_remove(dname, UIO_USERSPACE, RMDIRECTORY)) + return (set_errno(error)); + return (0); +} diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c new file mode 100644 index 0000000000..5b66f2fa41 --- /dev/null +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -0,0 +1,294 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Implement fast getrusage call + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/errno.h> +#include <sys/resource.h> + +static int +getrusage(void *user_rusage) +{ + struct rusage r; + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + hrtime_t snsecs, unsecs; + klwp_t *lwp; + + r.ru_maxrss = 0; /* always 0 */ + r.ru_ixrss = 0; /* always 0 */ + r.ru_idrss = 0; /* always 0 */ + r.ru_isrss = 0; /* always 0 */ + + r.ru_utime.tv_sec = 0; + r.ru_utime.tv_usec = 0; + r.ru_stime.tv_sec = 0; + r.ru_stime.tv_usec = 0; + + mutex_enter(&p->p_lock); + + if (p->p_defunct > 0) { + r.ru_majflt = p->p_ru.majflt; + r.ru_minflt = p->p_ru.minflt; + r.ru_nswap = p->p_ru.nswap; + r.ru_inblock = p->p_ru.inblock; + r.ru_oublock = p->p_ru.oublock; + r.ru_msgsnd = p->p_ru.msgsnd; + r.ru_msgrcv = p->p_ru.msgrcv; + r.ru_nsignals = p->p_ru.nsignals; + r.ru_nvcsw = p->p_ru.nvcsw; + r.ru_nivcsw = p->p_ru.nivcsw; + } + + unsecs = mstate_aggr_state(p, LMS_USER); + snsecs = mstate_aggr_state(p, LMS_SYSTEM); + + do { + if (t->t_proc_flag & TP_LWPEXIT) + continue; + + lwp = ttolwp(t); + + r.ru_majflt += lwp->lwp_ru.majflt; + r.ru_minflt += lwp->lwp_ru.minflt; + r.ru_nswap += lwp->lwp_ru.nswap; + r.ru_inblock += lwp->lwp_ru.inblock; + r.ru_oublock += lwp->lwp_ru.oublock; + r.ru_msgsnd += lwp->lwp_ru.msgsnd; + r.ru_msgrcv += lwp->lwp_ru.msgrcv; + r.ru_nsignals += lwp->lwp_ru.nsignals; + r.ru_nvcsw += lwp->lwp_ru.nvcsw; + r.ru_nivcsw += lwp->lwp_ru.nivcsw; + + } while ((t = t->t_forw) != curthread); + + mutex_exit(&p->p_lock); + + hrt2tv(unsecs, &r.ru_utime); + hrt2tv(snsecs, &r.ru_stime); + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() == DATAMODEL_ILP32) { + struct rusage32 r32; + + r32.ru_maxrss = 0; /* always 0 */ + r32.ru_ixrss = 0; /* always 0 */ + r32.ru_idrss = 0; /* always 0 */ + r32.ru_isrss = 0; /* always 0 */ + + r32.ru_utime.tv_sec = r.ru_utime.tv_sec; + r32.ru_utime.tv_usec = r.ru_utime.tv_usec; + r32.ru_stime.tv_sec = r.ru_stime.tv_sec; + r32.ru_stime.tv_usec = r.ru_stime.tv_usec; + + r32.ru_majflt = (int32_t)r.ru_majflt; + r32.ru_minflt = (int32_t)r.ru_minflt; + r32.ru_nswap = (int32_t)r.ru_nswap; + r32.ru_inblock = (int32_t)r.ru_inblock; + r32.ru_oublock = (int32_t)r.ru_oublock; + r32.ru_msgsnd = (int32_t)r.ru_msgsnd; + r32.ru_msgrcv = (int32_t)r.ru_msgrcv; + r32.ru_nsignals = (int32_t)r.ru_nsignals; + r32.ru_nvcsw = (int32_t)r.ru_nvcsw; + r32.ru_nivcsw = (int32_t)r.ru_nivcsw; + if (copyout(&r32, user_rusage, sizeof (r32)) != 0) + return (set_errno(EFAULT)); + } else +#endif /* _SYSCALL32_IMPL */ + + if (copyout(&r, user_rusage, sizeof (r)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +getrusage_chld(void *user_rusage) +{ + struct rusage r; + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + + hrtime_t snsecs, unsecs; + + r.ru_maxrss = 0; /* always 0 */ + r.ru_ixrss = 0; /* always 0 */ + r.ru_idrss = 0; /* always 0 */ + r.ru_isrss = 0; /* always 0 */ + + mutex_enter(&p->p_lock); + + unsecs = p->p_cacct[LMS_USER]; + snsecs = p->p_cacct[LMS_SYSTEM] + p->p_cacct[LMS_TRAP]; + r.ru_utime.tv_sec = 0; + r.ru_utime.tv_usec = 0; + r.ru_stime.tv_sec = 0; + r.ru_stime.tv_usec = 0; + + r.ru_majflt = p->p_cru.majflt; + r.ru_minflt = p->p_cru.minflt; + r.ru_nswap = p->p_cru.nswap; + r.ru_inblock = p->p_cru.inblock; + r.ru_oublock = p->p_cru.oublock; + r.ru_msgsnd = p->p_cru.msgsnd; + r.ru_msgrcv = p->p_cru.msgrcv; + r.ru_nsignals = p->p_cru.nsignals; + r.ru_nvcsw = p->p_cru.nvcsw; + r.ru_nivcsw = p->p_cru.nivcsw; + + mutex_exit(&p->p_lock); + + hrt2tv(unsecs, &r.ru_utime); + hrt2tv(snsecs, &r.ru_stime); +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() == DATAMODEL_ILP32) { + struct rusage32 r32; + + r32.ru_maxrss = 0; /* always 0 */ + r32.ru_ixrss = 0; /* always 0 */ + r32.ru_idrss = 0; /* always 0 */ + r32.ru_isrss = 0; /* always 0 */ + + r32.ru_utime.tv_sec = r.ru_utime.tv_sec; + r32.ru_utime.tv_usec = r.ru_utime.tv_usec; + r32.ru_stime.tv_sec = r.ru_stime.tv_sec; + r32.ru_stime.tv_usec = r.ru_stime.tv_usec; + + r32.ru_majflt = (int32_t)r.ru_majflt; + r32.ru_minflt = (int32_t)r.ru_minflt; + r32.ru_nswap = (int32_t)r.ru_nswap; + r32.ru_inblock = (int32_t)r.ru_inblock; + r32.ru_oublock = (int32_t)r.ru_oublock; + r32.ru_msgsnd = (int32_t)r.ru_msgsnd; + r32.ru_msgrcv = (int32_t)r.ru_msgrcv; + r32.ru_nsignals = (int32_t)r.ru_nsignals; + r32.ru_nvcsw = (int32_t)r.ru_nvcsw; + r32.ru_nivcsw = (int32_t)r.ru_nivcsw; + if (copyout(&r32, user_rusage, sizeof (r32)) != 0) + return (set_errno(EFAULT)); + } else +#endif /* _SYSCALL32_IMPL */ + + if (copyout(&r, user_rusage, sizeof (r)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +getrusage_lwp(void *user_rusage) +{ + struct rusage r; + kthread_t *t = curthread; + klwp_t *lwp; + hrtime_t snsecs, unsecs; + struct mstate *ms; + + r.ru_maxrss = 0; /* always 0 */ + r.ru_ixrss = 0; /* always 0 */ + r.ru_idrss = 0; /* always 0 */ + r.ru_isrss = 0; /* always 0 */ + r.ru_utime.tv_sec = 0; + r.ru_utime.tv_usec = 0; + r.ru_stime.tv_sec = 0; + r.ru_stime.tv_usec = 0; + + lwp = ttolwp(t); + ms = &lwp->lwp_mstate; + unsecs = ms->ms_acct[LMS_USER]; + snsecs = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP]; + scalehrtime(&unsecs); + scalehrtime(&snsecs); + r.ru_majflt = lwp->lwp_ru.majflt; + r.ru_minflt = lwp->lwp_ru.minflt; + r.ru_nswap = lwp->lwp_ru.nswap; + r.ru_inblock = lwp->lwp_ru.inblock; + r.ru_oublock = lwp->lwp_ru.oublock; + r.ru_msgsnd = lwp->lwp_ru.msgsnd; + r.ru_msgrcv = lwp->lwp_ru.msgrcv; + r.ru_nsignals = lwp->lwp_ru.nsignals; + r.ru_nvcsw = lwp->lwp_ru.nvcsw; + r.ru_nivcsw = lwp->lwp_ru.nivcsw; + + hrt2tv(unsecs, &r.ru_utime); + hrt2tv(snsecs, &r.ru_stime); +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() == DATAMODEL_ILP32) { + struct rusage32 r32; + + r32.ru_maxrss = 0; /* always 0 */ + r32.ru_ixrss = 0; /* always 0 */ + r32.ru_idrss = 0; /* always 0 */ + r32.ru_isrss = 0; /* always 0 */ + + r32.ru_utime.tv_sec = r.ru_utime.tv_sec; + r32.ru_utime.tv_usec = r.ru_utime.tv_usec; + r32.ru_stime.tv_sec = r.ru_stime.tv_sec; + r32.ru_stime.tv_usec = r.ru_stime.tv_usec; + + r32.ru_majflt = (int32_t)r.ru_majflt; + r32.ru_minflt = (int32_t)r.ru_minflt; + r32.ru_nswap = (int32_t)r.ru_nswap; + r32.ru_inblock = (int32_t)r.ru_inblock; + r32.ru_oublock = (int32_t)r.ru_oublock; + r32.ru_msgsnd = (int32_t)r.ru_msgsnd; + r32.ru_msgrcv = (int32_t)r.ru_msgrcv; + r32.ru_nsignals = (int32_t)r.ru_nsignals; + r32.ru_nvcsw = (int32_t)r.ru_nvcsw; + r32.ru_nivcsw = (int32_t)r.ru_nivcsw; + if (copyout(&r32, user_rusage, sizeof (r32)) != 0) + return (set_errno(EFAULT)); + } else +#endif /* _SYSCALL32_IMPL */ + + if (copyout(&r, user_rusage, sizeof (r)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +int +rusagesys(int code, void * arg) +{ + switch (code) { + + case _RUSAGESYS_GETRUSAGE: + return (getrusage(arg)); + case _RUSAGESYS_GETRUSAGE_CHLD: + return (getrusage_chld(arg)); + case _RUSAGESYS_GETRUSAGE_LWP: + return (getrusage_lwp(arg)); + default: + return (set_errno(EINVAL)); + } +} diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c new file mode 100644 index 0000000000..d2f35e2051 --- /dev/null +++ b/usr/src/uts/common/syscall/rw.c @@ -0,0 +1,1223 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/inttypes.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/cpuvar.h> +#include <sys/uio.h> +#include <sys/ioreq.h> +#include <sys/debug.h> +#include <sys/rctl.h> +#include <sys/nbmlock.h> + +#define COPYOUT_MIN_SIZE (1<<17) /* 128K */ + +static size_t copyout_min_size = COPYOUT_MIN_SIZE; + +/* + * read, write, pread, pwrite, readv, and writev syscalls. + * + * 64-bit open: all open's are large file opens. + * Large Files: the behaviour of read depends on whether the fd + * corresponds to large open or not. + * 32-bit open: FOFFMAX flag not set. + * read until MAXOFF32_T - 1 and read at MAXOFF32_T returns + * EOVERFLOW if count is non-zero and if size of file + * is > MAXOFF32_T. If size of file is <= MAXOFF32_T read + * at >= MAXOFF32_T returns EOF. + */ + +/* + * Native system call + */ +ssize_t +read(int fdes, void *cbuf, size_t count) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + register vnode_t *vp; + struct cpu *cp; + int fflag, ioflag, rwflag; + ssize_t cnt, bcount; + int error = 0; + u_offset_t fileoff; + int in_crit = 0; + + if ((cnt = (ssize_t)count) < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + vp = fp->f_vnode; + + if (vp->v_type == VREG && cnt == 0) { + goto out; + } + + rwflag = 0; + aiov.iov_base = cbuf; + aiov.iov_len = cnt; + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with write() calls. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand)) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + + /* + * We do the following checks inside VOP_RWLOCK so as to + * prevent file size from changing while these checks are + * being done. Also, we load fp's offset to the local + * variable fileoff because we can have a parallel lseek + * going on (f_offset is not protected by any lock) which + * could change f_offset. We need to see the value only + * once here and take a decision. Seeing it more than once + * can lead to incorrect functionality. + */ + + fileoff = (u_offset_t)fp->f_offset; + if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) { + struct vattr va; + va.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred))) { + VOP_RWUNLOCK(vp, rwflag, NULL); + goto out; + } + if (fileoff >= va.va_size) { + cnt = 0; + VOP_RWUNLOCK(vp, rwflag, NULL); + goto out; + } else { + error = EOVERFLOW; + VOP_RWUNLOCK(vp, rwflag, NULL); + goto out; + } + } + if ((vp->v_type == VREG) && + (fileoff + cnt > OFFSET_MAX(fp))) { + cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff); + } + auio.uio_loffset = fileoff; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = bcount = cnt; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + /* + * Only use bypass caches when the count is large enough + */ + if (bcount < copyout_min_size) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + /* If read sync is not asked for, filter sync flags */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); + cnt -= auio.uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, sysread, 1); + CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; + + if (vp->v_type == VFIFO) /* Backward compatibility */ + fp->f_offset = cnt; + else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (bcount != 0)) /* POSIX */ + fp->f_offset = auio.uio_loffset; + VOP_RWUNLOCK(vp, rwflag, NULL); + + if (error == EINTR && cnt != 0) + error = 0; +out: + if (in_crit) + nbl_end_crit(vp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (cnt); +} + +/* + * Native system call + */ +ssize_t +write(int fdes, void *cbuf, size_t count) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + register vnode_t *vp; + struct cpu *cp; + int fflag, ioflag, rwflag; + ssize_t cnt, bcount; + int error = 0; + u_offset_t fileoff; + int in_crit = 0; + + if ((cnt = (ssize_t)count) < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + vp = fp->f_vnode; + + if (vp->v_type == VREG && cnt == 0) { + goto out; + } + + rwflag = 1; + aiov.iov_base = cbuf; + aiov.iov_len = cnt; + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand)) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + + fileoff = fp->f_offset; + if (vp->v_type == VREG) { + + /* + * We raise psignal if write for >0 bytes causes + * it to exceed the ulimit. + */ + if (fileoff >= curproc->p_fsz_ctl) { + VOP_RWUNLOCK(vp, rwflag, NULL); + + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + + error = EFBIG; + goto out; + } + /* + * We return EFBIG if write is done at an offset + * greater than the offset maximum for this file structure. + */ + + if (fileoff >= OFFSET_MAX(fp)) { + VOP_RWUNLOCK(vp, rwflag, NULL); + error = EFBIG; + goto out; + } + /* + * Limit the bytes to be written upto offset maximum for + * this open file structure. + */ + if (fileoff + cnt > OFFSET_MAX(fp)) + cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff); + } + auio.uio_loffset = fileoff; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = bcount = cnt; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); + cnt -= auio.uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, syswrite, 1); + CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; + + if (vp->v_type == VFIFO) /* Backward compatibility */ + fp->f_offset = cnt; + else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (bcount != 0)) /* POSIX */ + fp->f_offset = auio.uio_loffset; + VOP_RWUNLOCK(vp, rwflag, NULL); + + if (error == EINTR && cnt != 0) + error = 0; +out: + if (in_crit) + nbl_end_crit(vp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (cnt); +} + +ssize_t +pread(int fdes, void *cbuf, size_t count, off_t offset) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + register vnode_t *vp; + struct cpu *cp; + int fflag, ioflag, rwflag; + ssize_t bcount; + int error = 0; + u_offset_t fileoff = (u_offset_t)(ulong_t)offset; +#ifdef _SYSCALL32_IMPL + u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ? + MAXOFF32_T : MAXOFFSET_T; +#else + const u_offset_t maxoff = MAXOFF32_T; +#endif + int in_crit = 0; + + if ((bcount = (ssize_t)count) < 0) + return (set_errno(EINVAL)); + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & (FREAD)) == 0) { + error = EBADF; + goto out; + } + + rwflag = 0; + vp = fp->f_vnode; + + if (vp->v_type == VREG) { + + if (bcount == 0) + goto out; + + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + + if (fileoff > maxoff) { + error = EINVAL; + goto out; + } + /* + * Limit offset such that we don't read or write + * a file beyond the maximum offset representable in + * an off_t structure. + */ + if (fileoff + bcount > maxoff) + bcount = (ssize_t)((offset_t)maxoff - fileoff); + } else if (vp->v_type == VFIFO) { + error = ESPIPE; + goto out; + } + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) { + error = EACCES; + goto out; + } + } + + aiov.iov_base = cbuf; + aiov.iov_len = bcount; + (void) VOP_RWLOCK(vp, rwflag, NULL); + if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) { + struct vattr va; + va.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred))) { + VOP_RWUNLOCK(vp, rwflag, NULL); + goto out; + } + VOP_RWUNLOCK(vp, rwflag, NULL); + + /* + * We have to return EOF if fileoff is >= file size. + */ + if (fileoff >= va.va_size) { + bcount = 0; + goto out; + } + + /* + * File is greater than or equal to maxoff and therefore + * we return EOVERFLOW. + */ + error = EOVERFLOW; + goto out; + } + auio.uio_loffset = fileoff; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = bcount; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + /* If read sync is not asked for, filter sync flags */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); + bcount -= auio.uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, sysread, 1); + CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount; + VOP_RWUNLOCK(vp, rwflag, NULL); + + if (error == EINTR && bcount != 0) + error = 0; +out: + if (in_crit) + nbl_end_crit(vp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (bcount); +} + +ssize_t +pwrite(int fdes, void *cbuf, size_t count, off_t offset) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + register vnode_t *vp; + struct cpu *cp; + int fflag, ioflag, rwflag; + ssize_t bcount; + int error = 0; + u_offset_t fileoff = (u_offset_t)(ulong_t)offset; +#ifdef _SYSCALL32_IMPL + u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ? + MAXOFF32_T : MAXOFFSET_T; +#else + const u_offset_t maxoff = MAXOFF32_T; +#endif + int in_crit = 0; + + if ((bcount = (ssize_t)count) < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & (FWRITE)) == 0) { + error = EBADF; + goto out; + } + + rwflag = 1; + vp = fp->f_vnode; + + if (vp->v_type == VREG) { + + if (bcount == 0) + goto out; + + /* + * return EINVAL for offsets that cannot be + * represented in an off_t. + */ + if (fileoff > maxoff) { + error = EINVAL; + goto out; + } + /* + * Take appropriate action if we are trying to write above the + * resource limit. + */ + if (fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + + error = EFBIG; + goto out; + } + /* + * Don't allow pwrite to cause file sizes to exceed + * maxoff. + */ + if (fileoff == maxoff) { + error = EFBIG; + goto out; + } + if (fileoff + count > maxoff) + bcount = (ssize_t)((u_offset_t)maxoff - fileoff); + } else if (vp->v_type == VFIFO) { + error = ESPIPE; + goto out; + } + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) { + error = EACCES; + goto out; + } + } + + aiov.iov_base = cbuf; + aiov.iov_len = bcount; + (void) VOP_RWLOCK(vp, rwflag, NULL); + auio.uio_loffset = fileoff; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = bcount; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); + bcount -= auio.uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, syswrite, 1); + CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount; + VOP_RWUNLOCK(vp, rwflag, NULL); + + if (error == EINTR && bcount != 0) + error = 0; +out: + if (in_crit) + nbl_end_crit(vp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (bcount); +} + +/* + * XXX -- The SVID refers to IOV_MAX, but doesn't define it. Grrrr.... + * XXX -- However, SVVS expects readv() and writev() to fail if + * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source), + * XXX -- so I guess that's the "interface". + */ +#define DEF_IOV_MAX 16 + +ssize_t +readv(int fdes, struct iovec *iovp, int iovcnt) +{ + struct uio auio; + struct iovec aiov[DEF_IOV_MAX]; + file_t *fp; + register vnode_t *vp; + struct cpu *cp; + int fflag, ioflag, rwflag; + ssize_t count, bcount; + int error = 0; + int i; + u_offset_t fileoff; + int in_crit = 0; + + if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + return (set_errno(EINVAL)); + +#ifdef _SYSCALL32_IMPL + /* + * 32-bit callers need to have their iovec expanded, + * while ensuring that they can't move more than 2Gbytes + * of data in a single call. + */ + if (get_udatamodel() == DATAMODEL_ILP32) { + struct iovec32 aiov32[DEF_IOV_MAX]; + ssize32_t count32; + + if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + return (set_errno(EFAULT)); + + count32 = 0; + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32 = aiov32[i].iov_len; + count32 += iovlen32; + if (iovlen32 < 0 || count32 < 0) + return (set_errno(EINVAL)); + aiov[i].iov_len = iovlen32; + aiov[i].iov_base = + (caddr_t)(uintptr_t)aiov32[i].iov_base; + } + } else +#endif + if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + return (set_errno(EFAULT)); + + count = 0; + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = aiov[i].iov_len; + count += iovlen; + if (iovlen < 0 || count < 0) + return (set_errno(EINVAL)); + } + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + vp = fp->f_vnode; + if (vp->v_type == VREG && count == 0) { + goto out; + } + + rwflag = 0; + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand)) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + fileoff = fp->f_offset; + + /* + * Behaviour is same as read. Please see comments in read. + */ + + if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) { + struct vattr va; + va.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred))) { + VOP_RWUNLOCK(vp, rwflag, NULL); + goto out; + } + if (fileoff >= va.va_size) { + VOP_RWUNLOCK(vp, rwflag, NULL); + count = 0; + goto out; + } else { + VOP_RWUNLOCK(vp, rwflag, NULL); + error = EOVERFLOW; + goto out; + } + } + if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) { + count = (ssize_t)(OFFSET_MAX(fp) - fileoff); + } + auio.uio_loffset = fileoff; + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_resid = bcount = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (bcount < copyout_min_size) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + + ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + /* If read sync is not asked for, filter sync flags */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); + count -= auio.uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, sysread, 1); + CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count; + + if (vp->v_type == VFIFO) /* Backward compatibility */ + fp->f_offset = count; + else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (bcount != 0)) /* POSIX */ + fp->f_offset = auio.uio_loffset; + + VOP_RWUNLOCK(vp, rwflag, NULL); + + if (error == EINTR && count != 0) + error = 0; +out: + if (in_crit) + nbl_end_crit(vp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (count); +} + +ssize_t +writev(int fdes, struct iovec *iovp, int iovcnt) +{ + struct uio auio; + struct iovec aiov[DEF_IOV_MAX]; + file_t *fp; + register vnode_t *vp; + struct cpu *cp; + int fflag, ioflag, rwflag; + ssize_t count, bcount; + int error = 0; + int i; + u_offset_t fileoff; + int in_crit = 0; + + if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + return (set_errno(EINVAL)); + +#ifdef _SYSCALL32_IMPL + /* + * 32-bit callers need to have their iovec expanded, + * while ensuring that they can't move more than 2Gbytes + * of data in a single call. + */ + if (get_udatamodel() == DATAMODEL_ILP32) { + struct iovec32 aiov32[DEF_IOV_MAX]; + ssize32_t count32; + + if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + return (set_errno(EFAULT)); + + count32 = 0; + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen = aiov32[i].iov_len; + count32 += iovlen; + if (iovlen < 0 || count32 < 0) + return (set_errno(EINVAL)); + aiov[i].iov_len = iovlen; + aiov[i].iov_base = + (caddr_t)(uintptr_t)aiov32[i].iov_base; + } + } else +#endif + if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + return (set_errno(EFAULT)); + + count = 0; + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = aiov[i].iov_len; + count += iovlen; + if (iovlen < 0 || count < 0) + return (set_errno(EINVAL)); + } + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + vp = fp->f_vnode; + if (vp->v_type == VREG && count == 0) { + goto out; + } + + rwflag = 1; + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand)) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + + fileoff = fp->f_offset; + + /* + * Behaviour is same as write. Please see comments for write. + */ + + if (vp->v_type == VREG) { + if (fileoff >= curproc->p_fsz_ctl) { + VOP_RWUNLOCK(vp, rwflag, NULL); + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + error = EFBIG; + goto out; + } + if (fileoff >= OFFSET_MAX(fp)) { + VOP_RWUNLOCK(vp, rwflag, NULL); + error = EFBIG; + goto out; + } + if (fileoff + count > OFFSET_MAX(fp)) + count = (ssize_t)(OFFSET_MAX(fp) - fileoff); + } + auio.uio_loffset = fileoff; + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_resid = bcount = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); + count -= auio.uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, syswrite, 1); + CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count; + + if (vp->v_type == VFIFO) /* Backward compatibility */ + fp->f_offset = count; + else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (bcount != 0)) /* POSIX */ + fp->f_offset = auio.uio_loffset; + VOP_RWUNLOCK(vp, rwflag, NULL); + + if (error == EINTR && count != 0) + error = 0; +out: + if (in_crit) + nbl_end_crit(vp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (count); +} + +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) + +/* + * This syscall supplies 64-bit file offsets to 32-bit applications only. + */ +ssize32_t +pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1, + uint32_t offset_2) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + register vnode_t *vp; + struct cpu *cp; + int fflag, ioflag, rwflag; + ssize_t bcount; + int error = 0; + u_offset_t fileoff; + int in_crit = 0; + +#if defined(_LITTLE_ENDIAN) + fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1; +#else + fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2; +#endif + + if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX) + return (set_errno(EINVAL)); + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & (FREAD)) == 0) { + error = EBADF; + goto out; + } + + rwflag = 0; + vp = fp->f_vnode; + + if (vp->v_type == VREG) { + + if (bcount == 0) + goto out; + + /* + * Same as pread. See comments in pread. + */ + + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + if (fileoff + bcount > MAXOFFSET_T) + bcount = (ssize_t)(MAXOFFSET_T - fileoff); + } else if (vp->v_type == VFIFO) { + error = ESPIPE; + goto out; + } + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) { + error = EACCES; + goto out; + } + } + + aiov.iov_base = cbuf; + aiov.iov_len = bcount; + (void) VOP_RWLOCK(vp, rwflag, NULL); + auio.uio_loffset = fileoff; + + /* + * Note: File size can never be greater than MAXOFFSET_T. + * If ever we start supporting 128 bit files the code + * similar to the one in pread at this place should be here. + * Here we avoid the unnecessary VOP_GETATTR() when we + * know that fileoff == MAXOFFSET_T implies that it is always + * greater than or equal to file size. + */ + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = bcount; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + /* If read sync is not asked for, filter sync flags */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); + bcount -= auio.uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, sysread, 1); + CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount; + VOP_RWUNLOCK(vp, rwflag, NULL); + + if (error == EINTR && bcount != 0) + error = 0; +out: + if (in_crit) + nbl_end_crit(vp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (bcount); +} + +/* + * This syscall supplies 64-bit file offsets to 32-bit applications only. + */ +ssize32_t +pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1, + uint32_t offset_2) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + register vnode_t *vp; + struct cpu *cp; + int fflag, ioflag, rwflag; + ssize_t bcount; + int error = 0; + u_offset_t fileoff; + int in_crit = 0; + +#if defined(_LITTLE_ENDIAN) + fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1; +#else + fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2; +#endif + + if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & (FWRITE)) == 0) { + error = EBADF; + goto out; + } + + rwflag = 1; + vp = fp->f_vnode; + + if (vp->v_type == VREG) { + + if (bcount == 0) + goto out; + + /* + * See comments in pwrite. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + if (fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_SAFE); + mutex_exit(&curproc->p_lock); + error = EFBIG; + goto out; + } + if (fileoff == MAXOFFSET_T) { + error = EFBIG; + goto out; + } + if (fileoff + bcount > MAXOFFSET_T) + bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff); + } else if (vp->v_type == VFIFO) { + error = ESPIPE; + goto out; + } + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) { + error = EACCES; + goto out; + } + } + + aiov.iov_base = cbuf; + aiov.iov_len = bcount; + (void) VOP_RWLOCK(vp, rwflag, NULL); + auio.uio_loffset = fileoff; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = bcount; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); + bcount -= auio.uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, syswrite, 1); + CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount; + VOP_RWUNLOCK(vp, rwflag, NULL); + + if (error == EINTR && bcount != 0) + error = 0; +out: + if (in_crit) + nbl_end_crit(vp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (bcount); +} + +#endif /* _SYSCALL32_IMPL || _ILP32 */ + +#ifdef _SYSCALL32_IMPL +/* + * Tail-call elimination of xxx32() down to xxx() + * + * A number of xxx32 system calls take a len (or count) argument and + * return a number in the range [0,len] or -1 on error. + * Given an ssize32_t input len, the downcall xxx() will return + * a 64-bit value that is -1 or in the range [0,len] which actually + * is a proper return value for the xxx32 call. So even if the xxx32 + * calls can be considered as returning a ssize32_t, they are currently + * declared as returning a ssize_t as this enables tail-call elimination. + * + * The cast of len (or count) to ssize32_t is needed to ensure we pass + * down negative input values as such and let the downcall handle error + * reporting. Functions covered by this comments are: + * + * rw.c: read32, write32, pread32, pwrite32, readv32, writev32. + * socksyscall.c: recv32, recvfrom32, send32, sendto32. + * readlink.c: readlink32. + */ + +ssize_t +read32(int32_t fdes, caddr32_t cbuf, size32_t count) +{ + return (read(fdes, + (void *)(uintptr_t)cbuf, (ssize32_t)count)); +} + +ssize_t +write32(int32_t fdes, caddr32_t cbuf, size32_t count) +{ + return (write(fdes, + (void *)(uintptr_t)cbuf, (ssize32_t)count)); +} + +ssize_t +pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset) +{ + return (pread(fdes, + (void *)(uintptr_t)cbuf, (ssize32_t)count, + (off_t)(uint32_t)offset)); +} + +ssize_t +pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset) +{ + return (pwrite(fdes, + (void *)(uintptr_t)cbuf, (ssize32_t)count, + (off_t)(uint32_t)offset)); +} + +ssize_t +readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt) +{ + return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt)); +} + +ssize_t +writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt) +{ + return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt)); +} + +#endif /* _SYSCALL32_IMPL */ diff --git a/usr/src/uts/common/syscall/sem.c b/usr/src/uts/common/syscall/sem.c new file mode 100644 index 0000000000..5498418a27 --- /dev/null +++ b/usr/src/uts/common/syscall/sem.c @@ -0,0 +1,1208 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Inter-Process Communication Semaphore Facility. + * + * See os/ipc.c for a description of common IPC functionality. + * + * Resource controls + * ----------------- + * + * Control: project.max-sem-ids (rc_project_semmni) + * Description: Maximum number of semaphore ids allowed a project. + * + * When semget() is used to allocate a semaphore set, one id is + * allocated. If the id allocation doesn't succeed, semget() fails + * and errno is set to ENOSPC. Upon successful semctl(, IPC_RMID) + * the id is deallocated. + * + * Control: process.max-sem-nsems (rc_process_semmsl) + * Description: Maximum number of semaphores allowed per semaphore set. + * + * When semget() is used to allocate a semaphore set, the size of the + * set is compared with this limit. If the number of semaphores + * exceeds the limit, semget() fails and errno is set to EINVAL. + * + * Control: process.max-sem-ops (rc_process_semopm) + * Description: Maximum number of semaphore operations allowed per + * semop call. + * + * When semget() successfully allocates a semaphore set, the minimum + * enforced value of this limit is used to initialize the + * "system-imposed maximum" number of operations a semop() call for + * this set can perform. + * + * Undo structures + * --------------- + * + * Removing the undo structure tunables involved a serious redesign of + * how they were implemented. There is now one undo structure for + * every process/semaphore array combination (lazily allocated, of + * course), and each is equal in size to the semaphore it corresponds + * to. To avoid scalability and performance problems, the undo + * structures are stored in two places: a per-process AVL tree sorted + * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted + * per-semaphore linked list (sem_undos, protected by the semaphore's + * ID lock). The former is used by semop, where a lookup is performed + * once and cached if SEM_UNDO is specified for any of the operations, + * and at process exit where the undoable operations are rolled back. + * The latter is used when removing the semaphore, so the undo + * structures can be removed from the appropriate processes' trees. + * + * The undo structure itself contains pointers to the ksemid and proc + * to which it corresponds, a list node, an AVL node, and an array of + * adjust-on-exit (AOE) values. When an undo structure is allocated it + * is immediately added to both the process's tree and the semaphore's + * list. Lastly, the reference count on the semaphore is increased. + * + * Avoiding a lock ordering violation between p_lock and the ID lock, + * wont to occur when there is a race between a process exiting and the + * removal of a semaphore, mandates the delicate dance that exists + * between semexit and sem_rmid. + * + * sem_rmid, holding the ID lock, iterates through all undo structures + * and for each takes the appropriate process's p_lock and checks to + * see if p_semacct is NULL. If it is, it skips that undo structure + * and continues to the next. Otherwise, it removes the undo structure + * from both the AVL tree and the semaphore's list, and releases the + * hold that the undo structure had on the semaphore. + * + * The important other half of this is semexit, which will immediately + * take p_lock, obtain the AVL pointer, clear p_semacct, and drop + * p_lock. From this point on it is semexit's responsibility to clean + * up all undo structures found in the tree -- a coexecuting sem_rmid + * will see the NULL p_semacct and skip that undo structure. It walks + * the AVL tree (using avl_destroy_nodes) and for each undo structure + * takes the appropriate semaphore's ID lock (always legal since the + * undo structure has a hold on the semaphore), updates all semaphores + * with non-zero AOE values, and removes the structure from the + * semaphore's list. It then drops the structure's reference on the + * semaphore, drops the ID lock, and frees the undo structure. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/vmem.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/time.h> +#include <sys/ipc.h> +#include <sys/ipc_impl.h> +#include <sys/sem.h> +#include <sys/sem_impl.h> +#include <sys/user.h> +#include <sys/proc.h> +#include <sys/cpuvar.h> +#include <sys/debug.h> +#include <sys/var.h> +#include <sys/cmn_err.h> +#include <sys/modctl.h> +#include <sys/syscall.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/zone.h> + +#include <c2/audit.h> + +extern rctl_hndl_t rc_project_semmni; +extern rctl_hndl_t rc_process_semmsl; +extern rctl_hndl_t rc_process_semopm; +static ipc_service_t *sem_svc; +static zone_key_t sem_zone_key; + +/* + * The following tunables are obsolete. Though for compatibility we + * still read and interpret seminfo_semmsl, seminfo_semopm and + * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred + * mechanism for administrating the IPC Semaphore facility is through + * the resource controls described at the top of this file. + */ +int seminfo_semaem = 16384; /* (obsolete) */ +int seminfo_semmap = 10; /* (obsolete) */ +int seminfo_semmni = 10; /* (obsolete) */ +int seminfo_semmns = 60; /* (obsolete) */ +int seminfo_semmnu = 30; /* (obsolete) */ +int seminfo_semmsl = 25; /* (obsolete) */ +int seminfo_semopm = 10; /* (obsolete) */ +int seminfo_semume = 10; /* (obsolete) */ +int seminfo_semusz = 96; /* (obsolete) */ +int seminfo_semvmx = 32767; /* (obsolete) */ + +#define SEM_MAXUCOPS 4096 /* max # of unchecked ops per semop call */ +#define SEM_UNDOSZ(n) (sizeof (struct sem_undo) + (n - 1) * sizeof (int)) + +static int semsys(int opcode, uintptr_t a0, uintptr_t a1, + uintptr_t a2, uintptr_t a3); +static void sem_dtor(kipc_perm_t *); +static void sem_rmid(kipc_perm_t *); +static void sem_remove_zone(zoneid_t, void *); + +static struct sysent ipcsem_sysent = { + 5, + SE_NOUNLOAD | SE_ARGC | SE_32RVAL1, + semsys +}; + +/* + * Module linkage information for the kernel. + */ +static struct modlsys modlsys = { + &mod_syscallops, "System V semaphore facility", &ipcsem_sysent +}; + +#ifdef _SYSCALL32_IMPL +static struct modlsys modlsys32 = { + &mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent +}; +#endif + +static struct modlinkage modlinkage = { + MODREV_1, + &modlsys, +#ifdef _SYSCALL32_IMPL + &modlsys32, +#endif + NULL +}; + + +int +_init(void) +{ + int result; + + sem_svc = ipcs_create("semids", rc_project_semmni, sizeof (ksemid_t), + sem_dtor, sem_rmid, AT_IPC_SEM, + offsetof(kproject_data_t, kpd_semmni)); + zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL); + + if ((result = mod_install(&modlinkage)) == 0) + return (0); + + (void) zone_key_delete(sem_zone_key); + ipcs_destroy(sem_svc); + + return (result); +} + +int +_fini(void) +{ + return (EBUSY); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static void +sem_dtor(kipc_perm_t *perm) +{ + ksemid_t *sp = (ksemid_t *)perm; + + kmem_free(sp->sem_base, + P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64)); + list_destroy(&sp->sem_undos); +} + +/* + * sem_undo_add - Create or update adjust on exit entry. + */ +static int +sem_undo_add(short val, ushort_t num, struct sem_undo *undo) +{ + int newval = undo->un_aoe[num] - val; + + if (newval > USHRT_MAX || newval < -USHRT_MAX) + return (ERANGE); + undo->un_aoe[num] = newval; + + return (0); +} + +/* + * sem_undo_clear - clears all undo entries for specified semaphores + * + * Used when semaphores are reset by SETVAL or SETALL. + */ +static void +sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high) +{ + struct sem_undo *undo; + int i; + + ASSERT(low <= high); + ASSERT(high < sp->sem_nsems); + + for (undo = list_head(&sp->sem_undos); undo; + undo = list_next(&sp->sem_undos, undo)) + for (i = low; i <= high; i++) + undo->un_aoe[i] = 0; +} + +/* + * sem_rollback - roll back work done so far if unable to complete operation + */ +static void +sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo) +{ + struct sem *semp; /* semaphore ptr */ + + for (op += n - 1; n--; op--) { + if (op->sem_op == 0) + continue; + semp = &sp->sem_base[op->sem_num]; + semp->semval -= op->sem_op; + if (op->sem_flg & SEM_UNDO) { + ASSERT(undo != NULL); + (void) sem_undo_add(-op->sem_op, op->sem_num, undo); + } + } +} + +static void +sem_rmid(kipc_perm_t *perm) +{ + ksemid_t *sp = (ksemid_t *)perm; + struct sem *semp; + struct sem_undo *undo; + size_t size = SEM_UNDOSZ(sp->sem_nsems); + int i; + + /*LINTED*/ + while (undo = list_head(&sp->sem_undos)) { + list_remove(&sp->sem_undos, undo); + mutex_enter(&undo->un_proc->p_lock); + if (undo->un_proc->p_semacct == NULL) { + mutex_exit(&undo->un_proc->p_lock); + continue; + } + avl_remove(undo->un_proc->p_semacct, undo); + mutex_exit(&undo->un_proc->p_lock); + kmem_free(undo, size); + ipc_rele_locked(sem_svc, (kipc_perm_t *)sp); + } + + for (i = 0; i < sp->sem_nsems; i++) { + semp = &sp->sem_base[i]; + semp->semval = semp->sempid = 0; + if (semp->semncnt) { + cv_broadcast(&semp->semncnt_cv); + semp->semncnt = 0; + } + if (semp->semzcnt) { + cv_broadcast(&semp->semzcnt_cv); + semp->semzcnt = 0; + } + } +} + +/* + * semctl - Semctl system call. + */ +static int +semctl(int semid, uint_t semnum, int cmd, uintptr_t arg) +{ + ksemid_t *sp; /* ptr to semaphore header */ + struct sem *p; /* ptr to semaphore */ + unsigned int i; /* loop control */ + ushort_t *vals, *vp; + size_t vsize = 0; + int error = 0; + int retval = 0; + struct cred *cr; + kmutex_t *lock; + model_t mdl = get_udatamodel(); + STRUCT_DECL(semid_ds, sid); + struct semid_ds64 ds64; + + STRUCT_INIT(sid, mdl); + cr = CRED(); + + /* + * Perform pre- or non-lookup actions (e.g. copyins, RMID). + */ + switch (cmd) { + case IPC_SET: + if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid))) + return (set_errno(EFAULT)); + break; + + case IPC_SET64: + if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64))) + return (set_errno(EFAULT)); + break; + + case SETALL: + if ((lock = ipc_lookup(sem_svc, semid, + (kipc_perm_t **)&sp)) == NULL) + return (set_errno(EINVAL)); + vsize = sp->sem_nsems * sizeof (*vals); + mutex_exit(lock); + + /* allocate space to hold all semaphore values */ + vals = kmem_alloc(vsize, KM_SLEEP); + + if (copyin((void *)arg, vals, vsize)) { + kmem_free(vals, vsize); + return (set_errno(EFAULT)); + } + break; + + case IPC_RMID: + if (error = ipc_rmid(sem_svc, semid, cr)) + return (set_errno(error)); + return (0); + } + + if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) { + if (vsize != 0) + kmem_free(vals, vsize); + return (set_errno(EINVAL)); + } + switch (cmd) { + /* Set ownership and permissions. */ + case IPC_SET: + + if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm, + &STRUCT_BUF(sid)->sem_perm, mdl)) { + mutex_exit(lock); + return (set_errno(error)); + } + sp->sem_ctime = gethrestime_sec(); + mutex_exit(lock); + return (0); + + /* Get semaphore data structure. */ + case IPC_STAT: + + if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) { + mutex_exit(lock); + return (set_errno(error)); + } + + ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl); + STRUCT_FSETP(sid, sem_base, NULL); /* kernel addr */ + STRUCT_FSET(sid, sem_nsems, sp->sem_nsems); + STRUCT_FSET(sid, sem_otime, sp->sem_otime); + STRUCT_FSET(sid, sem_ctime, sp->sem_ctime); + STRUCT_FSET(sid, sem_binary, sp->sem_binary); + mutex_exit(lock); + + if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid))) + return (set_errno(EFAULT)); + return (0); + + case IPC_SET64: + + if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm, + &ds64.semx_perm)) { + mutex_exit(lock); + return (set_errno(error)); + } + sp->sem_ctime = gethrestime_sec(); + mutex_exit(lock); + return (0); + + case IPC_STAT64: + + ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm); + ds64.semx_nsems = sp->sem_nsems; + ds64.semx_otime = sp->sem_otime; + ds64.semx_ctime = sp->sem_ctime; + + mutex_exit(lock); + if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64))) + return (set_errno(EFAULT)); + + return (0); + + /* Get # of processes sleeping for greater semval. */ + case GETNCNT: + if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) { + mutex_exit(lock); + return (set_errno(error)); + } + if (semnum >= sp->sem_nsems) { + mutex_exit(lock); + return (set_errno(EINVAL)); + } + retval = sp->sem_base[semnum].semncnt; + mutex_exit(lock); + return (retval); + + /* Get pid of last process to operate on semaphore. */ + case GETPID: + if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) { + mutex_exit(lock); + return (set_errno(error)); + } + if (semnum >= sp->sem_nsems) { + mutex_exit(lock); + return (set_errno(EINVAL)); + } + retval = sp->sem_base[semnum].sempid; + mutex_exit(lock); + return (retval); + + /* Get semval of one semaphore. */ + case GETVAL: + if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) { + mutex_exit(lock); + return (set_errno(error)); + } + if (semnum >= sp->sem_nsems) { + mutex_exit(lock); + return (set_errno(EINVAL)); + } + retval = sp->sem_base[semnum].semval; + mutex_exit(lock); + return (retval); + + /* Get all semvals in set. */ + case GETALL: + if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) { + mutex_exit(lock); + return (set_errno(error)); + } + + /* allocate space to hold all semaphore values */ + vsize = sp->sem_nsems * sizeof (*vals); + vals = vp = kmem_alloc(vsize, KM_SLEEP); + + for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++) + bcopy(&p->semval, vp, sizeof (p->semval)); + + mutex_exit(lock); + + if (copyout((void *)vals, (void *)arg, vsize)) { + kmem_free(vals, vsize); + return (set_errno(EFAULT)); + } + + kmem_free(vals, vsize); + return (0); + + /* Get # of processes sleeping for semval to become zero. */ + case GETZCNT: + if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) { + mutex_exit(lock); + return (set_errno(error)); + } + if (semnum >= sp->sem_nsems) { + mutex_exit(lock); + return (set_errno(EINVAL)); + } + retval = sp->sem_base[semnum].semzcnt; + mutex_exit(lock); + return (retval); + + /* Set semval of one semaphore. */ + case SETVAL: + if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) { + mutex_exit(lock); + return (set_errno(error)); + } + if (semnum >= sp->sem_nsems) { + mutex_exit(lock); + return (set_errno(EINVAL)); + } + if ((uint_t)arg > USHRT_MAX) { + mutex_exit(lock); + return (set_errno(ERANGE)); + } + p = &sp->sem_base[semnum]; + if ((p->semval = (ushort_t)arg) != 0) { + if (p->semncnt) { + cv_broadcast(&p->semncnt_cv); + } + } else if (p->semzcnt) { + cv_broadcast(&p->semzcnt_cv); + } + p->sempid = curproc->p_pid; + sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum); + mutex_exit(lock); + return (0); + + /* Set semvals of all semaphores in set. */ + case SETALL: + /* Check if semaphore set has been deleted and reallocated. */ + if (sp->sem_nsems * sizeof (*vals) != vsize) { + error = set_errno(EINVAL); + goto seterr; + } + if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) { + error = set_errno(error); + goto seterr; + } + sem_undo_clear(sp, 0, sp->sem_nsems - 1); + for (i = 0, p = sp->sem_base; i < sp->sem_nsems; + (p++)->sempid = curproc->p_pid) { + if ((p->semval = vals[i++]) != 0) { + if (p->semncnt) { + cv_broadcast(&p->semncnt_cv); + } + } else if (p->semzcnt) { + cv_broadcast(&p->semzcnt_cv); + } + } +seterr: + mutex_exit(lock); + kmem_free(vals, vsize); + return (error); + + default: + mutex_exit(lock); + return (set_errno(EINVAL)); + } + + /* NOTREACHED */ +} + +/* + * semexit - Called by exit() to clean up on process exit. + */ +void +semexit(proc_t *pp) +{ + avl_tree_t *tree; + struct sem_undo *undo; + void *cookie = NULL; + + mutex_enter(&pp->p_lock); + tree = pp->p_semacct; + pp->p_semacct = NULL; + mutex_exit(&pp->p_lock); + + while (undo = avl_destroy_nodes(tree, &cookie)) { + ksemid_t *sp = undo->un_sp; + size_t size = SEM_UNDOSZ(sp->sem_nsems); + int i; + + (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id); + if (!IPC_FREE(&sp->sem_perm)) { + for (i = 0; i < sp->sem_nsems; i++) { + int adj = undo->un_aoe[i]; + if (adj) { + struct sem *semp = &sp->sem_base[i]; + int v = (int)semp->semval + adj; + + if (v < 0 || v > USHRT_MAX) + continue; + semp->semval = (ushort_t)v; + if (v == 0 && semp->semzcnt) + cv_broadcast(&semp->semzcnt_cv); + if (adj > 0 && semp->semncnt) + cv_broadcast(&semp->semncnt_cv); + } + } + list_remove(&sp->sem_undos, undo); + } + ipc_rele(sem_svc, (kipc_perm_t *)sp); + kmem_free(undo, size); + } + + avl_destroy(tree); + kmem_free(tree, sizeof (avl_tree_t)); +} + +/* + * Remove all semaphores associated with a given zone. Called by + * zone_shutdown when the zone is halted. + */ +/*ARGSUSED1*/ +static void +sem_remove_zone(zoneid_t zoneid, void *arg) +{ + ipc_remove_zone(sem_svc, zoneid); +} + +/* + * semget - Semget system call. + */ +static int +semget(key_t key, int nsems, int semflg) +{ + ksemid_t *sp; + kmutex_t *lock; + int id, error; + proc_t *pp = curproc; + +top: + if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock)) + return (set_errno(error)); + + if (!IPC_FREE(&sp->sem_perm)) { + /* + * A semaphore with the requested key exists. + */ + if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) { + mutex_exit(lock); + return (set_errno(EINVAL)); + } + } else { + /* + * This is a new semaphore set. Finish initialization. + */ + if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp, + nsems, RCA_SAFE) & RCT_DENY)) { + mutex_exit(lock); + mutex_exit(&pp->p_lock); + ipc_cleanup(sem_svc, (kipc_perm_t *)sp); + return (set_errno(EINVAL)); + } + mutex_exit(lock); + mutex_exit(&pp->p_lock); + + /* + * We round the allocation up to coherency granularity + * so that multiple semaphore allocations won't result + * in the false sharing of their sem structures. + */ + sp->sem_base = + kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64), + KM_SLEEP); + sp->sem_binary = (nsems == 1); + sp->sem_nsems = (ushort_t)nsems; + sp->sem_ctime = gethrestime_sec(); + sp->sem_otime = 0; + list_create(&sp->sem_undos, sizeof (struct sem_undo), + offsetof(struct sem_undo, un_list)); + + if (error = ipc_commit_begin(sem_svc, key, semflg, + (kipc_perm_t *)sp)) { + if (error == EAGAIN) + goto top; + return (set_errno(error)); + } + sp->sem_maxops = + rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp); + if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems, + RCA_SAFE) & RCT_DENY) { + ipc_cleanup(sem_svc, (kipc_perm_t *)sp); + return (set_errno(EINVAL)); + } + lock = ipc_commit_end(sem_svc, &sp->sem_perm); + } +#ifdef C2_AUDIT + if (audit_active) + audit_ipcget(AT_IPC_SEM, (void *)sp); +#endif + id = sp->sem_perm.ipc_id; + mutex_exit(lock); + return (id); +} + +/* + * semids system call. + */ +static int +semids(int *buf, uint_t nids, uint_t *pnids) +{ + int error; + + if (error = ipc_ids(sem_svc, buf, nids, pnids)) + return (set_errno(error)); + + return (0); +} + + +/* + * Helper function for semop - copies in the provided timespec and + * computes the absolute future time after which we must return. + */ +static int +compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now, + timespec_t *timeout) +{ + model_t datamodel = get_udatamodel(); + + if (datamodel == DATAMODEL_NATIVE) { + if (copyin(timeout, ts, sizeof (timespec_t))) + return (EFAULT); + } else { + timespec32_t ts32; + + if (copyin(timeout, &ts32, sizeof (timespec32_t))) + return (EFAULT); + TIMESPEC32_TO_TIMESPEC(ts, &ts32) + } + + if (itimerspecfix(ts)) + return (EINVAL); + + /* + * Convert the timespec value into absolute time. + */ + timespecadd(ts, now); + *tsp = ts; + + return (0); +} + +/* + * Undo structure comparator. We sort based on ksemid_t pointer. + */ +static int +sem_undo_compar(const void *x, const void *y) +{ + struct sem_undo *undo1 = (struct sem_undo *)x; + struct sem_undo *undo2 = (struct sem_undo *)y; + + if (undo1->un_sp < undo2->un_sp) + return (-1); + if (undo1->un_sp > undo2->un_sp) + return (1); + return (0); +} + +/* + * Helper function for semop - creates an undo structure and adds it to + * the process's avl tree and the semaphore's list. + */ +static int +sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock, + struct sem_undo *template, struct sem_undo **un) +{ + size_t size; + struct sem_undo *undo; + avl_tree_t *tree = NULL; + avl_index_t where; + + mutex_exit(*lock); + + size = SEM_UNDOSZ(sp->sem_nsems); + undo = kmem_zalloc(size, KM_SLEEP); + undo->un_proc = pp; + undo->un_sp = sp; + + if (pp->p_semacct == NULL) + tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + + *lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id); + if (IPC_FREE(&sp->sem_perm)) { + kmem_free(undo, size); + if (tree) + kmem_free(tree, sizeof (avl_tree_t)); + return (EIDRM); + } + + mutex_enter(&pp->p_lock); + if (tree) { + if (pp->p_semacct == NULL) { + avl_create(tree, sem_undo_compar, + sizeof (struct sem_undo), + offsetof(struct sem_undo, un_avl)); + pp->p_semacct = tree; + } else { + kmem_free(tree, sizeof (avl_tree_t)); + } + } + + if (*un = avl_find(pp->p_semacct, template, &where)) { + mutex_exit(&pp->p_lock); + kmem_free(undo, size); + } else { + *un = undo; + avl_insert(pp->p_semacct, undo, where); + mutex_exit(&pp->p_lock); + list_insert_head(&sp->sem_undos, undo); + ipc_hold(sem_svc, (kipc_perm_t *)sp); + } + + + return (0); +} + +/* + * semop - Semop system call. + */ +static int +semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout) +{ + ksemid_t *sp = NULL; + kmutex_t *lock; + struct sembuf *op; /* ptr to operation */ + int i; /* loop control */ + struct sem *semp; /* ptr to semaphore */ + int error = 0; + struct sembuf *uops; /* ptr to copy of user ops */ + struct sembuf x_sem; /* avoid kmem_alloc's */ + timespec_t now, ts, *tsp = NULL; + int timecheck = 0; + int cvres, needundo, mode; + struct sem_undo *undo; + proc_t *pp = curproc; + int held = 0; + + CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */ + + /* + * To avoid the cost of copying in 'timeout' in the common + * case, we could only grab the time here and defer the copyin + * and associated computations until we are about to block. + * + * The down side to this is that we would then have to spin + * some goto top nonsense to avoid the copyin behind the semid + * lock. As a common use of timed semaphores is as an explicit + * blocking mechanism, this could incur a greater penalty. + * + * If we eventually decide that this would be a wise route to + * take, the deferrable functionality is completely contained + * in 'compute_timeout', and the interface is defined such that + * we can legally not validate 'timeout' if it is unused. + */ + if (timeout != NULL) { + timecheck = timechanged; + gethrestime(&now); + if (error = compute_timeout(&tsp, &ts, &now, timeout)) + return (set_errno(error)); + } + + /* + * Allocate space to hold the vector of semaphore ops. If + * there is only 1 operation we use a preallocated buffer on + * the stack for speed. + * + * Since we don't want to allow the user to allocate an + * arbitrary amount of kernel memory, we need to check against + * the number of operations allowed by the semaphore. We only + * bother doing this if the number of operations is larger than + * SEM_MAXUCOPS. + */ + if (nsops == 1) + uops = &x_sem; + else if (nsops == 0) + return (0); + else if (nsops <= SEM_MAXUCOPS) + uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP); + + if (nsops > SEM_MAXUCOPS) { + if ((lock = ipc_lookup(sem_svc, semid, + (kipc_perm_t **)&sp)) == NULL) + return (set_errno(EFAULT)); + + if (nsops > sp->sem_maxops) { + mutex_exit(lock); + return (set_errno(E2BIG)); + } + held = 1; + ipc_hold(sem_svc, (kipc_perm_t *)sp); + mutex_exit(lock); + + uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP); + if (copyin(sops, uops, nsops * sizeof (*op))) { + error = EFAULT; + (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id); + goto semoperr; + } + + lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id); + if (IPC_FREE(&sp->sem_perm)) { + error = EIDRM; + goto semoperr; + } + } else { + /* + * This could be interleaved with the above code, but + * keeping them separate improves readability. + */ + if (copyin(sops, uops, nsops * sizeof (*op))) { + error = EFAULT; + goto semoperr_unlocked; + } + + if ((lock = ipc_lookup(sem_svc, semid, + (kipc_perm_t **)&sp)) == NULL) { + error = EINVAL; + goto semoperr_unlocked; + } + + if (nsops > sp->sem_maxops) { + error = E2BIG; + goto semoperr; + } + } + + /* + * Scan all operations. Verify that sem #s are in range and + * this process is allowed the requested operations. If any + * operations are marked SEM_UNDO, find (or allocate) the undo + * structure for this process and semaphore. + */ + needundo = 0; + mode = 0; + for (i = 0, op = uops; i++ < nsops; op++) { + mode |= op->sem_op ? SEM_A : SEM_R; + if (op->sem_num >= sp->sem_nsems) { + error = EFBIG; + goto semoperr; + } + if ((op->sem_flg & SEM_UNDO) && op->sem_op) + needundo = 1; + } + if (error = ipcperm_access(&sp->sem_perm, mode, CRED())) + goto semoperr; + + if (needundo) { + struct sem_undo template; + + template.un_sp = sp; + mutex_enter(&pp->p_lock); + if (pp->p_semacct) + undo = avl_find(pp->p_semacct, &template, NULL); + else + undo = NULL; + mutex_exit(&pp->p_lock); + if (undo == NULL) { + if (error = sem_undo_alloc(pp, sp, &lock, &template, + &undo)) + goto semoperr; + + /* sem_undo_alloc unlocks the semaphore */ + if (error = ipcperm_access(&sp->sem_perm, mode, CRED())) + goto semoperr; + } + } + +check: + /* + * Loop waiting for the operations to be satisfied atomically. + * Actually, do the operations and undo them if a wait is needed + * or an error is detected. + */ + for (i = 0; i < nsops; i++) { + op = &uops[i]; + semp = &sp->sem_base[op->sem_num]; + + /* + * Raise the semaphore (i.e. sema_v) + */ + if (op->sem_op > 0) { + if (op->sem_op + (int)semp->semval > USHRT_MAX || + ((op->sem_flg & SEM_UNDO) && + (error = sem_undo_add(op->sem_op, op->sem_num, + undo)))) { + if (i) + sem_rollback(sp, uops, i, undo); + if (error == 0) + error = ERANGE; + goto semoperr; + } + semp->semval += op->sem_op; + /* + * If we are only incrementing the semaphore value + * by one on a binary semaphore, we can cv_signal. + */ + if (semp->semncnt) { + if (op->sem_op == 1 && sp->sem_binary) + cv_signal(&semp->semncnt_cv); + else + cv_broadcast(&semp->semncnt_cv); + } + if (semp->semzcnt && !semp->semval) + cv_broadcast(&semp->semzcnt_cv); + continue; + } + + /* + * Lower the semaphore (i.e. sema_p) + */ + if (op->sem_op < 0) { + if (semp->semval >= (unsigned)(-op->sem_op)) { + if ((op->sem_flg & SEM_UNDO) && + (error = sem_undo_add(op->sem_op, + op->sem_num, undo))) { + if (i) + sem_rollback(sp, uops, i, undo); + goto semoperr; + } + semp->semval += op->sem_op; + if (semp->semzcnt && !semp->semval) + cv_broadcast(&semp->semzcnt_cv); + continue; + } + if (i) + sem_rollback(sp, uops, i, undo); + if (op->sem_flg & IPC_NOWAIT) { + error = EAGAIN; + goto semoperr; + } + + /* + * Mark the semaphore set as not a binary type + * if we are decrementing the value by more than 1. + * + * V operations will resort to cv_broadcast + * for this set because there are too many weird + * cases that have to be caught. + */ + if (op->sem_op < -1) + sp->sem_binary = 0; + if (!held) { + held = 1; + ipc_hold(sem_svc, (kipc_perm_t *)sp); + } + semp->semncnt++; + cvres = cv_waituntil_sig(&semp->semncnt_cv, lock, + tsp, timecheck); + lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock); + + if (!IPC_FREE(&sp->sem_perm)) { + ASSERT(semp->semncnt != 0); + semp->semncnt--; + if (cvres > 0) /* normal wakeup */ + goto check; + } + + /* EINTR or EAGAIN overrides EIDRM */ + if (cvres == 0) + error = EINTR; + else if (cvres < 0) + error = EAGAIN; + else + error = EIDRM; + goto semoperr; + } + + /* + * Wait for zero value + */ + if (semp->semval) { + if (i) + sem_rollback(sp, uops, i, undo); + if (op->sem_flg & IPC_NOWAIT) { + error = EAGAIN; + goto semoperr; + } + + if (!held) { + held = 1; + ipc_hold(sem_svc, (kipc_perm_t *)sp); + } + semp->semzcnt++; + cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock, + tsp, timecheck); + lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock); + + /* + * Don't touch semp if the semaphores have been removed. + */ + if (!IPC_FREE(&sp->sem_perm)) { + ASSERT(semp->semzcnt != 0); + semp->semzcnt--; + if (cvres > 0) /* normal wakeup */ + goto check; + } + + /* EINTR or EAGAIN overrides EIDRM */ + if (cvres == 0) + error = EINTR; + else if (cvres < 0) + error = EAGAIN; + else + error = EIDRM; + goto semoperr; + } + } + + /* All operations succeeded. Update sempid for accessed semaphores. */ + for (i = 0, op = uops; i++ < nsops; + sp->sem_base[(op++)->sem_num].sempid = pp->p_pid) + ; + sp->sem_otime = gethrestime_sec(); + if (held) + ipc_rele(sem_svc, (kipc_perm_t *)sp); + else + mutex_exit(lock); + + /* Before leaving, deallocate the buffer that held the user semops */ + if (nsops != 1) + kmem_free(uops, sizeof (*uops) * nsops); + return (0); + + /* + * Error return labels + */ +semoperr: + if (held) + ipc_rele(sem_svc, (kipc_perm_t *)sp); + else + mutex_exit(lock); + +semoperr_unlocked: + + /* Before leaving, deallocate the buffer that held the user semops */ + if (nsops != 1) + kmem_free(uops, sizeof (*uops) * nsops); + return (set_errno(error)); +} + +/* + * semsys - System entry point for semctl, semget, and semop system calls. + */ +static int +semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4) +{ + int error; + + switch (opcode) { + case SEMCTL: + error = semctl((int)a1, (uint_t)a2, (int)a3, a4); + break; + case SEMGET: + error = semget((key_t)a1, (int)a2, (int)a3); + break; + case SEMOP: + error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0); + break; + case SEMIDS: + error = semids((int *)a1, (uint_t)a2, (uint_t *)a3); + break; + case SEMTIMEDOP: + error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, + (timespec_t *)a4); + break; + default: + error = set_errno(EINVAL); + break; + } + return (error); +} diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c new file mode 100644 index 0000000000..2f504af827 --- /dev/null +++ b/usr/src/uts/common/syscall/sendfile.c @@ -0,0 +1,1186 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/time.h> +#include <sys/file.h> +#include <sys/open.h> +#include <sys/user.h> +#include <sys/termios.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/esunddi.h> +#include <sys/flock.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/vmsystm.h> + +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <netinet/in.h> +#include <sys/sendfile.h> +#include <sys/un.h> +#include <inet/nca/ncadoorhdr.h> +#include <inet/nca/ncaio.h> +#include <sys/tihdr.h> +#include <sys/atomic.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <inet/tcp.h> + +extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *); +extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, + ssize32_t *); +extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *, + int); + +#define SEND_MAX_CHUNK 16 + +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) +/* + * 64 bit offsets for 32 bit applications only running either on + * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer + * more than 2GB of data. + */ +int +sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, + int copy_cnt, ssize32_t *count) +{ + struct vnode *vp; + ushort_t fflag; + int ioflag; + size32_t cnt; + ssize32_t sfv_len; + ssize32_t tmpcount; + u_offset_t sfv_off; + struct uio auio; + struct iovec aiov; + int i, error; + + fflag = fp->f_flag; + vp = fp->f_vnode; + for (i = 0; i < copy_cnt; i++) { + + if (ISSIG(curthread, JUSTLOOKING)) + return (EINTR); + + /* + * Do similar checks as "write" as we are writing + * sfv_len bytes into "vp". + */ + sfv_len = (ssize32_t)sfv->sfv_len; + + if (sfv_len == 0) + continue; + + if (sfv_len < 0) + return (EINVAL); + + if (vp->v_type == VREG) { + if (*fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action( + rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_SAFE); + mutex_exit(&curproc->p_lock); + return (EFBIG); + } + + if (*fileoff >= OFFSET_MAX(fp)) + return (EFBIG); + + if (*fileoff + sfv_len > OFFSET_MAX(fp)) + return (EINVAL); + } + + tmpcount = *count + sfv_len; + if (tmpcount < 0) + return (EINVAL); + + sfv_off = sfv->sfv_off; + + auio.uio_extflg = UIO_COPY_DEFAULT; + if (sfv->sfv_fd == SFV_FD_SELF) { + aiov.iov_len = sfv_len; + aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; + auio.uio_loffset = *fileoff; + auio.uio_iovcnt = 1; + auio.uio_resid = sfv_len; + auio.uio_iov = &aiov; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + while (sfv_len > 0) { + error = VOP_WRITE(vp, &auio, ioflag, + fp->f_cred, NULL); + cnt = sfv_len - auio.uio_resid; + sfv_len -= cnt; + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; + if (vp->v_type == VREG) + *fileoff += cnt; + *count += cnt; + if (error != 0) + return (error); + } + } else { + file_t *ffp; + vnode_t *readvp; + int readflg = 0; + size_t size; + caddr_t ptr; + + if ((ffp = getf(sfv->sfv_fd)) == NULL) + return (EBADF); + + if ((ffp->f_flag & FREAD) == 0) { + releasef(sfv->sfv_fd); + return (EBADF); + } + + readvp = ffp->f_vnode; + if (readvp->v_type != VREG) { + releasef(sfv->sfv_fd); + return (EINVAL); + } + + /* + * No point reading and writing to same vp, + * as long as both are regular files. readvp is not + * locked; but since we got it from an open file the + * contents will be valid during the time of access. + */ + if (VN_CMP(vp, readvp)) { + releasef(sfv->sfv_fd); + return (EINVAL); + } + + /* + * Note: we assume readvp != vp. "vp" is already + * locked, and "readvp" must not be. + */ + (void) VOP_RWLOCK(readvp, readflg, NULL); + + /* + * Same checks as in pread64. + */ + if (sfv_off > MAXOFFSET_T) { + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + return (EINVAL); + } + + if (sfv_off + sfv_len > MAXOFFSET_T) + sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); + + /* Find the native blocksize to transfer data */ + size = MIN(vp->v_vfsp->vfs_bsize, + readvp->v_vfsp->vfs_bsize); + size = sfv_len < size ? sfv_len : size; + ptr = kmem_alloc(size, KM_SLEEP); + + while (sfv_len > 0) { + size_t iov_len; + + iov_len = MIN(size, sfv_len); + aiov.iov_base = ptr; + aiov.iov_len = iov_len; + auio.uio_loffset = sfv_off; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = iov_len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = ffp->f_flag; + ioflag = auio.uio_fmode & + (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + /* + * If read sync is not asked for, + * filter sync flags + */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(readvp, &auio, ioflag, + fp->f_cred, NULL); + if (error) { + kmem_free(ptr, size); + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + return (error); + } + + /* + * Check how must data was really read. + * Decrement the 'len' and increment the + * 'off' appropriately. + */ + cnt = iov_len - auio.uio_resid; + if (cnt == 0) { + /* + * If we were reading a pipe (currently + * not implemented), we may now lose + * data. + */ + kmem_free(ptr, size); + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + return (EINVAL); + } + sfv_len -= cnt; + sfv_off += cnt; + + aiov.iov_base = ptr; + aiov.iov_len = cnt; + auio.uio_loffset = *fileoff; + auio.uio_resid = cnt; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + ioflag = auio.uio_fmode & + (FAPPEND|FSYNC|FDSYNC|FRSYNC); + error = VOP_WRITE(vp, &auio, ioflag, + fp->f_cred, NULL); + + /* + * Check how much data was written. Increment + * the 'len' and decrement the 'off' if all + * the data was not written. + */ + cnt -= auio.uio_resid; + sfv_len += auio.uio_resid; + sfv_off -= auio.uio_resid; + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; + if (vp->v_type == VREG) + *fileoff += cnt; + *count += cnt; + if (error != 0) { + kmem_free(ptr, size); + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + return (error); + } + } + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + kmem_free(ptr, size); + } + sfv++; + } + return (0); +} + +ssize32_t +sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, + size32_t *xferred, int fildes) +{ + int rwflag; + u_offset_t fileoff; + int copy_cnt; + const struct ksendfilevec64 *copy_vec; + struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; + struct vnode *vp; + int error; + ssize32_t count = 0; + int osfvcnt; + + rwflag = 1; + vp = fp->f_vnode; + (void) VOP_RWLOCK(vp, rwflag, NULL); + + copy_vec = vec; + fileoff = fp->f_offset; + osfvcnt = sfvcnt; + + do { + copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); + if (copyin(copy_vec, sfv, copy_cnt * + sizeof (struct ksendfilevec64))) { + error = EFAULT; + break; + } + + /* + * Optimize the single regular file over + * the socket case. + */ + if (vp->v_type == VSOCK && osfvcnt == 1 && + sfv->sfv_fd != SFV_FD_SELF) { + file_t *rfp; + vnode_t *rvp; + + if ((rfp = getf(sfv->sfv_fd)) == NULL) { + error = EBADF; + break; + } + if ((rfp->f_flag & FREAD) == 0) { + releasef(sfv->sfv_fd); + error = EBADF; + break; + } + rvp = rfp->f_vnode; + if (rvp->v_type == VREG) { + error = sosendfile64(fp, rfp, sfv, &count); + break; + } + releasef(sfv->sfv_fd); + } + error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); + if (error != 0) + break; + + copy_vec += copy_cnt; + sfvcnt -= copy_cnt; + } while (sfvcnt > 0); + + if (vp->v_type == VREG) + fp->f_offset += count; + + VOP_RWUNLOCK(vp, rwflag, NULL); + if (copyout(&count, xferred, sizeof (count))) + error = EFAULT; + releasef(fildes); + if (error != 0) + return (set_errno(error)); + return (count); +} +#endif + +int +sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, + int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) +{ + struct vnode *vp; + struct uio auio; + struct iovec aiov; + ushort_t fflag; + int ioflag; + int i, error; + size_t cnt; + ssize_t sfv_len; + u_offset_t sfv_off; +#ifdef _SYSCALL32_IMPL + model_t model = get_udatamodel(); + u_offset_t maxoff = (model == DATAMODEL_ILP32) ? + MAXOFF32_T : MAXOFFSET_T; +#else + const u_offset_t maxoff = MAXOFF32_T; +#endif + mblk_t *dmp = NULL; + int wroff; + int buf_left = 0; + size_t iov_len; + mblk_t *head, *tmp; + size_t size = total_size; + + fflag = fp->f_flag; + vp = fp->f_vnode; + + ASSERT(vp->v_type == VSOCK); + ASSERT(maxblk > 0); + + wroff = (int)vp->v_stream->sd_wroff; + buf_left = MIN(total_size, maxblk); + head = dmp = allocb(buf_left + wroff, BPRI_HI); + if (head == NULL) + return (ENOMEM); + head->b_wptr = head->b_rptr = head->b_rptr + wroff; + + auio.uio_extflg = UIO_COPY_DEFAULT; + for (i = 0; i < copy_cnt; i++) { + if (ISSIG(curthread, JUSTLOOKING)) + return (EINTR); + + /* + * Do similar checks as "write" as we are writing + * sfv_len bytes into "vp". + */ + sfv_len = (ssize_t)sfv->sfv_len; + + if (sfv_len == 0) { + sfv++; + continue; + } + + /* Make sure sfv_len is not negative */ +#ifdef _SYSCALL32_IMPL + if (model == DATAMODEL_ILP32) { + if ((ssize32_t)sfv_len < 0) + return (EINVAL); + } else +#endif + if (sfv_len < 0) + return (EINVAL); + + /* Check for overflow */ +#ifdef _SYSCALL32_IMPL + if (model == DATAMODEL_ILP32) { + if (((ssize32_t)(*count + sfv_len)) < 0) + return (EINVAL); + } else +#endif + if ((*count + sfv_len) < 0) + return (EINVAL); + + sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; + + if (sfv->sfv_fd == SFV_FD_SELF) { + while (sfv_len > 0) { + if (buf_left == 0) { + tmp = dmp; + buf_left = MIN(total_size, maxblk); + iov_len = MIN(buf_left, sfv_len); + dmp = allocb(buf_left + wroff, BPRI_HI); + if (dmp == NULL) { + freemsg(head); + return (ENOMEM); + } + dmp->b_wptr = dmp->b_rptr = + dmp->b_rptr + wroff; + tmp->b_cont = dmp; + } else { + iov_len = MIN(buf_left, sfv_len); + } + + aiov.iov_len = iov_len; + aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; + auio.uio_loffset = *fileoff; + auio.uio_iovcnt = 1; + auio.uio_resid = iov_len; + auio.uio_iov = &aiov; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + + buf_left -= iov_len; + total_size -= iov_len; + sfv_len -= iov_len; + sfv_off += iov_len; + + error = uiomove((caddr_t)dmp->b_wptr, + iov_len, UIO_WRITE, &auio); + if (error != 0) { + freemsg(head); + return (error); + } + dmp->b_wptr += iov_len; + } + } else { + file_t *ffp; + vnode_t *readvp; + int readflg = 0; + + if ((ffp = getf(sfv->sfv_fd)) == NULL) { + freemsg(head); + return (EBADF); + } + + if ((ffp->f_flag & FREAD) == 0) { + releasef(sfv->sfv_fd); + freemsg(head); + return (EACCES); + } + + readvp = ffp->f_vnode; + if (readvp->v_type != VREG) { + releasef(sfv->sfv_fd); + freemsg(head); + return (EINVAL); + } + + /* + * No point reading and writing to same vp, + * as long as both are regular files. readvp is not + * locked; but since we got it from an open file the + * contents will be valid during the time of access. + */ + + if (VN_CMP(vp, readvp)) { + releasef(sfv->sfv_fd); + freemsg(head); + return (EINVAL); + } + + /* + * Note: we assume readvp != vp. "vp" is already + * locked, and "readvp" must not be. + */ + + (void) VOP_RWLOCK(readvp, readflg, NULL); + + /* Same checks as in pread */ + if (sfv_off > maxoff) { + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + freemsg(head); + return (EINVAL); + } + if (sfv_off + sfv_len > maxoff) { + sfv_len = (ssize_t)((offset_t)maxoff - + sfv_off); + } + + while (sfv_len > 0) { + if (buf_left == 0) { + tmp = dmp; + buf_left = MIN(total_size, maxblk); + iov_len = MIN(buf_left, sfv_len); + dmp = allocb(buf_left + wroff, BPRI_HI); + if (dmp == NULL) { + VOP_RWUNLOCK(readvp, readflg, + NULL); + releasef(sfv->sfv_fd); + freemsg(head); + return (ENOMEM); + } + dmp->b_wptr = dmp->b_rptr = + dmp->b_rptr + wroff; + tmp->b_cont = dmp; + } else { + iov_len = MIN(buf_left, sfv_len); + } + aiov.iov_base = (caddr_t)dmp->b_wptr; + aiov.iov_len = iov_len; + auio.uio_loffset = sfv_off; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = iov_len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = ffp->f_flag; + ioflag = auio.uio_fmode & + (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + /* + * If read sync is not asked for, + * filter sync flags + */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(readvp, &auio, ioflag, + fp->f_cred, NULL); + if (error != 0) { + /* + * If we were reading a pipe (currently + * not implemented), we may now loose + * data. + */ + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + freemsg(head); + return (error); + } + + /* + * Check how much data was really read. + * Decrement the 'len' and increment the + * 'off' appropriately. + */ + cnt = iov_len - auio.uio_resid; + if (cnt == 0) { + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + freemsg(head); + return (EINVAL); + } + sfv_len -= cnt; + sfv_off += cnt; + total_size -= cnt; + buf_left -= cnt; + + dmp->b_wptr += cnt; + } + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + } + sfv++; + } + + ASSERT(total_size == 0); + error = kstrwritemp(vp, head, fflag); + if (error != 0) { + freemsg(head); + return (error); + } + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; + *count += size; + + return (0); +} + + +int +sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, + int copy_cnt, ssize_t *count) +{ + struct vnode *vp; + struct uio auio; + struct iovec aiov; + ushort_t fflag; + int ioflag; + int i, error; + size_t cnt; + ssize_t sfv_len; + u_offset_t sfv_off; +#ifdef _SYSCALL32_IMPL + model_t model = get_udatamodel(); + u_offset_t maxoff = (model == DATAMODEL_ILP32) ? + MAXOFF32_T : MAXOFFSET_T; +#else + const u_offset_t maxoff = MAXOFF32_T; +#endif + mblk_t *dmp; + + fflag = fp->f_flag; + vp = fp->f_vnode; + + auio.uio_extflg = UIO_COPY_DEFAULT; + for (i = 0; i < copy_cnt; i++) { + if (ISSIG(curthread, JUSTLOOKING)) + return (EINTR); + + /* + * Do similar checks as "write" as we are writing + * sfv_len bytes into "vp". + */ + sfv_len = (ssize_t)sfv->sfv_len; + + if (sfv_len == 0) { + sfv++; + continue; + } + + /* Make sure sfv_len is not negative */ +#ifdef _SYSCALL32_IMPL + if (model == DATAMODEL_ILP32) { + if ((ssize32_t)sfv_len < 0) + return (EINVAL); + } else +#endif + if (sfv_len < 0) + return (EINVAL); + + if (vp->v_type == VREG) { + if (*fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action( + rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_SAFE); + mutex_exit(&curproc->p_lock); + + return (EFBIG); + } + + if (*fileoff >= maxoff) + return (EFBIG); + + if (*fileoff + sfv_len > maxoff) + return (EINVAL); + } + + /* Check for overflow */ +#ifdef _SYSCALL32_IMPL + if (model == DATAMODEL_ILP32) { + if (((ssize32_t)(*count + sfv_len)) < 0) + return (EINVAL); + } else +#endif + if ((*count + sfv_len) < 0) + return (EINVAL); + + sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; + + if (sfv->sfv_fd == SFV_FD_SELF) { + aiov.iov_len = sfv_len; + aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; + auio.uio_loffset = *fileoff; + auio.uio_iovcnt = 1; + auio.uio_resid = sfv_len; + auio.uio_iov = &aiov; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + + if (vp->v_type == VSOCK) { + + /* + * Optimize for the socket case + */ + int wroff = (int)vp->v_stream->sd_wroff; + + dmp = allocb(sfv_len + wroff, BPRI_HI); + if (dmp == NULL) + return (ENOMEM); + dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; + error = uiomove((caddr_t)dmp->b_wptr, + sfv_len, UIO_WRITE, &auio); + if (error != 0) { + freeb(dmp); + return (error); + } + dmp->b_wptr += sfv_len; + error = kstrwritemp(vp, dmp, fflag); + if (error != 0) { + freeb(dmp); + return (error); + } + ttolwp(curthread)->lwp_ru.ioch += + (ulong_t)sfv_len; + *count += sfv_len; + } else { + ioflag = auio.uio_fmode & + (FAPPEND|FSYNC|FDSYNC|FRSYNC); + while (sfv_len > 0) { + error = VOP_WRITE(vp, &auio, ioflag, + fp->f_cred, NULL); + cnt = sfv_len - auio.uio_resid; + sfv_len -= cnt; + ttolwp(curthread)->lwp_ru.ioch += + (ulong_t)cnt; + *fileoff += cnt; + *count += cnt; + if (error != 0) + return (error); + } + } + } else { + file_t *ffp; + vnode_t *readvp; + int readflg = 0; + size_t size; + caddr_t ptr; + + if ((ffp = getf(sfv->sfv_fd)) == NULL) + return (EBADF); + + if ((ffp->f_flag & FREAD) == 0) { + releasef(sfv->sfv_fd); + return (EBADF); + } + + readvp = ffp->f_vnode; + if (readvp->v_type != VREG) { + releasef(sfv->sfv_fd); + return (EINVAL); + } + + /* + * No point reading and writing to same vp, + * as long as both are regular files. readvp is not + * locked; but since we got it from an open file the + * contents will be valid during the time of access. + */ + if (VN_CMP(vp, readvp)) { + releasef(sfv->sfv_fd); + return (EINVAL); + } + + /* + * Note: we assume readvp != vp. "vp" is already + * locked, and "readvp" must not be. + */ + (void) VOP_RWLOCK(readvp, readflg, NULL); + + /* Same checks as in pread */ + if (sfv_off > maxoff) { + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + return (EINVAL); + } + if (sfv_off + sfv_len > maxoff) { + sfv_len = (ssize_t)((offset_t)maxoff - + sfv_off); + } + /* Find the native blocksize to transfer data */ + size = MIN(vp->v_vfsp->vfs_bsize, + readvp->v_vfsp->vfs_bsize); + size = sfv_len < size ? sfv_len : size; + + while (sfv_len > 0) { + size_t iov_len; + + iov_len = MIN(size, sfv_len); + + dmp = allocb(iov_len, BPRI_HI); + if (dmp == NULL) { + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + return (ENOMEM); + } + ptr = (caddr_t)dmp->b_rptr; + + aiov.iov_base = ptr; + aiov.iov_len = iov_len; + auio.uio_loffset = sfv_off; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = iov_len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = ffp->f_flag; + ioflag = auio.uio_fmode & + (FAPPEND|FSYNC|FDSYNC|FRSYNC); + + /* + * If read sync is not asked for, + * filter sync flags + */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(readvp, &auio, ioflag, + fp->f_cred, NULL); + if (error != 0) { + /* + * If we were reading a pipe (currently + * not implemented), we may now lose + * data. + */ + freeb(dmp); + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + return (error); + } + + /* + * Check how much data was really read. + * Decrement the 'len' and increment the + * 'off' appropriately. + */ + cnt = iov_len - auio.uio_resid; + if (cnt == 0) { + freeb(dmp); + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + return (EINVAL); + } + sfv_len -= cnt; + sfv_off += cnt; + + if (vp->v_type == VSOCK) { + dmp->b_wptr = dmp->b_rptr + cnt; + + error = kstrwritemp(vp, dmp, fflag); + if (error != 0) { + freeb(dmp); + VOP_RWUNLOCK(readvp, readflg, + NULL); + releasef(sfv->sfv_fd); + return (error); + } + + ttolwp(curthread)->lwp_ru.ioch += + (ulong_t)cnt; + *count += cnt; + } else { + + aiov.iov_base = ptr; + aiov.iov_len = cnt; + auio.uio_loffset = *fileoff; + auio.uio_resid = cnt; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + ioflag = auio.uio_fmode & + (FAPPEND|FSYNC|FDSYNC|FRSYNC); + error = VOP_WRITE(vp, &auio, ioflag, + fp->f_cred, NULL); + + /* + * Check how much data was written. + * Increment the 'len' and decrement the + * 'off' if all the data was not + * written. + */ + cnt -= auio.uio_resid; + sfv_len += auio.uio_resid; + sfv_off -= auio.uio_resid; + ttolwp(curthread)->lwp_ru.ioch += + (ulong_t)cnt; + *fileoff += cnt; + *count += cnt; + freeb(dmp); + if (error != 0) { + VOP_RWUNLOCK(readvp, readflg, + NULL); + releasef(sfv->sfv_fd); + return (error); + } + } + } + VOP_RWUNLOCK(readvp, readflg, NULL); + releasef(sfv->sfv_fd); + } + sfv++; + } + return (0); +} + +ssize_t +sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, + size_t *xferred) +{ + int error; + file_t *fp; + struct vnode *vp; + struct sonode *so; + u_offset_t fileoff; + int copy_cnt; + const struct sendfilevec *copy_vec; + struct sendfilevec sfv[SEND_MAX_CHUNK]; + ssize_t count = 0; +#ifdef _SYSCALL32_IMPL + struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; +#endif + ssize_t total_size = 0; + int i; + boolean_t is_sock = B_FALSE; + int maxblk = 0; + + if (sfvcnt <= 0) + return (set_errno(EINVAL)); + + if ((fp = getf(fildes)) == NULL) + return (set_errno(EBADF)); + + if (((fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto err; + } + + fileoff = fp->f_offset; + vp = fp->f_vnode; + + switch (vp->v_type) { + case VSOCK: + so = VTOSO(vp); + /* sendfile not supported for SCTP */ + if (so->so_protocol == IPPROTO_SCTP) { + error = EPROTONOSUPPORT; + goto err; + } + is_sock = B_TRUE; + switch (so->so_family) { + case AF_NCA: + case AF_INET: + case AF_INET6: + /* + * Make similar checks done in SOP_WRITE(). + */ + if (so->so_state & SS_CANTSENDMORE) { + tsignal(curthread, SIGPIPE); + error = EPIPE; + goto err; + } + if (so->so_type != SOCK_STREAM) { + error = EOPNOTSUPP; + goto err; + } + + if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != + (SS_ISCONNECTED|SS_ISBOUND)) { + error = ENOTCONN; + goto err; + } + + if ((so->so_state & SS_TCP_FAST_ACCEPT) && + (so->so_priv != NULL)) { + maxblk = ((tcp_t *)so->so_priv)->tcp_mss; + } else { + maxblk = (int)vp->v_stream->sd_maxblk; + } + break; + default: + error = EAFNOSUPPORT; + goto err; + } + break; + case VREG: + break; + default: + error = EINVAL; + goto err; + } + + switch (opcode) { + case SENDFILEV : + break; +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) + case SENDFILEV64 : + return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, + (size32_t *)xferred, fildes)); +#endif + default : + error = ENOSYS; + break; + } + + (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); + copy_vec = vec; + + do { + copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); +#ifdef _SYSCALL32_IMPL + /* 32-bit callers need to have their iovec expanded. */ + if (get_udatamodel() == DATAMODEL_ILP32) { + if (copyin(copy_vec, sfv32, + copy_cnt * sizeof (ksendfilevec32_t))) { + error = EFAULT; + break; + } + + for (i = 0; i < copy_cnt; i++) { + sfv[i].sfv_fd = sfv32[i].sfv_fd; + sfv[i].sfv_off = + (off_t)(uint32_t)sfv32[i].sfv_off; + sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; + total_size += sfv[i].sfv_len; + sfv[i].sfv_flag = sfv32[i].sfv_flag; + } + } else { +#endif + if (copyin(copy_vec, sfv, + copy_cnt * sizeof (sendfilevec_t))) { + error = EFAULT; + break; + } + + for (i = 0; i < copy_cnt; i++) { + total_size += sfv[i].sfv_len; + } +#ifdef _SYSCALL32_IMPL + } +#endif + + /* + * The task between deciding to use sendvec_small_chunk + * and sendvec_chunk is dependant on multiple things: + * + * i) latency is important for smaller files. So if the + * data is smaller than 'tcp_slow_start_initial' times + * maxblk, then use sendvec_small_chunk which creates + * maxblk size mblks and chains then together and sends + * them to TCP in one shot. It also leaves 'wroff' size + * space for the headers in each mblk. + * + * ii) for total size bigger than 'tcp_slow_start_initial' + * time maxblk, its probably real file data which is + * dominating. So its better to use sendvec_chunk because + * performance goes to dog if we don't do pagesize reads. + * sendvec_chunk will do pagesize reads and write them + * in pagesize mblks to TCP. + * + * Side Notes: A write to file has not been optimized. + * Future zero copy code will plugin into sendvec_chunk + * only because doing zero copy for files smaller then + * pagesize is useless. + * + * Note, if socket has NL7C enabled then call NL7C's + * senfilev() function to give NL7C a chance to copy + * the vec for caching, then continue processing as + * normal. + */ + if (is_sock) { + switch (so->so_family) { + case AF_INET: + case AF_INET6: + if (so->so_nl7c_flags != 0) { + nl7c_sendfilev(so, fileoff, + sfv, copy_cnt); + } + if (total_size <= (4 * maxblk)) + error = sendvec_small_chunk(fp, + &fileoff, sfv, copy_cnt, + total_size, maxblk, &count); + else + error = sendvec_chunk(fp, &fileoff, + sfv, copy_cnt, &count); + break; + case AF_NCA: + error = nca_sendfilev(fp, sfv, copy_cnt, + &count); + break; + } + } else { + ASSERT(vp->v_type == VREG); + error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, + &count); + } + + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() == DATAMODEL_ILP32) + copy_vec = (const struct sendfilevec *)((char *)copy_vec + + (copy_cnt * sizeof (ksendfilevec32_t))); + else +#endif + copy_vec += copy_cnt; + sfvcnt -= copy_cnt; + } while (sfvcnt > 0); + + if (vp->v_type == VREG) + fp->f_offset += count; + + + VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() == DATAMODEL_ILP32) { + ssize32_t count32 = (ssize32_t)count; + if (copyout(&count32, xferred, sizeof (count32))) + error = EFAULT; + releasef(fildes); + if (error != 0) + return (set_errno(error)); + return (count32); + } +#endif + if (copyout(&count, xferred, sizeof (count))) + error = EFAULT; + releasef(fildes); + if (error != 0) + return (set_errno(error)); + return (count); +err: + ASSERT(error != 0); + releasef(fildes); + return (set_errno(error)); +} diff --git a/usr/src/uts/common/syscall/sigaction.c b/usr/src/uts/common/syscall/sigaction.c new file mode 100644 index 0000000000..8a38de67b8 --- /dev/null +++ b/usr/src/uts/common/syscall/sigaction.c @@ -0,0 +1,231 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/fault.h> +#include <sys/signal.h> +#include <sys/siginfo.h> +#include <sys/debug.h> + +int +sigaction(int sig, struct sigaction *actp, struct sigaction *oactp) +{ + struct sigaction act; + struct sigaction oact; + k_sigset_t set; + proc_t *p; + int sigcld_look = 0; + + if (sig <= 0 || sig >= NSIG || + (actp != NULL && sigismember(&cantmask, sig))) + return (set_errno(EINVAL)); + + /* + * act and oact might be the same address, so copyin act first. + */ + if (actp) { +#if defined(__sparc) + void (*handler)(); +#endif + if (copyin(actp, &act, sizeof (act))) + return (set_errno(EFAULT)); +#if defined(__sparc) + /* + * Check alignment of handler + */ + handler = act.sa_handler; + if (handler != SIG_IGN && handler != SIG_DFL && + ((uintptr_t)handler & 0x3) != 0) + return (set_errno(EINVAL)); +#endif + } + + p = curproc; + mutex_enter(&p->p_lock); + + if (oactp) { + int flags; + void (*disp)(); + + disp = u.u_signal[sig - 1]; + + flags = 0; + if (disp != SIG_DFL && disp != SIG_IGN) { + set = u.u_sigmask[sig-1]; + if (sigismember(&p->p_siginfo, sig)) + flags |= SA_SIGINFO; + if (sigismember(&u.u_sigrestart, sig)) + flags |= SA_RESTART; + if (sigismember(&u.u_sigonstack, sig)) + flags |= SA_ONSTACK; + if (sigismember(&u.u_sigresethand, sig)) + flags |= SA_RESETHAND; + if (sigismember(&u.u_signodefer, sig)) + flags |= SA_NODEFER; + } else + sigemptyset(&set); + + if (sig == SIGCLD) { + if (p->p_flag & SNOWAIT) + flags |= SA_NOCLDWAIT; + if (!(p->p_flag & SJCTL)) + flags |= SA_NOCLDSTOP; + } + + oact.sa_handler = disp; + oact.sa_flags = flags; + sigktou(&set, &oact.sa_mask); + } + + if (actp) { + if (sig == SIGCLD && + act.sa_handler != SIG_IGN && + act.sa_handler != SIG_DFL) + sigcld_look = 1; + + sigutok(&act.sa_mask, &set); + setsigact(sig, act.sa_handler, set, act.sa_flags); + } + + mutex_exit(&p->p_lock); + + if (sigcld_look) + sigcld_repost(); + + if (oactp && + copyout(&oact, oactp, sizeof (oact))) + return (set_errno(EFAULT)); + + return (0); +} + +#ifdef _SYSCALL32_IMPL + +int +sigaction32(int sig, struct sigaction32 *actp, struct sigaction32 *oactp) +{ + struct sigaction32 act32; + struct sigaction32 oact32; + k_sigset_t set; + proc_t *p; + int sigcld_look = 0; + + if (sig <= 0 || sig >= NSIG || + (actp != NULL && sigismember(&cantmask, sig))) + return (set_errno(EINVAL)); + + /* + * act and oact might be the same address, so copyin act first. + */ + if (actp) { +#if defined(__sparc) + void (*handler)(); +#endif + if (copyin(actp, &act32, sizeof (act32))) + return (set_errno(EFAULT)); +#if defined(__sparc) + /* + * Check alignment of handler + */ + handler = (void (*)())act32.sa_handler; + if (handler != SIG_IGN && handler != SIG_DFL && + ((uintptr_t)handler & 0x3) != 0) + return (set_errno(EINVAL)); +#endif + } + + p = curproc; + mutex_enter(&p->p_lock); + + if (oactp) { + int flags; + void (*disp)(); + + disp = u.u_signal[sig - 1]; + + flags = 0; + if (disp != SIG_DFL && disp != SIG_IGN) { + set = u.u_sigmask[sig-1]; + if (sigismember(&p->p_siginfo, sig)) + flags |= SA_SIGINFO; + if (sigismember(&u.u_sigrestart, sig)) + flags |= SA_RESTART; + if (sigismember(&u.u_sigonstack, sig)) + flags |= SA_ONSTACK; + if (sigismember(&u.u_sigresethand, sig)) + flags |= SA_RESETHAND; + if (sigismember(&u.u_signodefer, sig)) + flags |= SA_NODEFER; + } else + sigemptyset(&set); + + if (sig == SIGCLD) { + if (p->p_flag & SNOWAIT) + flags |= SA_NOCLDWAIT; + if (!(p->p_flag & SJCTL)) + flags |= SA_NOCLDSTOP; + } + + oact32.sa_handler = (caddr32_t)(uintptr_t)disp; + oact32.sa_flags = flags; + sigktou(&set, &oact32.sa_mask); + } + + if (actp) { + if (sig == SIGCLD && + act32.sa_handler != (caddr32_t)SIG_IGN && + act32.sa_handler != (caddr32_t)SIG_DFL) + sigcld_look = 1; + + sigutok(&act32.sa_mask, &set); + setsigact(sig, (void (*)())(uintptr_t)act32.sa_handler, set, + act32.sa_flags); + } + + mutex_exit(&p->p_lock); + + if (sigcld_look) + sigcld_repost(); + + if (oactp && + copyout(&oact32, oactp, sizeof (oact32))) + return (set_errno(EFAULT)); + + return (0); +} +#endif /* _SYSCALL32_IMPL */ diff --git a/usr/src/uts/common/syscall/sigaltstack.c b/usr/src/uts/common/syscall/sigaltstack.c new file mode 100644 index 0000000000..4c310390b3 --- /dev/null +++ b/usr/src/uts/common/syscall/sigaltstack.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/fault.h> +#include <sys/signal.h> +#include <sys/siginfo.h> +#include <sys/debug.h> + +int +sigaltstack(struct sigaltstack *ssp, struct sigaltstack *oss) +{ + klwp_t *lwp = ttolwp(curthread); + struct sigaltstack ss; + + /* + * User's oss and ss might be the same address, so copyin first and + * save before copying out. + */ + if (ssp) { + if (lwp->lwp_sigaltstack.ss_flags & SS_ONSTACK) + return (set_errno(EPERM)); + if (copyin(ssp, &ss, sizeof (ss))) + return (set_errno(EFAULT)); + if (ss.ss_flags & ~SS_DISABLE) + return (set_errno(EINVAL)); + if (!(ss.ss_flags & SS_DISABLE) && ss.ss_size < MINSIGSTKSZ) + return (set_errno(ENOMEM)); + } + + if (oss) { + if (copyout(&lwp->lwp_sigaltstack, + oss, sizeof (struct sigaltstack))) + return (set_errno(EFAULT)); + } + + if (ssp) + lwp->lwp_sigaltstack = ss; + + return (0); +} + +#ifdef _LP64 +int +sigaltstack32(struct sigaltstack32 *ssp, struct sigaltstack32 *oss) +{ + klwp_t *lwp = ttolwp(curthread); + struct sigaltstack *ss; + struct sigaltstack32 ss32, oss32; + + /* + * User's oss and ss might be the same address, so copyin first and + * save before copying out. + */ + if (ssp) { + if (lwp->lwp_sigaltstack.ss_flags & SS_ONSTACK) + return (set_errno(EPERM)); + if (copyin(ssp, &ss32, sizeof (ss32))) + return (set_errno(EFAULT)); + if (ss32.ss_flags & ~SS_DISABLE) + return (set_errno(EINVAL)); + if (!(ss32.ss_flags & SS_DISABLE) && ss32.ss_size < MINSIGSTKSZ) + return (set_errno(ENOMEM)); + } + + if (oss) { + /* + * copy to ILP32 struct before copyout. + */ + ss = &lwp->lwp_sigaltstack; + oss32.ss_sp = (caddr32_t)(uintptr_t)ss->ss_sp; + oss32.ss_size = (size32_t)ss->ss_size; + oss32.ss_flags = ss->ss_flags; + + if (copyout(&oss32, oss, sizeof (oss32))) + return (set_errno(EFAULT)); + } + + if (ssp) { + ss = &lwp->lwp_sigaltstack; + ss->ss_sp = (void *)(uintptr_t)ss32.ss_sp; + ss->ss_size = (size_t)ss32.ss_size; + ss->ss_flags = ss32.ss_flags; + } + + return (0); +} +#endif /* _LP64 */ diff --git a/usr/src/uts/common/syscall/signotify.c b/usr/src/uts/common/syscall/signotify.c new file mode 100644 index 0000000000..0c32a0cd5e --- /dev/null +++ b/usr/src/uts/common/syscall/signotify.c @@ -0,0 +1,226 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/procset.h> +#include <sys/fault.h> +#include <sys/signal.h> +#include <sys/siginfo.h> +#include <vm/as.h> +#include <sys/debug.h> +#include <sys/contract/process_impl.h> + +/*ARGSUSED*/ +static int +copyin_siginfo(model_t datamodel, void *uaddr, k_siginfo_t *ksip) +{ +#ifdef _SYSCALL32_IMPL + int ret; + + if (datamodel == DATAMODEL_NATIVE) { +#endif + return (copyin(uaddr, ksip, sizeof (k_siginfo_t))); +#ifdef _SYSCALL32_IMPL + } else { + siginfo32_t si32; + + if (ret = copyin(uaddr, &si32, sizeof (si32))) + return (ret); + + siginfo_32tok(&si32, ksip); + } + + return (0); +#endif +} + +/* + * To find secured 64 bit id for signotify() call + * This depends upon as_getmemid() which returns + * unique vnode/offset for a user virtual address. + */ +static u_longlong_t +get_sigid(proc_t *p, caddr_t addr) +{ + u_longlong_t snid = 0; + memid_t memid; + quad_t *tquad = (quad_t *)&snid; + + if (!as_getmemid(p->p_as, addr, &memid)) { + tquad->val[0] = (int)memid.val[0]; + tquad->val[1] = (int)memid.val[1]; + } + return (snid); +} + +#define SIGN_PTR(p, n) &((signotifyq_t *)(&p->p_signhdr[1]))[n]; + +int +signotify(int cmd, siginfo_t *siginfo, signotify_id_t *sn_id) +{ + k_siginfo_t info; + signotify_id_t id; + proc_t *p; + proc_t *cp = curproc; + signotifyq_t *snqp; + struct cred *cr; + sigqueue_t *sqp; + sigqhdr_t *sqh; + u_longlong_t sid; + model_t datamodel = get_udatamodel(); + + if (copyin(sn_id, &id, sizeof (signotify_id_t))) + return (set_errno(EFAULT)); + + if (id.sn_index >= _SIGNOTIFY_MAX || id.sn_index < 0) + return (set_errno(EINVAL)); + + switch (cmd) { + case SN_PROC: + /* get snid for the given user address of signotifyid_t */ + sid = get_sigid(cp, (caddr_t)sn_id); + + if (id.sn_pid > 0) { + mutex_enter(&pidlock); + if ((p = prfind(id.sn_pid)) != NULL) { + mutex_enter(&p->p_lock); + if (p->p_signhdr != NULL) { + snqp = SIGN_PTR(p, id.sn_index); + if (snqp->sn_snid == sid) { + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + return (set_errno(EBUSY)); + } + } + mutex_exit(&p->p_lock); + } + mutex_exit(&pidlock); + } + + if (copyin_siginfo(datamodel, siginfo, &info)) + return (set_errno(EFAULT)); + + /* The si_code value must indicate the signal will be queued */ + if (!sigwillqueue(info.si_signo, info.si_code)) + return (set_errno(EINVAL)); + + if (cp->p_signhdr == NULL) { + /* Allocate signotify pool first time */ + sqh = sigqhdralloc(sizeof (signotifyq_t), + _SIGNOTIFY_MAX); + mutex_enter(&cp->p_lock); + if (cp->p_signhdr == NULL) { + /* hang the pool head on proc */ + cp->p_signhdr = sqh; + } else { + /* another lwp allocated the pool, free ours */ + sigqhdrfree(sqh); + } + } else { + mutex_enter(&cp->p_lock); + } + + sqp = sigqalloc(cp->p_signhdr); + if (sqp == NULL) { + mutex_exit(&cp->p_lock); + return (set_errno(EAGAIN)); + } + cr = CRED(); + sqp->sq_info = info; + sqp->sq_info.si_pid = cp->p_pid; + sqp->sq_info.si_ctid = PRCTID(cp); + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(cr); + + /* fill the signotifyq_t fields */ + ((signotifyq_t *)sqp)->sn_snid = sid; + + mutex_exit(&cp->p_lock); + + /* complete the signotify_id_t fields */ + id.sn_index = (signotifyq_t *)sqp - SIGN_PTR(cp, 0); + id.sn_pid = cp->p_pid; + + break; + + case SN_CANCEL: + case SN_SEND: + + mutex_enter(&pidlock); + if ((id.sn_pid <= 0) || ((p = prfind(id.sn_pid)) == NULL)) { + mutex_exit(&pidlock); + return (set_errno(EINVAL)); + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (p->p_signhdr == NULL) { + mutex_exit(&p->p_lock); + return (set_errno(EINVAL)); + } + + snqp = SIGN_PTR(p, id.sn_index); + + if (snqp->sn_snid == 0) { + mutex_exit(&p->p_lock); + return (set_errno(EINVAL)); + } + + if (snqp->sn_snid != get_sigid(cp, (caddr_t)sn_id)) { + mutex_exit(&p->p_lock); + return (set_errno(EINVAL)); + } + + snqp->sn_snid = 0; + + /* cmd == SN_CANCEL or signo == 0 (SIGEV_NONE) */ + if (((sigqueue_t *)snqp)->sq_info.si_signo <= 0) + cmd = SN_CANCEL; + + sigqsend(cmd, p, 0, (sigqueue_t *)snqp); + mutex_exit(&p->p_lock); + + id.sn_pid = 0; + id.sn_index = 0; + + break; + + default : + return (set_errno(EINVAL)); + } + + if (copyout(&id, sn_id, sizeof (signotify_id_t))) + return (set_errno(EFAULT)); + + return (0); +} diff --git a/usr/src/uts/common/syscall/sigpending.c b/usr/src/uts/common/syscall/sigpending.c new file mode 100644 index 0000000000..5801d09d26 --- /dev/null +++ b/usr/src/uts/common/syscall/sigpending.c @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1994-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/fault.h> +#include <sys/signal.h> +#include <sys/schedctl.h> +#include <sys/debug.h> + +int +sigpending(int flag, sigset_t *setp) +{ + sigset_t set; + k_sigset_t kset; + proc_t *p; + + switch (flag) { + case 1: /* sigpending */ + p = ttoproc(curthread); + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(curthread); + kset = p->p_sig; + sigorset(&kset, &curthread->t_sig); + sigandset(&kset, &curthread->t_hold); + mutex_exit(&p->p_lock); + break; + case 2: /* sigfillset */ + kset = fillset; + break; + default: + return (set_errno(EINVAL)); + } + + sigktou(&kset, &set); + if (copyout((caddr_t)&set, (caddr_t)setp, sizeof (sigset_t))) + return (set_errno(EFAULT)); + return (0); +} diff --git a/usr/src/uts/common/syscall/sigprocmask.c b/usr/src/uts/common/syscall/sigprocmask.c new file mode 100644 index 0000000000..8f7cf6113d --- /dev/null +++ b/usr/src/uts/common/syscall/sigprocmask.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ + + +/* + * Copyright 1994-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/fault.h> +#include <sys/signal.h> +#include <sys/schedctl.h> +#include <sys/debug.h> + +int64_t +lwp_sigmask(int how, uint_t bits0, uint_t bits1) +{ + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + rval_t rv; + + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + + bits0 &= (FILLSET0 & ~CANTMASK0); + bits1 &= (FILLSET1 & ~CANTMASK1); + + rv.r_val1 = t->t_hold.__sigbits[0]; + rv.r_val2 = t->t_hold.__sigbits[1]; + + switch (how) { + case SIG_BLOCK: + t->t_hold.__sigbits[0] |= bits0; + t->t_hold.__sigbits[1] |= bits1; + break; + case SIG_UNBLOCK: + t->t_hold.__sigbits[0] &= ~bits0; + t->t_hold.__sigbits[1] &= ~bits1; + if (sigcheck(p, t)) + t->t_sig_check = 1; + break; + case SIG_SETMASK: + t->t_hold.__sigbits[0] = bits0; + t->t_hold.__sigbits[1] = bits1; + if (sigcheck(p, t)) + t->t_sig_check = 1; + break; + } + + mutex_exit(&p->p_lock); + return (rv.r_vals); +} + +/* + * This system call is no longer called from libc. + * It exists solely for the benefit of statically-linked + * binaries from the past. It should be eliminated. + */ +int +sigprocmask(int how, sigset_t *setp, sigset_t *osetp) +{ + sigset_t set; + k_sigset_t kset; + rval_t rv; + + /* + * User's oset and set might be the same address, so copyin first and + * save before copying out. + */ + if (setp) { + switch (how) { + case SIG_BLOCK: + case SIG_UNBLOCK: + case SIG_SETMASK: + break; + default: + return (set_errno(EINVAL)); + } + if (copyin((caddr_t)setp, (caddr_t)&set, sizeof (sigset_t))) + return (set_errno(EFAULT)); + sigutok(&set, &kset); + } else { + /* none of SIG_BLOCK, SIG_UNBLOCK, SIG_SETMASK equals 0 */ + how = 0; + sigemptyset(&kset); + } + + rv.r_vals = lwp_sigmask(how, kset.__sigbits[0], kset.__sigbits[1]); + + if (osetp) { + kset.__sigbits[0] = rv.r_val1; + kset.__sigbits[1] = rv.r_val2; + sigktou(&kset, &set); + if (copyout((caddr_t)&set, (caddr_t)osetp, sizeof (sigset_t))) + return (set_errno(EFAULT)); + } + + return (0); +} diff --git a/usr/src/uts/common/syscall/sigqueue.c b/usr/src/uts/common/syscall/sigqueue.c new file mode 100644 index 0000000000..38c5b91202 --- /dev/null +++ b/usr/src/uts/common/syscall/sigqueue.c @@ -0,0 +1,185 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1998-2000 by Sun Microsystems, Inc. + * All rights reserved. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ + + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/procset.h> +#include <sys/fault.h> +#include <sys/signal.h> +#include <sys/siginfo.h> +#include <sys/debug.h> + +static int +sigqkill(pid_t pid, int signo, sigsend_t *sigsend) +{ + register proc_t *p; + int error; + + if (signo < 0 || signo >= NSIG) + return (set_errno(EINVAL)); + + if (pid == -1) { + procset_t set; + + setprocset(&set, POP_AND, P_ALL, P_MYID, P_ALL, P_MYID); + error = sigsendset(&set, sigsend); + } else if (pid > 0) { + mutex_enter(&pidlock); + if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) + error = ESRCH; + else { + error = sigsendproc(p, sigsend); + if (error == 0 && sigsend->perm == 0) + error = EPERM; + } + mutex_exit(&pidlock); + } else { + int nfound = 0; + pid_t pgid; + + if (pid == 0) + pgid = ttoproc(curthread)->p_pgrp; + else + pgid = -pid; + + error = 0; + mutex_enter(&pidlock); + for (p = pgfind(pgid); p && !error; p = p->p_pglink) { + if (p->p_stat != SIDL) { + nfound++; + error = sigsendproc(p, sigsend); + } + } + mutex_exit(&pidlock); + if (nfound == 0) + error = ESRCH; + else if (error == 0 && sigsend->perm == 0) + error = EPERM; + } + + if (error) + return (set_errno(error)); + return (0); +} + + +/* + * for implementations that don't require binary compatibility, + * the kill system call may be made into a library call to the + * sigsend system call + */ +int +kill(pid_t pid, int sig) +{ + sigsend_t v; + + bzero(&v, sizeof (v)); + v.sig = sig; + v.checkperm = 1; + v.sicode = SI_USER; + + return (sigqkill(pid, sig, &v)); +} + +/* + * The handling of small unions, like the sigval argument to sigqueue, + * is architecture dependent. We have adapted the convention that the + * value itself is passed in the storage which crosses the kernel + * protection boundary. This procedure will accept a scalar argument, + * and store it in the appropriate value member of the sigsend_t structure. + */ +int +sigqueue(pid_t pid, int signo, /* union sigval */ void *value, int si_code) +{ + sigsend_t v; + sigqhdr_t *sqh; + proc_t *p = curproc; + + /* The si_code value must indicate the signal will be queued */ + if (pid <= 0 || !sigwillqueue(signo, si_code)) + return (set_errno(EINVAL)); + + if (p->p_sigqhdr == NULL) { + /* Allocate sigqueue pool first time */ + sqh = sigqhdralloc(sizeof (sigqueue_t), _SIGQUEUE_MAX); + mutex_enter(&p->p_lock); + if (p->p_sigqhdr == NULL) { + /* hang the pool head on proc */ + p->p_sigqhdr = sqh; + } else { + /* another lwp allocated the pool, free ours */ + sigqhdrfree(sqh); + } + mutex_exit(&p->p_lock); + } + + bzero(&v, sizeof (v)); + v.sig = signo; + v.checkperm = 1; + v.sicode = si_code; + v.value.sival_ptr = value; + + return (sigqkill(pid, signo, &v)); +} + +#ifdef _SYSCALL32_IMPL +/* + * sigqueue32 - System call entry point for 32-bit callers on LP64 kernel, + * needed to handle the 32-bit sigvals as correctly as we can. We always + * assume that a 32-bit caller is passing an int. A 64-bit recipient + * that expects an int will therefore get it correctly. A 32-bit + * recipient will also get it correctly since siginfo_kto32() uses + * sival_int in the conversion. Since a 32-bit pointer has the same + * size and address in the sigval, it also converts correctly so that + * two 32-bit apps can exchange a pointer value. However, this means + * that a pointer sent by a 32-bit caller will be seen in the upper half + * by a 64-bit recipient, and only the upper half of a 64-bit pointer will + * be seen by a 32-bit recipient. This is the best solution that does + * not require severe hacking of the sigval union. Anyways, what it + * means to be sending pointers between processes with dissimilar + * models is unclear. + */ +int +sigqueue32(pid_t pid, int signo, /* union sigval32 */ caddr32_t value, + int si_code) +{ + union sigval sv; + + bzero(&sv, sizeof (sv)); + sv.sival_int = (int)value; + return (sigqueue(pid, signo, sv.sival_ptr, si_code)); +} +#endif diff --git a/usr/src/uts/common/syscall/sigsendset.c b/usr/src/uts/common/syscall/sigsendset.c new file mode 100644 index 0000000000..fa40d76502 --- /dev/null +++ b/usr/src/uts/common/syscall/sigsendset.c @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* Copyright (c) 1994 Sun Microsystems, Inc. */ +/* All Rights Reserved */ + + +/* + * Copyright (c) 1998 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/fault.h> +#include <sys/procset.h> +#include <sys/signal.h> +#include <sys/siginfo.h> +#include <sys/debug.h> + +int +sigsendsys(procset_t *psp, int sig) +{ + int error; + procset_t set; + sigsend_t v; + + + if (sig < 0 || sig >= NSIG) + return (set_errno(EINVAL)); + + bzero(&v, sizeof (v)); + v.sig = sig; + v.checkperm = 1; + v.sicode = SI_USER; + + if (copyin((caddr_t)psp, (caddr_t)&set, sizeof (procset_t))) + return (set_errno(EFAULT)); + if (error = sigsendset(&set, &v)) + return (set_errno(error)); + return (0); +} diff --git a/usr/src/uts/common/syscall/sigsuspend.c b/usr/src/uts/common/syscall/sigsuspend.c new file mode 100644 index 0000000000..819bf787fc --- /dev/null +++ b/usr/src/uts/common/syscall/sigsuspend.c @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 1994-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/fault.h> +#include <sys/procset.h> +#include <sys/signal.h> +#include <sys/schedctl.h> +#include <sys/debug.h> + +int +sigsuspend(sigset_t *setp) +{ + sigset_t set; + k_sigset_t kset; + proc_t *p = curproc; + + if (copyin((caddr_t)setp, (caddr_t)&set, sizeof (sigset_t))) + return (set_errno(EFAULT)); + sigutok(&set, &kset); + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(curthread); + ttolwp(curthread)->lwp_sigoldmask = curthread->t_hold; + curthread->t_hold = kset; + curthread->t_sig_check = 1; /* so post-syscall will re-evaluate */ + curthread->t_flag |= T_TOMASK; + /* pause() */ + while (cv_wait_sig_swap(&curthread->t_delay_cv, &p->p_lock)) + ; + mutex_exit(&p->p_lock); + return (set_errno(EINTR)); +} diff --git a/usr/src/uts/common/syscall/sigtimedwait.c b/usr/src/uts/common/syscall/sigtimedwait.c new file mode 100644 index 0000000000..ad4d79b763 --- /dev/null +++ b/usr/src/uts/common/syscall/sigtimedwait.c @@ -0,0 +1,207 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/bitmap.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/fault.h> +#include <sys/procset.h> +#include <sys/signal.h> +#include <sys/siginfo.h> +#include <sys/time.h> +#include <sys/kmem.h> +#include <sys/schedctl.h> +#include <sys/debug.h> +#include <sys/condvar_impl.h> +#include <sys/model.h> +#include <sys/sdt.h> +#include <sys/zone.h> + +static int +copyout_siginfo(model_t datamodel, k_siginfo_t *ksip, void *uaddr) +{ + zoneid_t zoneid = getzoneid(); + + if (datamodel == DATAMODEL_NATIVE) { + if (SI_FROMUSER(ksip) && zoneid != GLOBAL_ZONEID && + zoneid != ksip->si_zoneid) { + k_siginfo_t sani_sip = *ksip; + sani_sip.si_pid = curproc->p_zone->zone_zsched->p_pid; + sani_sip.si_uid = 0; + sani_sip.si_ctid = -1; + sani_sip.si_zoneid = zoneid; + if (copyout(&sani_sip, uaddr, sizeof (sani_sip))) + return (set_errno(EFAULT)); + } else { + if (copyout(ksip, uaddr, sizeof (*ksip))) + return (set_errno(EFAULT)); + } + } +#ifdef _SYSCALL32_IMPL + else { + siginfo32_t si32; + + siginfo_kto32(ksip, &si32); + if (SI_FROMUSER(ksip) && zoneid != GLOBAL_ZONEID && + zoneid != ksip->si_zoneid) { + si32.si_pid = curproc->p_zone->zone_zsched->p_pid; + si32.si_uid = 0; + si32.si_ctid = -1; + si32.si_zoneid = zoneid; + } + if (copyout(&si32, uaddr, sizeof (si32))) + return (set_errno(EFAULT)); + } +#endif + return (ksip->si_signo); +} + +/* + * Wait until a signal within a specified set is posted or until the + * time interval 'timeout' if specified. The signal is caught but + * not delivered. The value of the signal is returned to the caller. + */ +int +sigtimedwait(sigset_t *setp, siginfo_t *siginfop, timespec_t *timeoutp) +{ + sigset_t set; + k_sigset_t kset; + k_sigset_t oldmask; + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + timespec_t sig_timeout; + timespec_t *rqtp = NULL; + int timecheck = 0; + int ret; + int error = 0; + k_siginfo_t info, *infop; + model_t datamodel = get_udatamodel(); + + if (timeoutp) { + timespec_t now; + + timecheck = timechanged; + gethrestime(&now); + if (datamodel == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &sig_timeout, + sizeof (sig_timeout))) + return (set_errno(EFAULT)); + } else { + timespec32_t timeout32; + + if (copyin(timeoutp, &timeout32, sizeof (timeout32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&sig_timeout, &timeout32) + } + + if (itimerspecfix(&sig_timeout)) + return (set_errno(EINVAL)); + /* + * Convert the timespec value into absolute time. + */ + timespecadd(&sig_timeout, &now); + rqtp = &sig_timeout; + } + if (copyin(setp, &set, sizeof (set))) + return (set_errno(EFAULT)); + sigutok(&set, &kset); + if (sigisempty(&kset)) + return (set_errno(EINVAL)); + + mutex_enter(&p->p_lock); + /* + * set the thread's signal mask to unmask + * those signals in the specified set. + */ + schedctl_finish_sigblock(t); + oldmask = t->t_hold; + sigdiffset(&t->t_hold, &kset); + + /* + * Wait until we take a signal or until + * the absolute future time is passed. + */ + while ((ret = cv_waituntil_sig(&t->t_delay_cv, &p->p_lock, + rqtp, timecheck)) > 0) + continue; + if (ret == -1) + error = EAGAIN; + + /* + * Restore thread's signal mask to its previous value. + */ + t->t_hold = oldmask; + t->t_sig_check = 1; /* so post_syscall sees new t_hold mask */ + + if (error) { + mutex_exit(&p->p_lock); + return (set_errno(error)); /* timer expired */ + } + /* + * Don't bother with signal if it is not in request set. + */ + if (lwp->lwp_cursig == 0 || !sigismember(&kset, lwp->lwp_cursig)) { + mutex_exit(&p->p_lock); + /* + * lwp_cursig is zero if pokelwps() awakened cv_wait_sig(). + * This happens if some other thread in this process called + * forkall() or exit(). + */ + return (set_errno(EINTR)); + } + + if (lwp->lwp_curinfo) + infop = &lwp->lwp_curinfo->sq_info; + else { + infop = &info; + bzero(infop, sizeof (info)); + infop->si_signo = lwp->lwp_cursig; + infop->si_code = SI_NOINFO; + } + + lwp->lwp_ru.nsignals++; + ret = lwp->lwp_cursig; + DTRACE_PROC2(signal__clear, int, ret, ksiginfo_t *, infop); + lwp->lwp_cursig = 0; + lwp->lwp_extsig = 0; + mutex_exit(&p->p_lock); + + if (siginfop) + ret = copyout_siginfo(datamodel, infop, siginfop); + if (lwp->lwp_curinfo) { + siginfofree(lwp->lwp_curinfo); + lwp->lwp_curinfo = NULL; + } + return (ret); +} diff --git a/usr/src/uts/common/syscall/ssig.c b/usr/src/uts/common/syscall/ssig.c new file mode 100644 index 0000000000..e0998f474b --- /dev/null +++ b/usr/src/uts/common/syscall/ssig.c @@ -0,0 +1,169 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/fault.h> +#include <sys/procset.h> +#include <sys/signal.h> +#include <sys/schedctl.h> +#include <sys/debug.h> + + +/* + * ssig() is the old common entry for signal, sigset, sighold, + * sigrelse, sigignore and sigpause. + * + * All of these interfaces have been reimplemented in libc using + * calls to sigaction, sigsuspend and sigprocmask. + * + * This kernel interface is no longer called by any application + * that is dynamically linked with libc. It exists solely for + * the benefit of really old statically-linked applications. + * It should be removed from the system. + */ + +int +ssig(int signo, void (*func)()) +{ + int sig; + struct proc *p; + int flags; + int retval = 0; + int sigcld_look = 0; + + sig = signo & SIGNO_MASK; + + if (sig <= 0 || sig >= NSIG || sigismember(&cantmask, sig)) + return (set_errno(EINVAL)); + + p = ttoproc(curthread); + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(curthread); + switch (signo & ~SIGNO_MASK) { + + case SIGHOLD: /* sighold */ + sigaddset(&curthread->t_hold, sig); + mutex_exit(&p->p_lock); + return (0); + + case SIGRELSE: /* sigrelse */ + sigdelset(&curthread->t_hold, sig); + curthread->t_sig_check = 1; /* so ISSIG will see release */ + mutex_exit(&p->p_lock); + return (0); + + case SIGPAUSE: /* sigpause */ + sigdelset(&curthread->t_hold, sig); + curthread->t_sig_check = 1; /* so ISSIG will see release */ + /* pause() */ + while (cv_wait_sig_swap(&curthread->t_delay_cv, &p->p_lock)) + ; + mutex_exit(&p->p_lock); + return (set_errno(EINTR)); + + case SIGIGNORE: /* signore */ + sigdelset(&curthread->t_hold, sig); + curthread->t_sig_check = 1; /* so ISSIG will see release */ + func = SIG_IGN; + flags = 0; + break; + + case SIGDEFER: /* sigset */ + if (sigismember(&curthread->t_hold, sig)) + retval = (int)SIG_HOLD; + else + retval = (int)(uintptr_t)u.u_signal[sig-1]; + if (func == SIG_HOLD) { + sigaddset(&curthread->t_hold, sig); + mutex_exit(&p->p_lock); + return (retval); + } + +#if defined(__sparc) + /* + * Check alignment of handler + */ + if (func != SIG_IGN && func != SIG_DFL && + ((uintptr_t)func & 0x3) != 0) { + mutex_exit(&p->p_lock); + return (set_errno(EINVAL)); + } +#endif + sigdelset(&curthread->t_hold, sig); + curthread->t_sig_check = 1; /* so post_syscall sees it */ + flags = 0; + break; + + case 0: /* signal */ +#if defined(__sparc) + /* + * Check alignment of handler + */ + if (func != SIG_IGN && func != SIG_DFL && + ((uintptr_t)func & 0x3) != 0) { + mutex_exit(&p->p_lock); + return (set_errno(EINVAL)); + } +#endif + retval = (int)(uintptr_t)u.u_signal[sig-1]; + flags = SA_RESETHAND|SA_NODEFER; + break; + + default: /* error */ + mutex_exit(&p->p_lock); + return (set_errno(EINVAL)); + } + + if (sigismember(&stopdefault, sig)) + flags |= SA_RESTART; + else if (sig == SIGCLD) { + flags |= SA_NOCLDSTOP; + if (func == SIG_IGN) + flags |= SA_NOCLDWAIT; + else if (func != SIG_DFL) + sigcld_look = 1; + } + + setsigact(sig, func, nullsmask, flags); + mutex_exit(&p->p_lock); + + if (sigcld_look) + sigcld_repost(); + + return (retval); +} diff --git a/usr/src/uts/common/syscall/stat.c b/usr/src/uts/common/syscall/stat.c new file mode 100644 index 0000000000..b9505ebaa1 --- /dev/null +++ b/usr/src/uts/common/syscall/stat.c @@ -0,0 +1,675 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Get file attribute information through a file name or a file descriptor. + */ + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/fcntl.h> +#include <sys/pathname.h> +#include <sys/stat.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/ioreq.h> +#include <sys/debug.h> +#include <sys/cmn_err.h> +#include <c2/audit.h> + +/* + * Get the vp to be stated and the cred to be used for the call + * to VOP_GETATTR + */ + +/* + * nmflag has the following values + * + * 1 - Always do lookup. i.e. stat, lstat. + * 2 - Name is optional i.e. fstatat + * 0 - Don't lookup name, vp is in file_p. i.e. fstat + * + */ +static int +cstatat_getvp(int fd, char *name, int nmflag, + int follow, vnode_t **vp, cred_t **cred) +{ + vnode_t *startvp; + file_t *fp; + int error; + cred_t *cr; + + *vp = NULL; + + /* + * Only return EFAULT for fstatat when fd == AT_FDCWD && name == NULL + */ + + if (fd == AT_FDCWD) { + if (name != NULL || nmflag != 2) { + startvp = NULL; + cr = CRED(); + crhold(cr); + } else + return (EFAULT); + } else { + char startchar; + + if (nmflag == 1 || (nmflag == 2 && name != NULL)) { + if (copyin(name, &startchar, sizeof (char))) + return (EFAULT); + } else { + startchar = '\0'; + } + if (startchar != '/' || nmflag == 0) { + if ((fp = getf(fd)) == NULL) { + return (EBADF); + } + startvp = fp->f_vnode; + cr = fp->f_cred; + crhold(cr); + VN_HOLD(startvp); + releasef(fd); + } else { + startvp = NULL; + cr = CRED(); + crhold(cr); + } + } + *cred = cr; + +#ifdef C2_AUDIT + if (audit_active) + audit_setfsat_path(1); +#endif /* C2_AUDIT */ + + + if (nmflag == 1 || (nmflag == 2 && name != NULL)) { +lookup: + if (error = lookupnameat(name, UIO_USERSPACE, follow, NULLVPP, + vp, startvp)) { + if (error == ESTALE) + goto lookup; + if (startvp != NULL) + VN_RELE(startvp); + crfree(cr); + return (error); + } + if (startvp != NULL) + VN_RELE(startvp); + } else { + *vp = startvp; + } + + return (0); +} + +/* + * Native syscall interfaces: + * + * N-bit kernel, N-bit applications, N-bit file offsets + */ + +static int cstatat(int, char *, int, struct stat *, int, int); +static int cstat(vnode_t *vp, struct stat *, int, cred_t *); + +int +stat(char *fname, struct stat *sb) +{ + return (cstatat(AT_FDCWD, fname, 1, sb, 0, ATTR_REAL)); +} + +int +lstat(char *fname, struct stat *sb) +{ + return (cstatat(AT_FDCWD, fname, 1, sb, AT_SYMLINK_NOFOLLOW, 0)); +} + +/* + * fstat can and should be fast, do an inline implementation here. + */ +#define FSTAT_BODY(fd, sb, statfn) \ + { \ + file_t *fp; \ + int error; \ + \ + if ((fp = getf(fd)) == NULL) \ + return (set_errno(EBADF)); \ + if (audit_active) \ + audit_setfsat_path(1); \ + error = statfn(fp->f_vnode, sb, 0, fp->f_cred); \ + releasef(fd); \ + if (error) \ + return (set_errno(error)); \ + return (0); \ + } + +int +fstat(int fd, struct stat *sb) +{ + FSTAT_BODY(fd, sb, cstat) +} + +int +fstatat(int fd, char *name, struct stat *sb, int flags) +{ + return (cstatat(fd, name, 2, sb, flags, 0)); +} + +#if defined(__i386) || defined(__i386_COMPAT) + +/* + * Handle all the "extended" stat operations in the same way; + * validate the version, then call the real handler. + */ + +#define XSTAT_BODY(ver, f, s, fn) \ + return (ver != _STAT_VER ? set_errno(EINVAL) : fn(f, s)); + +#endif /* __i386 || __i386_COMPAT */ + +#if defined(__i386) + +/* + * Syscalls for i386 applications that issue {,l,f}xstat() directly + */ +int +xstat(int version, char *fname, struct stat *sb) +{ + XSTAT_BODY(version, fname, sb, stat) +} + +int +lxstat(int version, char *fname, struct stat *sb) +{ + XSTAT_BODY(version, fname, sb, lstat) +} + +int +fxstat(int version, int fd, struct stat *sb) +{ + XSTAT_BODY(version, fd, sb, fstat) +} + +#endif /* __i386 */ + +/* + * Common code for stat(), lstat(), and fstat(). + * (32-bit kernel, 32-bit applications, 32-bit files) + * (64-bit kernel, 64-bit applications, 64-bit files) + */ +static int +cstat(vnode_t *vp, struct stat *ubp, int flag, cred_t *cr) +{ + struct vfssw *vswp; + struct stat sb; + vattr_t vattr; + int error; + + vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, flag, cr)) != 0) + return (error); +#ifdef _ILP32 + /* + * (32-bit kernel, 32-bit applications, 32-bit files) + * NOTE: 32-bit kernel maintains a 64-bit unsigend va_size. + * + * st_size of devices (VBLK and VCHR special files) is a special case. + * POSIX does not define size behavior for special files, so the + * following Solaris specific behavior is not a violation. Solaris + * returns the size of the device. + * + * For compatibility with 32-bit programs which happen to do stat() on + * a (mknod) bigger than 2GB we suppress the large file EOVERFLOW and + * instead we return the value MAXOFF32_T (LONG_MAX). + * + * 32-bit applications that care about the size of devices should be + * built 64-bit or use a large file interface (lfcompile(5) or lf64(5)). + */ + if ((vattr.va_size > MAXOFF32_T) && + ((vp->v_type == VBLK) || (vp->v_type == VCHR))) { + /* OVERFLOW | UNKNOWN_SIZE */ + vattr.va_size = MAXOFF32_T; + } +#endif /* _ILP32 */ + if (vattr.va_size > MAXOFF_T || vattr.va_nblocks > LONG_MAX || + vattr.va_nodeid > ULONG_MAX) + return (EOVERFLOW); + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = (ino_t)vattr.va_nodeid; + sb.st_mode = VTTOIF(vattr.va_type) | vattr.va_mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = vattr.va_uid; + sb.st_gid = vattr.va_gid; + sb.st_rdev = vattr.va_rdev; + sb.st_size = (off_t)vattr.va_size; + sb.st_atim = vattr.va_atime; + sb.st_mtim = vattr.va_mtime; + sb.st_ctim = vattr.va_ctime; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = (blkcnt_t)vattr.va_nblocks; + if (vp->v_vfsp != NULL) { + vswp = &vfssw[vp->v_vfsp->vfs_fstype]; + if (vswp->vsw_name && *vswp->vsw_name) + (void) strcpy(sb.st_fstype, vswp->vsw_name); + } + if (copyout(&sb, ubp, sizeof (sb))) + return (EFAULT); + return (0); +} + +static int +cstatat(int fd, char *name, int nmflag, struct stat *sb, int follow, int flags) +{ + vnode_t *vp; + int error; + cred_t *cred; + int link_follow; + + link_follow = (follow == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW; +lookup: + if (error = cstatat_getvp(fd, name, nmflag, link_follow, &vp, &cred)) + return (set_errno(error)); + error = cstat(vp, sb, flags, cred); + crfree(cred); + VN_RELE(vp); +out: + if (error != 0) { + if (error == ESTALE && + (nmflag == 1 || (nmflag == 2 && name != NULL))) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +#if defined(_SYSCALL32_IMPL) + +/* + * 64-bit kernel, 32-bit applications, 32-bit file offsets + */ +static int cstatat32(int, char *, int, struct stat32 *, int, int); +static int cstat32(vnode_t *, struct stat32 *, int, cred_t *); + +int +stat32(char *fname, struct stat32 *sb) +{ + return (cstatat32(AT_FDCWD, fname, 1, sb, 0, ATTR_REAL)); +} + +int +lstat32(char *fname, struct stat32 *sb) +{ + return (cstatat32(AT_FDCWD, fname, 1, sb, AT_SYMLINK_NOFOLLOW, 0)); +} + +int +fstat32(int fd, struct stat32 *sb) +{ + FSTAT_BODY(fd, sb, cstat32) +} + +int +fstatat32(int fd, char *name, struct stat32 *sb, int flag) +{ + return (cstatat32(fd, name, 2, sb, flag, 0)); +} + +#if defined(__i386_COMPAT) + +/* + * Syscalls for i386 applications that issue {,l,f}xstat() directly + */ +int +xstat32(int version, char *fname, struct stat32 *sb) +{ + XSTAT_BODY(version, fname, sb, stat32) +} + +int +lxstat32(int version, char *fname, struct stat32 *sb) +{ + XSTAT_BODY(version, fname, sb, lstat32) +} + +int +fxstat32(int version, int fd, struct stat32 *sb) +{ + XSTAT_BODY(version, fd, sb, fstat32) +} + +#endif /* __i386_COMPAT */ + +static int +cstat32(vnode_t *vp, struct stat32 *ubp, int flag, struct cred *cr) +{ + struct vfssw *vswp; + struct stat32 sb; + vattr_t vattr; + int error; + dev32_t st_dev, st_rdev; + + vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE; + if (error = VOP_GETATTR(vp, &vattr, flag, cr)) + return (error); + + /* devices are a special case, see comments in cstat */ + if ((vattr.va_size > MAXOFF32_T) && + ((vp->v_type == VBLK) || (vp->v_type == VCHR))) { + /* OVERFLOW | UNKNOWN_SIZE */ + vattr.va_size = MAXOFF32_T; + } + + /* check for large values */ + if (!cmpldev(&st_dev, vattr.va_fsid) || + !cmpldev(&st_rdev, vattr.va_rdev) || + vattr.va_size > MAXOFF32_T || + vattr.va_nblocks > INT32_MAX || + vattr.va_nodeid > UINT32_MAX || + TIMESPEC_OVERFLOW(&(vattr.va_atime)) || + TIMESPEC_OVERFLOW(&(vattr.va_mtime)) || + TIMESPEC_OVERFLOW(&(vattr.va_ctime))) + return (EOVERFLOW); + + bzero(&sb, sizeof (sb)); + sb.st_dev = st_dev; + sb.st_ino = (ino32_t)vattr.va_nodeid; + sb.st_mode = VTTOIF(vattr.va_type) | vattr.va_mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = vattr.va_uid; + sb.st_gid = vattr.va_gid; + sb.st_rdev = st_rdev; + sb.st_size = (off32_t)vattr.va_size; + TIMESPEC_TO_TIMESPEC32(&(sb.st_atim), &(vattr.va_atime)); + TIMESPEC_TO_TIMESPEC32(&(sb.st_mtim), &(vattr.va_mtime)); + TIMESPEC_TO_TIMESPEC32(&(sb.st_ctim), &(vattr.va_ctime)); + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = (blkcnt32_t)vattr.va_nblocks; + if (vp->v_vfsp != NULL) { + vswp = &vfssw[vp->v_vfsp->vfs_fstype]; + if (vswp->vsw_name && *vswp->vsw_name) + (void) strcpy(sb.st_fstype, vswp->vsw_name); + } + if (copyout(&sb, ubp, sizeof (sb))) + return (EFAULT); + return (0); +} + +static int +cstatat32(int fd, char *name, int nmflag, struct stat32 *sb, + int follow, int flags) +{ + vnode_t *vp; + int error; + cred_t *cred; + int link_follow; + + link_follow = (follow == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW; +lookup: + if (error = cstatat_getvp(fd, name, nmflag, link_follow, &vp, &cred)) + return (set_errno(error)); + error = cstat32(vp, sb, flags, cred); + crfree(cred); + VN_RELE(vp); +out: + if (error != 0) { + if (error == ESTALE && + (nmflag == 1 || (nmflag == 2 && name != NULL))) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +#endif /* _SYSCALL32_IMPL */ + +#if defined(_ILP32) + +/* + * 32-bit kernel, 32-bit applications, 64-bit file offsets. + * + * These routines are implemented differently on 64-bit kernels. + */ +static int cstatat64(int, char *, int, struct stat64 *, int, int); +static int cstat64(vnode_t *, struct stat64 *, int, cred_t *); + +int +stat64(char *fname, struct stat64 *sb) +{ + return (cstatat64(AT_FDCWD, fname, 1, sb, 0, ATTR_REAL)); +} + +int +lstat64(char *fname, struct stat64 *sb) +{ + return (cstatat64(AT_FDCWD, fname, 1, sb, AT_SYMLINK_NOFOLLOW, 0)); +} + +int +fstat64(int fd, struct stat64 *sb) +{ + FSTAT_BODY(fd, sb, cstat64) +} + +int +fstatat64(int fd, char *name, struct stat64 *sb, int flags) +{ + return (cstatat64(fd, name, 2, sb, flags, 0)); +} + +static int +cstat64(vnode_t *vp, struct stat64 *ubp, int flag, cred_t *cr) +{ + struct vfssw *vswp; + struct stat64 lsb; + vattr_t vattr; + int error; + + vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE; + if (error = VOP_GETATTR(vp, &vattr, flag, cr)) + return (error); + + bzero(&lsb, sizeof (lsb)); + lsb.st_dev = vattr.va_fsid; + lsb.st_ino = vattr.va_nodeid; + lsb.st_mode = VTTOIF(vattr.va_type) | vattr.va_mode; + lsb.st_nlink = vattr.va_nlink; + lsb.st_uid = vattr.va_uid; + lsb.st_gid = vattr.va_gid; + lsb.st_rdev = vattr.va_rdev; + lsb.st_size = vattr.va_size; + lsb.st_atim = vattr.va_atime; + lsb.st_mtim = vattr.va_mtime; + lsb.st_ctim = vattr.va_ctime; + lsb.st_blksize = vattr.va_blksize; + lsb.st_blocks = vattr.va_nblocks; + if (vp->v_vfsp != NULL) { + vswp = &vfssw[vp->v_vfsp->vfs_fstype]; + if (vswp->vsw_name && *vswp->vsw_name) + (void) strcpy(lsb.st_fstype, vswp->vsw_name); + } + if (copyout(&lsb, ubp, sizeof (lsb))) + return (EFAULT); + return (0); +} + +static int +cstatat64(int fd, char *name, int nmflag, struct stat64 *sb, + int follow, int flags) +{ + vnode_t *vp; + int error; + cred_t *cred; + int link_follow; + + link_follow = (follow == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW; +lookup: + if (error = cstatat_getvp(fd, name, nmflag, link_follow, &vp, &cred)) + return (set_errno(error)); + error = cstat64(vp, sb, flags, cred); + crfree(cred); + VN_RELE(vp); +out: + if (error != 0) { + if (error == ESTALE && + (nmflag == 1 || (nmflag == 2 && name != NULL))) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +#endif /* _ILP32 */ + +#if defined(_SYSCALL32_IMPL) + +/* + * 64-bit kernel, 32-bit applications, 64-bit file offsets. + * + * We'd really like to call the "native" stat calls for these ones, + * but the problem is that the 64-bit ABI defines the 'stat64' structure + * differently from the way the 32-bit ABI defines it. + */ + +static int cstatat64_32(int, char *, int, struct stat64_32 *, int, int); +static int cstat64_32(vnode_t *, struct stat64_32 *, int, cred_t *); + +int +stat64_32(char *fname, struct stat64_32 *sb) +{ + return (cstatat64_32(AT_FDCWD, fname, 1, sb, 0, ATTR_REAL)); +} + +int +lstat64_32(char *fname, struct stat64_32 *sb) +{ + return (cstatat64_32(AT_FDCWD, fname, 1, sb, AT_SYMLINK_NOFOLLOW, 0)); +} + +int +fstat64_32(int fd, struct stat64_32 *sb) +{ + FSTAT_BODY(fd, sb, cstat64_32) +} + +int +fstatat64_32(int fd, char *name, struct stat64_32 *sb, int flag) +{ + return (cstatat64_32(fd, name, 2, sb, flag, 0)); +} + +static int +cstat64_32(vnode_t *vp, struct stat64_32 *ubp, int flag, cred_t *cr) +{ + struct vfssw *vswp; + struct stat64_32 lsb; + vattr_t vattr; + int error; + dev32_t st_dev, st_rdev; + + vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE; + if (error = VOP_GETATTR(vp, &vattr, flag, cr)) + return (error); + + if (!cmpldev(&st_dev, vattr.va_fsid) || + !cmpldev(&st_rdev, vattr.va_rdev) || + TIMESPEC_OVERFLOW(&(vattr.va_atime)) || + TIMESPEC_OVERFLOW(&(vattr.va_mtime)) || + TIMESPEC_OVERFLOW(&(vattr.va_ctime))) + return (EOVERFLOW); + + bzero(&lsb, sizeof (lsb)); + lsb.st_dev = st_dev; + lsb.st_ino = vattr.va_nodeid; + lsb.st_mode = VTTOIF(vattr.va_type) | vattr.va_mode; + lsb.st_nlink = vattr.va_nlink; + lsb.st_uid = vattr.va_uid; + lsb.st_gid = vattr.va_gid; + lsb.st_rdev = st_rdev; + lsb.st_size = vattr.va_size; + TIMESPEC_TO_TIMESPEC32(&(lsb.st_atim), &(vattr.va_atime)); + TIMESPEC_TO_TIMESPEC32(&(lsb.st_mtim), &(vattr.va_mtime)); + TIMESPEC_TO_TIMESPEC32(&(lsb.st_ctim), &(vattr.va_ctime)); + lsb.st_blksize = vattr.va_blksize; + lsb.st_blocks = vattr.va_nblocks; + if (vp->v_vfsp != NULL) { + vswp = &vfssw[vp->v_vfsp->vfs_fstype]; + if (vswp->vsw_name && *vswp->vsw_name) + (void) strcpy(lsb.st_fstype, vswp->vsw_name); + } + if (copyout(&lsb, ubp, sizeof (lsb))) + return (EFAULT); + return (0); +} + +static int +cstatat64_32(int fd, char *name, int nmflag, struct stat64_32 *sb, + int follow, int flags) +{ + vnode_t *vp; + int error; + cred_t *cred; + int link_follow; + + link_follow = (follow == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW; +lookup: + if (error = cstatat_getvp(fd, name, nmflag, link_follow, &vp, &cred)) + return (set_errno(error)); + error = cstat64_32(vp, sb, flags, cred); + crfree(cred); + VN_RELE(vp); +out: + if (error != 0) { + if (error == ESTALE && + (nmflag == 1 || (nmflag == 2 && name != NULL))) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +#endif /* _SYSCALL32_IMPL */ diff --git a/usr/src/uts/common/syscall/statfs.c b/usr/src/uts/common/syscall/statfs.c new file mode 100644 index 0000000000..5d8c2cd395 --- /dev/null +++ b/usr/src/uts/common/syscall/statfs.c @@ -0,0 +1,164 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2001 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/inttypes.h> +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/fstyp.h> +#include <sys/systm.h> +#include <sys/vfs.h> +#include <sys/statfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/pathname.h> + +#include <vm/page.h> + +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) + +/* + * statfs(2) and fstatfs(2) have been replaced by statvfs(2) and + * fstatvfs(2) and will be removed from the system in a near-future + * release. + * + * Supported here purely for 32-bit compatibility. + */ + +static int cstatfs(struct vfs *, struct statfs32 *, int); + +int +statfs32(char *fname, struct statfs32 *sbp, int32_t len, int32_t fstyp) +{ + vnode_t *vp; + int error; + +lookup: + if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + if (fstyp != 0) + error = EINVAL; + else + error = cstatfs(vp->v_vfsp, sbp, len); + VN_RELE(vp); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +int +fstatfs32(int32_t fdes, struct statfs32 *sbp, int32_t len, int32_t fstyp) +{ + struct file *fp; + int error; + + if (fstyp != 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + error = cstatfs(fp->f_vnode->v_vfsp, sbp, len); + releasef(fdes); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * Common routine for fstatfs and statfs. + */ +static int +cstatfs(struct vfs *vfsp, struct statfs32 *sbp, int len) +{ + struct statfs32 sfs; + struct statvfs64 svfs; + int error, i; + char *cp, *cp2; + struct vfssw *vswp; + + if (len < 0 || len > sizeof (struct statfs)) + return (EINVAL); + if (error = VFS_STATVFS(vfsp, &svfs)) + return (error); + + if (svfs.f_blocks > UINT32_MAX || svfs.f_bfree > UINT32_MAX || + svfs.f_files > UINT32_MAX || svfs.f_ffree > UINT32_MAX) + return (EOVERFLOW); + /* + * Map statvfs fields into the old statfs structure. + */ + bzero(&sfs, sizeof (sfs)); + sfs.f_bsize = svfs.f_bsize; + sfs.f_frsize = (svfs.f_frsize == svfs.f_bsize) ? 0 : svfs.f_frsize; + sfs.f_blocks = svfs.f_blocks * (svfs.f_frsize / 512); + sfs.f_bfree = svfs.f_bfree * (svfs.f_frsize / 512); + sfs.f_files = svfs.f_files; + sfs.f_ffree = svfs.f_ffree; + + cp = svfs.f_fstr; + cp2 = sfs.f_fname; + i = 0; + while (i++ < sizeof (sfs.f_fname)) + if (*cp != '\0') + *cp2++ = *cp++; + else + *cp2++ = '\0'; + while (*cp != '\0' && + i++ < (sizeof (svfs.f_fstr) - sizeof (sfs.f_fpack))) + cp++; + (void) strncpy(sfs.f_fpack, cp + 1, sizeof (sfs.f_fpack)); + if ((vswp = vfs_getvfssw(svfs.f_basetype)) == NULL) + sfs.f_fstyp = 0; + else { + sfs.f_fstyp = vswp - vfssw; + vfs_unrefvfssw(vswp); + } + + if (copyout(&sfs, sbp, len)) + return (EFAULT); + + return (0); +} + +#endif /* _SYSCALL32_IMPL || _ILP32 */ diff --git a/usr/src/uts/common/syscall/statvfs.c b/usr/src/uts/common/syscall/statvfs.c new file mode 100644 index 0000000000..dc0a98153a --- /dev/null +++ b/usr/src/uts/common/syscall/statvfs.c @@ -0,0 +1,366 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2001 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Get file system statistics (statvfs and fstatvfs). + */ + +#include <sys/types.h> +#include <sys/inttypes.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/fstyp.h> +#include <sys/systm.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/pathname.h> + +#include <vm/page.h> + +#define STATVFSCOPY(dst, src) \ + (dst)->f_bsize = (src)->f_bsize; \ + (dst)->f_frsize = (src)->f_frsize; \ + (dst)->f_blocks = (src)->f_blocks; \ + (dst)->f_bfree = (src)->f_bfree; \ + (dst)->f_bavail = (src)->f_bavail; \ + (dst)->f_files = (src)->f_files; \ + (dst)->f_ffree = (src)->f_ffree; \ + (dst)->f_favail = (src)->f_favail; \ + (dst)->f_fsid = (src)->f_fsid; \ + bcopy((src)->f_basetype, (dst)->f_basetype, \ + sizeof ((dst)->f_basetype)); \ + (dst)->f_flag = (src)->f_flag; \ + (dst)->f_namemax = (src)->f_namemax; \ + bcopy((src)->f_fstr, (dst)->f_fstr, \ + sizeof ((dst)->f_fstr)) + +/* + * Common routines for statvfs and fstatvfs. + */ + +static int +cstatvfs32(struct vfs *vfsp, struct statvfs32 *ubp) +{ + struct statvfs64 ds64; + struct statvfs32 ds32; + int error; + +#if !defined(lint) + ASSERT32(sizeof (struct statvfs) == sizeof (struct statvfs32)); + ASSERT32(sizeof (struct statvfs64) == sizeof (struct statvfs64_32)); +#endif + + bzero(&ds64, sizeof (ds64)); + if ((error = VFS_STATVFS(vfsp, &ds64)) != 0) + return (error); + + /* + * VFS_STATVFS can return data that is incompatible with the space + * available the 32-bit statvfs structure. Check here to see if + * it will fit into the 32-bit structure, if not, return EOVERFLOW. + * + * The check for -1 is because some file systems return -1 in the + * fields that are irrelevant or nonessential, and we do not want + * to return EOVERFLOW for them. For example: df is expected to + * show -1 in the output for some of these fields on NFS mounted + * filesystems. + */ + if (ds64.f_files == (fsfilcnt64_t)-1) + ds64.f_files = UINT32_MAX; + if (ds64.f_ffree == (fsfilcnt64_t)-1) + ds64.f_ffree = UINT32_MAX; + if (ds64.f_favail == (fsfilcnt64_t)-1) + ds64.f_favail = UINT32_MAX; + if (ds64.f_bavail == (fsblkcnt64_t)-1) + ds64.f_bavail = UINT32_MAX; + if (ds64.f_bfree == (fsblkcnt64_t)-1) + ds64.f_bfree = UINT32_MAX; + + if (ds64.f_blocks > UINT32_MAX || ds64.f_bfree > UINT32_MAX || + ds64.f_bavail > UINT32_MAX || ds64.f_files > UINT32_MAX || + ds64.f_ffree > UINT32_MAX || ds64.f_favail > UINT32_MAX) + return (EOVERFLOW); +#ifdef _LP64 + /* + * On the 64-bit kernel, even these fields grow to 64-bit + * quantities in the statvfs64 structure. + */ + if (ds64.f_namemax == (ulong_t)-1l) + ds64.f_namemax = UINT32_MAX; + + if (ds64.f_bsize > UINT32_MAX || ds64.f_frsize > UINT32_MAX || + ds64.f_fsid > UINT32_MAX || ds64.f_flag > UINT32_MAX || + ds64.f_namemax > UINT32_MAX) + return (EOVERFLOW); +#endif + + bzero(&ds32, sizeof (ds32)); + STATVFSCOPY(&ds32, &ds64); + if (copyout(&ds32, ubp, sizeof (ds32)) != 0) + return (EFAULT); + return (0); +} + +static int +cstatvfs64(struct vfs *vfsp, struct statvfs64 *ubp) +{ + struct statvfs64 ds64; + int error; + +#if !defined(lint) + ASSERT64(sizeof (struct statvfs) == sizeof (struct statvfs64)); +#endif + bzero(&ds64, sizeof (ds64)); + if ((error = VFS_STATVFS(vfsp, &ds64)) != 0) + return (error); + if (copyout(&ds64, ubp, sizeof (ds64)) != 0) + return (EFAULT); + return (0); +} + +/* + * Native system calls + */ +int +statvfs(char *fname, struct statvfs *sbp) +{ + vnode_t *vp; + int error; + +lookup: + if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } +#ifdef _LP64 + error = cstatvfs64(vp->v_vfsp, (struct statvfs64 *)sbp); +#else + error = cstatvfs32(vp->v_vfsp, (struct statvfs32 *)sbp); +#endif + VN_RELE(vp); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +int +fstatvfs(int fdes, struct statvfs *sbp) +{ + struct file *fp; + int error; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); +#ifdef _LP64 + error = cstatvfs64(fp->f_vnode->v_vfsp, (struct statvfs64 *)sbp); +#else + error = cstatvfs32(fp->f_vnode->v_vfsp, (struct statvfs32 *)sbp); +#endif + releasef(fdes); + if (error) + return (set_errno(error)); + return (0); +} + +#if defined(_ILP32) + +/* + * Large File system calls. + * + * (We deliberately don't have special "large file" system calls in the + * 64-bit kernel -- we just use the native versions, since they're just + * as functional.) + */ +int +statvfs64(char *fname, struct statvfs64 *sbp) +{ + vnode_t *vp; + int error; + +lookup: + if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + error = cstatvfs64(vp->v_vfsp, sbp); + VN_RELE(vp); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +int +fstatvfs64(int fdes, struct statvfs64 *sbp) +{ + struct file *fp; + int error; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + error = cstatvfs64(fp->f_vnode->v_vfsp, sbp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (0); +} + +#endif /* _ILP32 */ + +#ifdef _SYSCALL32_IMPL + +static int +cstatvfs64_32(struct vfs *vfsp, struct statvfs64_32 *ubp) +{ + struct statvfs64 ds64; + struct statvfs64_32 ds64_32; + int error; + + bzero(&ds64, sizeof (ds64)); + if ((error = VFS_STATVFS(vfsp, &ds64)) != 0) + return (error); + + /* + * On the 64-bit kernel, even these fields grow to 64-bit + * quantities in the statvfs64 structure. + */ + if (ds64.f_namemax == (ulong_t)-1l) + ds64.f_namemax = UINT32_MAX; + + if (ds64.f_bsize > UINT32_MAX || ds64.f_frsize > UINT32_MAX || + ds64.f_fsid > UINT32_MAX || ds64.f_flag > UINT32_MAX || + ds64.f_namemax > UINT32_MAX) + return (EOVERFLOW); + + STATVFSCOPY(&ds64_32, &ds64); + if (copyout(&ds64_32, ubp, sizeof (ds64_32)) != 0) + return (EFAULT); + return (0); +} + +/* + * ILP32 "small file" system calls on LP64 kernel + */ +int +statvfs32(char *fname, struct statvfs32 *sbp) +{ + vnode_t *vp; + int error; + +lookup: + if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + error = cstatvfs32(vp->v_vfsp, sbp); + VN_RELE(vp); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +int +fstatvfs32(int fdes, struct statvfs32 *sbp) +{ + struct file *fp; + int error; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + error = cstatvfs32(fp->f_vnode->v_vfsp, sbp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (0); +} + +/* + * ILP32 Large File system calls on LP64 kernel + */ +int +statvfs64_32(char *fname, struct statvfs64_32 *sbp) +{ + vnode_t *vp; + int error; + +lookup: + if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + error = cstatvfs64_32(vp->v_vfsp, sbp); + VN_RELE(vp); + if (error) { + if (error == ESTALE) + goto lookup; + return (set_errno(error)); + } + return (0); +} + +int +fstatvfs64_32(int fdes, struct statvfs64_32 *sbp) +{ + struct file *fp; + int error; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + error = cstatvfs64_32(fp->f_vnode->v_vfsp, sbp); + releasef(fdes); + if (error) + return (set_errno(error)); + return (0); +} + +#endif /* _SYSCALL32_IMPL */ diff --git a/usr/src/uts/common/syscall/strcalls.c b/usr/src/uts/common/syscall/strcalls.c new file mode 100644 index 0000000000..bdde97a39d --- /dev/null +++ b/usr/src/uts/common/syscall/strcalls.c @@ -0,0 +1,537 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/fs/fifonode.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/debug.h> + +/* + * STREAMS system calls. + */ + +int getmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int *flagsp); +int putmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int flags); +int getpmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int *prip, + int *flagsp); +int putpmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int pri, + int flags); + +static int msgio(int fdes, struct strbuf *ctl, struct strbuf *data, int *rval, + int mode, unsigned char *prip, int *flagsp); + +int +getmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int *flagsp) +{ + int error; + int localflags; + int realflags = 0; + unsigned char pri = 0; + int rv = 0; + + /* + * Convert between old flags (localflags) and new flags (realflags). + */ + if (copyin(flagsp, &localflags, sizeof (*flagsp))) + return (set_errno(EFAULT)); + switch (localflags) { + case 0: + realflags = MSG_ANY; + break; + + case RS_HIPRI: + realflags = MSG_HIPRI; + break; + + default: + return (set_errno(EINVAL)); + } + + if ((error = msgio(fdes, ctl, data, &rv, FREAD, &pri, + &realflags)) == 0) { + /* + * massage realflags based on localflags. + */ + if (realflags == MSG_HIPRI) + localflags = RS_HIPRI; + else + localflags = 0; + if (copyout(&localflags, flagsp, sizeof (*flagsp))) + error = EFAULT; + } + if (error != 0) + return (set_errno(error)); + return (rv); +} + +int +putmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int flags) +{ + unsigned char pri = 0; + int realflags; + int error; + int rv = 0; + + switch (flags) { + case RS_HIPRI: + realflags = MSG_HIPRI; + break; + case (RS_HIPRI|MSG_XPG4): + realflags = MSG_HIPRI|MSG_XPG4; + break; + case MSG_XPG4: + realflags = MSG_BAND|MSG_XPG4; + break; + case 0: + realflags = MSG_BAND; + break; + + default: + return (set_errno(EINVAL)); + } + error = msgio(fdes, ctl, data, &rv, FWRITE, &pri, &realflags); + if (error != 0) + return (set_errno(error)); + return (rv); +} + + +int +getpmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int *prip, + int *flagsp) +{ + int error; + int flags; + int intpri; + unsigned char pri; + int rv = 0; + + if (copyin(flagsp, &flags, sizeof (flags))) + return (set_errno(EFAULT)); + if (copyin(prip, &intpri, sizeof (intpri))) + return (set_errno(EFAULT)); + if ((intpri > 255) || (intpri < 0)) + return (set_errno(EINVAL)); + pri = (unsigned char)intpri; + error = msgio(fdes, ctl, data, &rv, FREAD, &pri, &flags); + if (error != 0) + return (set_errno(error)); + if (copyout(&flags, flagsp, sizeof (flags))) + return (set_errno(EFAULT)); + intpri = (int)pri; + if (copyout(&intpri, prip, sizeof (intpri))) + return (set_errno(EFAULT)); + return (rv); +} + +int +putpmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int intpri, + int flags) +{ + unsigned char pri; + int rv = 0; + int error; + + if ((intpri > 255) || (intpri < 0)) + return (set_errno(EINVAL)); + pri = (unsigned char)intpri; + error = msgio(fdes, ctl, data, &rv, FWRITE, &pri, &flags); + if (error != 0) + return (set_errno(error)); + return (rv); +} + +/* + * Common code for getmsg and putmsg calls: check permissions, + * copy in args, do preliminary setup, and switch to + * appropriate stream routine. + */ +static int +msgio(int fdes, struct strbuf *ctl, struct strbuf *data, int *rval, + int mode, unsigned char *prip, int *flagsp) +{ + file_t *fp; + vnode_t *vp; + struct strbuf msgctl, msgdata; + int error; + int flag; + klwp_t *lwp = ttolwp(curthread); + rval_t rv; + + if ((fp = getf(fdes)) == NULL) + return (EBADF); + if ((fp->f_flag & mode) == 0) { + releasef(fdes); + return (EBADF); + } + vp = fp->f_vnode; + if (vp->v_type == VFIFO) { + if (vp->v_stream) { + /* + * must use sd_vnode, could be named pipe + */ + (void) fifo_vfastoff(vp->v_stream->sd_vnode); + } else { + releasef(fdes); + return (ENOSTR); + } + } else if ((vp->v_type != VCHR && vp->v_type != VSOCK) || + vp->v_stream == NULL) { + releasef(fdes); + return (ENOSTR); + } + if ((ctl != NULL) && + copyin(ctl, &msgctl, sizeof (struct strbuf))) { + releasef(fdes); + return (EFAULT); + } + if ((data != NULL) && + copyin(data, &msgdata, sizeof (struct strbuf))) { + releasef(fdes); + return (EFAULT); + } + + if (mode == FREAD) { + if (ctl == NULL) + msgctl.maxlen = -1; + if (data == NULL) + msgdata.maxlen = -1; + flag = fp->f_flag; + rv.r_val1 = 0; + if (vp->v_type == VSOCK) { + error = sock_getmsg(vp, &msgctl, &msgdata, prip, + flagsp, flag, &rv); + } else { + error = strgetmsg(vp, &msgctl, &msgdata, prip, + flagsp, flag, &rv); + } + *rval = rv.r_val1; + if (error != 0) { + releasef(fdes); + return (error); + } + if (lwp != NULL) + lwp->lwp_ru.msgrcv++; + if (((ctl != NULL) && + copyout(&msgctl, ctl, sizeof (struct strbuf))) || + ((data != NULL) && + copyout(&msgdata, data, sizeof (struct strbuf)))) { + releasef(fdes); + return (EFAULT); + } + releasef(fdes); + return (0); + } + + /* + * FWRITE case + */ + if (ctl == NULL) + msgctl.len = -1; + if (data == NULL) + msgdata.len = -1; + flag = fp->f_flag; + if (vp->v_type == VSOCK) { + error = sock_putmsg(vp, &msgctl, &msgdata, *prip, *flagsp, + flag); + } else { + error = strputmsg(vp, &msgctl, &msgdata, *prip, *flagsp, flag); + } + releasef(fdes); + if (error == 0 && lwp != NULL) + lwp->lwp_ru.msgsnd++; + return (error); +} + + +#if defined(_LP64) && defined(_SYSCALL32) + +static int msgio32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, + int *rval, int mode, unsigned char *prip, int *flagsp); + +int +getmsg32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int32_t *flagsp) +{ + int error; + int32_t localflags; + int realflags = 0; + unsigned char pri = 0; + int rv = 0; + + /* + * Convert between old flags (localflags) and new flags (realflags). + */ + if (copyin(flagsp, &localflags, sizeof (*flagsp))) + return (set_errno(EFAULT)); + switch (localflags) { + case 0: + realflags = MSG_ANY; + break; + + case RS_HIPRI: + realflags = MSG_HIPRI; + break; + + default: + return (set_errno(EINVAL)); + } + + if ((error = msgio32(fdes, ctl, data, &rv, FREAD, &pri, + &realflags)) == 0) { + /* + * massage realflags based on localflags. + */ + if (realflags == MSG_HIPRI) + localflags = RS_HIPRI; + else + localflags = 0; + if (copyout(&localflags, flagsp, sizeof (*flagsp))) + error = EFAULT; + } + if (error != 0) + return (set_errno(error)); + return (rv); +} + +int +putmsg32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int32_t flags) +{ + unsigned char pri = 0; + int realflags; + int error; + int rv = 0; + + switch (flags) { + case RS_HIPRI: + realflags = MSG_HIPRI; + break; + case (RS_HIPRI|MSG_XPG4): + realflags = MSG_HIPRI|MSG_XPG4; + break; + case MSG_XPG4: + realflags = MSG_BAND|MSG_XPG4; + break; + case 0: + realflags = MSG_BAND; + break; + + default: + return (set_errno(EINVAL)); + } + error = msgio32(fdes, ctl, data, &rv, FWRITE, &pri, &realflags); + if (error != 0) + return (set_errno(error)); + return (rv); +} + + +int +getpmsg32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int32_t *prip, + int32_t *flagsp) +{ + int error; + int32_t flags; + int32_t intpri; + unsigned char pri; + int rv = 0; + + if (copyin(flagsp, &flags, sizeof (*flagsp))) + return (set_errno(EFAULT)); + if (copyin(prip, &intpri, sizeof (intpri))) + return (set_errno(EFAULT)); + if ((intpri > 255) || (intpri < 0)) + return (set_errno(EINVAL)); + pri = (unsigned char)intpri; + error = msgio32(fdes, ctl, data, &rv, FREAD, &pri, &flags); + if (error != 0) + return (set_errno(error)); + if (copyout(&flags, flagsp, sizeof (flags))) + return (set_errno(EFAULT)); + intpri = (int)pri; + if (copyout(&intpri, prip, sizeof (intpri))) + return (set_errno(EFAULT)); + return (rv); +} + +int +putpmsg32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int32_t intpri, + int32_t flags) +{ + unsigned char pri; + int rv = 0; + int error; + + if ((intpri > 255) || (intpri < 0)) + return (set_errno(EINVAL)); + pri = (unsigned char)intpri; + error = msgio32(fdes, ctl, data, &rv, FWRITE, &pri, &flags); + if (error != 0) + return (set_errno(error)); + return (rv); +} + +/* + * Common code for getmsg and putmsg calls: check permissions, + * copy in args, do preliminary setup, and switch to + * appropriate stream routine. + */ +static int +msgio32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int *rval, + int mode, unsigned char *prip, int *flagsp) +{ + file_t *fp; + vnode_t *vp; + struct strbuf32 msgctl32, msgdata32; + struct strbuf msgctl, msgdata; + int error; + int flag; + klwp_t *lwp = ttolwp(curthread); + rval_t rv; + + if ((fp = getf(fdes)) == NULL) + return (EBADF); + if ((fp->f_flag & mode) == 0) { + releasef(fdes); + return (EBADF); + } + vp = fp->f_vnode; + if (vp->v_type == VFIFO) { + if (vp->v_stream) { + /* + * must use sd_vnode, could be named pipe + */ + (void) fifo_vfastoff(vp->v_stream->sd_vnode); + } else { + releasef(fdes); + return (ENOSTR); + } + } else if ((vp->v_type != VCHR && vp->v_type != VSOCK) || + vp->v_stream == NULL) { + releasef(fdes); + return (ENOSTR); + } + if (ctl != NULL) { + if (copyin(ctl, &msgctl32, sizeof (msgctl32))) { + releasef(fdes); + return (EFAULT); + } + msgctl.len = msgctl32.len; + msgctl.maxlen = msgctl32.maxlen; + msgctl.buf = (caddr_t)(uintptr_t)msgctl32.buf; + } + if (data != NULL) { + if (copyin(data, &msgdata32, sizeof (msgdata32))) { + releasef(fdes); + return (EFAULT); + } + msgdata.len = msgdata32.len; + msgdata.maxlen = msgdata32.maxlen; + msgdata.buf = (caddr_t)(uintptr_t)msgdata32.buf; + } + + if (mode == FREAD) { + if (ctl == NULL) + msgctl.maxlen = -1; + if (data == NULL) + msgdata.maxlen = -1; + flag = fp->f_flag; + rv.r_val1 = 0; + if (vp->v_type == VSOCK) { + error = sock_getmsg(vp, &msgctl, &msgdata, prip, + flagsp, flag, &rv); + } else { + error = strgetmsg(vp, &msgctl, &msgdata, prip, + flagsp, flag, &rv); + } + *rval = rv.r_val1; + if (error != 0) { + releasef(fdes); + return (error); + } + if (lwp != NULL) + lwp->lwp_ru.msgrcv++; + if (ctl != NULL) { + /* XX64 - range check */ + msgctl32.len = msgctl.len; + msgctl32.maxlen = msgctl.maxlen; + msgctl32.buf = (caddr32_t)(uintptr_t)msgctl.buf; + if (copyout(&msgctl32, ctl, sizeof (msgctl32))) { + releasef(fdes); + return (EFAULT); + } + } + if (data != NULL) { + /* XX64 - range check */ + msgdata32.len = msgdata.len; + msgdata32.maxlen = msgdata.maxlen; + msgdata32.buf = (caddr32_t)(uintptr_t)msgdata.buf; + if (copyout(&msgdata32, data, sizeof (msgdata32))) { + releasef(fdes); + return (EFAULT); + } + } + releasef(fdes); + return (0); + } + + /* + * FWRITE case + */ + if (ctl == NULL) + msgctl.len = -1; + if (data == NULL) + msgdata.len = -1; + flag = fp->f_flag; + if (vp->v_type == VSOCK) { + error = sock_putmsg(vp, &msgctl, &msgdata, *prip, *flagsp, + flag); + } else { + error = strputmsg(vp, &msgctl, &msgdata, *prip, *flagsp, flag); + } + releasef(fdes); + if (error == 0 && lwp != NULL) + lwp->lwp_ru.msgsnd++; + return (error); +} + +#endif /* _LP64 && _SYSCALL32 */ diff --git a/usr/src/uts/common/syscall/symlink.c b/usr/src/uts/common/syscall/symlink.c new file mode 100644 index 0000000000..2ce51d24cd --- /dev/null +++ b/usr/src/uts/common/syscall/symlink.c @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/debug.h> +#include <c2/audit.h> + +/* + * Create a symbolic link. Similar to link or rename except target + * name is passed as string argument, not converted to vnode reference. + */ +int +symlink(char *target, char *linkname) +{ + vnode_t *dvp; + struct vattr vattr; + struct pathname lpn; + char *tbuf; + size_t tlen; + int error; + +top: + if (error = pn_get(linkname, UIO_USERSPACE, &lpn)) + return (set_errno(error)); + if (error = lookuppn(&lpn, NULL, NO_FOLLOW, &dvp, NULLVPP)) { + pn_free(&lpn); + if (error == ESTALE) + goto top; + return (set_errno(error)); + } + if (vn_is_readonly(dvp)) + error = EROFS; + else if (pn_fixslash(&lpn)) + error = ENOTDIR; + else { + tbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if ((error = copyinstr(target, tbuf, MAXPATHLEN, &tlen)) == 0) { + vattr.va_type = VLNK; + vattr.va_mode = 0777; + vattr.va_mask = AT_TYPE|AT_MODE; + error = VOP_SYMLINK(dvp, lpn.pn_path, &vattr, + tbuf, CRED()); +#ifdef C2_AUDIT + if (audit_active) + audit_symlink_create(dvp, lpn.pn_path, + tbuf, error); +#endif /* C2_AUDIT */ + } + kmem_free(tbuf, MAXPATHLEN); + } + pn_free(&lpn); + VN_RELE(dvp); + if (error) { + if (error == ESTALE) + goto top; + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/syscall/sync.c b/usr/src/uts/common/syscall/sync.c new file mode 100644 index 0000000000..14945aa1d8 --- /dev/null +++ b/usr/src/uts/common/syscall/sync.c @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* Copyright (c) 1994 Sun Microsystems, Inc. */ +/* All Rights Reserved */ + + + +#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4 1.42 */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/vfs.h> + +int +syssync() +{ + vfs_sync(0); + return (0); +} diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c new file mode 100644 index 0000000000..222fcd5739 --- /dev/null +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -0,0 +1,171 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/tuneable.h> +#include <sys/errno.h> +#include <sys/var.h> +#include <sys/signal.h> +#include <sys/time.h> +#include <sys/sysconfig.h> +#include <sys/resource.h> +#include <sys/ulimit.h> +#include <sys/unistd.h> +#include <sys/debug.h> +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/timer.h> +#include <sys/zone.h> + +long +sysconfig(int which) +{ + switch (which) { + + /* + * if it is not handled in mach_sysconfig either + * it must be EINVAL. + */ + default: + return (mach_sysconfig(which)); /* `uname -i`/os */ + + case _CONFIG_CLK_TCK: + return ((long)hz); /* clock frequency per second */ + + case _CONFIG_PROF_TCK: + return ((long)hz); /* profiling clock freq per sec */ + + case _CONFIG_NGROUPS: + /* + * Maximum number of supplementary groups. + */ + return (ngroups_max); + + case _CONFIG_OPEN_FILES: + /* + * Maximum number of open files (soft limit). + */ + { + rlim64_t fd_ctl; + mutex_enter(&curproc->p_lock); + fd_ctl = rctl_enforced_value( + rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, + curproc); + mutex_exit(&curproc->p_lock); + return ((ulong_t)fd_ctl); + } + + case _CONFIG_CHILD_MAX: + /* + * Maximum number of processes. + */ + return (v.v_maxup); + + case _CONFIG_POSIX_VER: + return (_POSIX_VERSION); /* current POSIX version */ + + case _CONFIG_PAGESIZE: + return (PAGESIZE); + + case _CONFIG_XOPEN_VER: + return (_XOPEN_VERSION); /* current XOPEN version */ + + case _CONFIG_NPROC_CONF: + return (zone_ncpus_get(curproc->p_zone)); + + case _CONFIG_NPROC_ONLN: + return (zone_ncpus_online_get(curproc->p_zone)); + + case _CONFIG_NPROC_MAX: + return (max_ncpus); + + case _CONFIG_STACK_PROT: + return (curproc->p_stkprot & ~PROT_USER); + + case _CONFIG_AIO_LISTIO_MAX: + return (_AIO_LISTIO_MAX); + + case _CONFIG_AIO_MAX: + return (_AIO_MAX); + + case _CONFIG_AIO_PRIO_DELTA_MAX: + return (0); + + case _CONFIG_DELAYTIMER_MAX: + return (INT_MAX); + + case _CONFIG_MQ_OPEN_MAX: + return (_MQ_OPEN_MAX); + + case _CONFIG_MQ_PRIO_MAX: + return (_MQ_PRIO_MAX); + + case _CONFIG_RTSIG_MAX: + return (_SIGRTMAX - _SIGRTMIN + 1); + + case _CONFIG_SEM_NSEMS_MAX: + return (_SEM_NSEMS_MAX); + + case _CONFIG_SEM_VALUE_MAX: + return (_SEM_VALUE_MAX); + + case _CONFIG_SIGQUEUE_MAX: + return (_SIGQUEUE_MAX); + + case _CONFIG_SIGRT_MIN: + return (_SIGRTMIN); + + case _CONFIG_SIGRT_MAX: + return (_SIGRTMAX); + + case _CONFIG_TIMER_MAX: + return (_TIMER_MAX); + + case _CONFIG_PHYS_PAGES: + return (physinstalled); + + case _CONFIG_AVPHYS_PAGES: + return (freemem); + + case _CONFIG_MAXPID: + return (maxpid); + + case _CONFIG_CPUID_MAX: + return (max_cpuid); + + case _CONFIG_SYMLOOP_MAX: + return (MAXSYMLINKS); + } +} diff --git a/usr/src/uts/common/syscall/sysfs.c b/usr/src/uts/common/syscall/sysfs.c new file mode 100644 index 0000000000..58b760bf29 --- /dev/null +++ b/usr/src/uts/common/syscall/sysfs.c @@ -0,0 +1,137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1993 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/fstyp.h> +#include <sys/systm.h> +#include <sys/mount.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/debug.h> +#include <sys/pathname.h> + +/* + * System call to map fstype numbers to names, and vice versa. + */ + +static int sysfsind(char *); +static int sysfstyp(int, char *); + +int +sysfs(int opcode, long a1, long a2) +{ + int error; + + switch (opcode) { + case GETFSIND: + error = sysfsind((char *)a1); + break; + case GETFSTYP: + error = sysfstyp((int)a1, (char *)a2); + break; + case GETNFSTYP: + /* + * Return number of fstypes configured in the system. + */ + return (nfstype - 1); + default: + error = set_errno(EINVAL); + } + + return (error); +} + +static int +sysfsind(char *fsname) +{ + /* + * Translate fs identifier to an index into the vfssw structure. + */ + struct vfssw *vswp; + char fsbuf[FSTYPSZ]; + int retval; + size_t len = 0; + + retval = copyinstr(fsname, fsbuf, FSTYPSZ, &len); + if (retval == ENOENT) /* XXX */ + retval = EINVAL; /* XXX */ + if (len == 1) /* Includes null byte */ + retval = EINVAL; + if (retval) + return (set_errno(retval)); + /* + * Search the vfssw table for the fs identifier + * and return the index. + */ + if ((vswp = vfs_getvfssw(fsbuf)) != NULL) { + retval = vswp - vfssw; + vfs_unrefvfssw(vswp); + return (retval); + } + + return (set_errno(EINVAL)); +} + +static int +sysfstyp(int index, char *cbuf) +{ + /* + * Translate fstype index into an fs identifier. + */ + char *src; + struct vfssw *vswp; + char *osrc; + int error = 0; + + if (index <= 0 || index >= nfstype) + return (set_errno(EINVAL)); + RLOCK_VFSSW(); + vswp = &vfssw[index]; + + osrc = src = vswp->vsw_name; + while (*src++) + ; + + if (copyout(osrc, cbuf, src - osrc)) + error = set_errno(EFAULT); + RUNLOCK_VFSSW(); + return (error); +} diff --git a/usr/src/uts/common/syscall/systeminfo.c b/usr/src/uts/common/syscall/systeminfo.c new file mode 100644 index 0000000000..91c8e73ee4 --- /dev/null +++ b/usr/src/uts/common/syscall/systeminfo.c @@ -0,0 +1,329 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All rights reserved. */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/tuneable.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/utsname.h> +#include <sys/systeminfo.h> +#include <sys/unistd.h> +#include <sys/debug.h> +#include <sys/bootconf.h> +#include <sys/socket.h> +#include <sys/policy.h> +#include <net/if.h> +#include <sys/sunddi.h> +#include <sys/promif.h> +#include <sys/zone.h> +#include <sys/model.h> + +static void get_netif_name(char *, char *); + +long +systeminfo(int command, char *buf, long count) +{ + int error = 0; + long strcnt, getcnt; + char *kstr; + + if (count < 0 && command != SI_SET_HOSTNAME && + command != SI_SET_SRPC_DOMAIN) + return (set_errno(EINVAL)); + + /* + * Deal with the common "get a string" case first. + */ + switch (command) { + case SI_SYSNAME: + kstr = utsname.sysname; + break; + case SI_HOSTNAME: + kstr = uts_nodename(); + break; + case SI_RELEASE: + kstr = utsname.release; + break; + case SI_VERSION: + kstr = utsname.version; + break; + case SI_MACHINE: + kstr = utsname.machine; + break; +#ifdef _LP64 + case SI_ARCHITECTURE_64: + case SI_ARCHITECTURE_K: + kstr = architecture; + break; + case SI_ARCHITECTURE_32: + case SI_ARCHITECTURE: + kstr = architecture_32; + break; + case SI_ARCHITECTURE_NATIVE: + kstr = get_udatamodel() == DATAMODEL_NATIVE ? + architecture : architecture_32; + break; +#else + case SI_ARCHITECTURE_K: + case SI_ARCHITECTURE_32: + case SI_ARCHITECTURE: + case SI_ARCHITECTURE_NATIVE: + kstr = architecture; + break; +#endif + case SI_HW_SERIAL: + kstr = hw_serial; + break; + case SI_HW_PROVIDER: + kstr = hw_provider; + break; + case SI_SRPC_DOMAIN: + kstr = curproc->p_zone->zone_domain; + break; + case SI_PLATFORM: + kstr = platform; + break; + case SI_ISALIST: + kstr = isa_list; + break; + default: + kstr = NULL; + break; + } + + if (kstr != NULL) { + if ((strcnt = strlen(kstr)) >= count) { + getcnt = count - 1; + if (subyte(buf + count - 1, 0) < 0) + return (set_errno(EFAULT)); + } else + getcnt = strcnt + 1; + if (copyout(kstr, buf, getcnt)) + return (set_errno(EFAULT)); + return (strcnt + 1); + } + + switch (command) { + case SI_DHCP_CACHE: + { + char *tmp; + + if (dhcack == NULL) { + tmp = ""; + strcnt = 0; + } else { + /* + * If the interface name has not yet been resolved + * (first IFNAMSIZ bytes of dhcack[]) and a valid + * netdev_path[] was stashed by loadrootmodules in + * swapgeneric.c, resolve the interface name now. + */ + if (dhcack[0] == '\0' && + netdev_path != NULL && netdev_path[0] != '\0') { + get_netif_name(netdev_path, dhcack); + } + + tmp = dhcack; + strcnt = IFNAMSIZ + strlen(&tmp[IFNAMSIZ]); + } + + getcnt = (strcnt >= count) ? count : strcnt + 1; + + if (copyout(tmp, buf, getcnt)) { + error = EFAULT; + break; + } + + if (strcnt >= count && subyte((buf + count - 1), 0) < 0) { + error = EFAULT; + break; + } + + return (strcnt + 1); + } + + case SI_SET_HOSTNAME: + { + size_t len; + char name[SYS_NMLN]; + char *name_to_use; + + if ((error = secpolicy_systeminfo(CRED())) != 0) + break; + + name_to_use = uts_nodename(); + if ((error = copyinstr(buf, name, SYS_NMLN, &len)) != 0) + break; + + /* + * Must be non-NULL string and string + * must be less than SYS_NMLN chars. + */ + if (len < 2 || (len == SYS_NMLN && name[SYS_NMLN-1] != '\0')) { + error = EINVAL; + break; + } + + /* + * Copy the name into the relevant zone's nodename. + */ + (void) strcpy(name_to_use, name); + + /* + * Notify other interested parties that the nodename was set + */ + if (name_to_use == utsname.nodename) /* global zone nodename */ + nodename_set(); + + return (len); + } + + case SI_SET_SRPC_DOMAIN: + { + char name[SYS_NMLN]; + size_t len; + + if ((error = secpolicy_systeminfo(CRED())) != 0) + break; + if ((error = copyinstr(buf, name, SYS_NMLN, &len)) != 0) + break; + /* + * If string passed in is longer than length + * allowed for domain name, fail. + */ + if (len == SYS_NMLN && name[SYS_NMLN-1] != '\0') { + error = EINVAL; + break; + } + + (void) strcpy(curproc->p_zone->zone_domain, name); + return (len); + } + + default: + error = EINVAL; + break; + } + + return (set_errno(error)); +} + +/* + * i_path_find_node: Internal routine used by path_to_devinfo + * to locate a given nodeid in the device tree. + */ +struct i_path_findnode { + dnode_t nodeid; + dev_info_t *dip; +}; + +static int +i_path_find_node(dev_info_t *dev, void *arg) +{ + struct i_path_findnode *f = (struct i_path_findnode *)arg; + + + if (ddi_get_nodeid(dev) == (int)f->nodeid) { + f->dip = dev; + return (DDI_WALK_TERMINATE); + } + return (DDI_WALK_CONTINUE); +} + +/* + * Return the devinfo node to a boot device + */ +static dev_info_t * +path_to_devinfo(char *path) +{ + struct i_path_findnode fn; + extern dev_info_t *top_devinfo; + + /* + * Get the nodeid of the given pathname, if such a mapping exists. + */ + fn.dip = NULL; + fn.nodeid = prom_finddevice(path); + if (fn.nodeid != OBP_BADNODE) { + /* + * Find the nodeid in our copy of the device tree and return + * whatever name we used to bind this node to a driver. + */ + ddi_walk_devs(top_devinfo, i_path_find_node, (void *)(&fn)); + } + + return (fn.dip); +} + +/* + * Determine the network interface name from the device path argument. + */ +static void +get_netif_name(char *devname, char *ifname) +{ + dev_info_t *dip; + major_t ndev; + char *name; + int unit; + + dip = path_to_devinfo(devname); + if (dip == NULL) { + cmn_err(CE_WARN, "get_netif_name: " + "can't bind driver for '%s'\n", devname); + return; + } + + ndev = ddi_driver_major(dip); + if (ndev == -1) { + cmn_err(CE_WARN, "get_netif_name: " + "no driver bound to '%s'\n", devname); + return; + } + + name = ddi_major_to_name(ndev); + if (name == NULL) { + cmn_err(CE_WARN, "get_netif_name: " + "no name for major number %d\n", ndev); + return; + } + + unit = i_ddi_devi_get_ppa(dip); + if (unit < 0) { + cmn_err(CE_WARN, "get_netif_name: " + "illegal unit number %d\n", unit); + return; + } + + (void) snprintf(ifname, IFNAMSIZ, "%s%d", name, unit); +} diff --git a/usr/src/uts/common/syscall/tasksys.c b/usr/src/uts/common/syscall/tasksys.c new file mode 100644 index 0000000000..10b7e95c76 --- /dev/null +++ b/usr/src/uts/common/syscall/tasksys.c @@ -0,0 +1,266 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * System calls for creating and inquiring about tasks and projects + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/task.h> +#include <sys/systm.h> +#include <sys/project.h> +#include <sys/cpuvar.h> +#include <sys/policy.h> +#include <sys/zone.h> + +/* + * Limit projlist to 256k projects. + */ +#define MAX_PROJLIST_BUFSIZE 1048576 + +typedef struct projlist_walk { + projid_t *pw_buf; + size_t pw_bufsz; +} projlist_walk_t; + + +/* + * taskid_t tasksys_settaskid(projid_t projid, uint_t flags); + * + * Overview + * Place the calling process in a new task if sufficiently privileged. If the + * present task is finalized, the process may not create a new task. + * + * Return values + * 0 on success, errno on failure. + */ +static long +tasksys_settaskid(projid_t projid, uint_t flags) +{ + proc_t *p = ttoproc(curthread); + kproject_t *oldpj; + kproject_t *kpj; + task_t *tk, *oldtk; + rctl_entity_p_t e; + zone_t *zone; + int rctlfail = 0; + + if (secpolicy_tasksys(CRED()) != 0) + return (set_errno(EPERM)); + + if (projid < 0 || projid > MAXPROJID) + return (set_errno(EINVAL)); + + if (flags & ~TASK_FINAL) + return (set_errno(EINVAL)); + + mutex_enter(&pidlock); + if (p->p_task->tk_flags & TASK_FINAL) { + mutex_exit(&pidlock); + return (set_errno(EACCES)); + } + mutex_exit(&pidlock); + + /* + * Try to stop all other lwps in the process while we're changing + * our project. This way, curthread doesn't need to grab its own + * thread_lock to find its project ID (see curprojid()). If this + * is the /proc agent lwp, we know that the other lwps are already + * held. If we failed to hold all lwps, bail out and return EINTR. + */ + if (curthread != p->p_agenttp && !holdlwps(SHOLDFORK1)) + return (set_errno(EINTR)); + /* + * Put a hold on our new project and make sure that nobody is + * trying to bind it to a pool while we're joining. + */ + kpj = project_hold_by_id(projid, getzoneid(), PROJECT_HOLD_INSERT); + e.rcep_p.proj = kpj; + e.rcep_t = RCENTITY_PROJECT; + + mutex_enter(&p->p_lock); + oldpj = p->p_task->tk_proj; + zone = p->p_zone; + + mutex_enter(&zone->zone_nlwps_lock); + + if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl) + if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e, + p->p_lwpcnt, 0) & RCT_DENY) + rctlfail = 1; + + if (kpj->kpj_ntasks + 1 > kpj->kpj_ntasks_ctl) + if (rctl_test_entity(rc_project_ntasks, kpj->kpj_rctls, p, &e, + 1, 0) & RCT_DENY) + rctlfail = 1; + + if (rctlfail) { + mutex_exit(&zone->zone_nlwps_lock); + if (curthread != p->p_agenttp) + continuelwps(p); + mutex_exit(&p->p_lock); + return (set_errno(EAGAIN)); + } + kpj->kpj_nlwps += p->p_lwpcnt; + kpj->kpj_ntasks++; + + oldpj->kpj_nlwps -= p->p_lwpcnt; + + mutex_exit(&zone->zone_nlwps_lock); + mutex_exit(&p->p_lock); + + mutex_enter(&kpj->kpj_poolbind); + tk = task_create(projid, curproc->p_zone); + mutex_enter(&cpu_lock); + /* + * Returns with p_lock held. + */ + oldtk = task_join(tk, flags); + if (curthread != p->p_agenttp) + continuelwps(p); + mutex_exit(&p->p_lock); + mutex_exit(&cpu_lock); + mutex_exit(&kpj->kpj_poolbind); + task_rele(oldtk); + project_rele(kpj); + return (tk->tk_tkid); +} + +/* + * taskid_t tasksys_gettaskid(void); + * + * Overview + * Return the current task ID for this process. + * + * Return value + * The ID for the task to which the current process belongs. + */ +static long +tasksys_gettaskid() +{ + long ret; + proc_t *p = ttoproc(curthread); + + mutex_enter(&pidlock); + ret = p->p_task->tk_tkid; + mutex_exit(&pidlock); + return (ret); +} + +/* + * projid_t tasksys_getprojid(void); + * + * Overview + * Return the current project ID for this process. + * + * Return value + * The ID for the project to which the current process belongs. + */ +static long +tasksys_getprojid() +{ + long ret; + proc_t *p = ttoproc(curthread); + + mutex_enter(&pidlock); + ret = p->p_task->tk_proj->kpj_id; + mutex_exit(&pidlock); + return (ret); +} + +static int +tasksys_projlist_cb(kproject_t *kp, void *buf) +{ + projlist_walk_t *pw = (projlist_walk_t *)buf; + + if (pw && pw->pw_bufsz >= sizeof (projid_t)) { + *pw->pw_buf = kp->kpj_id; + pw->pw_buf++; + pw->pw_bufsz -= sizeof (projid_t); + } + + return (0); +} + +/* + * long tasksys_projlist(void *buf, size_t bufsz) + * + * Overview + * Return a buffer containing the project IDs of all currently active projects + * in the current zone. + * + * Return values + * The minimum size of a buffer sufficiently large to contain all of the + * active project IDs, or -1 if an error occurs during copyout. + */ +static long +tasksys_projlist(void *buf, size_t bufsz) +{ + long ret = 0; + projlist_walk_t pw; + void *kbuf; + + if (buf == NULL || bufsz == 0) + return (project_walk_all(getzoneid(), tasksys_projlist_cb, + NULL)); + + if (bufsz > MAX_PROJLIST_BUFSIZE) + return (set_errno(ENOMEM)); + + kbuf = pw.pw_buf = kmem_zalloc(bufsz, KM_SLEEP); + pw.pw_bufsz = bufsz; + + ret = project_walk_all(getzoneid(), tasksys_projlist_cb, &pw); + + if (copyout(kbuf, buf, bufsz) == -1) + ret = set_errno(EFAULT); + + kmem_free(kbuf, bufsz); + return (ret); +} + +long +tasksys(int code, projid_t projid, uint_t flags, void *projidbuf, size_t pbufsz) +{ + switch (code) { + case 0: + return (tasksys_settaskid(projid, flags)); + case 1: + return (tasksys_gettaskid()); + case 2: + return (tasksys_getprojid()); + case 3: + return (tasksys_projlist(projidbuf, pbufsz)); + default: + return (set_errno(EINVAL)); + } +} diff --git a/usr/src/uts/common/syscall/time.c b/usr/src/uts/common/syscall/time.c new file mode 100644 index 0000000000..ccca2f5847 --- /dev/null +++ b/usr/src/uts/common/syscall/time.c @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1994-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All rights reserved. */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/tuneable.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/debug.h> +#include <sys/policy.h> + +time_t +gtime(void) +{ + return (gethrestime_sec()); +} + +int +stime(time_t time) +{ + timestruc_t ts; + + if (secpolicy_settime(CRED()) != 0) + return (set_errno(EPERM)); + + if (time < 0) + return (set_errno(EINVAL)); + + ts.tv_sec = time; + ts.tv_nsec = 0; + mutex_enter(&tod_lock); + tod_set(ts); + set_hrestime(&ts); + mutex_exit(&tod_lock); + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +int +stime32(time32_t time) +{ + if (time < 0) + return (set_errno(EINVAL)); + + return (stime((time_t)time)); +} +#endif diff --git a/usr/src/uts/common/syscall/times.c b/usr/src/uts/common/syscall/times.c new file mode 100644 index 0000000000..cefa942d57 --- /dev/null +++ b/usr/src/uts/common/syscall/times.c @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/tuneable.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/times.h> +#include <sys/debug.h> +#include <sys/msacct.h> + +/* + * Return system and user times. + */ + +clock_t +times(struct tms *tp) +{ + proc_t *p = ttoproc(curthread); + struct tms p_time; + clock_t ret_lbolt; + + mutex_enter(&p->p_lock); + p_time.tms_utime = (clock_t)NSEC_TO_TICK( + mstate_aggr_state(p, LMS_USER)); + p_time.tms_stime = (clock_t)NSEC_TO_TICK( + mstate_aggr_state(p, LMS_SYSTEM)); + p_time.tms_cutime = p->p_cutime; + p_time.tms_cstime = p->p_cstime; + mutex_exit(&p->p_lock); + + if (copyout(&p_time, tp, sizeof (p_time))) + return (set_errno(EFAULT)); + + ret_lbolt = lbolt; + + return (ret_lbolt == -1 ? 0 : ret_lbolt); +} + +#ifdef _SYSCALL32_IMPL + +/* + * We deliberately -don't- return EOVERFLOW on type overflow, + * since the 32-bit kernel simply wraps 'em around. + */ +clock32_t +times32(struct tms32 *tp) +{ + proc_t *p = ttoproc(curthread); + struct tms32 p_time; + clock32_t ret_lbolt; + + mutex_enter(&p->p_lock); + p_time.tms_utime = (clock32_t)NSEC_TO_TICK( + mstate_aggr_state(p, LMS_USER)); + p_time.tms_stime = (clock32_t)NSEC_TO_TICK( + mstate_aggr_state(p, LMS_SYSTEM)); + p_time.tms_cutime = (clock32_t)p->p_cutime; + p_time.tms_cstime = (clock32_t)p->p_cstime; + mutex_exit(&p->p_lock); + + if (copyout(&p_time, tp, sizeof (p_time))) + return (set_errno(EFAULT)); + + ret_lbolt = (clock32_t)lbolt; + + return (ret_lbolt == (clock32_t)-1 ? 0 : ret_lbolt); +} + +#endif /* _SYSCALL32_IMPL */ diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c new file mode 100644 index 0000000000..31a3ff0a10 --- /dev/null +++ b/usr/src/uts/common/syscall/uadmin.c @@ -0,0 +1,373 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/swap.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/var.h> +#include <sys/uadmin.h> +#include <sys/signal.h> +#include <sys/time.h> +#include <vm/seg_kmem.h> +#include <sys/modctl.h> +#include <sys/callb.h> +#include <sys/dumphdr.h> +#include <sys/debug.h> +#include <sys/ftrace.h> +#include <sys/cmn_err.h> +#include <sys/panic.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/policy.h> +#include <sys/zone.h> + +/* + * Administrivia system call. We provide this in two flavors: one for calling + * from the system call path (uadmin), and the other for calling from elsewhere + * within the kernel (kadmin). Callers must beware that certain uadmin cmd + * values (specifically A_SWAPCTL) are only supported by uadmin and not kadmin. + */ + +extern ksema_t fsflush_sema; +kmutex_t ualock; + + +/* + * Kill all user processes in said zone. A special argument of ALL_ZONES is + * passed in when the system as a whole is shutting down. The lack of per-zone + * process lists is likely to make the following a performance bottleneck on a + * system with many zones. + */ +void +killall(zoneid_t zoneid) +{ + proc_t *p; + + ASSERT(zoneid != GLOBAL_ZONEID); + /* + * Kill all processes except kernel daemons and ourself. + * Make a first pass to stop all processes so they won't + * be trying to restart children as we kill them. + */ + mutex_enter(&pidlock); + for (p = practive; p != NULL; p = p->p_next) { + if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) && + p->p_exec != NULLVP && /* kernel daemons */ + p->p_as != &kas && + p->p_stat != SZOMB) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOWAIT; + sigtoproc(p, NULL, SIGSTOP); + mutex_exit(&p->p_lock); + } + } + p = practive; + while (p != NULL) { + if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) && + p->p_exec != NULLVP && /* kernel daemons */ + p->p_as != &kas && + p->p_stat != SIDL && + p->p_stat != SZOMB) { + mutex_enter(&p->p_lock); + if (sigismember(&p->p_sig, SIGKILL)) { + mutex_exit(&p->p_lock); + p = p->p_next; + } else { + sigtoproc(p, NULL, SIGKILL); + mutex_exit(&p->p_lock); + (void) cv_timedwait(&p->p_srwchan_cv, + &pidlock, lbolt + hz); + p = practive; + } + } else { + p = p->p_next; + } + } + mutex_exit(&pidlock); +} + +int +kadmin(int cmd, int fcn, void *mdep, cred_t *credp) +{ + int error = 0; + int locked = 0; + char *buf; + size_t buflen = 0; + + /* + * We might be called directly by the kernel's fault-handling code, so + * we can't assert that the caller is in the global zone. + */ + + /* + * Make sure that cmd is one of the valid <sys/uadmin.h> command codes + * and that we have appropriate privileges for this action. + */ + switch (cmd) { + case A_FTRACE: + case A_SHUTDOWN: + case A_REBOOT: + case A_REMOUNT: + case A_FREEZE: + case A_DUMP: + if (secpolicy_sys_config(credp, B_FALSE) != 0) + return (EPERM); + break; + + default: + return (EINVAL); + } + + /* + * Serialize these operations on ualock. If it is held, just return + * as if successful since the system will soon reset or remount. + */ + if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT) { + if (!mutex_tryenter(&ualock)) + return (0); + locked = 1; + } + + switch (cmd) { + case A_SHUTDOWN: + { + proc_t *p = ttoproc(curthread); + + /* + * Release (almost) all of our own resources if we are called + * from a user context, however if we are calling kadmin() from + * a kernel context then we do not release these resources. + */ + if (ttoproc(curthread) != &p0) { + if ((error = exitlwps(0)) != 0) + return (error); + mutex_enter(&p->p_lock); + p->p_flag |= SNOWAIT; + sigfillset(&p->p_ignore); + curthread->t_lwp->lwp_cursig = 0; + curthread->t_lwp->lwp_extsig = 0; + if (p->p_exec) { + vnode_t *exec_vp = p->p_exec; + p->p_exec = NULLVP; + mutex_exit(&p->p_lock); + VN_RELE(exec_vp); + } else { + mutex_exit(&p->p_lock); + } + + pollcleanup(); + closeall(P_FINFO(curproc)); + relvm(); + + } else { + /* + * Reset t_cred if not set because much of the + * filesystem code depends on CRED() being valid. + */ + if (curthread->t_cred == NULL) + curthread->t_cred = kcred; + } + + /* + * Communcate that init shouldn't be restarted. + */ + zone_shutdown_global(); + + killall(ALL_ZONES); + /* + * If we are calling kadmin() from a kernel context then we + * do not release these resources. + */ + if (ttoproc(curthread) != &p0) { + VN_RELE(u.u_cdir); + if (u.u_rdir) + VN_RELE(u.u_rdir); + if (u.u_cwd) + refstr_rele(u.u_cwd); + + u.u_cdir = rootdir; + u.u_rdir = NULL; + u.u_cwd = NULL; + } + + /* + * Allow the reboot/halt/poweroff code a chance to do + * anything it needs to whilst we still have filesystems + * mounted, like loading any modules necessary for later + * performing the actual poweroff. + */ + if ((mdep != NULL) && (*(char *)mdep == '/')) { + buf = i_convert_boot_device_name(mdep, NULL, &buflen); + mdpreboot(cmd, fcn, buf); + } else + mdpreboot(cmd, fcn, mdep); + + /* + * Allow fsflush to finish running and then prevent it + * from ever running again so that vfs_unmountall() and + * vfs_syncall() can acquire the vfs locks they need. + */ + sema_p(&fsflush_sema); + (void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, NULL); + + vfs_unmountall(); + (void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT); + vfs_syncall(); + + (void) callb_execute_class(CB_CL_UADMIN_POST_VFS, NULL); + dump_ereports(); + dump_messages(); + + /* FALLTHROUGH */ + } + + case A_REBOOT: + if ((mdep != NULL) && (*(char *)mdep == '/')) { + buf = i_convert_boot_device_name(mdep, NULL, &buflen); + mdboot(cmd, fcn, buf); + } else + mdboot(cmd, fcn, mdep); + /* no return expected */ + break; + + case A_REMOUNT: + (void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT); + break; + + case A_FREEZE: + { + /* XXX: declare in some header file */ + extern int cpr(int); + + if (modload("misc", "cpr") == -1) + return (ENOTSUP); + error = cpr(fcn); + break; + } + + case A_FTRACE: + { + switch (fcn) { + case AD_FTRACE_START: + (void) FTRACE_START(); + break; + case AD_FTRACE_STOP: + (void) FTRACE_STOP(); + break; + default: + error = EINVAL; + } + break; + } + + case A_DUMP: + { + if (fcn == AD_NOSYNC) { + in_sync = 1; + break; + } + + panic_bootfcn = fcn; + panic_forced = 1; + + if ((mdep != NULL) && (*(char *)mdep == '/')) { + panic_bootstr = i_convert_boot_device_name(mdep, + NULL, &buflen); + } else + panic_bootstr = mdep; + + panic("forced crash dump initiated at user request"); + /*NOTREACHED*/ + } + + default: + error = EINVAL; + } + + if (locked) + mutex_exit(&ualock); + + return (error); +} + +int +uadmin(int cmd, int fcn, uintptr_t mdep) +{ + int error = 0, rv = 0; + size_t nbytes = 0; + char buf[257]; + cred_t *credp = CRED(); + + /* + * The swapctl system call doesn't have its own entry point: it uses + * uadmin as a wrapper so we just call it directly from here. + */ + if (cmd == A_SWAPCTL) { + if (get_udatamodel() == DATAMODEL_NATIVE) + error = swapctl(fcn, (void *)mdep, &rv); +#if defined(_SYSCALL32_IMPL) + else + error = swapctl32(fcn, (void *)mdep, &rv); +#endif /* _SYSCALL32_IMPL */ + return (error ? set_errno(error) : rv); + } + + /* + * Handle zones. + */ + if (getzoneid() != GLOBAL_ZONEID) { + error = zone_uadmin(cmd, fcn, credp); + return (error ? set_errno(error) : 0); + } + + /* + * Certain subcommands intepret a non-NULL mdep value as a pointer to + * a boot string. Attempt to copy it in now, or reset mdep to NULL. + */ + if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_DUMP) { + if (mdep != NULL && copyinstr((const char *)mdep, buf, + sizeof (buf) - 1, &nbytes) == 0) { + buf[nbytes] = '\0'; + mdep = (uintptr_t)buf; + } else + mdep = NULL; + } + + if ((error = kadmin(cmd, fcn, (void *)mdep, credp)) != 0) + return (set_errno(error)); + + return (0); +} diff --git a/usr/src/uts/common/syscall/ucredsys.c b/usr/src/uts/common/syscall/ucredsys.c new file mode 100644 index 0000000000..16e4ce82b8 --- /dev/null +++ b/usr/src/uts/common/syscall/ucredsys.c @@ -0,0 +1,208 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/ucred.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/stropts.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/cmn_err.h> +#include <sys/socket.h> +#include <sys/strsubr.h> +#include <c2/audit.h> + +/* + * Getpeerucred system call implementation. + */ +static int +getpeerucred(int fd, void *buf) +{ + file_t *fp; + struct ucred_s *uc; + vnode_t *vp; + k_peercred_t kpc; + int err; + int32_t rval; + + kpc.pc_cr = NULL; + kpc.pc_cpid = -1; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + vp = fp->f_vnode; + + switch (vp->v_type) { + case VFIFO: + case VSOCK: + err = VOP_IOCTL(vp, _I_GETPEERCRED, (intptr_t)&kpc, + FKIOCTL, CRED(), &rval); + break; + case VCHR: { + struct strioctl strioc; + + if (vp->v_stream == NULL) { + err = ENOTSUP; + break; + } + strioc.ic_cmd = _I_GETPEERCRED; + strioc.ic_timout = INFTIM; + strioc.ic_len = (int)sizeof (k_peercred_t); + strioc.ic_dp = (char *)&kpc; + + err = strdoioctl(vp->v_stream, &strioc, FNATIVE|FKIOCTL, + STR_NOSIG|K_TO_K, CRED(), &rval); + + /* + * Map all unexpected error codes to ENOTSUP. + */ + switch (err) { + case 0: + case ENOTSUP: + case ENOTCONN: + case ENOMEM: + break; + default: + err = ENOTSUP; + break; + } + break; + } + default: + err = ENOTSUP; + break; + } + releasef(fd); + + /* + * If someone gave us a credential, err will be 0. + */ + if (kpc.pc_cr != NULL) { + ASSERT(err == 0); + + uc = cred2ucred(kpc.pc_cr, kpc.pc_cpid, NULL); + + crfree(kpc.pc_cr); + + err = copyout(uc, buf, uc->uc_size); + + kmem_free(uc, uc->uc_size); + + if (err != 0) + return (set_errno(EFAULT)); + + return (0); + } + return (set_errno(err)); +} + +static int +ucred_get(pid_t pid, void *ubuf) +{ + proc_t *p; + cred_t *pcr; + int err; + struct ucred_s *uc; + + if (pid == P_MYID || pid == curproc->p_pid) { + pcr = CRED(); + crhold(pcr); + pid = curproc->p_pid; + } else { + cred_t *updcred = NULL; + + if (pid < 0) + return (set_errno(EINVAL)); + + if (audit_active) + updcred = cralloc(); + + mutex_enter(&pidlock); + p = prfind(pid); + + if (p == NULL) { + mutex_exit(&pidlock); + if (updcred != NULL) + crfree(updcred); + return (set_errno(ESRCH)); + } + + /* + * Assure that audit data in cred is up-to-date. + * updcred will be used or freed. + */ + if (audit_active) + audit_update_context(p, updcred); + + err = priv_proc_cred_perm(CRED(), p, &pcr, VREAD); + mutex_exit(&pidlock); + + if (err != 0) + return (set_errno(err)); + } + + uc = cred2ucred(pcr, pid, NULL); + + crfree(pcr); + + err = copyout(uc, ubuf, uc->uc_size); + + kmem_free(uc, uc->uc_size); + + if (err) + return (set_errno(EFAULT)); + + return (0); +} + +int +ucredsys(int code, int obj, void *buf) +{ + switch (code) { + case UCREDSYS_UCREDGET: + return (ucred_get((pid_t)obj, buf)); + case UCREDSYS_GETPEERUCRED: + return (getpeerucred(obj, buf)); + default: + return (set_errno(EINVAL)); + } +} + +#ifdef _SYSCALL32_IMPL +int +ucredsys32(int arg1, int arg2, caddr32_t arg3) +{ + return (ucredsys(arg1, arg2, (void *)(uintptr_t)arg3)); +} +#endif diff --git a/usr/src/uts/common/syscall/uid.c b/usr/src/uts/common/syscall/uid.c new file mode 100644 index 0000000000..65bcabcaf0 --- /dev/null +++ b/usr/src/uts/common/syscall/uid.c @@ -0,0 +1,323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/tuneable.h> +#include <sys/cred_impl.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/signal.h> +#include <sys/debug.h> +#include <sys/policy.h> +#include <sys/zone.h> + +int +setuid(uid_t uid) +{ + register proc_t *p; + int error; + int do_nocd = 0; + int uidchge = 0; + cred_t *cr, *newcr; + uid_t oldruid = uid; + zoneid_t zoneid = getzoneid(); + + if (uid < 0 || uid > MAXUID) + return (set_errno(EINVAL)); + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + +retry: + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if ((uid == cr->cr_ruid || uid == cr->cr_suid) && + secpolicy_allow_setid(cr, uid, B_TRUE) != 0) { + error = 0; + crcopy_to(cr, newcr); + p->p_cred = newcr; + newcr->cr_uid = uid; + } else if ((error = secpolicy_allow_setid(cr, uid, B_FALSE)) == 0) { + if (!uidchge && uid != cr->cr_ruid) { + /* + * The ruid of the process is going to change. In order + * to avoid a race condition involving the + * process-count associated with the newly given ruid, + * we increment the count before assigning the + * credential to the process. + * To do that, we'll have to take pidlock, so we first + * release p_crlock. + */ + mutex_exit(&p->p_crlock); + uidchge = 1; + mutex_enter(&pidlock); + upcount_inc(uid, zoneid); + mutex_exit(&pidlock); + /* + * As we released p_crlock we can't rely on the cr + * we read. So retry the whole thing. + */ + goto retry; + } + /* + * A privileged process that gives up its privilege + * must be marked to produce no core dump. + */ + if (cr->cr_uid != uid || + cr->cr_ruid != uid || + cr->cr_suid != uid) + do_nocd = 1; + oldruid = cr->cr_ruid; + crcopy_to(cr, newcr); + p->p_cred = newcr; + newcr->cr_ruid = uid; + newcr->cr_suid = uid; + newcr->cr_uid = uid; + ASSERT(uid != oldruid ? uidchge : 1); + } else + crfree(newcr); + + mutex_exit(&p->p_crlock); + + /* + * We decrement the number of processes associated with the oldruid + * to match the increment above, even if the ruid of the process + * did not change or an error occurred (oldruid == uid). + */ + if (uidchge) { + mutex_enter(&pidlock); + upcount_dec(oldruid, zoneid); + mutex_exit(&pidlock); + } + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + return (0); + } + return (set_errno(error)); +} + +int64_t +getuid(void) +{ + rval_t r; + cred_t *cr; + + cr = curthread->t_cred; + r.r_val1 = cr->cr_ruid; + r.r_val2 = cr->cr_uid; + return (r.r_vals); +} + +int +seteuid(uid_t uid) +{ + register proc_t *p; + int error = EPERM; + int do_nocd = 0; + cred_t *cr, *newcr; + + if (uid < 0 || uid > MAXUID) + return (set_errno(EINVAL)); + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (uid == cr->cr_ruid || uid == cr->cr_uid || uid == cr->cr_suid || + (error = secpolicy_allow_setid(cr, uid, B_FALSE)) == 0) { + /* + * A privileged process that makes itself look like a + * set-uid process must be marked to produce no core dump, + * if the effective uid did changed. + */ + if (cr->cr_uid != uid && error == 0) + do_nocd = 1; + error = 0; + crcopy_to(cr, newcr); + p->p_cred = newcr; + newcr->cr_uid = uid; + } else + crfree(newcr); + + mutex_exit(&p->p_crlock); + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + return (0); + } + return (set_errno(error)); +} + +/* + * Buy-back from SunOS 4.x + * + * Like setuid() and seteuid() combined -except- that non-root users + * can change cr_ruid to cr_uid, and the semantics of cr_suid are + * subtly different. + */ +int +setreuid(uid_t ruid, uid_t euid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + int uidchge = 0; + uid_t oldruid = ruid; + cred_t *cr, *newcr; + zoneid_t zoneid = getzoneid(); + + if ((ruid != -1 && (ruid < 0 || ruid > MAXUID)) || + (euid != -1 && (euid < 0 || euid > MAXUID))) + return (set_errno(EINVAL)); + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + +retry: + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (ruid != -1 && ruid != cr->cr_ruid && ruid != cr->cr_uid && + secpolicy_allow_setid(cr, ruid, B_FALSE) != 0) { + error = EPERM; + } else if (euid != -1 && + euid != cr->cr_ruid && euid != cr->cr_uid && + euid != cr->cr_suid && secpolicy_allow_setid(cr, euid, B_FALSE)) { + error = EPERM; + } else { + if (!uidchge && ruid != -1 && cr->cr_ruid != ruid) { + /* + * The ruid of the process is going to change. In order + * to avoid a race condition involving the + * process-count associated with the newly given ruid, + * we increment the count before assigning the + * credential to the process. + * To do that, we'll have to take pidlock, so we first + * release p_crlock. + */ + mutex_exit(&p->p_crlock); + uidchge = 1; + mutex_enter(&pidlock); + upcount_inc(ruid, zoneid); + mutex_exit(&pidlock); + /* + * As we released p_crlock we can't rely on the cr + * we read. So retry the whole thing. + */ + goto retry; + } + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (euid != -1) + newcr->cr_uid = euid; + if (ruid != -1) { + oldruid = newcr->cr_ruid; + newcr->cr_ruid = ruid; + ASSERT(ruid != oldruid ? uidchge : 1); + } + /* + * "If the real uid is being changed, or the effective uid is + * being changed to a value not equal to the real uid, the + * saved uid is set to the new effective uid." + */ + if (ruid != -1 || + (euid != -1 && newcr->cr_uid != newcr->cr_ruid)) + newcr->cr_suid = newcr->cr_uid; + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_uid != newcr->cr_uid || + cr->cr_ruid != newcr->cr_ruid || + cr->cr_suid != newcr->cr_suid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + /* + * We decrement the number of processes associated with the oldruid + * to match the increment above, even if the ruid of the process + * did not change or an error occurred (oldruid == uid). + */ + if (uidchge) { + ASSERT(oldruid != -1 && ruid != -1); + mutex_enter(&pidlock); + upcount_dec(oldruid, zoneid); + mutex_exit(&pidlock); + } + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + return (0); + } + crfree(newcr); + return (set_errno(error)); +} diff --git a/usr/src/uts/common/syscall/umask.c b/usr/src/uts/common/syscall/umask.c new file mode 100644 index 0000000000..e80d1de9a6 --- /dev/null +++ b/usr/src/uts/common/syscall/umask.c @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/debug.h> + +/* + * Mode mask for creation of files. + */ + +int +umask(int mask) +{ + register mode_t t; + + t = u.u_cmask; + u.u_cmask = (mode_t)(mask & PERMMASK); + return ((int)t); +} diff --git a/usr/src/uts/common/syscall/umount.c b/usr/src/uts/common/syscall/umount.c new file mode 100644 index 0000000000..f5fb881f5d --- /dev/null +++ b/usr/src/uts/common/syscall/umount.c @@ -0,0 +1,188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/fstyp.h> +#include <sys/kmem.h> +#include <sys/systm.h> +#include <sys/mount.h> +#include <sys/vfs.h> +#include <sys/cred.h> +#include <sys/vnode.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/pathname.h> +#include <sys/policy.h> +#include <sys/zone.h> + + +/* + * New umount() system call (for force unmount flag and perhaps others later). + */ +int +umount2(char *pathp, int flag) +{ + struct pathname pn; + struct vfs *vfsp; + int error; + + /* + * Some flags are disallowed through the system call interface. + */ + flag &= MS_UMOUNT_MASK; + + /* + * Lookup user-supplied name by trying to match it against the + * mount points recorded at mount time. If no match is found + * (which can happen if the path to the mount point is specified + * differently between mount & umount, or if a block device were + * passed to umount) then we fall back to calling lookupname() + * to find the vfs. Doing it this way prevents calling lookupname() + * in most cases and that allows forcible umount to work even if + * lookupname() would hang (i.e. because an NFS server is dead). + */ + + if (error = pn_get(pathp, UIO_USERSPACE, &pn)) + return (set_errno(error)); + + /* + * Only a privileged user is allowed to bypass the security + * checks done by lookupname() and use the results from + * vfs_mntpoint2vfsp() instead. It could be argued that the + * proper check is FILE_DAC_SEARCH but we put it all + * under the mount privilege. Also, make sure the caller + * isn't in an environment with an alternate root (to the zone's root) + * directory, i.e. chroot(2). + */ + if (secpolicy_fs_unmount(CRED(), NULL) != 0 || + (PTOU(curproc)->u_rdir != NULL && + PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp) || + (vfsp = vfs_mntpoint2vfsp(pn.pn_path)) == NULL) { + vnode_t *fsrootvp; + + /* fall back to lookupname() on path given to us */ + if (error = lookupname(pn.pn_path, UIO_SYSSPACE, FOLLOW, + NULLVPP, &fsrootvp)) { + pn_free(&pn); + return (set_errno(error)); + } + /* + * Find the vfs to be unmounted. The caller may have specified + * either the directory mount point (preferred) or else (for a + * disk-based file system) the block device which was mounted. + * Check to see which it is; if it's the device, search the VFS + * list to find the associated vfs entry. + */ + if (fsrootvp->v_flag & VROOT) { + vfsp = fsrootvp->v_vfsp; + VFS_HOLD(vfsp); + } else if (fsrootvp->v_type == VBLK) + vfsp = vfs_dev2vfsp(fsrootvp->v_rdev); + else + vfsp = NULL; + + VN_RELE(fsrootvp); + + if (vfsp == NULL) { + pn_free(&pn); + return (set_errno(EINVAL)); + } + } + pn_free(&pn); + + /* + * Protect the call to vn_vfswlock() with the vfs reflock. This + * ensures vfs_vnodecovered will either be NULL (because someone + * beat us to the umount) or valid (because vfs_lock() prevents + * another umount from getting through here until we've called + * vn_vfswlock() on the covered vnode). + * + * At one point, we did the non-blocking version (vfs_lock()), + * and if it failed, bailed out with EBUSY. However, dounmount() + * calls vfs_lock_wait() and we drop the vfs lock before calling + * dounmount(), so there's no difference between waiting here + * for the lock or waiting there because grabbed it as soon as + * we drop it below. No returning with EBUSY at this point + * reduces the number of spurious unmount failures that happen + * as a side-effect of fsflush() and other mount and unmount + * operations that might be going on simultaneously. + */ + vfs_lock_wait(vfsp); + + /* + * Call vn_vfswlock() on the covered vnode so that dounmount() + * can do its thing. It will call the corresponding vn_vfsunlock(). + * Note that vfsp->vfs_vnodecovered can be NULL here, either because + * someone did umount on "/" or because someone beat us to the umount + * before we did the vfs_lock() above. In these cases, vn_vfswlock() + * returns EBUSY and we just pass that up. Also note that we're + * looking at a vnode without doing a VN_HOLD() on it. This is + * safe because it can't go away while something is mounted on it + * and we're locking out other umounts at this point. + */ + if (vn_vfswlock(vfsp->vfs_vnodecovered)) { + vfs_unlock(vfsp); + VFS_RELE(vfsp); + return (set_errno(EBUSY)); + } + + /* + * Now that the VVFSLOCK in the covered vnode is protecting this + * path, we don't need the vfs reflock or the hold on the vfs anymore. + */ + vfs_unlock(vfsp); + VFS_RELE(vfsp); + + /* + * Perform the unmount. + */ + if ((error = dounmount(vfsp, flag, CRED())) != 0) + return (set_errno(error)); + return (0); +} + +/* + * Old umount() system call for compatibility. + * Changes due to support for forced unmount. + */ +int +umount(char *pathp) +{ + return (umount2(pathp, 0)); +} diff --git a/usr/src/uts/common/syscall/uname.c b/usr/src/uts/common/syscall/uname.c new file mode 100644 index 0000000000..9a5a2608f7 --- /dev/null +++ b/usr/src/uts/common/syscall/uname.c @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/utsname.h> +#include <sys/debug.h> + +int +uname(struct utsname *buf) +{ + char *name_to_use = uts_nodename(); + + if (copyout(utsname.sysname, buf->sysname, strlen(utsname.sysname)+1)) { + return (set_errno(EFAULT)); + } + if (copyout(name_to_use, buf->nodename, strlen(name_to_use)+1)) { + return (set_errno(EFAULT)); + } + if (copyout(utsname.release, buf->release, strlen(utsname.release)+1)) { + return (set_errno(EFAULT)); + } + if (copyout(utsname.version, buf->version, strlen(utsname.version)+1)) { + return (set_errno(EFAULT)); + } + if (copyout(utsname.machine, buf->machine, strlen(utsname.machine)+1)) { + return (set_errno(EFAULT)); + } + return (1); /* XXX why 1 and not 0? 1003.1 says "non-negative" */ +} diff --git a/usr/src/uts/common/syscall/unlink.c b/usr/src/uts/common/syscall/unlink.c new file mode 100644 index 0000000000..d4b84c0272 --- /dev/null +++ b/usr/src/uts/common/syscall/unlink.c @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2001 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/debug.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <c2/audit.h> + +/* + * Unlink (i.e. delete) a file. + */ +int +unlink(char *fname) +{ + int error; + + if (error = vn_remove(fname, UIO_USERSPACE, RMFILE)) + return (set_errno(error)); + return (0); +} + +/* + * Unlink a file from a directory + */ +int +unlinkat(int fd, char *name, int flags) +{ + file_t *dirfp; + vnode_t *dirvp; + int error; + char startchar; + + if (fd == AT_FDCWD && name == NULL) + return (set_errno(EFAULT)); + + if (name != NULL) { + if (copyin(name, &startchar, sizeof (char))) + return (set_errno(EFAULT)); + } else + startchar = '\0'; + + if (fd == AT_FDCWD) { + dirvp = NULL; + } else { + if (startchar != '/') { + if ((dirfp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + dirvp = dirfp->f_vnode; + VN_HOLD(dirvp); + releasef(fd); + } else { + dirvp = NULL; + } + } + +#ifdef C2_AUDIT + if (audit_active) + audit_setfsat_path(1); +#endif /* C2_AUDIT */ + + error = vn_removeat(dirvp, name, + UIO_USERSPACE, (flags == AT_REMOVEDIR) ? RMDIRECTORY : RMFILE); + if (dirvp != NULL) + VN_RELE(dirvp); + + if (error != NULL) + return (set_errno(error)); + return (0); +} diff --git a/usr/src/uts/common/syscall/utime.c b/usr/src/uts/common/syscall/utime.c new file mode 100644 index 0000000000..b37681fe4c --- /dev/null +++ b/usr/src/uts/common/syscall/utime.c @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/time.h> +#include <sys/debug.h> +#include <sys/model.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/pathname.h> +#include <c2/audit.h> + +extern int namesetattr(char *, enum symfollow, vattr_t *, int); +extern int fdsetattr(int, vattr_t *); + +static int +cfutimesat(int fd, char *fname, int nmflag, vattr_t *vap, int flags) +{ + + file_t *fp; + vnode_t *startvp, *vp; + int error; + char startchar; + + if (fd == AT_FDCWD && fname == NULL) + return (set_errno(EFAULT)); + + if (nmflag == 1 || (nmflag == 2 && fname != NULL)) { + if (copyin(fname, &startchar, sizeof (char))) + return (set_errno(EFAULT)); + } else + startchar = '\0'; + + if (fd == AT_FDCWD) + startvp = NULL; + else { + + /* + * is this absolute path? + */ + if (startchar != '/') { + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + startvp = fp->f_vnode; + VN_HOLD(startvp); + releasef(fd); + } else { + startvp = NULL; + } + } + +#ifdef C2_AUDIT + if (audit_active) + audit_setfsat_path(1); +#endif /* C2_AUDIT */ + + if ((nmflag == 1) || ((nmflag == 2) && (fname != NULL))) { + if (error = lookupnameat(fname, UIO_USERSPACE, FOLLOW, + NULLVPP, &vp, startvp)) { + if (startvp != NULL) + VN_RELE(startvp); + return (set_errno(error)); + } + } else { + vp = startvp; + VN_HOLD(vp); + } + + if (startvp != NULL) { + VN_RELE(startvp); + } + + if (vn_is_readonly(vp)) { + error = EROFS; + } else { + error = VOP_SETATTR(vp, vap, flags, CRED(), NULL); + } + + VN_RELE(vp); + if (error != 0) + return (set_errno(error)); + else + return (0); +} + +static int +get_utimesvattr(struct timeval *tvptr, struct vattr *vattr, int *flags) +{ + struct timeval tv[2]; + + *flags = 0; + + if (tvptr != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(tvptr, tv, sizeof (tv))) + return (EFAULT); + } else { + struct timeval32 tv32[2]; + + if (copyin(tvptr, tv32, sizeof (tv32))) + return (EFAULT); + + TIMEVAL32_TO_TIMEVAL(&tv[0], &tv32[0]); + TIMEVAL32_TO_TIMEVAL(&tv[1], &tv32[1]); + } + + if (tv[0].tv_usec < 0 || tv[0].tv_usec >= 1000000 || + tv[1].tv_usec < 0 || tv[1].tv_usec >= 1000000) + return (EINVAL); + + vattr->va_atime.tv_sec = tv[0].tv_sec; + vattr->va_atime.tv_nsec = tv[0].tv_usec * 1000; + vattr->va_mtime.tv_sec = tv[1].tv_sec; + vattr->va_mtime.tv_nsec = tv[1].tv_usec * 1000; + *flags |= ATTR_UTIME; + } else { + gethrestime(&vattr->va_atime); + vattr->va_mtime = vattr->va_atime; + } + vattr->va_mask = AT_ATIME | AT_MTIME; + + return (0); +} +int +futimesat(int fd, char *fname, struct timeval *tvptr) +{ + struct vattr vattr; + int flags = 0; + int error; + + if ((error = get_utimesvattr(tvptr, &vattr, &flags)) != 0) + return (set_errno(error)); + + return (cfutimesat(fd, fname, 2, &vattr, flags)); +} +/* + * Set access/modify times on named file. + */ +int +utime(char *fname, time_t *tptr) +{ + time_t tv[2]; + struct vattr vattr; + int flags = 0; + + if (tptr != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(tptr, tv, sizeof (tv))) + return (set_errno(EFAULT)); + } else { + time32_t tv32[2]; + + if (copyin(tptr, &tv32, sizeof (tv32))) + return (set_errno(EFAULT)); + + tv[0] = (time_t)tv32[0]; + tv[1] = (time_t)tv32[1]; + } + + vattr.va_atime.tv_sec = tv[0]; + vattr.va_atime.tv_nsec = 0; + vattr.va_mtime.tv_sec = tv[1]; + vattr.va_mtime.tv_nsec = 0; + flags |= ATTR_UTIME; + } else { + gethrestime(&vattr.va_atime); + vattr.va_mtime = vattr.va_atime; + } + + vattr.va_mask = AT_ATIME|AT_MTIME; + return (cfutimesat(AT_FDCWD, fname, 1, &vattr, flags)); +} + +/* + * SunOS4.1 Buyback: + * Set access/modify time on named file, with hi res timer + */ +int +utimes(char *fname, struct timeval *tvptr) +{ + struct vattr vattr; + int flags = 0; + int error; + + if ((error = get_utimesvattr(tvptr, &vattr, &flags)) != 0) + return (set_errno(error)); + + return (cfutimesat(AT_FDCWD, fname, 1, &vattr, flags)); +} diff --git a/usr/src/uts/common/syscall/utssys.c b/usr/src/uts/common/syscall/utssys.c new file mode 100644 index 0000000000..380df8e8fc --- /dev/null +++ b/usr/src/uts/common/syscall/utssys.c @@ -0,0 +1,954 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/inttypes.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/errno.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/session.h> +#include <sys/var.h> +#include <sys/utsname.h> +#include <sys/utssys.h> +#include <sys/ustat.h> +#include <sys/statvfs.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/pathname.h> +#include <sys/modctl.h> +#include <sys/fs/snode.h> +#include <sys/sunldi_impl.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/ddipropdefs.h> +#include <sys/ddi_impldefs.h> +#include <sys/modctl.h> +#include <sys/flock.h> +#include <sys/share.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <util/qsort.h> +#include <sys/zone.h> + +/* + * utssys() + */ +static int uts_fusers(char *, int, intptr_t); +static int _statvfs64_by_dev(dev_t, struct statvfs64 *); + +#if defined(_ILP32) || defined(_SYSCALL32_IMPL) + +static int utssys_uname32(caddr_t, rval_t *); +static int utssys_ustat32(dev_t, struct ustat32 *); + +int64_t +utssys32(void *buf, int arg, int type, void *outbp) +{ + int error; + rval_t rv; + + rv.r_vals = 0; + + switch (type) { + case UTS_UNAME: + /* + * This is an obsolete way to get the utsname structure + * (it only gives you the first 8 characters of each field!) + * uname(2) is the preferred and better interface. + */ + error = utssys_uname32(buf, &rv); + break; + case UTS_USTAT: + error = utssys_ustat32(expldev((dev32_t)arg), buf); + break; + case UTS_FUSERS: + error = uts_fusers(buf, arg, (intptr_t)outbp); + break; + default: + error = EINVAL; + break; + } + + return (error == 0 ? rv.r_vals : (int64_t)set_errno(error)); +} + +static int +utssys_uname32(caddr_t buf, rval_t *rvp) +{ + if (copyout(utsname.sysname, buf, 8)) + return (EFAULT); + buf += 8; + if (subyte(buf, 0) < 0) + return (EFAULT); + buf++; + if (copyout(uts_nodename(), buf, 8)) + return (EFAULT); + buf += 8; + if (subyte(buf, 0) < 0) + return (EFAULT); + buf++; + if (copyout(utsname.release, buf, 8)) + return (EFAULT); + buf += 8; + if (subyte(buf, 0) < 0) + return (EFAULT); + buf++; + if (copyout(utsname.version, buf, 8)) + return (EFAULT); + buf += 8; + if (subyte(buf, 0) < 0) + return (EFAULT); + buf++; + if (copyout(utsname.machine, buf, 8)) + return (EFAULT); + buf += 8; + if (subyte(buf, 0) < 0) + return (EFAULT); + rvp->r_val1 = 1; + return (0); +} + +static int +utssys_ustat32(dev_t dev, struct ustat32 *cbuf) +{ + struct ustat32 ust32; + struct statvfs64 stvfs; + fsblkcnt64_t fsbc64; + char *cp, *cp2; + int i, error; + + if ((error = _statvfs64_by_dev(dev, &stvfs)) != 0) + return (error); + + fsbc64 = stvfs.f_bfree * (stvfs.f_frsize / 512); + /* + * Check to see if the number of free blocks can be expressed + * in 31 bits or whether the number of free files is more than + * can be expressed in 32 bits and is not -1 (UINT64_MAX). NFS + * Version 2 does not support the number of free files and + * hence will return -1. -1, when translated from a 32 bit + * quantity to an unsigned 64 bit quantity, turns into UINT64_MAX. + */ + if (fsbc64 > INT32_MAX || + (stvfs.f_ffree > UINT32_MAX && stvfs.f_ffree != UINT64_MAX)) + return (EOVERFLOW); + + ust32.f_tfree = (daddr32_t)fsbc64; + ust32.f_tinode = (ino32_t)stvfs.f_ffree; + + cp = stvfs.f_fstr; + cp2 = ust32.f_fname; + i = 0; + while (i++ < sizeof (ust32.f_fname)) + if (*cp != '\0') + *cp2++ = *cp++; + else + *cp2++ = '\0'; + while (*cp != '\0' && + (i++ < sizeof (stvfs.f_fstr) - sizeof (ust32.f_fpack))) + cp++; + (void) strncpy(ust32.f_fpack, cp + 1, sizeof (ust32.f_fpack)); + + if (copyout(&ust32, cbuf, sizeof (ust32))) + return (EFAULT); + return (0); +} + +#endif /* _ILP32 || _SYSCALL32_IMPL */ + +#ifdef _LP64 + +static int uts_ustat64(dev_t, struct ustat *); + +int64_t +utssys64(void *buf, long arg, int type, void *outbp) +{ + int error; + rval_t rv; + + rv.r_vals = 0; + + switch (type) { + case UTS_USTAT: + error = uts_ustat64((dev_t)arg, buf); + break; + case UTS_FUSERS: + error = uts_fusers(buf, (int)arg, (intptr_t)outbp); + break; + default: + error = EINVAL; + break; + } + + return (error == 0 ? rv.r_vals : (int64_t)set_errno(error)); +} + +static int +uts_ustat64(dev_t dev, struct ustat *cbuf) +{ + struct ustat ust; + struct statvfs64 stvfs; + fsblkcnt64_t fsbc64; + char *cp, *cp2; + int i, error; + + if ((error = _statvfs64_by_dev(dev, &stvfs)) != 0) + return (error); + + fsbc64 = stvfs.f_bfree * (stvfs.f_frsize / 512); + ust.f_tfree = (daddr_t)fsbc64; + ust.f_tinode = (ino_t)stvfs.f_ffree; + + cp = stvfs.f_fstr; + cp2 = ust.f_fname; + i = 0; + while (i++ < sizeof (ust.f_fname)) + if (*cp != '\0') + *cp2++ = *cp++; + else + *cp2++ = '\0'; + while (*cp != '\0' && + (i++ < sizeof (stvfs.f_fstr) - sizeof (ust.f_fpack))) + cp++; + (void) strncpy(ust.f_fpack, cp + 1, sizeof (ust.f_fpack)); + + if (copyout(&ust, cbuf, sizeof (ust))) + return (EFAULT); + return (0); +} + +#endif /* _LP64 */ + +/* + * Utility routine for the ustat implementations. + * (If it wasn't for the 'find-by-dev_t' semantic of ustat(2), we could push + * this all out into userland, sigh.) + */ +static int +_statvfs64_by_dev(dev_t dev, struct statvfs64 *svp) +{ + vfs_t *vfsp; + int error; + + if ((vfsp = vfs_dev2vfsp(dev)) == NULL) { + /* + * See if it's the root of our zone. + */ + vfsp = curproc->p_zone->zone_rootvp->v_vfsp; + if (vfsp->vfs_dev == dev) { + VFS_HOLD(vfsp); + } else { + vfsp = NULL; + } + } + if (vfsp == NULL) + return (EINVAL); + error = VFS_STATVFS(vfsp, svp); + VFS_RELE(vfsp); + return (error); +} + +/* + * Check if this pid has an NBMAND lock or share reservation + * on this vp. llp is a snapshoted list of all NBMAND locks + * set by this pid. Return 1 if there is an NBMAND lock else + * return 0. + */ +static int +proc_has_nbmand_on_vp(vnode_t *vp, pid_t pid, locklist_t *llp) +{ + /* + * Any NBMAND lock held by the process on this vp? + */ + while (llp) { + if (llp->ll_vp == vp) { + return (1); + } + llp = llp->ll_next; + } + /* + * Any NBMAND share reservation on the vp for this process? + */ + return (proc_has_nbmand_share_on_vp(vp, pid)); +} + +static fu_data_t * +dofusers(vnode_t *fvp, int flags) +{ + fu_data_t *fu_data; + proc_t *prp; + vfs_t *cvfsp; + pid_t npids, pidx, *pidlist; + int v_proc = v.v_proc; /* max # of procs */ + int pcnt = 0; + int contained = (flags & F_CONTAINED); + int nbmandonly = (flags & F_NBMANDLIST); + int dip_usage = (flags & F_DEVINFO); + int fvp_isdev = vn_matchops(fvp, spec_getvnodeops()); + zone_t *zone = curproc->p_zone; + int inglobal = INGLOBALZONE(curproc); + + /* get a pointer to the file system containing this vnode */ + cvfsp = fvp->v_vfsp; + ASSERT(cvfsp); + + /* allocate the data structure to return our results in */ + fu_data = kmem_alloc(fu_data_size(v_proc), KM_SLEEP); + fu_data->fud_user_max = v_proc; + fu_data->fud_user_count = 0; + + /* get a snapshot of all the pids we're going to check out */ + pidlist = kmem_alloc(v_proc * sizeof (pid_t), KM_SLEEP); + mutex_enter(&pidlock); + for (npids = 0, prp = practive; prp != NULL; prp = prp->p_next) { + if (inglobal || prp->p_zone == zone) + pidlist[npids++] = prp->p_pid; + } + mutex_exit(&pidlock); + + /* grab each process and check its file usage */ + for (pidx = 0; pidx < npids; pidx++) { + locklist_t *llp = NULL; + uf_info_t *fip; + vnode_t *vp; + user_t *up; + sess_t *sp; + uid_t uid; + pid_t pid = pidlist[pidx]; + int i, use_flag = 0; + + /* + * grab prp->p_lock using sprlock() + * if sprlock() fails the process does not exists anymore + */ + prp = sprlock(pid); + if (prp == NULL) + continue; + + /* get the processes credential info in case we need it */ + mutex_enter(&prp->p_crlock); + uid = crgetruid(prp->p_cred); + mutex_exit(&prp->p_crlock); + + /* + * it's safe to drop p_lock here because we + * called sprlock() before and it set the SPRLOCK + * flag for the process so it won't go away. + */ + mutex_exit(&prp->p_lock); + + /* + * now we want to walk a processes open file descriptors + * to do this we need to grab the fip->fi_lock. (you + * can't hold p_lock when grabbing the fip->fi_lock.) + */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* + * Snapshot nbmand locks for pid + */ + llp = flk_active_nbmand_locks(prp->p_pid); + for (i = 0; i < fip->fi_nfiles; i++) { + uf_entry_t *ufp; + file_t *fp; + + UF_ENTER(ufp, fip, i); + if (((fp = ufp->uf_file) == NULL) || + ((vp = fp->f_vnode) == NULL)) { + UF_EXIT(ufp); + continue; + } + + /* + * if the target file (fvp) is not a device + * and corrosponds to the root of a filesystem + * (cvfsp), then check if it contains the file + * is use by this process (vp). + */ + if (contained && (vp->v_vfsp == cvfsp)) + use_flag |= F_OPEN; + + /* + * if the target file (fvp) is not a device, + * then check if it matches the file in use + * by this process (vp). + */ + if (!fvp_isdev && VN_CMP(fvp, vp)) + use_flag |= F_OPEN; + + /* + * if the target file (fvp) is a device, + * then check if the current file in use + * by this process (vp) maps to the same device + * minor node. + */ + if (fvp_isdev && + vn_matchops(vp, spec_getvnodeops()) && + (fvp->v_rdev == vp->v_rdev)) + use_flag |= F_OPEN; + + /* + * if the target file (fvp) is a device, + * and we're checking for device instance + * usage, then check if the current file in use + * by this process (vp) maps to the same device + * instance. + */ + if (dip_usage && + vn_matchops(vp, spec_getvnodeops()) && + (VTOCS(fvp)->s_dip == VTOCS(vp)->s_dip)) + use_flag |= F_OPEN; + + /* + * if the current file in use by this process (vp) + * doesn't match what we're looking for, move on + * to the next file in the process. + */ + if ((use_flag & F_OPEN) == 0) { + UF_EXIT(ufp); + continue; + } + + if (proc_has_nbmand_on_vp(vp, prp->p_pid, llp)) { + /* A nbmand found so we're done. */ + use_flag |= F_NBM; + UF_EXIT(ufp); + break; + } + UF_EXIT(ufp); + } + if (llp) + flk_free_locklist(llp); + + mutex_exit(&fip->fi_lock); + + /* + * If nbmand usage tracking is desired and no nbmand was + * found for this process, then no need to do further + * usage tracking for this process. + */ + if (nbmandonly && (!(use_flag & F_NBM))) { + /* + * grab the process lock again, clear the SPRLOCK + * flag, release the process, and continue. + */ + mutex_enter(&prp->p_lock); + sprunlock(prp); + continue; + } + + /* + * All other types of usage. + * For the next few checks we need to hold p_lock. + */ + mutex_enter(&prp->p_lock); + up = PTOU(prp); + if (fvp_isdev) { + /* + * if the target file (fvp) is a device + * then check if it matches the processes tty + * + * we grab s_lock to protect ourselves against + * freectty() freeing the vnode out from under us. + */ + sp = prp->p_sessp; + mutex_enter(&sp->s_lock); + vp = prp->p_sessp->s_vp; + if (vp != NULL) { + if (fvp->v_rdev == vp->v_rdev) + use_flag |= F_TTY; + + if (dip_usage && + (VTOCS(fvp)->s_dip == VTOCS(vp)->s_dip)) + use_flag |= F_TTY; + } + mutex_exit(&sp->s_lock); + } else { + /* check the processes current working directory */ + if (up->u_cdir && + (VN_CMP(fvp, up->u_cdir) || + (contained && (up->u_cdir->v_vfsp == cvfsp)))) + use_flag |= F_CDIR; + + /* check the processes root directory */ + if (up->u_rdir && + (VN_CMP(fvp, up->u_rdir) || + (contained && (up->u_rdir->v_vfsp == cvfsp)))) + use_flag |= F_RDIR; + + /* check the program text vnode */ + if (prp->p_exec && + (VN_CMP(fvp, prp->p_exec) || + (contained && (prp->p_exec->v_vfsp == cvfsp)))) + use_flag |= F_TEXT; + } + + /* Now we can drop p_lock again */ + mutex_exit(&prp->p_lock); + + /* + * now we want to walk a processes memory mappings. + * to do this we need to grab the prp->p_as lock. (you + * can't hold p_lock when grabbing the prp->p_as lock.) + */ + if (prp->p_as != &kas) { + struct seg *seg; + struct as *as = prp->p_as; + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg; + seg = AS_SEGNEXT(as, seg)) { + /* + * if we can't get a backing vnode for this + * segment then skip it + */ + vp = NULL; + if ((SEGOP_GETVP(seg, seg->s_base, &vp)) || + (vp == NULL)) + continue; + + /* + * if the target file (fvp) is not a device + * and corrosponds to the root of a filesystem + * (cvfsp), then check if it contains the + * vnode backing this segment (vp). + */ + if (contained && (vp->v_vfsp == cvfsp)) { + use_flag |= F_MAP; + break; + } + + /* + * if the target file (fvp) is not a device, + * check if it matches the the vnode backing + * this segment (vp). + */ + if (!fvp_isdev && VN_CMP(fvp, vp)) { + use_flag |= F_MAP; + break; + } + + /* + * if the target file (fvp) isn't a device, + * or the the vnode backing this segment (vp) + * isn't a device then continue. + */ + if (!fvp_isdev || + !vn_matchops(vp, spec_getvnodeops())) + continue; + + /* + * check if the vnode backing this segment + * (vp) maps to the same device minor node + * as the target device (fvp) + */ + if (fvp->v_rdev == vp->v_rdev) { + use_flag |= F_MAP; + break; + } + + /* + * if we're checking for device instance + * usage, then check if the vnode backing + * this segment (vp) maps to the same device + * instance as the target device (fvp). + */ + if (dip_usage && + (VTOCS(fvp)->s_dip == VTOCS(vp)->s_dip)) { + use_flag |= F_MAP; + break; + } + } + AS_LOCK_EXIT(as, &as->a_lock); + } + + if (use_flag) { + ASSERT(pcnt < fu_data->fud_user_max); + fu_data->fud_user[pcnt].fu_flags = use_flag; + fu_data->fud_user[pcnt].fu_pid = pid; + fu_data->fud_user[pcnt].fu_uid = uid; + pcnt++; + } + + /* + * grab the process lock again, clear the SPRLOCK + * flag, release the process, and continue. + */ + mutex_enter(&prp->p_lock); + sprunlock(prp); + } + + kmem_free(pidlist, v_proc * sizeof (pid_t)); + + fu_data->fud_user_count = pcnt; + return (fu_data); +} + +typedef struct dofkusers_arg { + vnode_t *fvp; + int flags; + int *error; + fu_data_t *fu_data; +} dofkusers_arg_t; + +static int +dofkusers_walker(const ldi_usage_t *ldi_usage, void *arg) +{ + dofkusers_arg_t *dofkusers_arg = (dofkusers_arg_t *)arg; + + vnode_t *fvp = dofkusers_arg->fvp; + int flags = dofkusers_arg->flags; + int *error = dofkusers_arg->error; + fu_data_t *fu_data = dofkusers_arg->fu_data; + + modid_t modid; + minor_t minor; + int instance; + int dip_usage = (flags & F_DEVINFO); + + ASSERT(*error == 0); + ASSERT(vn_matchops(fvp, spec_getvnodeops())); + + /* + * check if the dev_t of the target device matches the dev_t + * of the device we're trying to find usage info for. + */ + if (fvp->v_rdev != ldi_usage->tgt_devt) { + + /* + * if the dev_ts don't match and we're not trying + * to find usage information for device instances + * then return + */ + if (!dip_usage) + return (LDI_USAGE_CONTINUE); + + + /* + * we're trying to find usage information for an + * device instance instead of just a minor node. + * + * check if the dip for the target device matches the + * dip of the device we're trying to find usage info for. + */ + if (VTOCS(fvp)->s_dip != ldi_usage->tgt_dip) + return (LDI_USAGE_CONTINUE); + } + + if (fu_data->fud_user_count >= fu_data->fud_user_max) { + *error = E2BIG; + return (LDI_USAGE_TERMINATE); + } + + /* get the device vnode user information */ + modid = ldi_usage->src_modid; + ASSERT(modid != -1); + + minor = instance = -1; + if (ldi_usage->src_dip != NULL) { + instance = DEVI(ldi_usage->src_dip)->devi_instance; + } + if (ldi_usage->src_devt != DDI_DEV_T_NONE) { + minor = getminor(ldi_usage->src_devt); + } + + /* set the device vnode user information */ + fu_data->fud_user[fu_data->fud_user_count].fu_flags = F_KERNEL; + fu_data->fud_user[fu_data->fud_user_count].fu_modid = modid; + fu_data->fud_user[fu_data->fud_user_count].fu_instance = instance; + fu_data->fud_user[fu_data->fud_user_count].fu_minor = minor; + + fu_data->fud_user_count++; + + return (LDI_USAGE_CONTINUE); +} + +int +f_user_cmp(const void *arg1, const void *arg2) +{ + f_user_t *f_user1 = (f_user_t *)arg1; + f_user_t *f_user2 = (f_user_t *)arg2; + + /* + * we should only be called for f_user_t entires that represent + * a kernel file consumer + */ + ASSERT(f_user1->fu_flags & F_KERNEL); + ASSERT(f_user2->fu_flags & F_KERNEL); + + if (f_user1->fu_modid != f_user2->fu_modid) + return ((f_user1->fu_modid < f_user2->fu_modid) ? -1 : 1); + + if (f_user1->fu_instance != f_user2->fu_instance) + return ((f_user1->fu_instance < f_user2->fu_instance) ? -1 : 1); + + if (f_user1->fu_minor != f_user2->fu_minor) + return ((f_user1->fu_minor < f_user2->fu_minor) ? -1 : 1); + + return (0); +} + +static fu_data_t * +dofkusers(vnode_t *fvp, int flags, int *error) +{ + dofkusers_arg_t dofkusers_arg; + fu_data_t *fu_data; + int user_max, i; + + /* + * we only keep track of kernel device consumers, so if the + * target vnode isn't a device then there's nothing to do here + */ + if (!vn_matchops(fvp, spec_getvnodeops())) + return (NULL); + + /* allocate the data structure to return our results in */ + user_max = ldi_usage_count(); + fu_data = kmem_alloc(fu_data_size(user_max), KM_SLEEP); + fu_data->fud_user_max = user_max; + fu_data->fud_user_count = 0; + + /* invoke the callback to collect device usage information */ + dofkusers_arg.fvp = fvp; + dofkusers_arg.flags = flags; + dofkusers_arg.error = error; + dofkusers_arg.fu_data = fu_data; + ldi_usage_walker(&dofkusers_arg, dofkusers_walker); + + /* check for errors */ + if (*error != 0) + return (fu_data); + + /* if there aren't any file consumers then return */ + if (fu_data->fud_user_count == 0) + return (fu_data); + + /* + * since we ignore the spec_type of the target we're trying to + * access it's possible that we could have duplicates entries in + * the list of consumers. + * + * we don't want to check for duplicate in the callback because + * we're holding locks in the ldi when the callback is invoked. + * + * so here we need to go through the array of file consumers + * and remove duplicate entries. + */ + + /* first sort the array of file consumers */ + qsort((caddr_t)fu_data->fud_user, fu_data->fud_user_count, + sizeof (f_user_t), f_user_cmp); + + /* then remove any duplicate entires */ + i = 1; + while (i < fu_data->fud_user_count) { + + if (f_user_cmp(&fu_data->fud_user[i], + &fu_data->fud_user[i - 1]) != 0) { + /* + * the current element is unique, move onto + * the next one + */ + i++; + continue; + } + + /* + * this entry is a duplicate so if it's not the last + * entry in the array then remove it. + */ + fu_data->fud_user_count--; + if (i == fu_data->fud_user_count) + break; + + bcopy(&fu_data->fud_user[i + 1], &fu_data->fud_user[i], + sizeof (f_user_t) * (fu_data->fud_user_count - i)); + } + + return (fu_data); +} + +/* + * Determine the ways in which processes and the kernel are using a named + * file or mounted file system (path). Normally return 0. In case of an + * error appropriate errno will be returned. + * + * Upon success, uts_fusers will also copyout the file usage information + * in the form of an array of f_user_t's that are contained within an + * fu_data_t pointed to by userbp. + */ +static int +uts_fusers(char *path, int flags, intptr_t userbp) +{ + fu_data_t *fu_data = NULL, *fuk_data = NULL; + fu_data_t fu_header; + vnode_t *fvp = NULL; + size_t bcount; + int error = 0; + int total_max, total_out; + int contained = (flags & F_CONTAINED); + int dip_usage = (flags & F_DEVINFO); + int fvp_isdev; + + + /* figure out how man f_user_t's we can safetly copy out */ + if (copyin((const void *)userbp, &total_max, sizeof (total_max))) + return (EFAULT); + + /* + * check if we only want a count of how many kernel device + * consumers exist + */ + if (flags & F_KINFO_COUNT) { + fu_header.fud_user_max = total_max; + fu_header.fud_user_count = ldi_usage_count(); + bcount = fu_data_size(0); + if (copyout(&fu_header, (void *)userbp, bcount)) + return (EFAULT); + return (0); + } + + /* get the vnode for the file we want to look up usage for */ + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &fvp); + if (error != 0) + return (error); + ASSERT(fvp); + fvp_isdev = vn_matchops(fvp, spec_getvnodeops()); + + /* + * if we want to report usage for all files contained within a + * file system then the target file better correspond to the + * root node of a mounted file system, or the root of a zone. + */ + if (contained && !(fvp->v_flag & VROOT) && + fvp != curproc->p_zone->zone_rootvp) { + error = EINVAL; + goto out; + } + + /* + * if we want to report usage for all files contained within a + * file system then the target file better not be a device. + */ + if (contained && fvp_isdev) { + error = EINVAL; + goto out; + } + + /* + * if we want to report usage for a device instance then the + * target file better corrospond to a device + */ + if (dip_usage && !fvp_isdev) { + error = EINVAL; + goto out; + } + + /* + * if the target vnode isn't a device and it has a reference count + * of one then no one else is going to have it open so we don't + * have any work to do. + */ + if (!fvp_isdev && (fvp->v_count == 1)) { + goto out; + } + + /* look up usage information for this vnode */ + fu_data = dofusers(fvp, flags); + fuk_data = dofkusers(fvp, flags, &error); + if (error != 0) + goto out; + + /* get a count of the number of f_user_t's we need to copy out */ + total_out = 0; + if (fu_data) + total_out += fu_data->fud_user_count; + if (fuk_data) + total_out += fuk_data->fud_user_count; + + /* check if there is enough space to copyout all results */ + if (total_out > total_max) { + error = E2BIG; + goto out; + } + + /* copyout file usage info counts */ + fu_header.fud_user_max = total_max; + fu_header.fud_user_count = total_out; + bcount = fu_data_size(0); + if (copyout(&fu_header, (void *)userbp, bcount)) { + error = EFAULT; + goto out; + } + + /* copyout userland process file usage info */ + if ((fu_data != NULL) && (fu_data->fud_user_count > 0)) { + userbp += bcount; + bcount = fu_data->fud_user_count * sizeof (f_user_t); + if (copyout(fu_data->fud_user, (void *)userbp, bcount)) { + error = EFAULT; + goto out; + } + } + + /* copyout kernel file usage info */ + if ((fuk_data != NULL) && (fuk_data->fud_user_count > 0)) { + userbp += bcount; + bcount = fuk_data->fud_user_count * sizeof (f_user_t); + if (copyout(fuk_data->fud_user, (void *)userbp, bcount)) { + error = EFAULT; + goto out; + } + } + +out: + /* release the vnode that we were looking up usage for */ + VN_RELE(fvp); + + /* release any allocated memory */ + if (fu_data) + kmem_free(fu_data, fu_data_size(fu_data->fud_user_max)); + if (fuk_data) + kmem_free(fuk_data, fu_data_size(fuk_data->fud_user_max)); + + return (error); +} diff --git a/usr/src/uts/common/syscall/yield.c b/usr/src/uts/common/syscall/yield.c new file mode 100644 index 0000000000..45133df129 --- /dev/null +++ b/usr/src/uts/common/syscall/yield.c @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1996-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/disp.h> +#include <sys/debug.h> +#include <sys/cpuvar.h> + + +/* + * The calling LWP is preempted in favor of some other LWP. + */ +int +yield() +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + + thread_lock(t); + lwp->lwp_ru.nvcsw++; + THREAD_TRANSITION(t); + CL_YIELD(t); /* does setbackdq */ + thread_unlock_nopreempt(t); + swtch(); /* clears cpu_runrun and cpu_kprunrun */ + + return (0); +} |